from:"Jonathan Kim"

[PATCH] drm/amdgpu: increase mes submission timeout

2024-04-11 Thread Jonathan Kim

MES internally has a timeout allowance of 2 seconds.
Increase driver timeout to 3 seconds to be safe.

Signed-off-by: Jonathan Kim 
---
 drivers/gpu/drm/amd/amdgpu/mes_v11_0.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c 
b/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c
index e5230078a4cd..81833395324a 100644
--- a/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c
@@ -111,7 +111,7 @@ static int mes_v11_0_submit_pkt_and_poll_completion(struct 
amdgpu_mes *mes,
struct amdgpu_device *adev = mes->adev;
struct amdgpu_ring *ring = >ring;
unsigned long flags;
-   signed long timeout = adev->usec_timeout;
+   signed long timeout = 300; /* 3000 ms */
 
if (amdgpu_emu_mode) {
timeout *= 100;
-- 
2.34.1

[PATCH] drm/amdkfd: range check cp bad op exception interrupts

2024-03-13 Thread Jonathan Kim

Due to a CP interrupt bug, bad packet garbage exception codes are raised.
Do a range check so that the debugger and runtime do not receive garbage
codes.
Update the user api to guard exception code type checking as well.

Signed-off-by: Jonathan Kim 
Tested-by: Jesse Zhang 
---
 .../gpu/drm/amd/amdkfd/kfd_int_process_v10.c|  3 ++-
 .../gpu/drm/amd/amdkfd/kfd_int_process_v11.c|  3 ++-
 drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c |  3 ++-
 include/uapi/linux/kfd_ioctl.h  | 17 ++---
 4 files changed, 20 insertions(+), 6 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v10.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v10.c
index a8e76287dde0..013d0a073b9b 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v10.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v10.c
@@ -339,7 +339,8 @@ static void event_interrupt_wq_v10(struct kfd_node *dev,
break;
}
kfd_signal_event_interrupt(pasid, context_id0 & 
0x7f, 23);
-   } else if (source_id == SOC15_INTSRC_CP_BAD_OPCODE) {
+   } else if (source_id == SOC15_INTSRC_CP_BAD_OPCODE &&
+  
KFD_DBG_EC_TYPE_IS_PACKET(KFD_DEBUG_CP_BAD_OP_ECODE(context_id0))) {
kfd_set_dbg_ev_from_interrupt(dev, pasid,
KFD_DEBUG_DOORBELL_ID(context_id0),

KFD_EC_MASK(KFD_DEBUG_CP_BAD_OP_ECODE(context_id0)),
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v11.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v11.c
index 7e2859736a55..fe2ad0c0de95 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v11.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v11.c
@@ -328,7 +328,8 @@ static void event_interrupt_wq_v11(struct kfd_node *dev,
/* CP */
if (source_id == SOC15_INTSRC_CP_END_OF_PIPE)
kfd_signal_event_interrupt(pasid, context_id0, 32);
-   else if (source_id == SOC15_INTSRC_CP_BAD_OPCODE)
+   else if (source_id == SOC15_INTSRC_CP_BAD_OPCODE &&
+
KFD_DBG_EC_TYPE_IS_PACKET(KFD_CTXID0_CP_BAD_OP_ECODE(context_id0)))
kfd_set_dbg_ev_from_interrupt(dev, pasid,
KFD_CTXID0_DOORBELL_ID(context_id0),

KFD_EC_MASK(KFD_CTXID0_CP_BAD_OP_ECODE(context_id0)),
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c
index ff7392336795..5483211c5d3d 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c
@@ -388,7 +388,8 @@ static void event_interrupt_wq_v9(struct kfd_node *dev,
break;
}
kfd_signal_event_interrupt(pasid, sq_int_data, 24);
-   } else if (source_id == SOC15_INTSRC_CP_BAD_OPCODE) {
+   } else if (source_id == SOC15_INTSRC_CP_BAD_OPCODE &&
+  
KFD_DBG_EC_TYPE_IS_PACKET(KFD_DEBUG_CP_BAD_OP_ECODE(context_id0))) {
kfd_set_dbg_ev_from_interrupt(dev, pasid,
KFD_DEBUG_DOORBELL_ID(context_id0),

KFD_EC_MASK(KFD_DEBUG_CP_BAD_OP_ECODE(context_id0)),
diff --git a/include/uapi/linux/kfd_ioctl.h b/include/uapi/linux/kfd_ioctl.h
index 9ce46edc62a5..2040a470ddb4 100644
--- a/include/uapi/linux/kfd_ioctl.h
+++ b/include/uapi/linux/kfd_ioctl.h
@@ -913,14 +913,25 @@ enum kfd_dbg_trap_exception_code {
 KFD_EC_MASK(EC_DEVICE_NEW))
 #define KFD_EC_MASK_PROCESS(KFD_EC_MASK(EC_PROCESS_RUNTIME) |  \
 KFD_EC_MASK(EC_PROCESS_DEVICE_REMOVE))
+#define KFD_EC_MASK_PACKET 
(KFD_EC_MASK(EC_QUEUE_PACKET_DISPATCH_DIM_INVALID) |\
+
KFD_EC_MASK(EC_QUEUE_PACKET_DISPATCH_GROUP_SEGMENT_SIZE_INVALID) | \
+
KFD_EC_MASK(EC_QUEUE_PACKET_DISPATCH_CODE_INVALID) |   \
+KFD_EC_MASK(EC_QUEUE_PACKET_RESERVED) |
\
+KFD_EC_MASK(EC_QUEUE_PACKET_UNSUPPORTED) | 
\
+
KFD_EC_MASK(EC_QUEUE_PACKET_DISPATCH_WORK_GROUP_SIZE_INVALID) |\
+
KFD_EC_MASK(EC_QUEUE_PACKET_DISPATCH_REGISTER_INVALID) |   \
+
KFD_EC_MASK(EC_QUEUE_PACKET_VENDOR_UNSUPPORTED))
 
 /* Checks for exception code types for KFD search */
+#define KFD_DBG_EC_IS_VALID(ecode) (ecode > EC_NONE && ecode < EC_MAX)
 #define KFD_DBG_EC_TYPE_IS_QUEUE(ecode)
\
-   (!!(KFD_EC_MASK(ecode) & KFD_EC_MASK_QUEUE))
+   (KFD_DBG_EC_IS_VALID(ecode)

[PATCH] drm/amdkfd: fix process reference drop on debug ioctl

2024-02-21 Thread Jonathan Kim

Prevent dropping the KFD process reference at the end of a debug
IOCTL call where the acquired process value is an error.

Signed-off-by: Jonathan Kim 
---
 drivers/gpu/drm/amd/amdkfd/kfd_chardev.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
index 80e90fdef291..824e660283b2 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
@@ -2935,6 +2935,7 @@ static int kfd_ioctl_set_debug_trap(struct file *filep, 
struct kfd_process *p, v
if (IS_ERR_OR_NULL(target)) {
pr_debug("Cannot find process PID %i to debug\n", args->pid);
r = target ? PTR_ERR(target) : -ESRCH;
+   target = NULL;
goto out;
}
 
-- 
2.34.1

[PATCH] drm/amdkfd: only flush mes process context if mes support is there

2023-12-13 Thread Jonathan Kim

Fix up on mes process context flush to prevent non-mes devices from
spamming error messages or running into undefined behaviour during
process termination.

Fixes: 73204d028eb5 ("drm/amdkfd: fix mes set shader debugger process 
management")
Signed-off-by: Jonathan Kim 
---
 drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
index 8e55e78fce4e..43eff221eae5 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
@@ -87,7 +87,8 @@ void kfd_process_dequeue_from_device(struct 
kfd_process_device *pdd)
return;
 
dev->dqm->ops.process_termination(dev->dqm, >qpd);
-   amdgpu_mes_flush_shader_debugger(dev->adev, pdd->proc_ctx_gpu_addr);
+   if (dev->kfd->shared_resources.enable_mes)
+   amdgpu_mes_flush_shader_debugger(dev->adev, 
pdd->proc_ctx_gpu_addr);
pdd->already_dequeued = true;
 }
 
-- 
2.34.1

[PATCH] drm/amdkfd: fix mes set shader debugger process management

2023-12-11 Thread Jonathan Kim

MES provides the driver a call to explicitly flush stale process memory
within the MES to avoid a race condition that results in a fatal
memory violation.

When SET_SHADER_DEBUGGER is called, the driver passes a memory address
that represents a process context address MES uses to keep track of
future per-process calls.

Normally, MES will purge its process context list when the last queue
has been removed.  The driver, however, can call SET_SHADER_DEBUGGER
regardless of whether a queue has been added or not.

If SET_SHADER_DEBUGGER has been called with no queues as the last call
prior to process termination, the passed process context address will
still reside within MES.

On a new process call to SET_SHADER_DEBUGGER, the driver may end up
passing an identical process context address value (based on per-process
gpu memory address) to MES but is now pointing to a new allocated buffer
object during KFD process creation.  Since the MES is unaware of this,
access of the passed address points to the stale object within MES and
triggers a fatal memory violation.

The solution is for KFD to explicitly flush the process context address
from MES on process termination.

Note that the flush call and the MES debugger calls use the same MES
interface but are separated as KFD calls to avoid conflicting with each
other.

Signed-off-by: Jonathan Kim 
Tested-by: Alice Wong 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c   | 31 +++
 drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h   | 10 +++---
 .../amd/amdkfd/kfd_process_queue_manager.c|  1 +
 drivers/gpu/drm/amd/include/mes_v11_api_def.h |  3 +-
 4 files changed, 40 insertions(+), 5 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c
index e544b823abf6..e98de23250dc 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c
@@ -916,6 +916,11 @@ int amdgpu_mes_set_shader_debugger(struct amdgpu_device 
*adev,
op_input.op = MES_MISC_OP_SET_SHADER_DEBUGGER;
op_input.set_shader_debugger.process_context_addr = 
process_context_addr;
op_input.set_shader_debugger.flags.u32all = flags;
+
+   /* use amdgpu mes_flush_shader_debugger instead */
+   if (op_input.set_shader_debugger.flags.process_ctx_flush)
+   return -EINVAL;
+
op_input.set_shader_debugger.spi_gdbg_per_vmid_cntl = 
spi_gdbg_per_vmid_cntl;
memcpy(op_input.set_shader_debugger.tcp_watch_cntl, tcp_watch_cntl,
sizeof(op_input.set_shader_debugger.tcp_watch_cntl));
@@ -935,6 +940,32 @@ int amdgpu_mes_set_shader_debugger(struct amdgpu_device 
*adev,
return r;
 }
 
+int amdgpu_mes_flush_shader_debugger(struct amdgpu_device *adev,
+uint64_t process_context_addr)
+{
+   struct mes_misc_op_input op_input = {0};
+   int r;
+
+   if (!adev->mes.funcs->misc_op) {
+   DRM_ERROR("mes flush shader debugger is not supported!\n");
+   return -EINVAL;
+   }
+
+   op_input.op = MES_MISC_OP_SET_SHADER_DEBUGGER;
+   op_input.set_shader_debugger.process_context_addr = 
process_context_addr;
+   op_input.set_shader_debugger.flags.process_ctx_flush = true;
+
+   amdgpu_mes_lock(>mes);
+
+   r = adev->mes.funcs->misc_op(>mes, _input);
+   if (r)
+   DRM_ERROR("failed to set_shader_debugger\n");
+
+   amdgpu_mes_unlock(>mes);
+
+   return r;
+}
+
 static void
 amdgpu_mes_ring_to_queue_props(struct amdgpu_device *adev,
   struct amdgpu_ring *ring,
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h
index 894b9b133000..7d4f93fea937 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h
@@ -296,9 +296,10 @@ struct mes_misc_op_input {
uint64_t process_context_addr;
union {
struct {
-   uint64_t single_memop : 1;
-   uint64_t single_alu_op : 1;
-   uint64_t reserved: 30;
+   uint32_t single_memop : 1;
+   uint32_t single_alu_op : 1;
+   uint32_t reserved: 29;
+   uint32_t process_ctx_flush: 1;
};
uint32_t u32all;
} flags;
@@ -374,7 +375,8 @@ int amdgpu_mes_set_shader_debugger(struct amdgpu_device 
*adev,
const uint32_t *tcp_watch_cntl,
uint32_t flags,
bool trap_en);
-
+int amdgpu_mes_flush_shader_debugger(struct amdgpu_device *adev,
+

[PATCH] drm/amdgpu: update xgmi num links info post gc9.4.2

2023-11-17 Thread Jonathan Kim

GC IP 9.4.2 and up support TA reporting of the number
of xGMI links between peers.

Tested-by: Vignesh Chander 
Signed-off-by: Jonathan Kim 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
index 6ab17330a6ed..2d22f7d45512 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
@@ -547,7 +547,7 @@ int amdgpu_amdkfd_get_xgmi_bandwidth_mbytes(struct 
amdgpu_device *dst,
struct amdgpu_device *adev = dst, *peer_adev;
int num_links;
 
-   if (adev->asic_type != CHIP_ALDEBARAN)
+   if (amdgpu_ip_version(adev, GC_HWIP, 0) < IP_VERSION(9, 4, 2))
return 0;
 
if (src)
-- 
2.34.1

[PATCH] drm/amdkfd: fix add queue process context clear without runtime enable

2023-09-12 Thread Jonathan Kim

There are cases where HSA runtime is not enabled through the
AMDKFD_IOC_RUNTIME_ENABLE call when adding queues and the MES ADD_QUEUE
API should clear the MES process context instead of SET_SHADER_DEBUGGER.
Such examples are legacy HSA runtime builds that do not support the
current exception handling and running KFD tests.

The only time ADD_QUEUE.skip_process_ctx_clear is required is for
debugger use cases where a debugged process is always runtime enabled
when adding a queue.

Tested-by: Shikai Guo 
Signed-off-by: Jonathan Kim 
---
 drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c | 6 --
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
index 6d07a5dd2648..77159b03a422 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
@@ -227,8 +227,10 @@ static int add_queue_mes(struct device_queue_manager *dqm, 
struct queue *q,
queue_input.tba_addr = qpd->tba_addr;
queue_input.tma_addr = qpd->tma_addr;
queue_input.trap_en = !kfd_dbg_has_cwsr_workaround(q->device);
-   queue_input.skip_process_ctx_clear = 
qpd->pqm->process->debug_trap_enabled ||
-
kfd_dbg_has_ttmps_always_setup(q->device);
+   queue_input.skip_process_ctx_clear =
+   qpd->pqm->process->runtime_info.runtime_state == 
DEBUG_RUNTIME_STATE_ENABLED &&
+   
(qpd->pqm->process->debug_trap_enabled ||
+
kfd_dbg_has_ttmps_always_setup(q->device));
 
queue_type = convert_to_mes_queue_type(q->properties.type);
if (queue_type < 0) {
-- 
2.34.1

[PATCH] drm/amdkfd: fix add queue process context clear for hsa non-init cases

2023-09-12 Thread Jonathan Kim

There are cases where HSA is not initialized when adding queues and
the ADD_QUEUE API should clear the MES process context instead of
SET_SHADER_DEBUGGER.

The only time ADD_QUEUE.skip_process_ctx_clear is required is for
debugger use cases and a debugged process is always runtime enabled
when adding a queue.

Tested-by: Shikai Guo 
Signed-off-by: Jonathan Kim 
---
 drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c | 6 --
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
index 6d07a5dd2648..77159b03a422 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
@@ -227,8 +227,10 @@ static int add_queue_mes(struct device_queue_manager *dqm, 
struct queue *q,
queue_input.tba_addr = qpd->tba_addr;
queue_input.tma_addr = qpd->tma_addr;
queue_input.trap_en = !kfd_dbg_has_cwsr_workaround(q->device);
-   queue_input.skip_process_ctx_clear = 
qpd->pqm->process->debug_trap_enabled ||
-
kfd_dbg_has_ttmps_always_setup(q->device);
+   queue_input.skip_process_ctx_clear =
+   qpd->pqm->process->runtime_info.runtime_state == 
DEBUG_RUNTIME_STATE_ENABLED &&
+   
(qpd->pqm->process->debug_trap_enabled ||
+
kfd_dbg_has_ttmps_always_setup(q->device));
 
queue_type = convert_to_mes_queue_type(q->properties.type);
if (queue_type < 0) {
-- 
2.34.1

[PATCH] drm/amdkfd: fix double assign skip process context clear

2023-08-10 Thread Jonathan Kim

Remove redundant assignment when skipping process ctx clear.

Signed-off-by: Jonathan Kim 
---
 drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
index aa5091f18681..89c2bfcb36ce 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
@@ -227,7 +227,6 @@ static int add_queue_mes(struct device_queue_manager *dqm, 
struct queue *q,
queue_input.tba_addr = qpd->tba_addr;
queue_input.tma_addr = qpd->tma_addr;
queue_input.trap_en = !kfd_dbg_has_cwsr_workaround(q->device);
-   queue_input.skip_process_ctx_clear = 
qpd->pqm->process->debug_trap_enabled;
queue_input.skip_process_ctx_clear = 
qpd->pqm->process->debug_trap_enabled ||
 
kfd_dbg_has_ttmps_always_setup(q->device);
 
-- 
2.25.1

[PATCH] drm/amdkfd: fix and enable ttmp setup for gfx11

2023-07-24 Thread Jonathan Kim

The MES cached process context must be cleared on adding any queue for
the first time.

For proper debug support, the MES will clear it's cached process context
on the first call to SET_SHADER_DEBUGGER.

This allows TTMPs to be pesistently enabled in a safe manner.

Signed-off-by: Jonathan Kim 
---
 .../drm/amd/amdgpu/amdgpu_amdkfd_gfx_v11.c|  2 +-
 drivers/gpu/drm/amd/amdkfd/kfd_chardev.c  | 13 -
 drivers/gpu/drm/amd/amdkfd/kfd_debug.c| 19 +--
 drivers/gpu/drm/amd/amdkfd/kfd_debug.h| 11 ++-
 .../drm/amd/amdkfd/kfd_device_queue_manager.c |  2 ++
 drivers/gpu/drm/amd/amdkfd/kfd_topology.c | 12 +---
 6 files changed, 39 insertions(+), 20 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v11.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v11.c
index 77ca5cbfb601..d67d003bada2 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v11.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v11.c
@@ -637,7 +637,7 @@ static uint32_t kgd_gfx_v11_disable_debug_trap(struct 
amdgpu_device *adev,
 {
uint32_t data = 0;
 
-   data = REG_SET_FIELD(data, SPI_GDBG_PER_VMID_CNTL, TRAP_EN, 
keep_trap_enabled);
+   data = REG_SET_FIELD(data, SPI_GDBG_PER_VMID_CNTL, TRAP_EN, 1);
data = REG_SET_FIELD(data, SPI_GDBG_PER_VMID_CNTL, EXCP_EN, 0);
data = REG_SET_FIELD(data, SPI_GDBG_PER_VMID_CNTL, EXCP_REPLACE, 0);
 
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
index e0f9cf6dd8fd..42df972357e9 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
@@ -2755,6 +2755,16 @@ static int runtime_enable(struct kfd_process *p, 
uint64_t r_debug,
 
if (pdd->qpd.queue_count)
return -EEXIST;
+
+   /*
+* Setup TTMPs by default.
+* Note that this call must remain here for MES ADD QUEUE to
+* skip_process_ctx_clear unconditionally as the first call to
+* SET_SHADER_DEBUGGER clears any stale process context data
+* saved in MES.
+*/
+   if (pdd->dev->kfd->shared_resources.enable_mes)
+   kfd_dbg_set_mes_debug_mode(pdd, 
!kfd_dbg_has_cwsr_workaround(pdd->dev));
}
 
p->runtime_info.runtime_state = DEBUG_RUNTIME_STATE_ENABLED;
@@ -2848,7 +2858,8 @@ static int runtime_disable(struct kfd_process *p)
if (!pdd->dev->kfd->shared_resources.enable_mes)
debug_refresh_runlist(pdd->dev->dqm);
else
-   kfd_dbg_set_mes_debug_mode(pdd);
+   kfd_dbg_set_mes_debug_mode(pdd,
+  
!kfd_dbg_has_cwsr_workaround(pdd->dev));
}
}
 
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_debug.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_debug.c
index 1f82caea59ba..9ec750666382 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_debug.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_debug.c
@@ -344,11 +344,10 @@ static int kfd_dbg_set_workaround(struct kfd_process 
*target, bool enable)
return r;
 }
 
-int kfd_dbg_set_mes_debug_mode(struct kfd_process_device *pdd)
+int kfd_dbg_set_mes_debug_mode(struct kfd_process_device *pdd, bool sq_trap_en)
 {
uint32_t spi_dbg_cntl = pdd->spi_dbg_override | 
pdd->spi_dbg_launch_mode;
uint32_t flags = pdd->process->dbg_flags;
-   bool sq_trap_en = !!spi_dbg_cntl || 
!kfd_dbg_has_cwsr_workaround(pdd->dev);
 
if (!kfd_dbg_is_per_vmid_supported(pdd->dev))
return 0;
@@ -432,7 +431,7 @@ int kfd_dbg_trap_clear_dev_address_watch(struct 
kfd_process_device *pdd,
if (!pdd->dev->kfd->shared_resources.enable_mes)
r = debug_map_and_unlock(pdd->dev->dqm);
else
-   r = kfd_dbg_set_mes_debug_mode(pdd);
+   r = kfd_dbg_set_mes_debug_mode(pdd, true);
 
kfd_dbg_clear_dev_watch_id(pdd, watch_id);
 
@@ -474,7 +473,7 @@ int kfd_dbg_trap_set_dev_address_watch(struct 
kfd_process_device *pdd,
if (!pdd->dev->kfd->shared_resources.enable_mes)
r = debug_map_and_unlock(pdd->dev->dqm);
else
-   r = kfd_dbg_set_mes_debug_mode(pdd);
+   r = kfd_dbg_set_mes_debug_mode(pdd, true);
 
/* HWS is broken so no point in HW rollback but release the watchpoint 
anyways */
if (r)
@@ -516,7 +515,7 @@ int kfd_dbg_trap_set_flags(struct kfd_process *target, 
uint32_t *flags)
if (!pdd->dev->kfd->shared_resources.enable_mes)
r = debug_refresh_runlist(pdd->dev->dqm);
else
-   r = kfd_dbg_set_mes_debug_mode(pdd);
+   r

[PATCH] drm/amdkfd: enable cooperative groups for gfx11

2023-07-19 Thread Jonathan Kim

MES can concurrently schedule queues on the device that require
exclusive device access if marked exclusively_scheduled without the
requirement of GWS.  Similar to the F32 HWS, MES will manage
quality of service for these queues.
Use this for cooperative groups since cooperative groups are device
occupancy limited.

Since some GFX11 devices can only be debugged with partial CUs, do not
allow the debugging of cooperative groups on these devices as the CU
occupancy limit will change on attach.

In addition, zero initialize the MES add queue submission vector for MES
initialization tests as we do not want these to be cooperative
dispatches.

v2: fix up indentation and comments.
remove unnecessary perf warning on oversubscription.
change 0 init to 0 memset to deal with padding.

Signed-off-by: Jonathan Kim 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c  |  2 ++
 drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h  |  1 +
 drivers/gpu/drm/amd/amdgpu/mes_v11_0.c   |  2 ++
 drivers/gpu/drm/amd/amdkfd/kfd_chardev.c |  3 ++-
 drivers/gpu/drm/amd/amdkfd/kfd_debug.c   |  3 ++-
 drivers/gpu/drm/amd/amdkfd/kfd_device.c  |  6 +-
 .../gpu/drm/amd/amdkfd/kfd_device_queue_manager.c|  7 ++-
 .../gpu/drm/amd/amdkfd/kfd_process_queue_manager.c   | 12 
 drivers/gpu/drm/amd/include/mes_v11_api_def.h|  4 +++-
 9 files changed, 27 insertions(+), 13 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c
index f808841310fd..72ab6a838bb6 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c
@@ -642,6 +642,8 @@ int amdgpu_mes_add_hw_queue(struct amdgpu_device *adev, int 
gang_id,
unsigned long flags;
int r;
 
+   memset(_input, 0, sizeof(struct mes_add_queue_input));
+
/* allocate the mes queue buffer */
queue = kzalloc(sizeof(struct amdgpu_mes_queue), GFP_KERNEL);
if (!queue) {
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h
index 2d6ac30b7135..2053954a235c 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h
@@ -224,6 +224,7 @@ struct mes_add_queue_input {
uint32_tis_kfd_process;
uint32_tis_aql_queue;
uint32_tqueue_size;
+   uint32_texclusively_scheduled;
 };
 
 struct mes_remove_queue_input {
diff --git a/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c 
b/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c
index 1bdaa00c0b46..8e67e965f7ea 100644
--- a/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c
@@ -214,6 +214,8 @@ static int mes_v11_0_add_hw_queue(struct amdgpu_mes *mes,
mes_add_queue_pkt.is_aql_queue = input->is_aql_queue;
mes_add_queue_pkt.gds_size = input->queue_size;
 
+   mes_add_queue_pkt.exclusively_scheduled = input->exclusively_scheduled;
+
return mes_v11_0_submit_pkt_and_poll_completion(mes,
_add_queue_pkt, sizeof(mes_add_queue_pkt),
offsetof(union MESAPI__ADD_QUEUE, api_status));
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
index 40ac093b5035..e0f9cf6dd8fd 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
@@ -1487,7 +1487,8 @@ static int kfd_ioctl_alloc_queue_gws(struct file *filep,
goto out_unlock;
}
 
-   if (!kfd_dbg_has_gws_support(dev) && p->debug_trap_enabled) {
+   if (p->debug_trap_enabled && (!kfd_dbg_has_gws_support(dev) ||
+ kfd_dbg_has_cwsr_workaround(dev))) {
retval = -EBUSY;
goto out_unlock;
}
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_debug.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_debug.c
index ccfc81f085ce..1f82caea59ba 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_debug.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_debug.c
@@ -753,7 +753,8 @@ int kfd_dbg_trap_enable(struct kfd_process *target, 
uint32_t fd,
if (!KFD_IS_SOC15(pdd->dev))
return -ENODEV;
 
-   if (!kfd_dbg_has_gws_support(pdd->dev) && pdd->qpd.num_gws)
+   if (pdd->qpd.num_gws && (!kfd_dbg_has_gws_support(pdd->dev) ||
+kfd_dbg_has_cwsr_workaround(pdd->dev)))
return -EBUSY;
}
 
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_device.c
index 0b3dc754e06b..ebc9674d3ce1 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_device.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_device.c
@@ -508,6 +508,7 @@ static int kfd_gws_init(struct kfd_node *node)
 {
int ret = 0;
struct kfd_dev *kfd = node->kfd;
+   uint32_t mes_rev = node->adev

[PATCH 1/2] drm/amdkfd: fix trap handling work around for debugging

2023-07-14 Thread Jonathan Kim

Update the list of devices that require the cwsr trap handling
workaround for debugging use cases.

Signed-off-by: Jonathan Kim 
---
 drivers/gpu/drm/amd/amdkfd/kfd_debug.c| 5 ++---
 drivers/gpu/drm/amd/amdkfd/kfd_debug.h| 6 ++
 drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c | 6 ++
 3 files changed, 10 insertions(+), 7 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_debug.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_debug.c
index 190b03efe5ff..ccfc81f085ce 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_debug.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_debug.c
@@ -302,8 +302,7 @@ static int kfd_dbg_set_queue_workaround(struct queue *q, 
bool enable)
if (!q)
return 0;
 
-   if (KFD_GC_VERSION(q->device) < IP_VERSION(11, 0, 0) ||
-   KFD_GC_VERSION(q->device) >= IP_VERSION(12, 0, 0))
+   if (!kfd_dbg_has_cwsr_workaround(q->device))
return 0;
 
if (enable && q->properties.is_user_cu_masked)
@@ -349,7 +348,7 @@ int kfd_dbg_set_mes_debug_mode(struct kfd_process_device 
*pdd)
 {
uint32_t spi_dbg_cntl = pdd->spi_dbg_override | 
pdd->spi_dbg_launch_mode;
uint32_t flags = pdd->process->dbg_flags;
-   bool sq_trap_en = !!spi_dbg_cntl;
+   bool sq_trap_en = !!spi_dbg_cntl || 
!kfd_dbg_has_cwsr_workaround(pdd->dev);
 
if (!kfd_dbg_is_per_vmid_supported(pdd->dev))
return 0;
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_debug.h 
b/drivers/gpu/drm/amd/amdkfd/kfd_debug.h
index ba616ed17dee..586d7f886712 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_debug.h
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_debug.h
@@ -101,6 +101,12 @@ static inline bool kfd_dbg_is_rlc_restore_supported(struct 
kfd_node *dev)
 KFD_GC_VERSION(dev) == IP_VERSION(10, 1, 1));
 }
 
+static inline bool kfd_dbg_has_cwsr_workaround(struct kfd_node *dev)
+{
+   return KFD_GC_VERSION(dev) >= IP_VERSION(11, 0, 0) &&
+  KFD_GC_VERSION(dev) <= IP_VERSION(11, 0, 3);
+}
+
 static inline bool kfd_dbg_has_gws_support(struct kfd_node *dev)
 {
if ((KFD_GC_VERSION(dev) == IP_VERSION(9, 0, 1)
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
index 31cac1fd0d58..761963ad6154 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
@@ -226,8 +226,7 @@ static int add_queue_mes(struct device_queue_manager *dqm, 
struct queue *q,
queue_input.paging = false;
queue_input.tba_addr = qpd->tba_addr;
queue_input.tma_addr = qpd->tma_addr;
-   queue_input.trap_en = KFD_GC_VERSION(q->device) < IP_VERSION(11, 0, 0) 
||
- KFD_GC_VERSION(q->device) > IP_VERSION(11, 0, 3);
+   queue_input.trap_en = !kfd_dbg_has_cwsr_workaround(q->device);
queue_input.skip_process_ctx_clear = 
qpd->pqm->process->debug_trap_enabled;
 
queue_type = convert_to_mes_queue_type(q->properties.type);
@@ -1827,8 +1826,7 @@ static int create_queue_cpsch(struct device_queue_manager 
*dqm, struct queue *q,
 */
q->properties.is_evicted = !!qpd->evicted;
q->properties.is_dbg_wa = qpd->pqm->process->debug_trap_enabled &&
-   KFD_GC_VERSION(q->device) >= IP_VERSION(11, 0, 0) &&
-   KFD_GC_VERSION(q->device) <= IP_VERSION(11, 0, 3);
+ kfd_dbg_has_cwsr_workaround(q->device);
 
if (qd)
mqd_mgr->restore_mqd(mqd_mgr, >mqd, q->mqd_mem_obj, 
>gart_mqd_addr,
-- 
2.25.1

[PATCH 2/2] drm/amdkfd: enable cooperative groups for gfx11

2023-07-14 Thread Jonathan Kim

MES can concurrently schedule queues on the device that require
exclusive device access if marked exclusively_scheduled without the
requirement of GWS.  Similar to the F32 HWS, MES will manage
quality of service for these queues.
Use this for cooperative groups since cooperative groups are device
occupancy limited.

Since some GFX11 devices can only be debugged with partial CUs, do not
allow the debugging of cooperative groups on these devices as the CU
occupancy limit will change on attach.

In addition, zero initialize the MES add queue submission vector for MES
initialization tests as we do not want these to be cooperative
dispatches.

NOTE: FIXME MES FW enablement checks are a placeholder at the moment and
will be updated when the binary revision number is finalized.

Signed-off-by: Jonathan Kim 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c   |  2 +-
 drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h   |  1 +
 drivers/gpu/drm/amd/amdgpu/mes_v11_0.c|  2 ++
 drivers/gpu/drm/amd/amdkfd/kfd_chardev.c  |  3 ++-
 drivers/gpu/drm/amd/amdkfd/kfd_debug.c|  3 ++-
 drivers/gpu/drm/amd/amdkfd/kfd_device.c   |  6 +-
 drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c |  9 -
 .../gpu/drm/amd/amdkfd/kfd_process_queue_manager.c| 11 +++
 drivers/gpu/drm/amd/include/mes_v11_api_def.h |  4 +++-
 9 files changed, 27 insertions(+), 14 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c
index e9091ebfe230..8d13623389d8 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c
@@ -638,7 +638,7 @@ int amdgpu_mes_add_hw_queue(struct amdgpu_device *adev, int 
gang_id,
 {
struct amdgpu_mes_queue *queue;
struct amdgpu_mes_gang *gang;
-   struct mes_add_queue_input queue_input;
+   struct mes_add_queue_input queue_input = {0};
unsigned long flags;
int r;
 
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h
index 2d6ac30b7135..2053954a235c 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h
@@ -224,6 +224,7 @@ struct mes_add_queue_input {
uint32_tis_kfd_process;
uint32_tis_aql_queue;
uint32_tqueue_size;
+   uint32_texclusively_scheduled;
 };
 
 struct mes_remove_queue_input {
diff --git a/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c 
b/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c
index 1bdaa00c0b46..8e67e965f7ea 100644
--- a/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c
@@ -214,6 +214,8 @@ static int mes_v11_0_add_hw_queue(struct amdgpu_mes *mes,
mes_add_queue_pkt.is_aql_queue = input->is_aql_queue;
mes_add_queue_pkt.gds_size = input->queue_size;
 
+   mes_add_queue_pkt.exclusively_scheduled = input->exclusively_scheduled;
+
return mes_v11_0_submit_pkt_and_poll_completion(mes,
_add_queue_pkt, sizeof(mes_add_queue_pkt),
offsetof(union MESAPI__ADD_QUEUE, api_status));
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
index 40ac093b5035..e18401811956 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
@@ -1487,7 +1487,8 @@ static int kfd_ioctl_alloc_queue_gws(struct file *filep,
goto out_unlock;
}
 
-   if (!kfd_dbg_has_gws_support(dev) && p->debug_trap_enabled) {
+   if (p->debug_trap_enabled && (!kfd_dbg_has_gws_support(dev) ||
+  kfd_dbg_has_cwsr_workaround(dev))) {
retval = -EBUSY;
goto out_unlock;
}
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_debug.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_debug.c
index ccfc81f085ce..895e7f690fd0 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_debug.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_debug.c
@@ -753,7 +753,8 @@ int kfd_dbg_trap_enable(struct kfd_process *target, 
uint32_t fd,
if (!KFD_IS_SOC15(pdd->dev))
return -ENODEV;
 
-   if (!kfd_dbg_has_gws_support(pdd->dev) && pdd->qpd.num_gws)
+   if (pdd->qpd.num_gws && (!kfd_dbg_has_gws_support(pdd->dev) ||
+ 
kfd_dbg_has_cwsr_workaround(pdd->dev)))
return -EBUSY;
}
 
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_device.c
index 0b3dc754e06b..9f4f75fd2fb2 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_device.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_device.c
@@ -508,6 +508,7 @@ static int kfd_gws_init(struct kfd_node *node)
 {
int ret = 0;
struct kfd_dev *kfd = node->kfd;
+   uint32_t mes_rev = node->adev

[PATCH] drm/amdkfd: report dispatch id always saved in ttmps after gc9.4.2

2023-07-11 Thread Jonathan Kim

The feature to save the dispatch ID in trap temporaries 6 & 7 on context
save is unconditionally enabled during MQD initialization.

Now that TTMPs are always setup regardless of debug mode for GC 9.4.3, we
should report that the dispatch ID is always available for debug/trap
handling.

Signed-off-by: Jonathan Kim 
---
 drivers/gpu/drm/amd/amdkfd/kfd_topology.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_topology.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_topology.c
index 1a4cdee86759..eeedc3ddffeb 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_topology.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_topology.c
@@ -1941,10 +1941,11 @@ static void kfd_topology_set_capabilities(struct 
kfd_topology_device *dev)
HSA_DBG_WATCH_ADDR_MASK_LO_BIT_GFX9 |
HSA_DBG_WATCH_ADDR_MASK_HI_BIT;
 
-   if (KFD_GC_VERSION(dev->gpu) < IP_VERSION(9, 4, 2))
+   if (KFD_GC_VERSION(dev->gpu) != IP_VERSION(9, 4, 2))
dev->node_props.debug_prop |=
HSA_DBG_DISPATCH_INFO_ALWAYS_VALID;
-   else
+
+   if (KFD_GC_VERSION(dev->gpu) >= IP_VERSION(9, 4, 2))
dev->node_props.capability |=

HSA_CAP_TRAP_DEBUG_PRECISE_MEMORY_OPERATIONS_SUPPORTED;
} else {
-- 
2.25.1

[PATCH] drm/amdkfd: decrement queue count on mes queue destroy

2023-06-13 Thread Jonathan Kim

Queue count should decrement on queue destruction regardless of HWS
support type.

Signed-off-by: Jonathan Kim 
---
 drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
index 8a39a9e0ed5a..f515cb8f30ca 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
@@ -2089,8 +2089,8 @@ static int destroy_queue_cpsch(struct 
device_queue_manager *dqm,
list_del(>list);
qpd->queue_count--;
if (q->properties.is_active) {
+   decrement_queue_count(dqm, qpd, q);
if (!dqm->dev->kfd->shared_resources.enable_mes) {
-   decrement_queue_count(dqm, qpd, q);
retval = execute_queues_cpsch(dqm,
  
KFD_UNMAP_QUEUES_FILTER_DYNAMIC_QUEUES, 0,
  USE_DEFAULT_GRACE_PERIOD);
-- 
2.25.1

[PATCH] drm/amdkfd: fix null queue check on debug setting exceptions

2023-06-12 Thread Jonathan Kim

Null check should be done on queue struct itself and not on the
process queue list node.

Signed-off-by: Jonathan Kim 
---
 drivers/gpu/drm/amd/amdkfd/kfd_debug.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_debug.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_debug.c
index cd34e7aaead4..fff3ccc04fa9 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_debug.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_debug.c
@@ -1097,7 +1097,7 @@ void kfd_dbg_set_enabled_debug_exception_mask(struct 
kfd_process *target,
 
pqm = >pqm;
list_for_each_entry(pqn, >queues, process_queue_list) {
-   if (!pqn)
+   if (!pqn->q)
continue;
 
found_mask |= pqn->q->properties.exception_status;
-- 
2.25.1

[PATCH] drm/amdkfd: fix and enable debugging for gfx11

2023-06-07 Thread Jonathan Kim

There are a couple of fixes required to enable gfx11 debugging.

First, ADD_QUEUE.trap_en is an inappropriate place to toggle
a per-process register so move it to SET_SHADER_DEBUGGER.trap_en.
When ADD_QUEUE.skip_process_ctx_clear is set, MES will prioritize
the SET_SHADER_DEBUGGER.trap_en setting.

Second, to preserve correct save/restore priviledged wave states
in coordination with the trap enablement setting, resume suspended
waves early in the disable call.

NOTE: The AMDGPU_MES_VERSION_MASK check is a place holder as
MES FW updates have been reviewed but is awaiting binary
creation.  Once the binaries have been created, this check may
be subject to change.

v2: do a trap_en safety check in case old mes doesn't accept
unused trap_en d-word.
remove unnecessary process termination work around.

Signed-off-by: Jonathan Kim 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c|  7 ++-
 drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h|  4 +++-
 drivers/gpu/drm/amd/amdgpu/mes_v11_0.c |  1 +
 drivers/gpu/drm/amd/amdkfd/kfd_debug.c | 14 ++
 .../gpu/drm/amd/amdkfd/kfd_device_queue_manager.c  |  3 +--
 drivers/gpu/drm/amd/amdkfd/kfd_topology.c  | 12 +++-
 drivers/gpu/drm/amd/include/mes_v11_api_def.h  |  1 +
 7 files changed, 25 insertions(+), 17 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c
index 20cc3fffe921..e9091ebfe230 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c
@@ -928,7 +928,8 @@ int amdgpu_mes_set_shader_debugger(struct amdgpu_device 
*adev,
uint64_t process_context_addr,
uint32_t spi_gdbg_per_vmid_cntl,
const uint32_t *tcp_watch_cntl,
-   uint32_t flags)
+   uint32_t flags,
+   bool trap_en)
 {
struct mes_misc_op_input op_input = {0};
int r;
@@ -945,6 +946,10 @@ int amdgpu_mes_set_shader_debugger(struct amdgpu_device 
*adev,
memcpy(op_input.set_shader_debugger.tcp_watch_cntl, tcp_watch_cntl,
sizeof(op_input.set_shader_debugger.tcp_watch_cntl));
 
+   if (((adev->mes.sched_version & AMDGPU_MES_API_VERSION_MASK) >>
+   AMDGPU_MES_API_VERSION_SHIFT) >= 14)
+   op_input.set_shader_debugger.trap_en = trap_en;
+
amdgpu_mes_lock(>mes);
 
r = adev->mes.funcs->misc_op(>mes, _input);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h
index b5f5eed2b5ef..2d6ac30b7135 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h
@@ -294,6 +294,7 @@ struct mes_misc_op_input {
} flags;
uint32_t spi_gdbg_per_vmid_cntl;
uint32_t tcp_watch_cntl[4];
+   uint32_t trap_en;
} set_shader_debugger;
};
 };
@@ -361,7 +362,8 @@ int amdgpu_mes_set_shader_debugger(struct amdgpu_device 
*adev,
uint64_t process_context_addr,
uint32_t spi_gdbg_per_vmid_cntl,
const uint32_t *tcp_watch_cntl,
-   uint32_t flags);
+   uint32_t flags,
+   bool trap_en);
 
 int amdgpu_mes_add_ring(struct amdgpu_device *adev, int gang_id,
int queue_type, int idx,
diff --git a/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c 
b/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c
index c4e3cb8d44de..1bdaa00c0b46 100644
--- a/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c
@@ -347,6 +347,7 @@ static int mes_v11_0_misc_op(struct amdgpu_mes *mes,
memcpy(misc_pkt.set_shader_debugger.tcp_watch_cntl,
input->set_shader_debugger.tcp_watch_cntl,

sizeof(misc_pkt.set_shader_debugger.tcp_watch_cntl));
+   misc_pkt.set_shader_debugger.trap_en = 
input->set_shader_debugger.trap_en;
break;
default:
DRM_ERROR("unsupported misc op (%d) \n", input->op);
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_debug.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_debug.c
index 125274445f43..cd34e7aaead4 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_debug.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_debug.c
@@ -349,12 +349,13 @@ int kfd_dbg_set_mes_debug_mode(struct kfd_process_device 
*pdd)
 {
uint32_t spi_dbg_cntl = pdd->spi_dbg_override | 
pdd->spi_dbg_launch_mode;
uint32_t flags = pdd->process->dbg_flags;
+   bool sq_trap_en = !!spi_dbg_cntl;
 
if (!kfd_dbg_is_per_vmid_supported(pdd->dev))
return 0;

[PATCH] drm/amdkfd: optimize gfx off enable toggle for debugging

2023-06-07 Thread Jonathan Kim

Legacy debug devices limited to pinning a single debug VMID for debugging
are the only devices that require disabling GFX OFF while accessing
debug registers.  Debug devices that support multi-process debugging
rely on the hardware scheduler to update debug registers and do not run
into GFX OFF access issues.

Remove KFD GFX OFF enable toggle clutter by moving these calls into the
KGD debug calls themselves.

v2: toggle gfx off around address watch hi/lo settings as well.

Signed-off-by: Jonathan Kim 
---
 .../drm/amd/amdgpu/amdgpu_amdkfd_aldebaran.c  |  4 +++
 .../drm/amd/amdgpu/amdgpu_amdkfd_arcturus.c   |  7 
 .../drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.c| 33 ++-
 .../drm/amd/amdgpu/amdgpu_amdkfd_gfx_v11.c|  4 +++
 .../gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c | 24 ++
 drivers/gpu/drm/amd/amdkfd/kfd_chardev.c  | 22 +++--
 drivers/gpu/drm/amd/amdkfd/kfd_debug.c| 21 +---
 7 files changed, 77 insertions(+), 38 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_aldebaran.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_aldebaran.c
index 60f9e027fb66..1f0e6ec56618 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_aldebaran.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_aldebaran.c
@@ -150,6 +150,8 @@ static uint32_t kgd_gfx_aldebaran_set_address_watch(
VALID,
1);
 
+   amdgpu_gfx_off_ctrl(adev, false);
+
WREG32_RLC((SOC15_REG_OFFSET(GC, 0, regTCP_WATCH0_ADDR_H) +
(watch_id * TCP_WATCH_STRIDE)),
watch_address_high);
@@ -158,6 +160,8 @@ static uint32_t kgd_gfx_aldebaran_set_address_watch(
(watch_id * TCP_WATCH_STRIDE)),
watch_address_low);
 
+   amdgpu_gfx_off_ctrl(adev, true);
+
return watch_address_cntl;
 }
 
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_arcturus.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_arcturus.c
index 625db444df1c..a4e28d547173 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_arcturus.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_arcturus.c
@@ -350,6 +350,8 @@ static uint32_t kgd_arcturus_enable_debug_trap(struct 
amdgpu_device *adev,
bool restore_dbg_registers,
uint32_t vmid)
 {
+   amdgpu_gfx_off_ctrl(adev, false);
+
mutex_lock(>grbm_idx_mutex);
 
kgd_gfx_v9_set_wave_launch_stall(adev, vmid, true);
@@ -362,6 +364,8 @@ static uint32_t kgd_arcturus_enable_debug_trap(struct 
amdgpu_device *adev,
 
mutex_unlock(>grbm_idx_mutex);
 
+   amdgpu_gfx_off_ctrl(adev, true);
+
return 0;
 }
 
@@ -375,6 +379,7 @@ static uint32_t kgd_arcturus_disable_debug_trap(struct 
amdgpu_device *adev,
bool keep_trap_enabled,
uint32_t vmid)
 {
+   amdgpu_gfx_off_ctrl(adev, false);
 
mutex_lock(>grbm_idx_mutex);
 
@@ -388,6 +393,8 @@ static uint32_t kgd_arcturus_disable_debug_trap(struct 
amdgpu_device *adev,
 
mutex_unlock(>grbm_idx_mutex);
 
+   amdgpu_gfx_off_ctrl(adev, true);
+
return 0;
 }
 const struct kfd2kgd_calls arcturus_kfd2kgd = {
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.c
index 8ad7a7779e14..415928139861 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.c
@@ -754,12 +754,13 @@ uint32_t kgd_gfx_v10_enable_debug_trap(struct 
amdgpu_device *adev,
bool restore_dbg_registers,
uint32_t vmid)
 {
+   amdgpu_gfx_off_ctrl(adev, false);
 
mutex_lock(>grbm_idx_mutex);
 
kgd_gfx_v10_set_wave_launch_stall(adev, vmid, true);
 
-   /* assume gfx off is disabled for the debug session if rlc restore not 
supported. */
+   /* keep gfx off disabled for the debug session if rlc restore not 
supported. */
if (restore_dbg_registers) {
uint32_t data = 0;
 
@@ -784,6 +785,8 @@ uint32_t kgd_gfx_v10_enable_debug_trap(struct amdgpu_device 
*adev,
 
mutex_unlock(>grbm_idx_mutex);
 
+   amdgpu_gfx_off_ctrl(adev, true);
+
return 0;
 }
 
@@ -791,6 +794,8 @@ uint32_t kgd_gfx_v10_disable_debug_trap(struct 
amdgpu_device *adev,
bool keep_trap_enabled,
uint32_t vmid)
 {
+   amdgpu_gfx_off_ctrl(adev, false);
+
mutex_lock(>grbm_idx_mutex);
 
kgd_gfx_v10_set_wave_launch_stall(adev, vmid, true);
@@ -801,6 +806,16 @@ uint32_t kgd_gfx_v10_disable_debug_trap(struct 
amdgpu_device *adev,
 
mutex_unlock(>grbm_idx_mutex);
 
+   amdgpu_gfx_off_ctrl(adev, true);
+
+   /*
+* Remove the extra gfx off disable reference from debug restore call
+

[PATCH] drm/amdkfd: fix and enable debugging for gfx11

2023-06-07 Thread Jonathan Kim

There are a few fixes required to enable gfx11 debugging.

First, ADD_QUEUE.trap_en is an inappropriate place to toggle
a per-process register so move it to SET_SHADER_DEBUGGER.trap_en.
When ADD_QUEUE.skip_process_ctx_clear is set, MES will prioritize
the SET_SHADER_DEBUGGER.trap_en setting.

Second, to preserve correct save/restore priviledged wave states
in coordination with the trap enablement setting, resume suspended
waves early in the disable call.

Finally, displaced single stepping can cause non-fatal illegal
instructions during process termination on debug disable.  To work
around this, stall the waves prior to disable and allow clean
up to happen naturally on process termination.

NOTE: The AMDGPU_MES_VERSION_MASK check is a place holder as
MES FW updates have been reviewed but is awaiting binary
creation.  Once the binaries have been created, this check may
be subject to change.

Signed-off-by: Jonathan Kim 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c   |  5 ++-
 drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h   |  4 ++-
 drivers/gpu/drm/amd/amdgpu/mes_v11_0.c|  1 +
 drivers/gpu/drm/amd/amdkfd/kfd_debug.c| 31 ++-
 .../drm/amd/amdkfd/kfd_device_queue_manager.c |  3 +-
 drivers/gpu/drm/amd/amdkfd/kfd_topology.c | 12 ---
 drivers/gpu/drm/amd/include/mes_v11_api_def.h |  1 +
 7 files changed, 40 insertions(+), 17 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c
index 20cc3fffe921..95d69f9c7361 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c
@@ -928,7 +928,8 @@ int amdgpu_mes_set_shader_debugger(struct amdgpu_device 
*adev,
uint64_t process_context_addr,
uint32_t spi_gdbg_per_vmid_cntl,
const uint32_t *tcp_watch_cntl,
-   uint32_t flags)
+   uint32_t flags,
+   bool trap_en)
 {
struct mes_misc_op_input op_input = {0};
int r;
@@ -945,6 +946,8 @@ int amdgpu_mes_set_shader_debugger(struct amdgpu_device 
*adev,
memcpy(op_input.set_shader_debugger.tcp_watch_cntl, tcp_watch_cntl,
sizeof(op_input.set_shader_debugger.tcp_watch_cntl));
 
+   op_input.set_shader_debugger.trap_en = trap_en;
+
amdgpu_mes_lock(>mes);
 
r = adev->mes.funcs->misc_op(>mes, _input);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h
index b5f5eed2b5ef..2d6ac30b7135 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h
@@ -294,6 +294,7 @@ struct mes_misc_op_input {
} flags;
uint32_t spi_gdbg_per_vmid_cntl;
uint32_t tcp_watch_cntl[4];
+   uint32_t trap_en;
} set_shader_debugger;
};
 };
@@ -361,7 +362,8 @@ int amdgpu_mes_set_shader_debugger(struct amdgpu_device 
*adev,
uint64_t process_context_addr,
uint32_t spi_gdbg_per_vmid_cntl,
const uint32_t *tcp_watch_cntl,
-   uint32_t flags);
+   uint32_t flags,
+   bool trap_en);
 
 int amdgpu_mes_add_ring(struct amdgpu_device *adev, int gang_id,
int queue_type, int idx,
diff --git a/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c 
b/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c
index c4e3cb8d44de..1bdaa00c0b46 100644
--- a/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c
@@ -347,6 +347,7 @@ static int mes_v11_0_misc_op(struct amdgpu_mes *mes,
memcpy(misc_pkt.set_shader_debugger.tcp_watch_cntl,
input->set_shader_debugger.tcp_watch_cntl,

sizeof(misc_pkt.set_shader_debugger.tcp_watch_cntl));
+   misc_pkt.set_shader_debugger.trap_en = 
input->set_shader_debugger.trap_en;
break;
default:
DRM_ERROR("unsupported misc op (%d) \n", input->op);
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_debug.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_debug.c
index 125274445f43..e7bc07068eed 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_debug.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_debug.c
@@ -349,12 +349,30 @@ int kfd_dbg_set_mes_debug_mode(struct kfd_process_device 
*pdd)
 {
uint32_t spi_dbg_cntl = pdd->spi_dbg_override | 
pdd->spi_dbg_launch_mode;
uint32_t flags = pdd->process->dbg_flags;
+   bool sq_trap_en = !!spi_dbg_cntl;
 
if (!kfd_dbg_is_per_vmid_supported(pdd->dev))
return 0;
 
+   /*
+* For displaced single stepping, the debugger inserts s_trap 
instructions
+

[PATCH] drm/amdkfd: fix vmfault signalling with additional data.

2023-06-07 Thread Jonathan Kim

Exception handling for vmfaults should be raised with additional data.

Reported-by: Mukul Joshi 
Signed-off-by: Jonathan Kim 
---
 drivers/gpu/drm/amd/amdkfd/kfd_events.c | 34 +++--
 1 file changed, 20 insertions(+), 14 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_events.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_events.c
index 7ff5c4e1b7e2..c069ef77daa5 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_events.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_events.c
@@ -1240,19 +1240,24 @@ void kfd_signal_vm_fault_event(struct kfd_node *dev, 
u32 pasid,
return;
}
 
-   memset(_exception_data, 0, sizeof(memory_exception_data));
-   memory_exception_data.gpu_id = user_gpu_id;
-   memory_exception_data.failure.imprecise = true;
-   /* Set failure reason */
-   if (info) {
-   memory_exception_data.va = (info->page_addr) << PAGE_SHIFT;
-   memory_exception_data.failure.NotPresent =
-   info->prot_valid ? 1 : 0;
-   memory_exception_data.failure.NoExecute =
-   info->prot_exec ? 1 : 0;
-   memory_exception_data.failure.ReadOnly =
-   info->prot_write ? 1 : 0;
-   memory_exception_data.failure.imprecise = 0;
+   /* SoC15 chips and onwards will pass in data from now on. */
+   if (!data) {
+   memset(_exception_data, 0, 
sizeof(memory_exception_data));
+   memory_exception_data.gpu_id = user_gpu_id;
+   memory_exception_data.failure.imprecise = true;
+
+   /* Set failure reason */
+   if (info) {
+   memory_exception_data.va = (info->page_addr) <<
+   PAGE_SHIFT;
+   memory_exception_data.failure.NotPresent =
+   info->prot_valid ? 1 : 0;
+   memory_exception_data.failure.NoExecute =
+   info->prot_exec ? 1 : 0;
+   memory_exception_data.failure.ReadOnly =
+   info->prot_write ? 1 : 0;
+   memory_exception_data.failure.imprecise = 0;
+   }
}
 
rcu_read_lock();
@@ -1261,7 +1266,8 @@ void kfd_signal_vm_fault_event(struct kfd_node *dev, u32 
pasid,
idr_for_each_entry_continue(>event_idr, ev, id)
if (ev->type == KFD_EVENT_TYPE_MEMORY) {
spin_lock(>lock);
-   ev->memory_exception_data = memory_exception_data;
+   ev->memory_exception_data = data ? *data :
+   memory_exception_data;
set_event(ev);
spin_unlock(>lock);
}
-- 
2.25.1

[PATCH 31/33] drm/amdkfd: add debug queue snapshot operation

2023-05-25 Thread Jonathan Kim

Allow the debugger to get a snapshot of a specified number of queues
containing various queue property information that is copied to the
debugger.

Since the debugger doesn't know how many queues exist at any given time,
allow the debugger to pass the requested number of snapshots as 0 to get
the actual number of potential snapshots to use for a subsequent snapshot
request for actual information.

To prevent future ABI breakage, pass in the requested entry_size.
The KFD will return it's own entry_size in case the debugger still wants
log the information in a core dump on sizing failure.

Also allow the debugger to clear exceptions when doing a snapshot.

Signed-off-by: Jonathan Kim 
Reviewed-by: Felix Kuehling 
---
 drivers/gpu/drm/amd/amdkfd/kfd_chardev.c  |  6 +++
 .../drm/amd/amdkfd/kfd_device_queue_manager.c | 36 +
 .../drm/amd/amdkfd/kfd_device_queue_manager.h |  3 ++
 drivers/gpu/drm/amd/amdkfd/kfd_priv.h |  5 +++
 .../amd/amdkfd/kfd_process_queue_manager.c| 40 +++
 5 files changed, 90 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
index 00aa844762b0..b24a73fd53af 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
@@ -3053,6 +3053,12 @@ static int kfd_ioctl_set_debug_trap(struct file *filep, 
struct kfd_process *p, v
>query_exception_info.info_size);
break;
case KFD_IOC_DBG_TRAP_GET_QUEUE_SNAPSHOT:
+   r = pqm_get_queue_snapshot(>pqm,
+   args->queue_snapshot.exception_mask,
+   (void __user 
*)args->queue_snapshot.snapshot_buf_ptr,
+   >queue_snapshot.num_queues,
+   >queue_snapshot.entry_size);
+   break;
case KFD_IOC_DBG_TRAP_GET_DEVICE_SNAPSHOT:
pr_warn("Debug op %i not supported yet\n", args->op);
r = -EACCES;
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
index 03fabe6e9cdb..9f52f8426ed1 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
@@ -3040,6 +3040,42 @@ int suspend_queues(struct kfd_process *p,
return total_suspended;
 }
 
+static uint32_t set_queue_type_for_user(struct queue_properties *q_props)
+{
+   switch (q_props->type) {
+   case KFD_QUEUE_TYPE_COMPUTE:
+   return q_props->format == KFD_QUEUE_FORMAT_PM4
+   ? KFD_IOC_QUEUE_TYPE_COMPUTE
+   : KFD_IOC_QUEUE_TYPE_COMPUTE_AQL;
+   case KFD_QUEUE_TYPE_SDMA:
+   return KFD_IOC_QUEUE_TYPE_SDMA;
+   case KFD_QUEUE_TYPE_SDMA_XGMI:
+   return KFD_IOC_QUEUE_TYPE_SDMA_XGMI;
+   default:
+   WARN_ONCE(true, "queue type not recognized!");
+   return 0x;
+   };
+}
+
+void set_queue_snapshot_entry(struct queue *q,
+ uint64_t exception_clear_mask,
+ struct kfd_queue_snapshot_entry *qss_entry)
+{
+   qss_entry->ring_base_address = q->properties.queue_address;
+   qss_entry->write_pointer_address = (uint64_t)q->properties.write_ptr;
+   qss_entry->read_pointer_address = (uint64_t)q->properties.read_ptr;
+   qss_entry->ctx_save_restore_address =
+   q->properties.ctx_save_restore_area_address;
+   qss_entry->ctx_save_restore_area_size =
+   q->properties.ctx_save_restore_area_size;
+   qss_entry->exception_status = q->properties.exception_status;
+   qss_entry->queue_id = q->properties.queue_id;
+   qss_entry->gpu_id = q->device->id;
+   qss_entry->ring_size = (uint32_t)q->properties.queue_size;
+   qss_entry->queue_type = set_queue_type_for_user(>properties);
+   q->properties.exception_status &= ~exception_clear_mask;
+}
+
 int debug_lock_and_unmap(struct device_queue_manager *dqm)
 {
int r;
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h 
b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h
index d4e6dbffe8c2..7dd4b177219d 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h
@@ -300,6 +300,9 @@ int suspend_queues(struct kfd_process *p,
 int resume_queues(struct kfd_process *p,
uint32_t num_queues,
uint32_t *usr_queue_id_array);
+void set_queue_snapshot_entry(struct queue *q,
+ uint64_t exception_clear_mask,
+ struct kfd_queue_snapshot_entry *qss_entry);
 int debu

[PATCH 30/33] drm/amdkfd: add debug query exception info operation

2023-05-25 Thread Jonathan Kim

Allow the debugger to query additional info based on an exception code.
For device exceptions, it's currently only memory violation information.
For process exceptions, it's currently only runtime information.
Queue exception only report the queue exception status.

The debugger has the option of clearing the target exception on query.

Signed-off-by: Jonathan Kim 
Reviewed-by: Felix Kuehling 
---
 drivers/gpu/drm/amd/amdkfd/kfd_chardev.c |   7 ++
 drivers/gpu/drm/amd/amdkfd/kfd_debug.c   | 120 +++
 drivers/gpu/drm/amd/amdkfd/kfd_debug.h   |   6 ++
 3 files changed, 133 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
index ebb2088d12fa..00aa844762b0 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
@@ -3045,6 +3045,13 @@ static int kfd_ioctl_set_debug_trap(struct file *filep, 
struct kfd_process *p, v
>query_debug_event.exception_mask);
break;
case KFD_IOC_DBG_TRAP_QUERY_EXCEPTION_INFO:
+   r = kfd_dbg_trap_query_exception_info(target,
+   args->query_exception_info.source_id,
+   args->query_exception_info.exception_code,
+   args->query_exception_info.clear_exception,
+   (void __user 
*)args->query_exception_info.info_ptr,
+   >query_exception_info.info_size);
+   break;
case KFD_IOC_DBG_TRAP_GET_QUEUE_SNAPSHOT:
case KFD_IOC_DBG_TRAP_GET_DEVICE_SNAPSHOT:
pr_warn("Debug op %i not supported yet\n", args->op);
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_debug.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_debug.c
index e9530e682e85..24e2b285448a 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_debug.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_debug.c
@@ -890,6 +890,126 @@ int kfd_dbg_trap_set_wave_launch_mode(struct kfd_process 
*target,
return r;
 }
 
+int kfd_dbg_trap_query_exception_info(struct kfd_process *target,
+   uint32_t source_id,
+   uint32_t exception_code,
+   bool clear_exception,
+   void __user *info,
+   uint32_t *info_size)
+{
+   bool found = false;
+   int r = 0;
+   uint32_t copy_size, actual_info_size = 0;
+   uint64_t *exception_status_ptr = NULL;
+
+   if (!target)
+   return -EINVAL;
+
+   if (!info || !info_size)
+   return -EINVAL;
+
+   mutex_lock(>event_mutex);
+
+   if (KFD_DBG_EC_TYPE_IS_QUEUE(exception_code)) {
+   /* Per queue exceptions */
+   struct queue *queue = NULL;
+   int i;
+
+   for (i = 0; i < target->n_pdds; i++) {
+   struct kfd_process_device *pdd = target->pdds[i];
+   struct qcm_process_device *qpd = >qpd;
+
+   list_for_each_entry(queue, >queues_list, list) {
+   if (!found && queue->properties.queue_id == 
source_id) {
+   found = true;
+   break;
+   }
+   }
+   if (found)
+   break;
+   }
+
+   if (!found) {
+   r = -EINVAL;
+   goto out;
+   }
+
+   if (!(queue->properties.exception_status & 
KFD_EC_MASK(exception_code))) {
+   r = -ENODATA;
+   goto out;
+   }
+   exception_status_ptr = >properties.exception_status;
+   } else if (KFD_DBG_EC_TYPE_IS_DEVICE(exception_code)) {
+   /* Per device exceptions */
+   struct kfd_process_device *pdd = NULL;
+   int i;
+
+   for (i = 0; i < target->n_pdds; i++) {
+   pdd = target->pdds[i];
+   if (pdd->dev->id == source_id) {
+   found = true;
+   break;
+   }
+   }
+
+   if (!found) {
+   r = -EINVAL;
+   goto out;
+   }
+
+   if (!(pdd->exception_status & KFD_EC_MASK(exception_code))) {
+   r = -ENODATA;
+   goto out;
+   }
+
+   if (exception_code == EC_DEVICE_MEMORY_VIOLATION) {
+   copy_size = min((size_t)(*info_size), 
pdd->vm_fault_exc_data_size);
+
+   if (copy_to_user(info, pdd->vm_fault_exc_data, 
copy_size)) {
+   r = -EFAULT;
+   goto out

[PATCH 32/33] drm/amdkfd: add debug device snapshot operation

2023-05-25 Thread Jonathan Kim

Similar to queue snapshot, return an array of device information using
an entry_size check and return.
Unlike queue snapshots, the debugger needs to pass to correct number of
devices that exist.  If it fails to do so, the KFD will return the
number of actual devices so that the debugger can make a subsequent
successful call.

v2: add num_xcc to device snapshot and fixup new kfd_node reference

Signed-off-by: Jonathan Kim 
---
 drivers/gpu/drm/amd/amdkfd/kfd_chardev.c |  7 ++-
 drivers/gpu/drm/amd/amdkfd/kfd_debug.c   | 73 
 drivers/gpu/drm/amd/amdkfd/kfd_debug.h   |  5 ++
 3 files changed, 83 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
index b24a73fd53af..f522325b409b 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
@@ -3060,8 +3060,11 @@ static int kfd_ioctl_set_debug_trap(struct file *filep, 
struct kfd_process *p, v
>queue_snapshot.entry_size);
break;
case KFD_IOC_DBG_TRAP_GET_DEVICE_SNAPSHOT:
-   pr_warn("Debug op %i not supported yet\n", args->op);
-   r = -EACCES;
+   r = kfd_dbg_trap_device_snapshot(target,
+   args->device_snapshot.exception_mask,
+   (void __user 
*)args->device_snapshot.snapshot_buf_ptr,
+   >device_snapshot.num_devices,
+   >device_snapshot.entry_size);
break;
default:
pr_err("Invalid option: %i\n", args->op);
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_debug.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_debug.c
index 24e2b285448a..125274445f43 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_debug.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_debug.c
@@ -22,6 +22,7 @@
 
 #include "kfd_debug.h"
 #include "kfd_device_queue_manager.h"
+#include "kfd_topology.h"
 #include 
 #include 
 
@@ -1010,6 +1011,78 @@ int kfd_dbg_trap_query_exception_info(struct kfd_process 
*target,
return r;
 }
 
+int kfd_dbg_trap_device_snapshot(struct kfd_process *target,
+   uint64_t exception_clear_mask,
+   void __user *user_info,
+   uint32_t *number_of_device_infos,
+   uint32_t *entry_size)
+{
+   struct kfd_dbg_device_info_entry device_info;
+   uint32_t tmp_entry_size = *entry_size, tmp_num_devices;
+   int i, r = 0;
+
+   if (!(target && user_info && number_of_device_infos && entry_size))
+   return -EINVAL;
+
+   tmp_num_devices = min_t(size_t, *number_of_device_infos, 
target->n_pdds);
+   *number_of_device_infos = target->n_pdds;
+   *entry_size = min_t(size_t, *entry_size, sizeof(device_info));
+
+   if (!tmp_num_devices)
+   return 0;
+
+   memset(_info, 0, sizeof(device_info));
+
+   mutex_lock(>event_mutex);
+
+   /* Run over all pdd of the process */
+   for (i = 0; i < tmp_num_devices; i++) {
+   struct kfd_process_device *pdd = target->pdds[i];
+   struct kfd_topology_device *topo_dev = 
kfd_topology_device_by_id(pdd->dev->id);
+
+   device_info.gpu_id = pdd->dev->id;
+   device_info.exception_status = pdd->exception_status;
+   device_info.lds_base = pdd->lds_base;
+   device_info.lds_limit = pdd->lds_limit;
+   device_info.scratch_base = pdd->scratch_base;
+   device_info.scratch_limit = pdd->scratch_limit;
+   device_info.gpuvm_base = pdd->gpuvm_base;
+   device_info.gpuvm_limit = pdd->gpuvm_limit;
+   device_info.location_id = topo_dev->node_props.location_id;
+   device_info.vendor_id = topo_dev->node_props.vendor_id;
+   device_info.device_id = topo_dev->node_props.device_id;
+   device_info.revision_id = pdd->dev->adev->pdev->revision;
+   device_info.subsystem_vendor_id = 
pdd->dev->adev->pdev->subsystem_vendor;
+   device_info.subsystem_device_id = 
pdd->dev->adev->pdev->subsystem_device;
+   device_info.fw_version = pdd->dev->kfd->mec_fw_version;
+   device_info.gfx_target_version =
+   topo_dev->node_props.gfx_target_version;
+   device_info.simd_count = topo_dev->node_props.simd_count;
+   device_info.max_waves_per_simd =
+   topo_dev->node_props.max_waves_per_simd;
+   device_info.array_count = topo_dev->node_props.array_count;
+   device_info.simd_arrays_per_engine =
+   topo_dev->node_props.simd_arrays_per_engine;
+

[PATCH 29/33] drm/amdkfd: add debug query event operation

2023-05-25 Thread Jonathan Kim

Allow the debugger to query a single queue, device and process
exception.
The KFD should also return the GPU or Queue id of the exception.
The debugger also has the option of clearing exceptions after
being queried.

Signed-off-by: Jonathan Kim 
Reviewed-by: Felix Kuehling 
---
 drivers/gpu/drm/amd/amdkfd/kfd_chardev.c |  6 +++
 drivers/gpu/drm/amd/amdkfd/kfd_debug.c   | 64 
 drivers/gpu/drm/amd/amdkfd/kfd_debug.h   |  5 ++
 3 files changed, 75 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
index e5d95b144dcd..ebb2088d12fa 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
@@ -3038,6 +3038,12 @@ static int kfd_ioctl_set_debug_trap(struct file *filep, 
struct kfd_process *p, v
r = kfd_dbg_trap_set_flags(target, >set_flags.flags);
break;
case KFD_IOC_DBG_TRAP_QUERY_DEBUG_EVENT:
+   r = kfd_dbg_ev_query_debug_event(target,
+   >query_debug_event.queue_id,
+   >query_debug_event.gpu_id,
+   args->query_debug_event.exception_mask,
+   >query_debug_event.exception_mask);
+   break;
case KFD_IOC_DBG_TRAP_QUERY_EXCEPTION_INFO:
case KFD_IOC_DBG_TRAP_GET_QUEUE_SNAPSHOT:
case KFD_IOC_DBG_TRAP_GET_DEVICE_SNAPSHOT:
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_debug.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_debug.c
index 43c3170998d3..e9530e682e85 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_debug.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_debug.c
@@ -27,6 +27,70 @@
 
 #define MAX_WATCH_ADDRESSES4
 
+int kfd_dbg_ev_query_debug_event(struct kfd_process *process,
+ unsigned int *queue_id,
+ unsigned int *gpu_id,
+ uint64_t exception_clear_mask,
+ uint64_t *event_status)
+{
+   struct process_queue_manager *pqm;
+   struct process_queue_node *pqn;
+   int i;
+
+   if (!(process && process->debug_trap_enabled))
+   return -ENODATA;
+
+   mutex_lock(>event_mutex);
+   *event_status = 0;
+   *queue_id = 0;
+   *gpu_id = 0;
+
+   /* find and report queue events */
+   pqm = >pqm;
+   list_for_each_entry(pqn, >queues, process_queue_list) {
+   uint64_t tmp = process->exception_enable_mask;
+
+   if (!pqn->q)
+   continue;
+
+   tmp &= pqn->q->properties.exception_status;
+
+   if (!tmp)
+   continue;
+
+   *event_status = pqn->q->properties.exception_status;
+   *queue_id = pqn->q->properties.queue_id;
+   *gpu_id = pqn->q->device->id;
+   pqn->q->properties.exception_status &= ~exception_clear_mask;
+   goto out;
+   }
+
+   /* find and report device events */
+   for (i = 0; i < process->n_pdds; i++) {
+   struct kfd_process_device *pdd = process->pdds[i];
+   uint64_t tmp = process->exception_enable_mask
+   & pdd->exception_status;
+
+   if (!tmp)
+   continue;
+
+   *event_status = pdd->exception_status;
+   *gpu_id = pdd->dev->id;
+   pdd->exception_status &= ~exception_clear_mask;
+   goto out;
+   }
+
+   /* report process events */
+   if (process->exception_enable_mask & process->exception_status) {
+   *event_status = process->exception_status;
+   process->exception_status &= ~exception_clear_mask;
+   }
+
+out:
+   mutex_unlock(>event_mutex);
+   return *event_status ? 0 : -EAGAIN;
+}
+
 void debug_event_write_work_handler(struct work_struct *work)
 {
struct kfd_process *process;
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_debug.h 
b/drivers/gpu/drm/amd/amdkfd/kfd_debug.h
index ef8e9f7f1716..e78f954c0684 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_debug.h
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_debug.h
@@ -27,6 +27,11 @@
 
 void kfd_dbg_trap_deactivate(struct kfd_process *target, bool unwind, int 
unwind_count);
 int kfd_dbg_trap_activate(struct kfd_process *target);
+int kfd_dbg_ev_query_debug_event(struct kfd_process *process,
+   unsigned int *queue_id,
+   unsigned int *gpu_id,
+   uint64_t exception_clear_mask,
+   uint64_t *event_status);
 bool kfd_set_dbg_ev_from_interrupt(struct kfd_node *dev,
   unsigned int pasid,
   uint32_t doorbell_id,
-- 
2.25.1

[PATCH 22/33] drm/amdkfd: update process interrupt handling for debug events

2023-05-25 Thread Jonathan Kim

The debugger must be notified by any debugger subscribed exception
that comes from hardware interrupts.

If a debugger session exits, any exceptions it subscribed to may still
have interrupts in the interrupt ring buffer or KGD/KFD pipeline.
To prevent a new session from inheriting stale interrupts, when a new
queue is created, open an interrupt drain and allow the IH ring to drain
from a timestamped checkpoint.  Then inject a custom IV so that once
the custom IV is picked up by the KFD, it's safe to close the drain
and proceed with queue creation.

The drain must also be on debug disable as SW interrupts may still
be processed.  Drain at this time and clear all the exception status.

The debugger may also not be attached nor subscibed to certain
exceptions so forward them directly to the runtime.

GFX10 also requires its own IV processing, hence the creation of
kfd_int_process_v10.c.  This is because the IV from SQ interrupts are
packed into a new continguous format unlike GFX9. To make this clear,
a separate interrupting handling code file was created.

v2: use new kfd_node struct in prototypes.

Signed-off-by: Jonathan Kim 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c|  16 +
 drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h|   2 +
 drivers/gpu/drm/amd/amdkfd/Makefile   |   1 +
 drivers/gpu/drm/amd/amdkfd/kfd_debug.c|  84 
 drivers/gpu/drm/amd/amdkfd/kfd_debug.h|   6 +
 drivers/gpu/drm/amd/amdkfd/kfd_device.c   |   4 +-
 .../gpu/drm/amd/amdkfd/kfd_int_process_v10.c  | 405 ++
 .../gpu/drm/amd/amdkfd/kfd_int_process_v11.c  |  21 +-
 .../gpu/drm/amd/amdkfd/kfd_int_process_v9.c   |  98 -
 drivers/gpu/drm/amd/amdkfd/kfd_priv.h |  12 +
 drivers/gpu/drm/amd/amdkfd/kfd_process.c  |  47 ++
 .../amd/amdkfd/kfd_process_queue_manager.c|   4 +
 12 files changed, 680 insertions(+), 20 deletions(-)
 create mode 100644 drivers/gpu/drm/amd/amdkfd/kfd_int_process_v10.c

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
index 66f80b9ab0c5..98cd52bb005f 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
@@ -777,6 +777,22 @@ void amdgpu_amdkfd_ras_poison_consumption_handler(struct 
amdgpu_device *adev, bo
amdgpu_umc_poison_handler(adev, reset);
 }
 
+int amdgpu_amdkfd_send_close_event_drain_irq(struct amdgpu_device *adev,
+   uint32_t *payload)
+{
+   int ret;
+
+   /* Device or IH ring is not ready so bail. */
+   ret = amdgpu_ih_wait_on_checkpoint_process_ts(adev, >irq.ih);
+   if (ret)
+   return ret;
+
+   /* Send payload to fence KFD interrupts */
+   amdgpu_amdkfd_interrupt(adev, payload);
+
+   return 0;
+}
+
 bool amdgpu_amdkfd_ras_query_utcl2_poison_status(struct amdgpu_device *adev)
 {
if (adev->gfx.ras && adev->gfx.ras->query_utcl2_poison_status)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
index 94cc456761e5..dd740e64e6e1 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
@@ -250,6 +250,8 @@ int amdgpu_amdkfd_get_xgmi_bandwidth_mbytes(struct 
amdgpu_device *dst,
struct amdgpu_device *src,
bool is_min);
 int amdgpu_amdkfd_get_pcie_bandwidth_mbytes(struct amdgpu_device *adev, bool 
is_min);
+int amdgpu_amdkfd_send_close_event_drain_irq(struct amdgpu_device *adev,
+   uint32_t *payload);
 
 /* Read user wptr from a specified user address space with page fault
  * disabled. The memory must be pinned and mapped to the hardware when
diff --git a/drivers/gpu/drm/amd/amdkfd/Makefile 
b/drivers/gpu/drm/amd/amdkfd/Makefile
index 747754428073..2ec8f27c5366 100644
--- a/drivers/gpu/drm/amd/amdkfd/Makefile
+++ b/drivers/gpu/drm/amd/amdkfd/Makefile
@@ -53,6 +53,7 @@ AMDKFD_FILES  := $(AMDKFD_PATH)/kfd_module.o \
$(AMDKFD_PATH)/kfd_events.o \
$(AMDKFD_PATH)/cik_event_interrupt.o \
$(AMDKFD_PATH)/kfd_int_process_v9.o \
+   $(AMDKFD_PATH)/kfd_int_process_v10.o \
$(AMDKFD_PATH)/kfd_int_process_v11.o \
$(AMDKFD_PATH)/kfd_smi_events.o \
$(AMDKFD_PATH)/kfd_crat.o \
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_debug.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_debug.c
index 17e8e9edccbf..68b657398d41 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_debug.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_debug.c
@@ -125,6 +125,64 @@ bool kfd_dbg_ev_raise(uint64_t event_mask,
return is_subscribed;
 }
 
+/* set pending event queue entry from ring entry  */
+bool kfd_set_dbg_ev_from_interrupt(struct kfd_node *dev,
+  unsigned int pasid,
+

[PATCH 23/33] drm/amdkfd: add debug set exceptions enabled operation

2023-05-25 Thread Jonathan Kim

The debugger subscibes to nofication for requested exceptions on attach.
Allow the debugger to change its subsciption later on.

Signed-off-by: Jonathan Kim 
Reviewed-by: Felix Kuehling 
---
 drivers/gpu/drm/amd/amdkfd/kfd_chardev.c |  3 ++
 drivers/gpu/drm/amd/amdkfd/kfd_debug.c   | 36 
 drivers/gpu/drm/amd/amdkfd/kfd_debug.h   |  2 ++
 3 files changed, 41 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
index 73cb5abce431..80d354eade35 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
@@ -2980,6 +2980,9 @@ static int kfd_ioctl_set_debug_trap(struct file *filep, 
struct kfd_process *p, v
args->send_runtime_event.exception_mask);
break;
case KFD_IOC_DBG_TRAP_SET_EXCEPTIONS_ENABLED:
+   kfd_dbg_set_enabled_debug_exception_mask(target,
+   args->set_exceptions_enabled.exception_mask);
+   break;
case KFD_IOC_DBG_TRAP_SET_WAVE_LAUNCH_OVERRIDE:
case KFD_IOC_DBG_TRAP_SET_WAVE_LAUNCH_MODE:
case KFD_IOC_DBG_TRAP_SUSPEND_QUEUES:
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_debug.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_debug.c
index 68b657398d41..48a4e3cc2234 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_debug.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_debug.c
@@ -521,3 +521,39 @@ int kfd_dbg_trap_enable(struct kfd_process *target, 
uint32_t fd,
 
return r;
 }
+
+void kfd_dbg_set_enabled_debug_exception_mask(struct kfd_process *target,
+   uint64_t exception_set_mask)
+{
+   uint64_t found_mask = 0;
+   struct process_queue_manager *pqm;
+   struct process_queue_node *pqn;
+   static const char write_data = '.';
+   loff_t pos = 0;
+   int i;
+
+   mutex_lock(>event_mutex);
+
+   found_mask |= target->exception_status;
+
+   pqm = >pqm;
+   list_for_each_entry(pqn, >queues, process_queue_list) {
+   if (!pqn)
+   continue;
+
+   found_mask |= pqn->q->properties.exception_status;
+   }
+
+   for (i = 0; i < target->n_pdds; i++) {
+   struct kfd_process_device *pdd = target->pdds[i];
+
+   found_mask |= pdd->exception_status;
+   }
+
+   if (exception_set_mask & found_mask)
+   kernel_write(target->dbg_ev_file, _data, 1, );
+
+   target->exception_enable_mask = exception_set_mask;
+
+   mutex_unlock(>event_mutex);
+}
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_debug.h 
b/drivers/gpu/drm/amd/amdkfd/kfd_debug.h
index 5153ccbd7fd1..6c1054a08872 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_debug.h
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_debug.h
@@ -56,6 +56,8 @@ static inline bool kfd_dbg_is_per_vmid_supported(struct 
kfd_node *dev)
 
 void debug_event_write_work_handler(struct work_struct *work);
 
+void kfd_dbg_set_enabled_debug_exception_mask(struct kfd_process *target,
+   uint64_t exception_set_mask);
 /*
  * If GFX off is enabled, chips that do not support RLC restore for the debug
  * registers will disable GFX off temporarily for the entire debug session.
-- 
2.25.1

[PATCH 33/33] drm/amdkfd: bump kfd ioctl minor version for debug api availability

2023-05-25 Thread Jonathan Kim

Bump the minor version to declare debugging capability is now
available.

Signed-off-by: Jonathan Kim 
Reviewed-by: Felix Kuehling 
---
 drivers/gpu/drm/amd/amdkfd/kfd_chardev.c | 1 -
 include/uapi/linux/kfd_ioctl.h   | 3 ++-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
index f522325b409b..56f55da482e2 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
@@ -2984,7 +2984,6 @@ static int kfd_ioctl_set_debug_trap(struct file *filep, 
struct kfd_process *p, v
if (!r)
target->exception_enable_mask = 
args->enable.exception_mask;
 
-   pr_warn("Debug functions limited\n");
break;
case KFD_IOC_DBG_TRAP_DISABLE:
r = kfd_dbg_trap_disable(target);
diff --git a/include/uapi/linux/kfd_ioctl.h b/include/uapi/linux/kfd_ioctl.h
index dfe745ee427e..ea0d50955eac 100644
--- a/include/uapi/linux/kfd_ioctl.h
+++ b/include/uapi/linux/kfd_ioctl.h
@@ -38,9 +38,10 @@
  * - 1.10 - Add SMI profiler event log
  * - 1.11 - Add unified memory for ctx save/restore area
  * - 1.12 - Add DMA buf export ioctl
+ * - 1.13 - Add debugger API
  */
 #define KFD_IOCTL_MAJOR_VERSION 1
-#define KFD_IOCTL_MINOR_VERSION 12
+#define KFD_IOCTL_MINOR_VERSION 13
 
 struct kfd_ioctl_get_version_args {
__u32 major_version;/* from KFD */
-- 
2.25.1

[PATCH 25/33] drm/amdkfd: add debug wave launch mode operation

2023-05-25 Thread Jonathan Kim

Allow the debugger to set wave behaviour on to either normally operate,
halt at launch, trap on every instruction, terminate immediately or
stall on allocation.

v2: fixup with new kfd_node struct reference for mes check

Signed-off-by: Jonathan Kim 
---
 .../drm/amd/amdgpu/amdgpu_amdkfd_aldebaran.c  | 12 +++
 .../drm/amd/amdgpu/amdgpu_amdkfd_arcturus.c   |  1 +
 .../drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.c| 25 +
 .../drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.h|  3 ++
 .../drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10_3.c  |  3 +-
 .../drm/amd/amdgpu/amdgpu_amdkfd_gfx_v11.c| 14 +++-
 .../gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c | 25 +
 .../gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.h |  3 ++
 drivers/gpu/drm/amd/amdkfd/kfd_chardev.c  |  3 ++
 drivers/gpu/drm/amd/amdkfd/kfd_debug.c| 36 ++-
 drivers/gpu/drm/amd/amdkfd/kfd_debug.h|  2 ++
 11 files changed, 124 insertions(+), 3 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_aldebaran.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_aldebaran.c
index d7881bbd828d..774ecfc3451a 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_aldebaran.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_aldebaran.c
@@ -107,6 +107,17 @@ static uint32_t 
kgd_aldebaran_set_wave_launch_trap_override(struct amdgpu_device
return data;
 }
 
+static uint32_t kgd_aldebaran_set_wave_launch_mode(struct amdgpu_device *adev,
+   uint8_t wave_launch_mode,
+   uint32_t vmid)
+{
+   uint32_t data = 0;
+
+   data = REG_SET_FIELD(data, SPI_GDBG_PER_VMID_CNTL, LAUNCH_MODE, 
wave_launch_mode);
+
+   return data;
+}
+
 const struct kfd2kgd_calls aldebaran_kfd2kgd = {
.program_sh_mem_settings = kgd_gfx_v9_program_sh_mem_settings,
.set_pasid_vmid_mapping = kgd_gfx_v9_set_pasid_vmid_mapping,
@@ -129,6 +140,7 @@ const struct kfd2kgd_calls aldebaran_kfd2kgd = {
.disable_debug_trap = kgd_aldebaran_disable_debug_trap,
.validate_trap_override_request = 
kgd_aldebaran_validate_trap_override_request,
.set_wave_launch_trap_override = 
kgd_aldebaran_set_wave_launch_trap_override,
+   .set_wave_launch_mode = kgd_aldebaran_set_wave_launch_mode,
.get_iq_wait_times = kgd_gfx_v9_get_iq_wait_times,
.build_grace_period_packet_info = 
kgd_gfx_v9_build_grace_period_packet_info,
.program_trap_handler_settings = 
kgd_gfx_v9_program_trap_handler_settings,
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_arcturus.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_arcturus.c
index ec2587664001..fbdc1b7b1e42 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_arcturus.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_arcturus.c
@@ -412,6 +412,7 @@ const struct kfd2kgd_calls arcturus_kfd2kgd = {
.disable_debug_trap = kgd_arcturus_disable_debug_trap,
.validate_trap_override_request = 
kgd_gfx_v9_validate_trap_override_request,
.set_wave_launch_trap_override = 
kgd_gfx_v9_set_wave_launch_trap_override,
+   .set_wave_launch_mode = kgd_gfx_v9_set_wave_launch_mode,
.get_iq_wait_times = kgd_gfx_v9_get_iq_wait_times,
.build_grace_period_packet_info = 
kgd_gfx_v9_build_grace_period_packet_info,
.get_cu_occupancy = kgd_gfx_v9_get_cu_occupancy,
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.c
index 7ea0362dcab3..a7a6edda557f 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.c
@@ -856,6 +856,30 @@ uint32_t kgd_gfx_v10_set_wave_launch_trap_override(struct 
amdgpu_device *adev,
return 0;
 }
 
+uint32_t kgd_gfx_v10_set_wave_launch_mode(struct amdgpu_device *adev,
+   uint8_t wave_launch_mode,
+   uint32_t vmid)
+{
+   uint32_t data = 0;
+   bool is_mode_set = !!wave_launch_mode;
+
+   mutex_lock(>grbm_idx_mutex);
+
+   kgd_gfx_v10_set_wave_launch_stall(adev, vmid, true);
+
+   data = REG_SET_FIELD(data, SPI_GDBG_WAVE_CNTL2,
+   VMID_MASK, is_mode_set ? 1 << vmid : 0);
+   data = REG_SET_FIELD(data, SPI_GDBG_WAVE_CNTL2,
+   MODE, is_mode_set ? wave_launch_mode : 0);
+   WREG32(SOC15_REG_OFFSET(GC, 0, mmSPI_GDBG_WAVE_CNTL2), data);
+
+   kgd_gfx_v10_set_wave_launch_stall(adev, vmid, false);
+
+   mutex_unlock(>grbm_idx_mutex);
+
+   return 0;
+}
+
 /* kgd_gfx_v10_get_iq_wait_times: Returns the mmCP_IQ_WAIT_TIME1/2 values
  * The values read are:
  * ib_offload_wait_time -- Wait Count for Indirect Buffer Offloads.
@@ -944,6 +968,7 @@ const struct kfd2kgd_calls gfx_v10_kfd2kgd = {
.disable_debug_trap = kgd_gfx_v10_disable_debug_trap,
.validate_trap_override_request = 
kgd_gfx_v10_validate_trap_over

[PATCH 27/33] drm/amdkfd: add debug set and clear address watch points operation

2023-05-25 Thread Jonathan Kim

Shader read, write and atomic memory operations can be alerted to the
debugger as an address watch exception.

Allow the debugger to pass in a watch point to a particular memory
address per device.

Note that there exists only 4 watch points per devices to date, so have
the KFD keep track of what watch points are allocated or not.

v2: fixup with new kfd_node struct reference for mes and watch point
checks

Signed-off-by: Jonathan Kim 
---
 .../drm/amd/amdgpu/amdgpu_amdkfd_aldebaran.c  |  51 +++
 .../drm/amd/amdgpu/amdgpu_amdkfd_arcturus.c   |   2 +
 .../drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.c|  78 ++
 .../drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.h|   8 ++
 .../drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10_3.c  |   5 +-
 .../drm/amd/amdgpu/amdgpu_amdkfd_gfx_v11.c|  52 ++-
 .../gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c |  77 ++
 .../gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.h |   8 ++
 drivers/gpu/drm/amd/amdkfd/kfd_chardev.c  |  24 
 drivers/gpu/drm/amd/amdkfd/kfd_debug.c| 136 ++
 drivers/gpu/drm/amd/amdkfd/kfd_debug.h|   8 +-
 drivers/gpu/drm/amd/amdkfd/kfd_device.c   |   2 +
 drivers/gpu/drm/amd/amdkfd/kfd_priv.h |   6 +-
 13 files changed, 452 insertions(+), 5 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_aldebaran.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_aldebaran.c
index 774ecfc3451a..efd6a72aab4e 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_aldebaran.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_aldebaran.c
@@ -118,6 +118,55 @@ static uint32_t kgd_aldebaran_set_wave_launch_mode(struct 
amdgpu_device *adev,
return data;
 }
 
+#define TCP_WATCH_STRIDE (regTCP_WATCH1_ADDR_H - regTCP_WATCH0_ADDR_H)
+static uint32_t kgd_gfx_aldebaran_set_address_watch(
+   struct amdgpu_device *adev,
+   uint64_t watch_address,
+   uint32_t watch_address_mask,
+   uint32_t watch_id,
+   uint32_t watch_mode,
+   uint32_t debug_vmid)
+{
+   uint32_t watch_address_high;
+   uint32_t watch_address_low;
+   uint32_t watch_address_cntl;
+
+   watch_address_cntl = 0;
+   watch_address_low = lower_32_bits(watch_address);
+   watch_address_high = upper_32_bits(watch_address) & 0x;
+
+   watch_address_cntl = REG_SET_FIELD(watch_address_cntl,
+   TCP_WATCH0_CNTL,
+   MODE,
+   watch_mode);
+
+   watch_address_cntl = REG_SET_FIELD(watch_address_cntl,
+   TCP_WATCH0_CNTL,
+   MASK,
+   watch_address_mask >> 6);
+
+   watch_address_cntl = REG_SET_FIELD(watch_address_cntl,
+   TCP_WATCH0_CNTL,
+   VALID,
+   1);
+
+   WREG32_RLC((SOC15_REG_OFFSET(GC, 0, regTCP_WATCH0_ADDR_H) +
+   (watch_id * TCP_WATCH_STRIDE)),
+   watch_address_high);
+
+   WREG32_RLC((SOC15_REG_OFFSET(GC, 0, regTCP_WATCH0_ADDR_L) +
+   (watch_id * TCP_WATCH_STRIDE)),
+   watch_address_low);
+
+   return watch_address_cntl;
+}
+
+uint32_t kgd_gfx_aldebaran_clear_address_watch(struct amdgpu_device *adev,
+   uint32_t watch_id)
+{
+   return 0;
+}
+
 const struct kfd2kgd_calls aldebaran_kfd2kgd = {
.program_sh_mem_settings = kgd_gfx_v9_program_sh_mem_settings,
.set_pasid_vmid_mapping = kgd_gfx_v9_set_pasid_vmid_mapping,
@@ -141,6 +190,8 @@ const struct kfd2kgd_calls aldebaran_kfd2kgd = {
.validate_trap_override_request = 
kgd_aldebaran_validate_trap_override_request,
.set_wave_launch_trap_override = 
kgd_aldebaran_set_wave_launch_trap_override,
.set_wave_launch_mode = kgd_aldebaran_set_wave_launch_mode,
+   .set_address_watch = kgd_gfx_aldebaran_set_address_watch,
+   .clear_address_watch = kgd_gfx_aldebaran_clear_address_watch,
.get_iq_wait_times = kgd_gfx_v9_get_iq_wait_times,
.build_grace_period_packet_info = 
kgd_gfx_v9_build_grace_period_packet_info,
.program_trap_handler_settings = 
kgd_gfx_v9_program_trap_handler_settings,
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_arcturus.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_arcturus.c
index fbdc1b7b1e42..6df215aba4c4 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_arcturus.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_arcturus.c
@@ -413,6 +413,8 @@ const struct kfd2kgd_calls arcturus_kfd2kgd = {
.validate_trap_override_request = 
kgd_gfx_v9_validate_trap_override_request,
.set_wave_launch_trap_override = 
kgd_gfx_v9_set_wave_launch_trap_override,
.set_wave_launch_mode = kgd_gfx_v9_set_wave_launch_mode,
+   .set_add

[PATCH 17/33] drm/amdkfd: apply trap workaround for gfx11

2023-05-25 Thread Jonathan Kim

Due to a HW bug, waves in only half the shader arrays can enter trap.

When starting a debug session, relocate all waves to the first shader
array of each shader engine and mask off the 2nd shader array as
unavailable.

When ending a debug session, re-enable the 2nd shader array per
shader engine.

User CU masking per queue cannot be guaranteed to remain functional
if requested during debugging (e.g. user cu mask requests only 2nd shader
array as an available resource leading to zero HW resources available)
nor can runtime be alerted of any of these changes during execution.

Make user CU masking and debugging mutual exclusive with respect to
availability.

If the debugger tries to attach to a process with a user cu masked
queue, return the runtime status as enabled but busy.

If the debugger tries to attach and fails to reallocate queue waves to
the first shader array of each shader engine, return the runtime status
as enabled but with an error.

In addition, like any other mutli-process debug supported devices,
disable trap temporary setup per-process to avoid performance impact from
setup overhead.

Signed-off-by: Jonathan Kim 
Reviewed-by: Felix Kuehling 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h   |  2 +
 drivers/gpu/drm/amd/amdgpu/mes_v11_0.c|  7 +--
 drivers/gpu/drm/amd/amdkfd/kfd_chardev.c  |  2 -
 drivers/gpu/drm/amd/amdkfd/kfd_debug.c| 57 +++
 drivers/gpu/drm/amd/amdkfd/kfd_debug.h|  3 +-
 .../drm/amd/amdkfd/kfd_device_queue_manager.c |  7 +++
 .../gpu/drm/amd/amdkfd/kfd_mqd_manager_cik.c  |  3 +-
 .../gpu/drm/amd/amdkfd/kfd_mqd_manager_v10.c  |  3 +-
 .../gpu/drm/amd/amdkfd/kfd_mqd_manager_v11.c  | 42 ++
 .../gpu/drm/amd/amdkfd/kfd_mqd_manager_v9.c   |  3 +-
 .../gpu/drm/amd/amdkfd/kfd_mqd_manager_vi.c   |  3 +-
 drivers/gpu/drm/amd/amdkfd/kfd_priv.h |  5 +-
 .../amd/amdkfd/kfd_process_queue_manager.c|  9 ++-
 drivers/gpu/drm/amd/amdkfd/kfd_topology.c |  7 ++-
 14 files changed, 122 insertions(+), 31 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h
index d20df0cf0d88..b5f5eed2b5ef 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h
@@ -219,6 +219,8 @@ struct mes_add_queue_input {
uint32_tgws_size;
uint64_ttba_addr;
uint64_ttma_addr;
+   uint32_ttrap_en;
+   uint32_tskip_process_ctx_clear;
uint32_tis_kfd_process;
uint32_tis_aql_queue;
uint32_tqueue_size;
diff --git a/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c 
b/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c
index 861910a6662d..c4e3cb8d44de 100644
--- a/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c
@@ -202,17 +202,14 @@ static int mes_v11_0_add_hw_queue(struct amdgpu_mes *mes,
mes_add_queue_pkt.gws_size = input->gws_size;
mes_add_queue_pkt.trap_handler_addr = input->tba_addr;
mes_add_queue_pkt.tma_addr = input->tma_addr;
+   mes_add_queue_pkt.trap_en = input->trap_en;
+   mes_add_queue_pkt.skip_process_ctx_clear = 
input->skip_process_ctx_clear;
mes_add_queue_pkt.is_kfd_process = input->is_kfd_process;
 
/* For KFD, gds_size is re-used for queue size (needed in MES for AQL 
queues) */
mes_add_queue_pkt.is_aql_queue = input->is_aql_queue;
mes_add_queue_pkt.gds_size = input->queue_size;
 
-   if (!(((adev->mes.sched_version & AMDGPU_MES_VERSION_MASK) >= 4) &&
- (adev->ip_versions[GC_HWIP][0] >= IP_VERSION(11, 0, 0)) &&
- (adev->ip_versions[GC_HWIP][0] <= IP_VERSION(11, 0, 3
-   mes_add_queue_pkt.trap_en = 1;
-
/* For KFD, gds_size is re-used for queue size (needed in MES for AQL 
queues) */
mes_add_queue_pkt.is_aql_queue = input->is_aql_queue;
mes_add_queue_pkt.gds_size = input->queue_size;
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
index 9d0c247f80fe..a5c457863048 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
@@ -537,8 +537,6 @@ static int kfd_ioctl_set_cu_mask(struct file *filp, struct 
kfd_process *p,
goto out;
}
 
-   minfo.update_flag = UPDATE_FLAG_CU_MASK;
-
mutex_lock(>mutex);
 
retval = pqm_update_mqd(>pqm, args->queue_id, );
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_debug.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_debug.c
index 73b07b5f17f1..5e2ee2d1acc4 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_debug.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_debug.c
@@ -24,6 +24,57 @@
 #include "kfd_device_queue_manager.h"
 #include 
 
+static int kfd_dbg_set_queue_workaround(struct queue *q, bool enable)
+{
+   struct mqd_update_info minfo = {0};
+

[PATCH 21/33] drm/amdkfd: add debug trap enabled flag to tma

2023-05-25 Thread Jonathan Kim

From: Jay Cornwall 

Trap handler behavior will differ when a debugger is attached.

Make the debug trap flag available in the trap handler TMA.
Update it when the debug trap ioctl is invoked.

Signed-off-by: Jay Cornwall 
Reviewed-by: Felix Kuehling 
Signed-off-by: Jonathan Kim 
Reviewed-by: Felix Kuehling 
---
 drivers/gpu/drm/amd/amdkfd/kfd_debug.c   | 11 +++
 drivers/gpu/drm/amd/amdkfd/kfd_priv.h|  2 ++
 drivers/gpu/drm/amd/amdkfd/kfd_process.c | 15 +++
 3 files changed, 28 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_debug.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_debug.c
index a19c21d04438..17e8e9edccbf 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_debug.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_debug.c
@@ -256,6 +256,8 @@ void kfd_dbg_trap_deactivate(struct kfd_process *target, 
bool unwind, int unwind
if (unwind && i == unwind_count)
break;
 
+   kfd_process_set_trap_debug_flag(>qpd, false);
+
/* GFX off is already disabled by debug activate if not RLC 
restore supported. */
if (kfd_dbg_is_rlc_restore_supported(pdd->dev))
amdgpu_gfx_off_ctrl(pdd->dev->adev, false);
@@ -351,6 +353,15 @@ int kfd_dbg_trap_activate(struct kfd_process *target)
if (kfd_dbg_is_rlc_restore_supported(pdd->dev))
amdgpu_gfx_off_ctrl(pdd->dev->adev, true);
 
+   /*
+* Setting the debug flag in the trap handler requires that the 
TMA has been
+* allocated, which occurs during CWSR initialization.
+* In the event that CWSR has not been initialized at this 
point, setting the
+* flag will be called again during CWSR initialization if the 
target process
+* is still debug enabled.
+*/
+   kfd_process_set_trap_debug_flag(>qpd, true);
+
if (!pdd->dev->kfd->shared_resources.enable_mes)
r = debug_refresh_runlist(pdd->dev->dqm);
else
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h 
b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
index 4b80f74b9de0..a02fb939614a 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
@@ -1157,6 +1157,8 @@ int kfd_init_apertures(struct kfd_process *process);
 void kfd_process_set_trap_handler(struct qcm_process_device *qpd,
  uint64_t tba_addr,
  uint64_t tma_addr);
+void kfd_process_set_trap_debug_flag(struct qcm_process_device *qpd,
+bool enabled);
 
 /* CWSR initialization */
 int kfd_process_init_cwsr_apu(struct kfd_process *process, struct file *filep);
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
index 8bfd0c91fb92..2a60c630ab5d 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_process.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
@@ -1309,6 +1309,8 @@ int kfd_process_init_cwsr_apu(struct kfd_process *p, 
struct file *filep)
 
memcpy(qpd->cwsr_kaddr, dev->kfd->cwsr_isa, 
dev->kfd->cwsr_isa_size);
 
+   kfd_process_set_trap_debug_flag(qpd, p->debug_trap_enabled);
+
qpd->tma_addr = qpd->tba_addr + KFD_CWSR_TMA_OFFSET;
pr_debug("set tba :0x%llx, tma:0x%llx, cwsr_kaddr:%p for 
pqm.\n",
qpd->tba_addr, qpd->tma_addr, qpd->cwsr_kaddr);
@@ -1345,6 +1347,9 @@ static int kfd_process_device_init_cwsr_dgpu(struct 
kfd_process_device *pdd)
 
memcpy(qpd->cwsr_kaddr, dev->kfd->cwsr_isa, dev->kfd->cwsr_isa_size);
 
+   kfd_process_set_trap_debug_flag(>qpd,
+   pdd->process->debug_trap_enabled);
+
qpd->tma_addr = qpd->tba_addr + KFD_CWSR_TMA_OFFSET;
pr_debug("set tba :0x%llx, tma:0x%llx, cwsr_kaddr:%p for pqm.\n",
 qpd->tba_addr, qpd->tma_addr, qpd->cwsr_kaddr);
@@ -1431,6 +1436,16 @@ bool kfd_process_xnack_mode(struct kfd_process *p, bool 
supported)
return true;
 }
 
+void kfd_process_set_trap_debug_flag(struct qcm_process_device *qpd,
+bool enabled)
+{
+   if (qpd->cwsr_kaddr) {
+   uint64_t *tma =
+   (uint64_t *)(qpd->cwsr_kaddr + KFD_CWSR_TMA_OFFSET);
+   tma[2] = enabled;
+   }
+}
+
 /*
  * On return the kfd_process is fully operational and will be freed when the
  * mm is released
-- 
2.25.1

[PATCH 15/33] drm/amdgpu: expose debug api for mes

2023-05-25 Thread Jonathan Kim

Similar to the F32 HWS, the RS64 HWS for GFX11 now supports a multi-process
debug API.

The skip_process_ctx_clear ADD_QUEUE requirement is to prevent the MES
from clearing the process context when the first queue is added to the
scheduler in order to maintain debug mode settings during queue preemption
and restore.  The MES clears the process context in this case due to an
unresolved FW caching bug during normal mode operations.
During debug mode, the KFD will hold a reference to the target process
so the process context should never go stale and MES can afford to skip
this requirement.

Signed-off-by: Jonathan Kim 
Reviewed-by: Felix Kuehling 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c   | 32 +++
 drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h   | 20 
 drivers/gpu/drm/amd/amdgpu/mes_v11_0.c| 12 +++
 drivers/gpu/drm/amd/include/mes_v11_api_def.h | 21 +++-
 4 files changed, 84 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c
index 49bb6c03d606..20cc3fffe921 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c
@@ -924,6 +924,38 @@ int amdgpu_mes_reg_wait(struct amdgpu_device *adev, 
uint32_t reg,
return r;
 }
 
+int amdgpu_mes_set_shader_debugger(struct amdgpu_device *adev,
+   uint64_t process_context_addr,
+   uint32_t spi_gdbg_per_vmid_cntl,
+   const uint32_t *tcp_watch_cntl,
+   uint32_t flags)
+{
+   struct mes_misc_op_input op_input = {0};
+   int r;
+
+   if (!adev->mes.funcs->misc_op) {
+   DRM_ERROR("mes set shader debugger is not supported!\n");
+   return -EINVAL;
+   }
+
+   op_input.op = MES_MISC_OP_SET_SHADER_DEBUGGER;
+   op_input.set_shader_debugger.process_context_addr = 
process_context_addr;
+   op_input.set_shader_debugger.flags.u32all = flags;
+   op_input.set_shader_debugger.spi_gdbg_per_vmid_cntl = 
spi_gdbg_per_vmid_cntl;
+   memcpy(op_input.set_shader_debugger.tcp_watch_cntl, tcp_watch_cntl,
+   sizeof(op_input.set_shader_debugger.tcp_watch_cntl));
+
+   amdgpu_mes_lock(>mes);
+
+   r = adev->mes.funcs->misc_op(>mes, _input);
+   if (r)
+   DRM_ERROR("failed to set_shader_debugger\n");
+
+   amdgpu_mes_unlock(>mes);
+
+   return r;
+}
+
 static void
 amdgpu_mes_ring_to_queue_props(struct amdgpu_device *adev,
   struct amdgpu_ring *ring,
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h
index 547ec35691fa..d20df0cf0d88 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h
@@ -256,6 +256,7 @@ enum mes_misc_opcode {
MES_MISC_OP_READ_REG,
MES_MISC_OP_WRM_REG_WAIT,
MES_MISC_OP_WRM_REG_WR_WAIT,
+   MES_MISC_OP_SET_SHADER_DEBUGGER,
 };
 
 struct mes_misc_op_input {
@@ -278,6 +279,20 @@ struct mes_misc_op_input {
uint32_t   reg0;
uint32_t   reg1;
} wrm_reg;
+
+   struct {
+   uint64_t process_context_addr;
+   union {
+   struct {
+   uint64_t single_memop : 1;
+   uint64_t single_alu_op : 1;
+   uint64_t reserved: 30;
+   };
+   uint32_t u32all;
+   } flags;
+   uint32_t spi_gdbg_per_vmid_cntl;
+   uint32_t tcp_watch_cntl[4];
+   } set_shader_debugger;
};
 };
 
@@ -340,6 +355,11 @@ int amdgpu_mes_reg_wait(struct amdgpu_device *adev, 
uint32_t reg,
 int amdgpu_mes_reg_write_reg_wait(struct amdgpu_device *adev,
  uint32_t reg0, uint32_t reg1,
  uint32_t ref, uint32_t mask);
+int amdgpu_mes_set_shader_debugger(struct amdgpu_device *adev,
+   uint64_t process_context_addr,
+   uint32_t spi_gdbg_per_vmid_cntl,
+   const uint32_t *tcp_watch_cntl,
+   uint32_t flags);
 
 int amdgpu_mes_add_ring(struct amdgpu_device *adev, int gang_id,
int queue_type, int idx,
diff --git a/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c 
b/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c
index 90b4a74ccf01..861910a6662d 100644
--- a/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c
@@ -339,6 +339,18 @@ static int mes_v11_0_misc_op(struct amdgpu_mes *mes,
misc_pkt.wait_

[PATCH 28/33] drm/amdkfd: add debug set flags operation

2023-05-25 Thread Jonathan Kim

Allow the debugger to set single memory and single ALU operations.

Some exceptions are imprecise (memory violations, address watch) in the
sense that a trap occurs only when the exception interrupt occurs and
not at the non-halting faulty instruction.  Trap temporaries 0 & 1 save
the program counter address, which means that these values will not point
to the faulty instruction address but to whenever the interrupt was
raised.

Setting the Single Memory Operations flag will inject an automatic wait
on every memory operation instruction forcing imprecise memory exceptions
to become precise at the cost of performance.  This setting is not
permitted on debug devices that support only a global setting of this
option.

Return the previous set flags to the debugger as well.

v2: fixup with new kfd_node struct reference mes checks

Signed-off-by: Jonathan Kim 
---
 drivers/gpu/drm/amd/amdkfd/kfd_chardev.c |  2 +
 drivers/gpu/drm/amd/amdkfd/kfd_debug.c   | 58 
 drivers/gpu/drm/amd/amdkfd/kfd_debug.h   |  1 +
 3 files changed, 61 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
index e88be582d44d..e5d95b144dcd 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
@@ -3035,6 +3035,8 @@ static int kfd_ioctl_set_debug_trap(struct file *filep, 
struct kfd_process *p, v
args->clear_node_address_watch.id);
break;
case KFD_IOC_DBG_TRAP_SET_FLAGS:
+   r = kfd_dbg_trap_set_flags(target, >set_flags.flags);
+   break;
case KFD_IOC_DBG_TRAP_QUERY_DEBUG_EVENT:
case KFD_IOC_DBG_TRAP_QUERY_EXCEPTION_INFO:
case KFD_IOC_DBG_TRAP_GET_QUEUE_SNAPSHOT:
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_debug.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_debug.c
index 4b36cc8b5fb7..43c3170998d3 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_debug.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_debug.c
@@ -23,6 +23,7 @@
 #include "kfd_debug.h"
 #include "kfd_device_queue_manager.h"
 #include 
+#include 
 
 #define MAX_WATCH_ADDRESSES4
 
@@ -423,6 +424,59 @@ static void kfd_dbg_clear_process_address_watch(struct 
kfd_process *target)
kfd_dbg_trap_clear_dev_address_watch(target->pdds[i], 
j);
 }
 
+int kfd_dbg_trap_set_flags(struct kfd_process *target, uint32_t *flags)
+{
+   uint32_t prev_flags = target->dbg_flags;
+   int i, r = 0, rewind_count = 0;
+
+   for (i = 0; i < target->n_pdds; i++) {
+   if (!kfd_dbg_is_per_vmid_supported(target->pdds[i]->dev) &&
+   (*flags & KFD_DBG_TRAP_FLAG_SINGLE_MEM_OP)) {
+   *flags = prev_flags;
+   return -EACCES;
+   }
+   }
+
+   target->dbg_flags = *flags & KFD_DBG_TRAP_FLAG_SINGLE_MEM_OP;
+   *flags = prev_flags;
+   for (i = 0; i < target->n_pdds; i++) {
+   struct kfd_process_device *pdd = target->pdds[i];
+
+   if (!kfd_dbg_is_per_vmid_supported(pdd->dev))
+   continue;
+
+   if (!pdd->dev->kfd->shared_resources.enable_mes)
+   r = debug_refresh_runlist(pdd->dev->dqm);
+   else
+   r = kfd_dbg_set_mes_debug_mode(pdd);
+
+   if (r) {
+   target->dbg_flags = prev_flags;
+   break;
+   }
+
+   rewind_count++;
+   }
+
+   /* Rewind flags */
+   if (r) {
+   target->dbg_flags = prev_flags;
+
+   for (i = 0; i < rewind_count; i++) {
+   struct kfd_process_device *pdd = target->pdds[i];
+
+   if (!kfd_dbg_is_per_vmid_supported(pdd->dev))
+   continue;
+
+   if (!pdd->dev->kfd->shared_resources.enable_mes)
+   debug_refresh_runlist(pdd->dev->dqm);
+   else
+   kfd_dbg_set_mes_debug_mode(pdd);
+   }
+   }
+
+   return r;
+}
 
 /* kfd_dbg_trap_deactivate:
  * target: target process
@@ -437,9 +491,13 @@ void kfd_dbg_trap_deactivate(struct kfd_process *target, 
bool unwind, int unwind
int i;
 
if (!unwind) {
+   uint32_t flags = 0;
+
cancel_work_sync(>debug_event_workarea);
kfd_dbg_clear_process_address_watch(target);
kfd_dbg_trap_set_wave_launch_mode(target, 0);
+
+   kfd_dbg_trap_set_flags(target, );
}
 
for (i = 0; i < target->n_pdds; i++) {
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_debug.h 
b/drivers/gpu/drm/amd/amdkfd/kfd_debug.h
index 7f0757c2af2c..ef8e9f7f1716 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_debug.h

[PATCH 11/33] drm/amdgpu: add gfx11 hw debug mode enable and disable calls

2023-05-25 Thread Jonathan Kim

Implement the per-device calls to enable or disable HW debug mode
for GFX11.

Signed-off-by: Jonathan Kim 
Reviewed-by: Felix Kuehling 
---
 .../drm/amd/amdgpu/amdgpu_amdkfd_gfx_v11.c| 38 +++
 1 file changed, 38 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v11.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v11.c
index 7deff8a547fb..cc954cf248ca 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v11.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v11.c
@@ -607,6 +607,42 @@ static void set_vm_context_page_table_base_v11(struct 
amdgpu_device *adev,
adev->gfxhub.funcs->setup_vm_pt_regs(adev, vmid, page_table_base);
 }
 
+/*
+ * Returns TRAP_EN, EXCP_EN and EXCP_REPLACE.
+ *
+ * restore_dbg_registers is ignored here but is a general interface requirement
+ * for devices that support GFXOFF and where the RLC save/restore list
+ * does not support hw registers for debugging i.e. the driver has to manually
+ * initialize the debug mode registers after it has disabled GFX off during the
+ * debug session.
+ */
+static uint32_t kgd_gfx_v11_enable_debug_trap(struct amdgpu_device *adev,
+   bool restore_dbg_registers,
+   uint32_t vmid)
+{
+   uint32_t data = 0;
+
+   data = REG_SET_FIELD(data, SPI_GDBG_PER_VMID_CNTL, TRAP_EN, 1);
+   data = REG_SET_FIELD(data, SPI_GDBG_PER_VMID_CNTL, EXCP_EN, 0);
+   data = REG_SET_FIELD(data, SPI_GDBG_PER_VMID_CNTL, EXCP_REPLACE, 0);
+
+   return data;
+}
+
+/* Returns TRAP_EN, EXCP_EN and EXCP_REPLACE. */
+static uint32_t kgd_gfx_v11_disable_debug_trap(struct amdgpu_device *adev,
+   bool keep_trap_enabled,
+   uint32_t vmid)
+{
+   uint32_t data = 0;
+
+   data = REG_SET_FIELD(data, SPI_GDBG_PER_VMID_CNTL, TRAP_EN, 
keep_trap_enabled);
+   data = REG_SET_FIELD(data, SPI_GDBG_PER_VMID_CNTL, EXCP_EN, 0);
+   data = REG_SET_FIELD(data, SPI_GDBG_PER_VMID_CNTL, EXCP_REPLACE, 0);
+
+   return data;
+}
+
 const struct kfd2kgd_calls gfx_v11_kfd2kgd = {
.program_sh_mem_settings = program_sh_mem_settings_v11,
.set_pasid_vmid_mapping = set_pasid_vmid_mapping_v11,
@@ -623,4 +659,6 @@ const struct kfd2kgd_calls gfx_v11_kfd2kgd = {
.wave_control_execute = wave_control_execute_v11,
.get_atc_vmid_pasid_mapping_info = NULL,
.set_vm_context_page_table_base = set_vm_context_page_table_base_v11,
+   .enable_debug_trap = kgd_gfx_v11_enable_debug_trap,
+   .disable_debug_trap = kgd_gfx_v11_disable_debug_trap
 };
-- 
2.25.1

[PATCH 24/33] drm/amdkfd: add debug wave launch override operation

2023-05-25 Thread Jonathan Kim

This operation allows the debugger to override the enabled HW
exceptions on the device.

On debug devices that only support the debugging of a single process,
the HW exceptions are global and set through the SPI_GDBG_TRAP_MASK
register.
Because they are global, only address watch exceptions are allowed to
be enabled.  In other words, the debugger must preserve all non-address
watch exception states in normal mode operation by barring a full
replacement override or a non-address watch override request.

For multi-process debugging, all HW exception overrides are per-VMID so
all exceptions can be overridden or fully replaced.

In order for the debugger to know what is permissible, returned the
supported override mask back to the debugger along with the previously
enable overrides.

v2: fixup with new kfd_node struct reference for mes check

Signed-off-by: Jonathan Kim 
---
 .../drm/amd/amdgpu/amdgpu_amdkfd_aldebaran.c  | 47 ++
 .../drm/amd/amdgpu/amdgpu_amdkfd_arcturus.c   |  2 +
 .../drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.c| 55 
 .../drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.h| 10 +++
 .../drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10_3.c  |  5 +-
 .../drm/amd/amdgpu/amdgpu_amdkfd_gfx_v11.c| 87 ++-
 .../gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c | 55 
 .../gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.h | 10 +++
 drivers/gpu/drm/amd/amdkfd/kfd_chardev.c  |  7 ++
 drivers/gpu/drm/amd/amdkfd/kfd_debug.c| 69 +++
 drivers/gpu/drm/amd/amdkfd/kfd_debug.h|  6 ++
 11 files changed, 351 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_aldebaran.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_aldebaran.c
index b811a0985050..d7881bbd828d 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_aldebaran.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_aldebaran.c
@@ -25,6 +25,7 @@
 #include "amdgpu_amdkfd_gfx_v9.h"
 #include "gc/gc_9_4_2_offset.h"
 #include "gc/gc_9_4_2_sh_mask.h"
+#include 
 
 /*
  * Returns TRAP_EN, EXCP_EN and EXCP_REPLACE.
@@ -62,6 +63,50 @@ static uint32_t kgd_aldebaran_disable_debug_trap(struct 
amdgpu_device *adev,
return data;
 }
 
+static int kgd_aldebaran_validate_trap_override_request(struct amdgpu_device 
*adev,
+   uint32_t trap_override,
+   uint32_t 
*trap_mask_supported)
+{
+   *trap_mask_supported &= KFD_DBG_TRAP_MASK_FP_INVALID |
+   KFD_DBG_TRAP_MASK_FP_INPUT_DENORMAL |
+   KFD_DBG_TRAP_MASK_FP_DIVIDE_BY_ZERO |
+   KFD_DBG_TRAP_MASK_FP_OVERFLOW |
+   KFD_DBG_TRAP_MASK_FP_UNDERFLOW |
+   KFD_DBG_TRAP_MASK_FP_INEXACT |
+   KFD_DBG_TRAP_MASK_INT_DIVIDE_BY_ZERO |
+   KFD_DBG_TRAP_MASK_DBG_ADDRESS_WATCH |
+   KFD_DBG_TRAP_MASK_DBG_MEMORY_VIOLATION;
+
+   if (trap_override != KFD_DBG_TRAP_OVERRIDE_OR &&
+   trap_override != KFD_DBG_TRAP_OVERRIDE_REPLACE)
+   return -EPERM;
+
+   return 0;
+}
+
+/* returns TRAP_EN, EXCP_EN and EXCP_RPLACE. */
+static uint32_t kgd_aldebaran_set_wave_launch_trap_override(struct 
amdgpu_device *adev,
+   uint32_t vmid,
+   uint32_t trap_override,
+   uint32_t trap_mask_bits,
+   uint32_t trap_mask_request,
+   uint32_t *trap_mask_prev,
+   uint32_t kfd_dbg_trap_cntl_prev)
+
+{
+   uint32_t data = 0;
+
+   *trap_mask_prev = REG_GET_FIELD(kfd_dbg_trap_cntl_prev, 
SPI_GDBG_PER_VMID_CNTL, EXCP_EN);
+   trap_mask_bits = (trap_mask_bits & trap_mask_request) |
+   (*trap_mask_prev & ~trap_mask_request);
+
+   data = REG_SET_FIELD(data, SPI_GDBG_PER_VMID_CNTL, TRAP_EN, 1);
+   data = REG_SET_FIELD(data, SPI_GDBG_PER_VMID_CNTL, EXCP_EN, 
trap_mask_bits);
+   data = REG_SET_FIELD(data, SPI_GDBG_PER_VMID_CNTL, EXCP_REPLACE, 
trap_override);
+
+   return data;
+}
+
 const struct kfd2kgd_calls aldebaran_kfd2kgd = {
.program_sh_mem_settings = kgd_gfx_v9_program_sh_mem_settings,
.set_pasid_vmid_mapping = kgd_gfx_v9_set_pasid_vmid_mapping,
@@ -82,6 +127,8 @@ const struct kfd2kgd_calls aldebaran_kfd2kgd = {
.get_cu_occupancy = kgd_gfx_v9_get_cu_occupancy,
.enable_debug_trap = kgd_aldebaran_enable_debug_trap,
.disable_debug_trap = kgd_aldebaran_disable_debug_trap,
+   .validate_trap_override_request = 
kgd_aldebaran_validate_trap_override_request,
+   .set_wave_launch_trap_override = 
kgd_aldebaran_set_wave_launch_trap_override,

[PATCH 26/33] drm/amdkfd: add debug suspend and resume process queues operation

2023-05-25 Thread Jonathan Kim

In order to inspect waves from the saved context at any point during a
debug session, the debugger must be able to preempt queues to trigger
context save by suspending them.

On queue suspend, the KFD will copy the context save header information
so that the debugger can correctly crawl the appropriate size of the saved
context. The debugger must then also be allowed to resume suspended queues.

A queue that is newly created cannot be suspended because queue ids are
recycled after destruction so the debugger needs to know that this has
occurred.  Query functions will be later added that will clear a given
queue of its new queue status.

A queue cannot be destroyed while it is suspended to preserve its saved
context during debugger inspection.  Have queue destruction block while
a queue is suspended and unblocked when it is resumed.  Likewise, if a
queue is about to be destroyed, it cannot be suspended.

Return the number of queues successfully suspended or resumed along with
a per queue status array where the upper bits per queue status show that
the request was invalid (new/destroyed queue suspend request, missing
queue) or an error occurred (HWS in a fatal state so it can't suspend or
resume queues).

v2: fixup new kfd_node struct reference for mes fw check.
also fixup missing EC_QUEUE_NEW flagging on newly created queue.

Signed-off-by: Jonathan Kim 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c|   5 +
 drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h|   1 +
 drivers/gpu/drm/amd/amdkfd/kfd_chardev.c  |  11 +
 drivers/gpu/drm/amd/amdkfd/kfd_debug.c|   7 +
 .../drm/amd/amdkfd/kfd_device_queue_manager.c | 447 +-
 .../drm/amd/amdkfd/kfd_device_queue_manager.h |  10 +
 .../gpu/drm/amd/amdkfd/kfd_mqd_manager_v10.c  |  10 +
 .../gpu/drm/amd/amdkfd/kfd_mqd_manager_v11.c  |  15 +-
 .../gpu/drm/amd/amdkfd/kfd_mqd_manager_v9.c   |  14 +-
 drivers/gpu/drm/amd/amdkfd/kfd_priv.h |   5 +-
 .../amd/amdkfd/kfd_process_queue_manager.c|   1 +
 11 files changed, 512 insertions(+), 14 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
index 98cd52bb005f..b4fcad0e62f7 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
@@ -772,6 +772,11 @@ bool amdgpu_amdkfd_have_atomics_support(struct 
amdgpu_device *adev)
return adev->have_atomics_support;
 }
 
+void amdgpu_amdkfd_debug_mem_fence(struct amdgpu_device *adev)
+{
+   amdgpu_device_flush_hdp(adev, NULL);
+}
+
 void amdgpu_amdkfd_ras_poison_consumption_handler(struct amdgpu_device *adev, 
bool reset)
 {
amdgpu_umc_poison_handler(adev, reset);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
index dd740e64e6e1..2d0406bff84e 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
@@ -322,6 +322,7 @@ int amdgpu_amdkfd_gpuvm_import_dmabuf(struct amdgpu_device 
*adev,
  uint64_t *mmap_offset);
 int amdgpu_amdkfd_gpuvm_export_dmabuf(struct kgd_mem *mem,
  struct dma_buf **dmabuf);
+void amdgpu_amdkfd_debug_mem_fence(struct amdgpu_device *adev);
 int amdgpu_amdkfd_get_tile_config(struct amdgpu_device *adev,
struct tile_config *config);
 void amdgpu_amdkfd_ras_poison_consumption_handler(struct amdgpu_device *adev,
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
index 4b45d4539d48..adda60273456 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
@@ -410,6 +410,7 @@ static int kfd_ioctl_create_queue(struct file *filep, 
struct kfd_process *p,
pr_debug("Write ptr address   == 0x%016llX\n",
args->write_pointer_address);
 
+   kfd_dbg_ev_raise(KFD_EC_MASK(EC_QUEUE_NEW), p, dev, queue_id, false, 
NULL, 0);
return 0;
 
 err_create_queue:
@@ -2996,7 +2997,17 @@ static int kfd_ioctl_set_debug_trap(struct file *filep, 
struct kfd_process *p, v
args->launch_mode.launch_mode);
break;
case KFD_IOC_DBG_TRAP_SUSPEND_QUEUES:
+   r = suspend_queues(target,
+   args->suspend_queues.num_queues,
+   args->suspend_queues.grace_period,
+   args->suspend_queues.exception_mask,
+   (uint32_t 
*)args->suspend_queues.queue_array_ptr);
+
+   break;
case KFD_IOC_DBG_TRAP_RESUME_QUEUES:
+   r = resume_queues(target, args->resume_queues.num_queues,
+   (uint32_t 
*)args->resume_queues.queue_array_ptr);
+   break;
case KFD_IOC_DBG_TRAP_SET_NODE_ADDRESS_WATCH:
case KFD_IOC_DBG_TRAP_C

[PATCH 19/33] drm/amdkfd: add send exception operation

2023-05-25 Thread Jonathan Kim

Add a debug operation that allows the debugger to send an exception
directly to runtime through a payload address.

For memory violations, normal vmfault signals will be applied to
notify runtime instead after passing in the saved exception data
when a memory violation was raised to the debugger.

For runtime exceptions, this will unblock the runtime enable
function which will be explained and implemented in a follow up
patch.

Signed-off-by: Jonathan Kim 
Reviewed-by: Felix Kuehling 
---
 .../gpu/drm/amd/amdkfd/cik_event_interrupt.c  |  4 +-
 drivers/gpu/drm/amd/amdkfd/kfd_chardev.c  |  5 ++
 drivers/gpu/drm/amd/amdkfd/kfd_debug.c| 43 +++
 drivers/gpu/drm/amd/amdkfd/kfd_debug.h|  6 ++
 drivers/gpu/drm/amd/amdkfd/kfd_events.c   |  3 +-
 .../gpu/drm/amd/amdkfd/kfd_int_process_v9.c   |  2 +-
 drivers/gpu/drm/amd/amdkfd/kfd_priv.h |  7 +-
 drivers/gpu/drm/amd/amdkfd/kfd_process.c  | 71 ++-
 8 files changed, 135 insertions(+), 6 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c 
b/drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c
index 4ebfff6b6c55..795382b55e0a 100644
--- a/drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c
+++ b/drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c
@@ -118,9 +118,9 @@ static void cik_event_interrupt_wq(struct kfd_node *dev,
return;
 
if (info.vmid == vmid)
-   kfd_signal_vm_fault_event(dev, pasid, );
+   kfd_signal_vm_fault_event(dev, pasid, , NULL);
else
-   kfd_signal_vm_fault_event(dev, pasid, NULL);
+   kfd_signal_vm_fault_event(dev, pasid, NULL, NULL);
}
 }
 
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
index a5c457863048..ec5a85454192 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
@@ -2833,6 +2833,11 @@ static int kfd_ioctl_set_debug_trap(struct file *filep, 
struct kfd_process *p, v
r = kfd_dbg_trap_disable(target);
break;
case KFD_IOC_DBG_TRAP_SEND_RUNTIME_EVENT:
+   r = kfd_dbg_send_exception_to_runtime(target,
+   args->send_runtime_event.gpu_id,
+   args->send_runtime_event.queue_id,
+   args->send_runtime_event.exception_mask);
+   break;
case KFD_IOC_DBG_TRAP_SET_EXCEPTIONS_ENABLED:
case KFD_IOC_DBG_TRAP_SET_WAVE_LAUNCH_OVERRIDE:
case KFD_IOC_DBG_TRAP_SET_WAVE_LAUNCH_MODE:
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_debug.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_debug.c
index dccb27fc764b..61098975bb0e 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_debug.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_debug.c
@@ -125,6 +125,49 @@ bool kfd_dbg_ev_raise(uint64_t event_mask,
return is_subscribed;
 }
 
+int kfd_dbg_send_exception_to_runtime(struct kfd_process *p,
+   unsigned int dev_id,
+   unsigned int queue_id,
+   uint64_t error_reason)
+{
+   if (error_reason & KFD_EC_MASK(EC_DEVICE_MEMORY_VIOLATION)) {
+   struct kfd_process_device *pdd = NULL;
+   struct kfd_hsa_memory_exception_data *data;
+   int i;
+
+   for (i = 0; i < p->n_pdds; i++) {
+   if (p->pdds[i]->dev->id == dev_id) {
+   pdd = p->pdds[i];
+   break;
+   }
+   }
+
+   if (!pdd)
+   return -ENODEV;
+
+   data = (struct kfd_hsa_memory_exception_data *)
+   pdd->vm_fault_exc_data;
+
+   kfd_dqm_evict_pasid(pdd->dev->dqm, p->pasid);
+   kfd_signal_vm_fault_event(pdd->dev, p->pasid, NULL, data);
+   error_reason &= ~KFD_EC_MASK(EC_DEVICE_MEMORY_VIOLATION);
+   }
+
+   if (error_reason & (KFD_EC_MASK(EC_PROCESS_RUNTIME))) {
+   /*
+* block should only happen after the debugger receives runtime
+* enable notice.
+*/
+   up(>runtime_enable_sema);
+   error_reason &= ~KFD_EC_MASK(EC_PROCESS_RUNTIME);
+   }
+
+   if (error_reason)
+   return kfd_send_exception_to_runtime(p, queue_id, error_reason);
+
+   return 0;
+}
+
 static int kfd_dbg_set_queue_workaround(struct queue *q, bool enable)
 {
struct mqd_update_info minfo = {0};
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_debug.h 
b/drivers/gpu/drm/amd/amdkfd/kfd_debug.h
index 66ee7b95d08a..2c6866bb8850 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_debug.h
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_debug.h
@@ -34

[PATCH 18/33] drm/amdkfd: add raise exception event function

2023-05-25 Thread Jonathan Kim

Exception events can be generated from interrupts or queue activitity.

The raise event function will save exception status of a queue, device
or process then notify the debugger of the status change by writing to
a debugger polled file descriptor that the debugger provides during
debug attach.

For memory violation exceptions, extra exception data will be saved.

The debugger will be able to query the saved exception states by query
operation that will be provided by follow up patches.

v2: use new kfd_node struct in prototype.

Signed-off-by: Jonathan Kim 
---
 drivers/gpu/drm/amd/amdkfd/kfd_debug.c   | 104 +++
 drivers/gpu/drm/amd/amdkfd/kfd_debug.h   |   7 ++
 drivers/gpu/drm/amd/amdkfd/kfd_priv.h|  10 +++
 drivers/gpu/drm/amd/amdkfd/kfd_process.c |   2 +
 4 files changed, 123 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_debug.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_debug.c
index 5e2ee2d1acc4..dccb27fc764b 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_debug.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_debug.c
@@ -24,6 +24,107 @@
 #include "kfd_device_queue_manager.h"
 #include 
 
+void debug_event_write_work_handler(struct work_struct *work)
+{
+   struct kfd_process *process;
+
+   static const char write_data = '.';
+   loff_t pos = 0;
+
+   process = container_of(work,
+   struct kfd_process,
+   debug_event_workarea);
+
+   kernel_write(process->dbg_ev_file, _data, 1, );
+}
+
+/* update process/device/queue exception status, write to descriptor
+ * only if exception_status is enabled.
+ */
+bool kfd_dbg_ev_raise(uint64_t event_mask,
+   struct kfd_process *process, struct kfd_node *dev,
+   unsigned int source_id, bool use_worker,
+   void *exception_data, size_t exception_data_size)
+{
+   struct process_queue_manager *pqm;
+   struct process_queue_node *pqn;
+   int i;
+   static const char write_data = '.';
+   loff_t pos = 0;
+   bool is_subscribed = true;
+
+   if (!(process && process->debug_trap_enabled))
+   return false;
+
+   mutex_lock(>event_mutex);
+
+   if (event_mask & KFD_EC_MASK_DEVICE) {
+   for (i = 0; i < process->n_pdds; i++) {
+   struct kfd_process_device *pdd = process->pdds[i];
+
+   if (pdd->dev != dev)
+   continue;
+
+   pdd->exception_status |= event_mask & 
KFD_EC_MASK_DEVICE;
+
+   if (event_mask & 
KFD_EC_MASK(EC_DEVICE_MEMORY_VIOLATION)) {
+   if (!pdd->vm_fault_exc_data) {
+   pdd->vm_fault_exc_data = kmemdup(
+   exception_data,
+   exception_data_size,
+   GFP_KERNEL);
+   if (!pdd->vm_fault_exc_data)
+   pr_debug("Failed to allocate 
exception data memory");
+   } else {
+   pr_debug("Debugger exception data not 
saved\n");
+   print_hex_dump_bytes("exception data: ",
+   DUMP_PREFIX_OFFSET,
+   exception_data,
+   exception_data_size);
+   }
+   }
+   break;
+   }
+   } else if (event_mask & KFD_EC_MASK_PROCESS) {
+   process->exception_status |= event_mask & KFD_EC_MASK_PROCESS;
+   } else {
+   pqm = >pqm;
+   list_for_each_entry(pqn, >queues,
+   process_queue_list) {
+   int target_id;
+
+   if (!pqn->q)
+   continue;
+
+   target_id = event_mask & KFD_EC_MASK(EC_QUEUE_NEW) ?
+   pqn->q->properties.queue_id :
+   pqn->q->doorbell_id;
+
+   if (pqn->q->device != dev || target_id != source_id)
+   continue;
+
+   pqn->q->properties.exception_status |= event_mask;
+   break;
+   }
+   }
+
+   if (process->exception_enable_mask & event_mask) {
+   if (use_worker)
+   schedule_work(>debug_event_workarea);
+   else
+   kernel_write(process->dbg_ev

[PATCH 10/33] drm/amdgpu: add gfx9.4.2 hw debug mode enable and disable calls

2023-05-25 Thread Jonathan Kim

GFX9.4.2 now supports per-VMID debug mode controls registers
(SPI_GDBG_PER_VMID_CNTL).

Because the KFD lets the HWS handle PASID-VMID mapping, the KFD will
forward all debug mode setting register writes to the HWS scheduler
using a new MAP_PROCESS API, so instead of writing to registers, return
the required register values that the HWS needs to write on debug enable
and disable.

Signed-off-by: Jonathan Kim 
Reviewed-by: Felix Kuehling 
---
 .../drm/amd/amdgpu/amdgpu_amdkfd_aldebaran.c  | 42 ++-
 1 file changed, 41 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_aldebaran.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_aldebaran.c
index 4485bb29bec9..a6f98141c29c 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_aldebaran.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_aldebaran.c
@@ -23,6 +23,44 @@
 #include "amdgpu_amdkfd.h"
 #include "amdgpu_amdkfd_arcturus.h"
 #include "amdgpu_amdkfd_gfx_v9.h"
+#include "gc/gc_9_4_2_offset.h"
+#include "gc/gc_9_4_2_sh_mask.h"
+
+/*
+ * Returns TRAP_EN, EXCP_EN and EXCP_REPLACE.
+ *
+ * restore_dbg_registers is ignored here but is a general interface requirement
+ * for devices that support GFXOFF and where the RLC save/restore list
+ * does not support hw registers for debugging i.e. the driver has to manually
+ * initialize the debug mode registers after it has disabled GFX off during the
+ * debug session.
+ */
+static uint32_t kgd_aldebaran_enable_debug_trap(struct amdgpu_device *adev,
+   bool restore_dbg_registers,
+   uint32_t vmid)
+{
+   uint32_t data = 0;
+
+   data = REG_SET_FIELD(data, SPI_GDBG_PER_VMID_CNTL, TRAP_EN, 1);
+   data = REG_SET_FIELD(data, SPI_GDBG_PER_VMID_CNTL, EXCP_EN, 0);
+   data = REG_SET_FIELD(data, SPI_GDBG_PER_VMID_CNTL, EXCP_REPLACE, 0);
+
+   return data;
+}
+
+/* returns TRAP_EN, EXCP_EN and EXCP_REPLACE. */
+static uint32_t kgd_aldebaran_disable_debug_trap(struct amdgpu_device *adev,
+   bool keep_trap_enabled,
+   uint32_t vmid)
+{
+   uint32_t data = 0;
+
+   data = REG_SET_FIELD(data, SPI_GDBG_PER_VMID_CNTL, TRAP_EN, 
keep_trap_enabled);
+   data = REG_SET_FIELD(data, SPI_GDBG_PER_VMID_CNTL, EXCP_EN, 0);
+   data = REG_SET_FIELD(data, SPI_GDBG_PER_VMID_CNTL, EXCP_REPLACE, 0);
+
+   return data;
+}
 
 const struct kfd2kgd_calls aldebaran_kfd2kgd = {
.program_sh_mem_settings = kgd_gfx_v9_program_sh_mem_settings,
@@ -42,5 +80,7 @@ const struct kfd2kgd_calls aldebaran_kfd2kgd = {
kgd_gfx_v9_get_atc_vmid_pasid_mapping_info,
.set_vm_context_page_table_base = 
kgd_gfx_v9_set_vm_context_page_table_base,
.get_cu_occupancy = kgd_gfx_v9_get_cu_occupancy,
-   .program_trap_handler_settings = 
kgd_gfx_v9_program_trap_handler_settings
+   .enable_debug_trap = kgd_aldebaran_enable_debug_trap,
+   .disable_debug_trap = kgd_aldebaran_disable_debug_trap,
+   .program_trap_handler_settings = 
kgd_gfx_v9_program_trap_handler_settings,
 };
-- 
2.25.1

[PATCH 16/33] drm/amdkfd: add per process hw trap enable and disable functions

2023-05-25 Thread Jonathan Kim

To enable HW debug mode per process, all devices must be debug enabled
successfully.  If a failure occures, rewind the enablement of debug mode
on the enabled devices.

A power management scenario that needs to be considered is HW
debug mode setting during GFXOFF.  During GFXOFF, these registers
will be unreachable so we have to transiently disable GFXOFF when
setting.  Also, some devices don't support the RLC save restore
function for these debug registers so we have to disable GFXOFF
completely during a debug session.

Cooperative launch also has debugging restriction based on HW/FW bugs.
If such bugs exists, the debugger cannot attach to a process that uses GWS
resources nor can GWS resources be requested if a process is being
debugged.

Multi-process debug devices can only enable trap temporaries based
on certain runtime scenerios, which will be explained when the
runtime enable functions are implemented in a follow up patch.

v2: spot fix with new kfd_node references

Signed-off-by: Jonathan Kim 
---
 drivers/gpu/drm/amd/amdkfd/kfd_chardev.c |   5 +
 drivers/gpu/drm/amd/amdkfd/kfd_debug.c   | 148 ++-
 drivers/gpu/drm/amd/amdkfd/kfd_debug.h   |  29 +
 drivers/gpu/drm/amd/amdkfd/kfd_process.c |  10 ++
 4 files changed, 190 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
index 7082d5d0f0e9..9d0c247f80fe 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
@@ -1488,6 +1488,11 @@ static int kfd_ioctl_alloc_queue_gws(struct file *filep,
goto out_unlock;
}
 
+   if (!kfd_dbg_has_gws_support(dev) && p->debug_trap_enabled) {
+   retval = -EBUSY;
+   goto out_unlock;
+   }
+
retval = pqm_set_gws(>pqm, args->queue_id, args->num_gws ? dev->gws 
: NULL);
mutex_unlock(>mutex);
 
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_debug.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_debug.c
index 898cc1fe3d13..73b07b5f17f1 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_debug.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_debug.c
@@ -21,13 +21,78 @@
  */
 
 #include "kfd_debug.h"
+#include "kfd_device_queue_manager.h"
 #include 
 
+static int kfd_dbg_set_mes_debug_mode(struct kfd_process_device *pdd)
+{
+   uint32_t spi_dbg_cntl = pdd->spi_dbg_override | 
pdd->spi_dbg_launch_mode;
+   uint32_t flags = pdd->process->dbg_flags;
+
+   if (!kfd_dbg_is_per_vmid_supported(pdd->dev))
+   return 0;
+
+   return amdgpu_mes_set_shader_debugger(pdd->dev->adev, 
pdd->proc_ctx_gpu_addr, spi_dbg_cntl,
+   pdd->watch_points, flags);
+}
+
+/* kfd_dbg_trap_deactivate:
+ * target: target process
+ * unwind: If this is unwinding a failed kfd_dbg_trap_enable()
+ * unwind_count:
+ * If unwind == true, how far down the pdd list we need
+ * to unwind
+ * else: ignored
+ */
+static void kfd_dbg_trap_deactivate(struct kfd_process *target, bool unwind, 
int unwind_count)
+{
+   int i;
+
+   for (i = 0; i < target->n_pdds; i++) {
+   struct kfd_process_device *pdd = target->pdds[i];
+
+   /* If this is an unwind, and we have unwound the required
+* enable calls on the pdd list, we need to stop now
+* otherwise we may mess up another debugger session.
+*/
+   if (unwind && i == unwind_count)
+   break;
+
+   /* GFX off is already disabled by debug activate if not RLC 
restore supported. */
+   if (kfd_dbg_is_rlc_restore_supported(pdd->dev))
+   amdgpu_gfx_off_ctrl(pdd->dev->adev, false);
+   pdd->spi_dbg_override =
+   pdd->dev->kfd2kgd->disable_debug_trap(
+   pdd->dev->adev,
+   target->runtime_info.ttmp_setup,
+   pdd->dev->vm_info.last_vmid_kfd);
+   amdgpu_gfx_off_ctrl(pdd->dev->adev, true);
+
+   if (!kfd_dbg_is_per_vmid_supported(pdd->dev) &&
+   release_debug_trap_vmid(pdd->dev->dqm, 
>qpd))
+   pr_err("Failed to release debug vmid on [%i]\n", 
pdd->dev->id);
+
+   if (!pdd->dev->kfd->shared_resources.enable_mes)
+   debug_refresh_runlist(pdd->dev->dqm);
+   else
+   kfd_dbg_set_mes_debug_mode(pdd);
+   }
+}
+
 int kfd_dbg_trap_disable(struct kfd_process *target)
 {
if (!target->debug_trap_enabled)
return 0;
 
+   /*
+* Defer deactivation to runtime if runtime not e

[PATCH 20/33] drm/amdkfd: add runtime enable operation

2023-05-25 Thread Jonathan Kim

The debugger can attach to a process prior to HSA enablement (i.e.
inferior is spawned by the debugger and attached to immediately before
target process has been enabled for HSA dispatches) or it
can attach to a running target that is already HSA enabled.  Either
way, the debugger needs to know the enablement status to know when
it can inspect queues.

For the scenario where the debugger spawns the target process,
it will have to wait for ROCr's runtime enable request from the target.
The runtime enable request will be able to see that its process has been
debug attached.  ROCr raises an EC_PROCESS_RUNTIME signal to the
debugger then blocks the target process while waiting the debugger's
response. Once the debugger has received the runtime signal, it will
unblock the target process.

For the scenario where the debugger attaches to a running target
process, ROCr will set the target process' runtime status as enabled so
that on an attach request, the debugger will be able to see this
status and will continue with debug enablement as normal.

A secondary requirement is to conditionally enable the trap tempories only
if the user requests it (env var HSA_ENABLE_DEBUG=1) or if the debugger
attaches with HSA runtime enabled.  This is because setting up the trap
temporaries incurs a performance overhead that is unacceptable for
microbench performance in normal mode for certain customers.

In the scenario where the debugger spawns the target process, when ROCr
detects that the debugger has attached during the runtime enable
request, it will enable the trap temporaries before it blocks the target
process while waiting for the debugger to respond.

In the scenario where the debugger attaches to a running target process,
it will enable to trap temporaries itself.

Finally, there is an additional restriction that is required to be
enforced with runtime enable and HW debug mode setting. The debugger must
first ensure that HW debug mode has been enabled before permitting HW debug
mode operations.

With single process debug devices, allowing the debugger to set debug
HW modes prior to trap activation means that debug HW mode setting can
occur before the KFD has reserved the debug VMID (0xf) from the hardware
scheduler's VMID allocation resource pool.  This can result in the
hardware scheduler assigning VMID 0xf to a non-debugged process and
having that process inherit debug HW mode settings intended for the
debugged target process instead, which is both incorrect and potentially
fatal for normal mode operation.

With multi process debug devices, allowing the debugger to set debug
HW modes prior to trap activation means that non-debugged processes
migrating to a new VMID could inherit unintended debug settings.

All debug operations that touch HW settings must require trap activation
where trap activation is triggered by both debug attach and runtime
enablement (target has KFD opened and is ready to dispatch work).

v2: fixup with new kfd_node struct reference

Signed-off-by: Jonathan Kim 
---
 drivers/gpu/drm/amd/amdkfd/kfd_chardev.c | 143 ++-
 drivers/gpu/drm/amd/amdkfd/kfd_debug.c   |   6 +-
 drivers/gpu/drm/amd/amdkfd/kfd_debug.h   |   4 +
 drivers/gpu/drm/amd/amdkfd/kfd_priv.h|   1 +
 4 files changed, 150 insertions(+), 4 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
index ec5a85454192..73cb5abce431 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
@@ -2738,11 +2738,140 @@ static int kfd_ioctl_criu(struct file *filep, struct 
kfd_process *p, void *data)
return ret;
 }
 
-static int kfd_ioctl_runtime_enable(struct file *filep, struct kfd_process *p, 
void *data)
+static int runtime_enable(struct kfd_process *p, uint64_t r_debug,
+   bool enable_ttmp_setup)
+{
+   int i = 0, ret = 0;
+
+   if (p->is_runtime_retry)
+   goto retry;
+
+   if (p->runtime_info.runtime_state != DEBUG_RUNTIME_STATE_DISABLED)
+   return -EBUSY;
+
+   for (i = 0; i < p->n_pdds; i++) {
+   struct kfd_process_device *pdd = p->pdds[i];
+
+   if (pdd->qpd.queue_count)
+   return -EEXIST;
+   }
+
+   p->runtime_info.runtime_state = DEBUG_RUNTIME_STATE_ENABLED;
+   p->runtime_info.r_debug = r_debug;
+   p->runtime_info.ttmp_setup = enable_ttmp_setup;
+
+   if (p->runtime_info.ttmp_setup) {
+   for (i = 0; i < p->n_pdds; i++) {
+   struct kfd_process_device *pdd = p->pdds[i];
+
+   if (!kfd_dbg_is_rlc_restore_supported(pdd->dev)) {
+   amdgpu_gfx_off_ctrl(pdd->dev->adev, false);
+   pdd->dev->kfd2kgd->enable_debug_trap(
+   pdd->dev

[PATCH 13/33] drm/amdkfd: prepare map process for single process debug devices

2023-05-25 Thread Jonathan Kim

Older HW only supports debugging on a single process because the
SPI debug mode setting registers are device global.

The HWS has supplied a single pinned VMID (0xf) for MAP_PROCESS
for debug purposes. To pin the VMID, the KFD will remove the VMID from
the HWS dynamic VMID allocation via SET_RESOUCES so that a debugged
process will never migrate away from its pinned VMID.

The KFD is responsible for reserving and releasing this pinned VMID
accordingly whenever the debugger attaches and detaches respectively.

v2: spot fix ups using new kfd_node references

Signed-off-by: Jonathan Kim 
---
 .../drm/amd/amdkfd/kfd_device_queue_manager.c | 93 +++
 .../drm/amd/amdkfd/kfd_device_queue_manager.h |  5 +
 .../drm/amd/amdkfd/kfd_packet_manager_v9.c|  9 ++
 .../gpu/drm/amd/amdkfd/kfd_pm4_headers_ai.h   |  5 +-
 4 files changed, 111 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
index d1f44feb7084..c8519adc89ac 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
@@ -1524,6 +1524,7 @@ static int initialize_cpsch(struct device_queue_manager 
*dqm)
dqm->gws_queue_count = 0;
dqm->active_runlist = false;
INIT_WORK(>hw_exception_work, kfd_process_hw_exception);
+   dqm->trap_debug_vmid = 0;
 
init_sdma_bitmaps(dqm);
 
@@ -2500,6 +2501,98 @@ static void kfd_process_hw_exception(struct work_struct 
*work)
amdgpu_amdkfd_gpu_reset(dqm->dev->adev);
 }
 
+int reserve_debug_trap_vmid(struct device_queue_manager *dqm,
+   struct qcm_process_device *qpd)
+{
+   int r;
+   int updated_vmid_mask;
+
+   if (dqm->sched_policy == KFD_SCHED_POLICY_NO_HWS) {
+   pr_err("Unsupported on sched_policy: %i\n", dqm->sched_policy);
+   return -EINVAL;
+   }
+
+   dqm_lock(dqm);
+
+   if (dqm->trap_debug_vmid != 0) {
+   pr_err("Trap debug id already reserved\n");
+   r = -EBUSY;
+   goto out_unlock;
+   }
+
+   r = unmap_queues_cpsch(dqm, KFD_UNMAP_QUEUES_FILTER_ALL_QUEUES, 0,
+   USE_DEFAULT_GRACE_PERIOD, false);
+   if (r)
+   goto out_unlock;
+
+   updated_vmid_mask = dqm->dev->kfd->shared_resources.compute_vmid_bitmap;
+   updated_vmid_mask &= ~(1 << dqm->dev->vm_info.last_vmid_kfd);
+
+   dqm->dev->kfd->shared_resources.compute_vmid_bitmap = updated_vmid_mask;
+   dqm->trap_debug_vmid = dqm->dev->vm_info.last_vmid_kfd;
+   r = set_sched_resources(dqm);
+   if (r)
+   goto out_unlock;
+
+   r = map_queues_cpsch(dqm);
+   if (r)
+   goto out_unlock;
+
+   pr_debug("Reserved VMID for trap debug: %i\n", dqm->trap_debug_vmid);
+
+out_unlock:
+   dqm_unlock(dqm);
+   return r;
+}
+
+/*
+ * Releases vmid for the trap debugger
+ */
+int release_debug_trap_vmid(struct device_queue_manager *dqm,
+   struct qcm_process_device *qpd)
+{
+   int r;
+   int updated_vmid_mask;
+   uint32_t trap_debug_vmid;
+
+   if (dqm->sched_policy == KFD_SCHED_POLICY_NO_HWS) {
+   pr_err("Unsupported on sched_policy: %i\n", dqm->sched_policy);
+   return -EINVAL;
+   }
+
+   dqm_lock(dqm);
+   trap_debug_vmid = dqm->trap_debug_vmid;
+   if (dqm->trap_debug_vmid == 0) {
+   pr_err("Trap debug id is not reserved\n");
+   r = -EINVAL;
+   goto out_unlock;
+   }
+
+   r = unmap_queues_cpsch(dqm, KFD_UNMAP_QUEUES_FILTER_ALL_QUEUES, 0,
+   USE_DEFAULT_GRACE_PERIOD, false);
+   if (r)
+   goto out_unlock;
+
+   updated_vmid_mask = dqm->dev->kfd->shared_resources.compute_vmid_bitmap;
+   updated_vmid_mask |= (1 << dqm->dev->vm_info.last_vmid_kfd);
+
+   dqm->dev->kfd->shared_resources.compute_vmid_bitmap = updated_vmid_mask;
+   dqm->trap_debug_vmid = 0;
+   r = set_sched_resources(dqm);
+   if (r)
+   goto out_unlock;
+
+   r = map_queues_cpsch(dqm);
+   if (r)
+   goto out_unlock;
+
+   pr_debug("Released VMID for trap debug: %i\n", trap_debug_vmid);
+
+out_unlock:
+   dqm_unlock(dqm);
+   return r;
+}
+
 #if defined(CONFIG_DEBUG_FS)
 
 static void seq_reg_dump(struct seq_file *m,
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h 
b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h
index d4dd3b4acbf0..bf7aa3f84182 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h
@@ -250,6 +250,7 @@ struct device_queue_manager {

[PATCH 08/33] drm/amdkfd: fix kfd_suspend_all_processes

2023-05-25 Thread Jonathan Kim

Flush delayed restore work in kfd_suspend_all_queues instead of
cancelling. Cancelling the work before it runs results in the queues
becoming permanently disabled. Flushing the work ensures that the
queue suspend/resume state stays balanced.

Signed-off-by: Jonathan Kim 
Reviewed-by: Felix Kuehling 
---
 drivers/gpu/drm/amd/amdkfd/kfd_process.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
index af0a4b5257cc..d63a764dafb9 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_process.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
@@ -2014,7 +2014,7 @@ void kfd_suspend_all_processes(void)
WARN(debug_evictions, "Evicting all processes");
hash_for_each_rcu(kfd_processes_table, temp, p, kfd_processes) {
cancel_delayed_work_sync(>eviction_work);
-   cancel_delayed_work_sync(>restore_work);
+   flush_delayed_work(>restore_work);
 
if (kfd_process_evict_queues(p, 
KFD_QUEUE_EVICTION_TRIGGER_SUSPEND))
pr_err("Failed to suspend process 0x%x\n", p->pasid);
-- 
2.25.1

[PATCH 07/33] drm/amdgpu: add gfx9.4.1 hw debug mode enable and disable calls

2023-05-25 Thread Jonathan Kim

On GFX9.4.1, the implicit wait count instruction on s_barrier is
disabled by default in the driver during normal operation for
performance requirements.

There is a hardware bug in GFX9.4.1 where if the implicit wait count
instruction after an s_barrier instruction is disabled, any wave that
hits an exception may step over the s_barrier when returning from the
trap handler with the barrier logic having no ability to be
aware of this, thereby causing other waves to wait at the barrier
indefinitely resulting in a shader hang.  This bug has been corrected
for GFX9.4.2 and onward.

Since the debugger subscribes to hardware exceptions, in order to avoid
this bug, the debugger must enable implicit wait count on s_barrier
for a debug session and disable it on detach.

In order to change this setting in the in the device global SQ_CONFIG
register, the GFX pipeline must be idle.  GFX9.4.1 as a compute device
will either dispatch work through the compute ring buffers used for
image post processing or through the hardware scheduler by the KFD.

Have the KGD suspend and drain the compute ring buffer, then suspend the
hardware scheduler and block any future KFD process job requests before
changing the implicit wait count setting.  Once set, resume all work.

Signed-off-by: Jonathan Kim 
Reviewed-by: Felix Kuehling 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu.h   |   3 +
 .../drm/amd/amdgpu/amdgpu_amdkfd_arcturus.c   | 116 ++
 drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c |   4 +-
 3 files changed, 121 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
index 5af954abd5ba..7c49cdc37d95 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
@@ -1065,6 +1065,9 @@ struct amdgpu_device {
struct pci_saved_state  *pci_state;
pci_channel_state_t pci_channel_state;
 
+   /* Track auto wait count on s_barrier settings */
+   boolbarrier_has_auto_waitcnt;
+
struct amdgpu_reset_control *reset_cntl;
uint32_t
ip_versions[MAX_HWIP][HWIP_MAX_INSTANCE];
 
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_arcturus.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_arcturus.c
index 4191af5a3f13..d2918e5c0dea 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_arcturus.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_arcturus.c
@@ -26,6 +26,7 @@
 #include "amdgpu.h"
 #include "amdgpu_amdkfd.h"
 #include "amdgpu_amdkfd_arcturus.h"
+#include "amdgpu_reset.h"
 #include "sdma0/sdma0_4_2_2_offset.h"
 #include "sdma0/sdma0_4_2_2_sh_mask.h"
 #include "sdma1/sdma1_4_2_2_offset.h"
@@ -48,6 +49,8 @@
 #include "amdgpu_amdkfd_gfx_v9.h"
 #include "gfxhub_v1_0.h"
 #include "mmhub_v9_4.h"
+#include "gc/gc_9_0_offset.h"
+#include "gc/gc_9_0_sh_mask.h"
 
 #define HQD_N_REGS 56
 #define DUMP_REG(addr) do {\
@@ -276,6 +279,117 @@ int kgd_arcturus_hqd_sdma_destroy(struct amdgpu_device 
*adev, void *mqd,
return 0;
 }
 
+/*
+ * Helper used to suspend/resume gfx pipe for image post process work to set
+ * barrier behaviour.
+ */
+static int suspend_resume_compute_scheduler(struct amdgpu_device *adev, bool 
suspend)
+{
+   int i, r = 0;
+
+   for (i = 0; i < adev->gfx.num_compute_rings; i++) {
+   struct amdgpu_ring *ring = >gfx.compute_ring[i];
+
+   if (!(ring && ring->sched.thread))
+   continue;
+
+   /* stop secheduler and drain ring. */
+   if (suspend) {
+   drm_sched_stop(>sched, NULL);
+   r = amdgpu_fence_wait_empty(ring);
+   if (r)
+   goto out;
+   } else {
+   drm_sched_start(>sched, false);
+   }
+   }
+
+out:
+   /* return on resume or failure to drain rings. */
+   if (!suspend || r)
+   return r;
+
+   return amdgpu_device_ip_wait_for_idle(adev, GC_HWIP);
+}
+
+static void set_barrier_auto_waitcnt(struct amdgpu_device *adev, bool 
enable_waitcnt)
+{
+   uint32_t data;
+
+   WRITE_ONCE(adev->barrier_has_auto_waitcnt, enable_waitcnt);
+
+   if (!down_read_trylock(>reset_domain->sem))
+   return;
+
+   amdgpu_amdkfd_suspend(adev, false);
+
+   if (suspend_resume_compute_scheduler(adev, true))
+   goto out;
+
+   data = RREG32(SOC15_REG_OFFSET(GC, 0, mmSQ_CONFIG));
+   data = REG_SET_FIELD(data, SQ_CONFIG, DISABLE_BARRIER_WAITCNT,
+   !enable_waitcnt);
+   WREG32(SOC15_REG_OFFSET(GC, 0, mmSQ_CONFIG), data);
+
+out:
+   suspend_resume_compute_scheduler(adev, false);
+
+   amdgpu_a

[PATCH 12/33] drm/amdgpu: add configurable grace period for unmap queues

2023-05-25 Thread Jonathan Kim

The HWS schedule allows a grace period for wave completion prior to
preemption for better performance by avoiding CWSR on waves that can
potentially complete quickly. The debugger, on the other hand, will
want to inspect wave status immediately after it actively triggers
preemption (a suspend function to be provided).

To minimize latency between preemption and debugger wave inspection, allow
immediate preemption by setting the grace period to 0.

Note that setting the preepmtion grace period to 0 will result in an
infinite grace period being set due to a CP FW bug so set it to 1 for now.

v2: add null grace period function pointers to VI packet manager.

Signed-off-by: Jonathan Kim 
---
 .../drm/amd/amdgpu/amdgpu_amdkfd_aldebaran.c  |  2 +
 .../drm/amd/amdgpu/amdgpu_amdkfd_arcturus.c   |  2 +
 .../drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.c| 43 
 .../drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.h|  6 ++
 .../drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10_3.c  |  2 +
 .../gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c | 43 
 .../gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.h |  8 ++-
 .../drm/amd/amdkfd/kfd_device_queue_manager.c | 63 +-
 .../drm/amd/amdkfd/kfd_device_queue_manager.h |  3 +
 .../gpu/drm/amd/amdkfd/kfd_packet_manager.c   | 32 +
 .../drm/amd/amdkfd/kfd_packet_manager_v9.c| 39 +++
 .../drm/amd/amdkfd/kfd_packet_manager_vi.c|  2 +
 .../gpu/drm/amd/amdkfd/kfd_pm4_headers_ai.h   | 65 +++
 drivers/gpu/drm/amd/amdkfd/kfd_priv.h |  5 ++
 14 files changed, 295 insertions(+), 20 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_aldebaran.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_aldebaran.c
index a6f98141c29c..b811a0985050 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_aldebaran.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_aldebaran.c
@@ -82,5 +82,7 @@ const struct kfd2kgd_calls aldebaran_kfd2kgd = {
.get_cu_occupancy = kgd_gfx_v9_get_cu_occupancy,
.enable_debug_trap = kgd_aldebaran_enable_debug_trap,
.disable_debug_trap = kgd_aldebaran_disable_debug_trap,
+   .get_iq_wait_times = kgd_gfx_v9_get_iq_wait_times,
+   .build_grace_period_packet_info = 
kgd_gfx_v9_build_grace_period_packet_info,
.program_trap_handler_settings = 
kgd_gfx_v9_program_trap_handler_settings,
 };
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_arcturus.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_arcturus.c
index d2918e5c0dea..a62bd0068515 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_arcturus.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_arcturus.c
@@ -410,6 +410,8 @@ const struct kfd2kgd_calls arcturus_kfd2kgd = {
kgd_gfx_v9_set_vm_context_page_table_base,
.enable_debug_trap = kgd_arcturus_enable_debug_trap,
.disable_debug_trap = kgd_arcturus_disable_debug_trap,
+   .get_iq_wait_times = kgd_gfx_v9_get_iq_wait_times,
+   .build_grace_period_packet_info = 
kgd_gfx_v9_build_grace_period_packet_info,
.get_cu_occupancy = kgd_gfx_v9_get_cu_occupancy,
.program_trap_handler_settings = 
kgd_gfx_v9_program_trap_handler_settings
 };
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.c
index 240f5006e278..98006c7021dd 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.c
@@ -803,6 +803,47 @@ uint32_t kgd_gfx_v10_disable_debug_trap(struct 
amdgpu_device *adev,
return 0;
 }
 
+/* kgd_gfx_v10_get_iq_wait_times: Returns the mmCP_IQ_WAIT_TIME1/2 values
+ * The values read are:
+ * ib_offload_wait_time -- Wait Count for Indirect Buffer Offloads.
+ * atomic_offload_wait_time -- Wait Count for L2 and GDS Atomics Offloads.
+ * wrm_offload_wait_time-- Wait Count for WAIT_REG_MEM Offloads.
+ * gws_wait_time-- Wait Count for Global Wave Syncs.
+ * que_sleep_wait_time  -- Wait Count for Dequeue Retry.
+ * sch_wave_wait_time   -- Wait Count for Scheduling Wave Message.
+ * sem_rearm_wait_time  -- Wait Count for Semaphore re-arm.
+ * deq_retry_wait_time  -- Wait Count for Global Wave Syncs.
+ */
+void kgd_gfx_v10_get_iq_wait_times(struct amdgpu_device *adev,
+   uint32_t *wait_times)
+
+{
+   *wait_times = RREG32(SOC15_REG_OFFSET(GC, 0, mmCP_IQ_WAIT_TIME2));
+}
+
+void kgd_gfx_v10_build_grace_period_packet_info(struct amdgpu_device *adev,
+   uint32_t wait_times,
+   uint32_t grace_period,
+   uint32_t *reg_offset,
+   uint32_t *reg_data)
+{
+   *reg_data = wait_times;
+
+   /*
+* The CP cannont handle a 0 grace period input and will result in
+* an infinite grace period being set so set

[PATCH 14/33] drm/amdgpu: prepare map process for multi-process debug devices

2023-05-25 Thread Jonathan Kim

Unlike single process debug devices, multi-process debug devices allow
debug mode setting per-VMID (non-device-global).

Because the HWS manages PASID-VMID mapping, the new MAP_PROCESS API allows
the KFD to forward the required SPI debug register write requests.

To request a new debug mode setting change, the KFD must be able to
preempt all queues then remap all queues with these new setting
requests for MAP_PROCESS to take effect.

Note that by default, trap enablement in non-debug mode must be disabled
for performance reasons for multi-process debug devices due to setup
overhead in FW.

v2: spot fixup new kfd_node references

Signed-off-by: Jonathan Kim 
---
 drivers/gpu/drm/amd/amdkfd/kfd_debug.h|  5 ++
 .../drm/amd/amdkfd/kfd_device_queue_manager.c | 51 +++
 .../drm/amd/amdkfd/kfd_device_queue_manager.h |  3 ++
 .../drm/amd/amdkfd/kfd_packet_manager_v9.c| 14 +
 drivers/gpu/drm/amd/amdkfd/kfd_priv.h |  9 
 drivers/gpu/drm/amd/amdkfd/kfd_process.c  |  5 ++
 6 files changed, 87 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_debug.h 
b/drivers/gpu/drm/amd/amdkfd/kfd_debug.h
index a8abfe2a0a14..db6d72e7930f 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_debug.h
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_debug.h
@@ -29,4 +29,9 @@ int kfd_dbg_trap_disable(struct kfd_process *target);
 int kfd_dbg_trap_enable(struct kfd_process *target, uint32_t fd,
void __user *runtime_info,
uint32_t *runtime_info_size);
+static inline bool kfd_dbg_is_per_vmid_supported(struct kfd_node *dev)
+{
+   return KFD_GC_VERSION(dev) == IP_VERSION(9, 4, 2);
+}
+
 #endif
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
index c8519adc89ac..badfe1210bc4 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
@@ -36,6 +36,7 @@
 #include "kfd_kernel_queue.h"
 #include "amdgpu_amdkfd.h"
 #include "mes_api_def.h"
+#include "kfd_debug.h"
 
 /* Size of the per-pipe EOP queue */
 #define CIK_HPD_EOP_BYTES_LOG2 11
@@ -2593,6 +2594,56 @@ int release_debug_trap_vmid(struct device_queue_manager 
*dqm,
return r;
 }
 
+int debug_lock_and_unmap(struct device_queue_manager *dqm)
+{
+   int r;
+
+   if (dqm->sched_policy == KFD_SCHED_POLICY_NO_HWS) {
+   pr_err("Unsupported on sched_policy: %i\n", dqm->sched_policy);
+   return -EINVAL;
+   }
+
+   if (!kfd_dbg_is_per_vmid_supported(dqm->dev))
+   return 0;
+
+   dqm_lock(dqm);
+
+   r = unmap_queues_cpsch(dqm, KFD_UNMAP_QUEUES_FILTER_ALL_QUEUES, 0, 0, 
false);
+   if (r)
+   dqm_unlock(dqm);
+
+   return r;
+}
+
+int debug_map_and_unlock(struct device_queue_manager *dqm)
+{
+   int r;
+
+   if (dqm->sched_policy == KFD_SCHED_POLICY_NO_HWS) {
+   pr_err("Unsupported on sched_policy: %i\n", dqm->sched_policy);
+   return -EINVAL;
+   }
+
+   if (!kfd_dbg_is_per_vmid_supported(dqm->dev))
+   return 0;
+
+   r = map_queues_cpsch(dqm);
+
+   dqm_unlock(dqm);
+
+   return r;
+}
+
+int debug_refresh_runlist(struct device_queue_manager *dqm)
+{
+   int r = debug_lock_and_unmap(dqm);
+
+   if (r)
+   return r;
+
+   return debug_map_and_unlock(dqm);
+}
+
 #if defined(CONFIG_DEBUG_FS)
 
 static void seq_reg_dump(struct seq_file *m,
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h 
b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h
index bf7aa3f84182..bb75d93712eb 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h
@@ -290,6 +290,9 @@ int reserve_debug_trap_vmid(struct device_queue_manager 
*dqm,
struct qcm_process_device *qpd);
 int release_debug_trap_vmid(struct device_queue_manager *dqm,
struct qcm_process_device *qpd);
+int debug_lock_and_unmap(struct device_queue_manager *dqm);
+int debug_map_and_unlock(struct device_queue_manager *dqm);
+int debug_refresh_runlist(struct device_queue_manager *dqm);
 
 static inline unsigned int get_sh_mem_bases_32(struct kfd_process_device *pdd)
 {
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager_v9.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager_v9.c
index 0fe73dbd28af..29a2d0499b67 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager_v9.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager_v9.c
@@ -88,6 +88,10 @@ static int pm_map_process_aldebaran(struct packet_manager 
*pm,
 {
struct pm4_mes_map_process_aldebaran *packet;
uint64_t vm_page_table_base_addr = qpd->page_table_base;
+   struct kfd_dev *kfd = pm->dqm->dev->kfd;
+   struct kfd_process_device *pdd =
+

[PATCH 09/33] drm/amdgpu: add gfx10 hw debug mode enable and disable calls

2023-05-25 Thread Jonathan Kim

Similar to GFX9 debug devices, set the hardware debug mode by draining
the SPI appropriately prior the mode setting request.

Because GFX10 has waves allocated by the work group boundary and each
SE's SPI instances do not communicate, the SPI drain time is much longer.
This long drain time will be fixed for GFX11 onwards.

Also remove a bunch of deprecated misplaced references for GFX10.3.

Signed-off-by: Jonathan Kim 
Reviewed-by: Felix Kuehling 
---
 .../drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.c|  96 
 .../drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.h|  28 
 .../drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10_3.c  | 148 +-
 3 files changed, 127 insertions(+), 145 deletions(-)
 create mode 100644 drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.h

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.c
index 7b60268d93c0..240f5006e278 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.c
@@ -21,6 +21,7 @@
  */
 #include "amdgpu.h"
 #include "amdgpu_amdkfd.h"
+#include "amdgpu_amdkfd_gfx_v10.h"
 #include "gc/gc_10_1_0_offset.h"
 #include "gc/gc_10_1_0_sh_mask.h"
 #include "athub/athub_2_0_0_offset.h"
@@ -709,6 +710,99 @@ static void set_vm_context_page_table_base(struct 
amdgpu_device *adev,
adev->gfxhub.funcs->setup_vm_pt_regs(adev, vmid, page_table_base);
 }
 
+/*
+ * GFX10 helper for wave launch stall requirements on debug trap setting.
+ *
+ * vmid:
+ *   Target VMID to stall/unstall.
+ *
+ * stall:
+ *   0-unstall wave launch (enable), 1-stall wave launch (disable).
+ *   After wavefront launch has been stalled, allocated waves must drain from
+ *   SPI in order for debug trap settings to take effect on those waves.
+ *   This is roughly a ~3500 clock cycle wait on SPI where a read on
+ *   SPI_GDBG_WAVE_CNTL translates to ~32 clock cycles.
+ *   KGD_GFX_V10_WAVE_LAUNCH_SPI_DRAIN_LATENCY indicates the number of reads 
required.
+ *
+ *   NOTE: We can afford to clear the entire STALL_VMID field on unstall
+ *   because current GFX10 chips cannot support multi-process debugging due to
+ *   trap configuration and masking being limited to global scope.  Always
+ *   assume single process conditions.
+ *
+ */
+
+#define KGD_GFX_V10_WAVE_LAUNCH_SPI_DRAIN_LATENCY  110
+static void kgd_gfx_v10_set_wave_launch_stall(struct amdgpu_device *adev, 
uint32_t vmid, bool stall)
+{
+   uint32_t data = RREG32(SOC15_REG_OFFSET(GC, 0, mmSPI_GDBG_WAVE_CNTL));
+   int i;
+
+   data = REG_SET_FIELD(data, SPI_GDBG_WAVE_CNTL, STALL_VMID,
+   stall ? 1 << vmid : 0);
+
+   WREG32(SOC15_REG_OFFSET(GC, 0, mmSPI_GDBG_WAVE_CNTL), data);
+
+   if (!stall)
+   return;
+
+   for (i = 0; i < KGD_GFX_V10_WAVE_LAUNCH_SPI_DRAIN_LATENCY; i++)
+   RREG32(SOC15_REG_OFFSET(GC, 0, mmSPI_GDBG_WAVE_CNTL));
+}
+
+uint32_t kgd_gfx_v10_enable_debug_trap(struct amdgpu_device *adev,
+   bool restore_dbg_registers,
+   uint32_t vmid)
+{
+
+   mutex_lock(>grbm_idx_mutex);
+
+   kgd_gfx_v10_set_wave_launch_stall(adev, vmid, true);
+
+   /* assume gfx off is disabled for the debug session if rlc restore not 
supported. */
+   if (restore_dbg_registers) {
+   uint32_t data = 0;
+
+   data = REG_SET_FIELD(data, SPI_GDBG_TRAP_CONFIG,
+   VMID_SEL, 1 << vmid);
+   data = REG_SET_FIELD(data, SPI_GDBG_TRAP_CONFIG,
+   TRAP_EN, 1);
+   WREG32(SOC15_REG_OFFSET(GC, 0, mmSPI_GDBG_TRAP_CONFIG), data);
+   WREG32(SOC15_REG_OFFSET(GC, 0, mmSPI_GDBG_TRAP_DATA0), 0);
+   WREG32(SOC15_REG_OFFSET(GC, 0, mmSPI_GDBG_TRAP_DATA1), 0);
+
+   kgd_gfx_v10_set_wave_launch_stall(adev, vmid, false);
+
+   mutex_unlock(>grbm_idx_mutex);
+
+   return 0;
+   }
+
+   WREG32(SOC15_REG_OFFSET(GC, 0, mmSPI_GDBG_TRAP_MASK), 0);
+
+   kgd_gfx_v10_set_wave_launch_stall(adev, vmid, false);
+
+   mutex_unlock(>grbm_idx_mutex);
+
+   return 0;
+}
+
+uint32_t kgd_gfx_v10_disable_debug_trap(struct amdgpu_device *adev,
+   bool keep_trap_enabled,
+   uint32_t vmid)
+{
+   mutex_lock(>grbm_idx_mutex);
+
+   kgd_gfx_v10_set_wave_launch_stall(adev, vmid, true);
+
+   WREG32(SOC15_REG_OFFSET(GC, 0, mmSPI_GDBG_TRAP_MASK), 0);
+
+   kgd_gfx_v10_set_wave_launch_stall(adev, vmid, false);
+
+   mutex_unlock(>grbm_idx_mutex);
+
+   return 0;
+}
+
 static void program_trap_handler_settings(struct amdgpu_device *adev,
uint32_t vmid, uint64_t tba_addr, uint64_t tma_add

[PATCH 05/33] drm/amdgpu: setup hw debug registers on driver initialization

2023-05-25 Thread Jonathan Kim

Add missing debug trap registers references and initialize all debug
registers on boot by clearing the hardware exception overrides and the
wave allocation ID index.

The debugger requires that TTMPs 6 & 7 save the dispatch ID to map
waves onto dispatch during compute context inspection.
In order to correctly set this up, set the special reserved CP bit by
default whenever the MQD is initailized.

v2: add missing 0-init of SPI_GDBG_TRAP_DATA0/1

Signed-off-by: Jonathan Kim 
---
 drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c| 26 +++
 drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c|  1 +
 drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c | 30 
 drivers/gpu/drm/amd/amdgpu/gfx_v9_4_2.c   |  3 +
 .../gpu/drm/amd/amdkfd/kfd_mqd_manager_v10.c  |  5 ++
 .../gpu/drm/amd/amdkfd/kfd_mqd_manager_v11.c  |  5 ++
 .../gpu/drm/amd/amdkfd/kfd_mqd_manager_v9.c   |  5 ++
 .../include/asic_reg/gc/gc_10_1_0_offset.h| 14 
 .../include/asic_reg/gc/gc_10_1_0_sh_mask.h   | 69 +++
 .../include/asic_reg/gc/gc_10_3_0_offset.h| 10 +++
 .../include/asic_reg/gc/gc_10_3_0_sh_mask.h   |  4 ++
 .../include/asic_reg/gc/gc_11_0_0_sh_mask.h   |  4 ++
 12 files changed, 176 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c 
b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
index f7ad883a70fa..be984f8c71c7 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
@@ -4825,6 +4825,29 @@ static u32 
gfx_v10_0_init_pa_sc_tile_steering_override(struct amdgpu_device *ade
 
 #define DEFAULT_SH_MEM_BASES   (0x6000)
 
+static void gfx_v10_0_debug_trap_config_init(struct amdgpu_device *adev,
+   uint32_t first_vmid,
+   uint32_t last_vmid)
+{
+   uint32_t data;
+   uint32_t trap_config_vmid_mask = 0;
+   int i;
+
+   /* Calculate trap config vmid mask */
+   for (i = first_vmid; i < last_vmid; i++)
+   trap_config_vmid_mask |= (1 << i);
+
+   data = REG_SET_FIELD(0, SPI_GDBG_TRAP_CONFIG,
+   VMID_SEL, trap_config_vmid_mask);
+   data = REG_SET_FIELD(data, SPI_GDBG_TRAP_CONFIG,
+   TRAP_EN, 1);
+   WREG32(SOC15_REG_OFFSET(GC, 0, mmSPI_GDBG_TRAP_CONFIG), data);
+   WREG32(SOC15_REG_OFFSET(GC, 0, mmSPI_GDBG_TRAP_MASK), 0);
+
+   WREG32(SOC15_REG_OFFSET(GC, 0, mmSPI_GDBG_TRAP_DATA0), 0);
+   WREG32(SOC15_REG_OFFSET(GC, 0, mmSPI_GDBG_TRAP_DATA1), 0);
+}
+
 static void gfx_v10_0_init_compute_vmid(struct amdgpu_device *adev)
 {
int i;
@@ -4856,6 +4879,9 @@ static void gfx_v10_0_init_compute_vmid(struct 
amdgpu_device *adev)
WREG32_SOC15_OFFSET(GC, 0, mmGDS_GWS_VMID0, i, 0);
WREG32_SOC15_OFFSET(GC, 0, mmGDS_OA_VMID0, i, 0);
}
+
+   gfx_v10_0_debug_trap_config_init(adev, adev->vm_manager.first_kfd_vmid,
+   AMDGPU_NUM_VMID);
 }
 
 static void gfx_v10_0_init_gds_vmid(struct amdgpu_device *adev)
diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c 
b/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c
index da21bf868080..690e121d9dda 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c
@@ -1638,6 +1638,7 @@ static void gfx_v11_0_init_compute_vmid(struct 
amdgpu_device *adev)
/* Enable trap for each kfd vmid. */
data = RREG32_SOC15(GC, 0, regSPI_GDBG_PER_VMID_CNTL);
data = REG_SET_FIELD(data, SPI_GDBG_PER_VMID_CNTL, TRAP_EN, 1);
+   WREG32_SOC15(GC, 0, regSPI_GDBG_PER_VMID_CNTL, data);
}
soc21_grbm_select(adev, 0, 0, 0, 0);
mutex_unlock(>srbm_mutex);
diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c 
b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
index 0189e50bd89f..7f17e0061027 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
@@ -2303,6 +2303,29 @@ static void gfx_v9_0_setup_rb(struct amdgpu_device *adev)
adev->gfx.config.num_rbs = hweight32(active_rbs);
 }
 
+static void gfx_v9_0_debug_trap_config_init(struct amdgpu_device *adev,
+   uint32_t first_vmid,
+   uint32_t last_vmid)
+{
+   uint32_t data;
+   uint32_t trap_config_vmid_mask = 0;
+   int i;
+
+   /* Calculate trap config vmid mask */
+   for (i = first_vmid; i < last_vmid; i++)
+   trap_config_vmid_mask |= (1 << i);
+
+   data = REG_SET_FIELD(0, SPI_GDBG_TRAP_CONFIG,
+   VMID_SEL, trap_config_vmid_mask);
+   data = REG_SET_FIELD(data, SPI_GDBG_TRAP_CONFIG,
+   TRAP_EN, 1);
+   WREG32(SOC15_REG_OFFSET(GC, 0, mmSPI_GDBG_TRAP_CONFIG), data);
+   WREG32(SOC15_REG_OFFSET(GC, 0, mmSPI_GDBG_TRAP_MASK), 0);
+
+   WREG32(SOC15_REG_OFFSET(GC, 0, mmSPI_GDBG_TRAP_DATA0), 0);
+   WREG32(SOC15_REG_OFFSET(GC, 0, mmSPI_GDBG_TRAP_DATA1), 0);
+}
+
 #define

[PATCH 06/33] drm/amdgpu: add gfx9 hw debug mode enable and disable calls

2023-05-25 Thread Jonathan Kim

Implement the per-device calls to enable or disable HW debug mode for
GFX9 prior to GFX9.4.1.

GFX9.4.1 and onward will require their own enable/disable sequence as
follow on patches.

When hardware debug mode setting is requested, waves will inherit
these settings in the Shader Processor Input's (SPI) Sequencer Global
Block (SQG). This means that the KGD must drain all waves from the SPI
into SQG (approximately 96 SPI clock cycles) prior to debug mode setting
to ensure that the order of operations that the debugger expects with
regards to debug mode setting transaction requests and wave inheritence
of that mode is upheld.

Also ensure that exception overrides are reset to their original state
prior to debug enable or disable.

Signed-off-by: Jonathan Kim 
Reviewed-by: Felix Kuehling 
---
 .../gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c | 92 +++
 .../gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.h |  9 ++
 2 files changed, 101 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c
index 9fa9aab22fe9..64da5995170c 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c
@@ -649,6 +649,96 @@ int kgd_gfx_v9_wave_control_execute(struct amdgpu_device 
*adev,
return 0;
 }
 
+/*
+ * GFX9 helper for wave launch stall requirements on debug trap setting.
+ *
+ * vmid:
+ *   Target VMID to stall/unstall.
+ *
+ * stall:
+ *   0-unstall wave launch (enable), 1-stall wave launch (disable).
+ *   After wavefront launch has been stalled, allocated waves must drain from
+ *   SPI in order for debug trap settings to take effect on those waves.
+ *   This is roughly a ~96 clock cycle wait on SPI where a read on
+ *   SPI_GDBG_WAVE_CNTL translates to ~32 clock cycles.
+ *   KGD_GFX_V9_WAVE_LAUNCH_SPI_DRAIN_LATENCY indicates the number of reads 
required.
+ *
+ *   NOTE: We can afford to clear the entire STALL_VMID field on unstall
+ *   because GFX9.4.1 cannot support multi-process debugging due to trap
+ *   configuration and masking being limited to global scope.  Always assume
+ *   single process conditions.
+ */
+#define KGD_GFX_V9_WAVE_LAUNCH_SPI_DRAIN_LATENCY   3
+void kgd_gfx_v9_set_wave_launch_stall(struct amdgpu_device *adev,
+   uint32_t vmid,
+   bool stall)
+{
+   int i;
+   uint32_t data = RREG32(SOC15_REG_OFFSET(GC, 0, mmSPI_GDBG_WAVE_CNTL));
+
+   if (adev->ip_versions[GC_HWIP][0] == IP_VERSION(9, 4, 1))
+   data = REG_SET_FIELD(data, SPI_GDBG_WAVE_CNTL, STALL_VMID,
+   stall ? 1 << vmid : 0);
+   else
+   data = REG_SET_FIELD(data, SPI_GDBG_WAVE_CNTL, STALL_RA,
+   stall ? 1 : 0);
+
+   WREG32(SOC15_REG_OFFSET(GC, 0, mmSPI_GDBG_WAVE_CNTL), data);
+
+   if (!stall)
+   return;
+
+   for (i = 0; i < KGD_GFX_V9_WAVE_LAUNCH_SPI_DRAIN_LATENCY; i++)
+   RREG32(SOC15_REG_OFFSET(GC, 0, mmSPI_GDBG_WAVE_CNTL));
+}
+
+/*
+ * restore_dbg_registers is ignored here but is a general interface requirement
+ * for devices that support GFXOFF and where the RLC save/restore list
+ * does not support hw registers for debugging i.e. the driver has to manually
+ * initialize the debug mode registers after it has disabled GFX off during the
+ * debug session.
+ */
+uint32_t kgd_gfx_v9_enable_debug_trap(struct amdgpu_device *adev,
+   bool restore_dbg_registers,
+   uint32_t vmid)
+{
+   mutex_lock(>grbm_idx_mutex);
+
+   kgd_gfx_v9_set_wave_launch_stall(adev, vmid, true);
+
+   WREG32(SOC15_REG_OFFSET(GC, 0, mmSPI_GDBG_TRAP_MASK), 0);
+
+   kgd_gfx_v9_set_wave_launch_stall(adev, vmid, false);
+
+   mutex_unlock(>grbm_idx_mutex);
+
+   return 0;
+}
+
+/*
+ * keep_trap_enabled is ignored here but is a general interface requirement
+ * for devices that support multi-process debugging where the performance
+ * overhead from trap temporary setup needs to be bypassed when the debug
+ * session has ended.
+ */
+uint32_t kgd_gfx_v9_disable_debug_trap(struct amdgpu_device *adev,
+   bool keep_trap_enabled,
+   uint32_t vmid)
+{
+   mutex_lock(>grbm_idx_mutex);
+
+   kgd_gfx_v9_set_wave_launch_stall(adev, vmid, true);
+
+   WREG32(SOC15_REG_OFFSET(GC, 0, mmSPI_GDBG_TRAP_MASK), 0);
+
+   kgd_gfx_v9_set_wave_launch_stall(adev, vmid, false);
+
+   mutex_unlock(>grbm_idx_mutex);
+
+   return 0;
+}
+
 void kgd_gfx_v9_set_vm_context_page_table_base(struct amdgpu_device *adev,
uint32_t vmid, uint64_t page_table_base)
 {
@@ -875,6 +965,8 @@ const struct kfd2kgd_calls gfx_v9_kfd2kgd = {
.get_atc

[PATCH 04/33] drm/amdgpu: add kgd hw debug mode setting interface

2023-05-25 Thread Jonathan Kim

Introduce the require KGD debug calls that will execute hardware debug
mode setting.

Signed-off-by: Jonathan Kim 
Reviewed-by: Felix Kuehling 
---
 .../gpu/drm/amd/include/kgd_kfd_interface.h   | 34 +++
 1 file changed, 34 insertions(+)

diff --git a/drivers/gpu/drm/amd/include/kgd_kfd_interface.h 
b/drivers/gpu/drm/amd/include/kgd_kfd_interface.h
index 8cb3dbcae3e4..d0df3381539f 100644
--- a/drivers/gpu/drm/amd/include/kgd_kfd_interface.h
+++ b/drivers/gpu/drm/amd/include/kgd_kfd_interface.h
@@ -291,6 +291,40 @@ struct kfd2kgd_calls {
uint32_t vmid, uint64_t page_table_base);
uint32_t (*read_vmid_from_vmfault_reg)(struct amdgpu_device *adev);
 
+   uint32_t (*enable_debug_trap)(struct amdgpu_device *adev,
+   bool restore_dbg_registers,
+   uint32_t vmid);
+   uint32_t (*disable_debug_trap)(struct amdgpu_device *adev,
+   bool keep_trap_enabled,
+   uint32_t vmid);
+   int (*validate_trap_override_request)(struct amdgpu_device *adev,
+   uint32_t trap_override,
+   uint32_t *trap_mask_supported);
+   uint32_t (*set_wave_launch_trap_override)(struct amdgpu_device *adev,
+uint32_t vmid,
+uint32_t trap_override,
+uint32_t trap_mask_bits,
+uint32_t trap_mask_request,
+uint32_t *trap_mask_prev,
+uint32_t kfd_dbg_trap_cntl_prev);
+   uint32_t (*set_wave_launch_mode)(struct amdgpu_device *adev,
+   uint8_t wave_launch_mode,
+   uint32_t vmid);
+   uint32_t (*set_address_watch)(struct amdgpu_device *adev,
+   uint64_t watch_address,
+   uint32_t watch_address_mask,
+   uint32_t watch_id,
+   uint32_t watch_mode,
+   uint32_t debug_vmid);
+   uint32_t (*clear_address_watch)(struct amdgpu_device *adev,
+   uint32_t watch_id);
+   void (*get_iq_wait_times)(struct amdgpu_device *adev,
+   uint32_t *wait_times);
+   void (*build_grace_period_packet_info)(struct amdgpu_device *adev,
+   uint32_t wait_times,
+   uint32_t grace_period,
+   uint32_t *reg_offset,
+   uint32_t *reg_data);
void (*get_cu_occupancy)(struct amdgpu_device *adev, int pasid,
int *wave_cnt, int *max_waves_per_cu, uint32_t inst);
void (*program_trap_handler_settings)(struct amdgpu_device *adev,
-- 
2.25.1

[PATCH 03/33] drm/amdkfd: prepare per-process debug enable and disable

2023-05-25 Thread Jonathan Kim

The ROCm debugger will attach to a process to debug by PTRACE and will
expect the KFD to prepare a process for the target PID, whether the
target PID has opened the KFD device or not.

This patch is to explicity handle this requirement.  Further HW mode
setting and runtime coordination requirements will be handled in
following patches.

In the case where the target process has not opened the KFD device,
a new KFD process must be created for the target PID.
The debugger as well as the target process for this case will have not
acquired any VMs so handle process restoration to correctly account for
this.

To coordinate with HSA runtime, the debugger must be aware of the target
process' runtime enablement status and will copy the runtime status
information into the debugged KFD process for later query.

On enablement, the debugger will subscribe to a set of exceptions where
each exception events will notify the debugger through a pollable FIFO
file descriptor that the debugger provides to the KFD to manage.

Finally on process termination of either the debugger or the target,
debugging must be disabled if it has not been done so.

Signed-off-by: Jonathan Kim 
Reviewed-by: Felix Kuehling 
---
 drivers/gpu/drm/amd/amdkfd/Makefile   |   3 +-
 drivers/gpu/drm/amd/amdkfd/kfd_chardev.c  | 102 +-
 drivers/gpu/drm/amd/amdkfd/kfd_debug.c|  80 ++
 drivers/gpu/drm/amd/amdkfd/kfd_debug.h|  32 ++
 .../drm/amd/amdkfd/kfd_device_queue_manager.c |  26 +++--
 drivers/gpu/drm/amd/amdkfd/kfd_priv.h |  31 +-
 drivers/gpu/drm/amd/amdkfd/kfd_process.c  |  60 +++
 7 files changed, 304 insertions(+), 30 deletions(-)
 create mode 100644 drivers/gpu/drm/amd/amdkfd/kfd_debug.c
 create mode 100644 drivers/gpu/drm/amd/amdkfd/kfd_debug.h

diff --git a/drivers/gpu/drm/amd/amdkfd/Makefile 
b/drivers/gpu/drm/amd/amdkfd/Makefile
index e758c2a24cd0..747754428073 100644
--- a/drivers/gpu/drm/amd/amdkfd/Makefile
+++ b/drivers/gpu/drm/amd/amdkfd/Makefile
@@ -55,7 +55,8 @@ AMDKFD_FILES  := $(AMDKFD_PATH)/kfd_module.o \
$(AMDKFD_PATH)/kfd_int_process_v9.o \
$(AMDKFD_PATH)/kfd_int_process_v11.o \
$(AMDKFD_PATH)/kfd_smi_events.o \
-   $(AMDKFD_PATH)/kfd_crat.o
+   $(AMDKFD_PATH)/kfd_crat.o \
+   $(AMDKFD_PATH)/kfd_debug.o
 
 ifneq ($(CONFIG_AMD_IOMMU_V2),)
 AMDKFD_FILES += $(AMDKFD_PATH)/kfd_iommu.o
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
index f4b50b74818e..7082d5d0f0e9 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
@@ -44,6 +44,7 @@
 #include "amdgpu_amdkfd.h"
 #include "kfd_smi_events.h"
 #include "amdgpu_dma_buf.h"
+#include "kfd_debug.h"
 
 static long kfd_ioctl(struct file *, unsigned int, unsigned long);
 static int kfd_open(struct inode *, struct file *);
@@ -142,10 +143,15 @@ static int kfd_open(struct inode *inode, struct file 
*filep)
return -EPERM;
}
 
-   process = kfd_create_process(filep);
+   process = kfd_create_process(current);
if (IS_ERR(process))
return PTR_ERR(process);
 
+   if (kfd_process_init_cwsr_apu(process, filep)) {
+   kfd_unref_process(process);
+   return -EFAULT;
+   }
+
/* filep now owns the reference returned by kfd_create_process */
filep->private_data = process;
 
@@ -2737,6 +2743,10 @@ static int kfd_ioctl_runtime_enable(struct file *filep, 
struct kfd_process *p, v
 static int kfd_ioctl_set_debug_trap(struct file *filep, struct kfd_process *p, 
void *data)
 {
struct kfd_ioctl_dbg_trap_args *args = data;
+   struct task_struct *thread = NULL;
+   struct mm_struct *mm = NULL;
+   struct pid *pid = NULL;
+   struct kfd_process *target = NULL;
int r = 0;
 
if (sched_policy == KFD_SCHED_POLICY_NO_HWS) {
@@ -2744,9 +2754,81 @@ static int kfd_ioctl_set_debug_trap(struct file *filep, 
struct kfd_process *p, v
return -EINVAL;
}
 
+   pid = find_get_pid(args->pid);
+   if (!pid) {
+   pr_debug("Cannot find pid info for %i\n", args->pid);
+   r = -ESRCH;
+   goto out;
+   }
+
+   thread = get_pid_task(pid, PIDTYPE_PID);
+   if (!thread) {
+   r = -ESRCH;
+   goto out;
+   }
+
+   mm = get_task_mm(thread);
+   if (!mm) {
+   r = -ESRCH;
+   goto out;
+   }
+
+   if (args->op == KFD_IOC_DBG_TRAP_ENABLE) {
+   bool create_process;
+
+   rcu_read_lock();
+   create_process = thread && thread != current && 
ptrace_parent(thread) == current;
+   rcu_read_unlock();
+
+   target = create_process ? kfd_create_process(t

[PATCH 01/33] drm/amdkfd: add debug and runtime enable interface

2023-05-25 Thread Jonathan Kim

Introduce the GPU debug operations interface.

For ROCm-GDB to extend the GNU Debugger's ability to inspect the AMD GPU
instruction set, provide the necessary interface to allow the debugger
to HW debug-mode set and query exceptions per HSA queue, process or
device.

The runtime_enable interface coordinates exception handling with the
HSA runtime.

Usage is available in the kern docs at uapi/linux/kfd_ioctl.h.

v2: add num_xcc to device snapshot entry.
fixup missing EC_QUEUE_PACKET_RESERVED mask.

Signed-off-by: Jonathan Kim 
---
 drivers/gpu/drm/amd/amdkfd/kfd_chardev.c |  48 ++
 include/uapi/linux/kfd_ioctl.h   | 668 ++-
 2 files changed, 715 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
index 88fe1f31739d..f4b50b74818e 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
@@ -2729,6 +2729,48 @@ static int kfd_ioctl_criu(struct file *filep, struct 
kfd_process *p, void *data)
return ret;
 }
 
+static int kfd_ioctl_runtime_enable(struct file *filep, struct kfd_process *p, 
void *data)
+{
+   return 0;
+}
+
+static int kfd_ioctl_set_debug_trap(struct file *filep, struct kfd_process *p, 
void *data)
+{
+   struct kfd_ioctl_dbg_trap_args *args = data;
+   int r = 0;
+
+   if (sched_policy == KFD_SCHED_POLICY_NO_HWS) {
+   pr_err("Debugging does not support sched_policy %i", 
sched_policy);
+   return -EINVAL;
+   }
+
+   switch (args->op) {
+   case KFD_IOC_DBG_TRAP_ENABLE:
+   case KFD_IOC_DBG_TRAP_DISABLE:
+   case KFD_IOC_DBG_TRAP_SEND_RUNTIME_EVENT:
+   case KFD_IOC_DBG_TRAP_SET_EXCEPTIONS_ENABLED:
+   case KFD_IOC_DBG_TRAP_SET_WAVE_LAUNCH_OVERRIDE:
+   case KFD_IOC_DBG_TRAP_SET_WAVE_LAUNCH_MODE:
+   case KFD_IOC_DBG_TRAP_SUSPEND_QUEUES:
+   case KFD_IOC_DBG_TRAP_RESUME_QUEUES:
+   case KFD_IOC_DBG_TRAP_SET_NODE_ADDRESS_WATCH:
+   case KFD_IOC_DBG_TRAP_CLEAR_NODE_ADDRESS_WATCH:
+   case KFD_IOC_DBG_TRAP_SET_FLAGS:
+   case KFD_IOC_DBG_TRAP_QUERY_DEBUG_EVENT:
+   case KFD_IOC_DBG_TRAP_QUERY_EXCEPTION_INFO:
+   case KFD_IOC_DBG_TRAP_GET_QUEUE_SNAPSHOT:
+   case KFD_IOC_DBG_TRAP_GET_DEVICE_SNAPSHOT:
+   pr_warn("Debugging not supported yet\n");
+   r = -EACCES;
+   break;
+   default:
+   pr_err("Invalid option: %i\n", args->op);
+   r = -EINVAL;
+   }
+
+   return r;
+}
+
 #define AMDKFD_IOCTL_DEF(ioctl, _func, _flags) \
[_IOC_NR(ioctl)] = {.cmd = ioctl, .func = _func, .flags = _flags, \
.cmd_drv = 0, .name = #ioctl}
@@ -2841,6 +2883,12 @@ static const struct amdkfd_ioctl_desc amdkfd_ioctls[] = {
 
AMDKFD_IOCTL_DEF(AMDKFD_IOC_EXPORT_DMABUF,
kfd_ioctl_export_dmabuf, 0),
+
+   AMDKFD_IOCTL_DEF(AMDKFD_IOC_RUNTIME_ENABLE,
+   kfd_ioctl_runtime_enable, 0),
+
+   AMDKFD_IOCTL_DEF(AMDKFD_IOC_DBG_TRAP,
+   kfd_ioctl_set_debug_trap, 0),
 };
 
 #define AMDKFD_CORE_IOCTL_COUNTARRAY_SIZE(amdkfd_ioctls)
diff --git a/include/uapi/linux/kfd_ioctl.h b/include/uapi/linux/kfd_ioctl.h
index 2a9671e1ddb5..dfe745ee427e 100644
--- a/include/uapi/linux/kfd_ioctl.h
+++ b/include/uapi/linux/kfd_ioctl.h
@@ -110,6 +110,32 @@ struct kfd_ioctl_get_available_memory_args {
__u32 pad;
 };
 
+struct kfd_dbg_device_info_entry {
+   __u64 exception_status;
+   __u64 lds_base;
+   __u64 lds_limit;
+   __u64 scratch_base;
+   __u64 scratch_limit;
+   __u64 gpuvm_base;
+   __u64 gpuvm_limit;
+   __u32 gpu_id;
+   __u32 location_id;
+   __u32 vendor_id;
+   __u32 device_id;
+   __u32 revision_id;
+   __u32 subsystem_vendor_id;
+   __u32 subsystem_device_id;
+   __u32 fw_version;
+   __u32 gfx_target_version;
+   __u32 simd_count;
+   __u32 max_waves_per_simd;
+   __u32 array_count;
+   __u32 simd_arrays_per_engine;
+   __u32 num_xcc;
+   __u32 capability;
+   __u32 debug_prop;
+};
+
 /* For kfd_ioctl_set_memory_policy_args.default_policy and alternate_policy */
 #define KFD_IOC_CACHE_POLICY_COHERENT 0
 #define KFD_IOC_CACHE_POLICY_NONCOHERENT 1
@@ -775,6 +801,640 @@ struct kfd_ioctl_set_xnack_mode_args {
__s32 xnack_enabled;
 };
 
+/* Wave launch override modes */
+enum kfd_dbg_trap_override_mode {
+   KFD_DBG_TRAP_OVERRIDE_OR = 0,
+   KFD_DBG_TRAP_OVERRIDE_REPLACE = 1
+};
+
+/* Wave launch overrides */
+enum kfd_dbg_trap_mask {
+   KFD_DBG_TRAP_MASK_FP_INVALID = 1,
+   KFD_DBG_TRAP_MASK_FP_INPUT_DENORMAL = 2,
+   KFD_DBG_TRAP_MASK_FP_DIVIDE_BY_ZERO = 4,
+   KFD_DBG_TRAP_MASK_FP_OVERFLOW = 8,
+   KFD_DBG_TRAP_MASK_FP_UNDERFLOW = 16,
+   KFD_DBG_TRAP_MASK_FP_INEXACT = 32,
+   KFD_D

[PATCH 02/33] drm/amdkfd: display debug capabilities

2023-05-25 Thread Jonathan Kim

Expose debug capabilities in the KFD topology node's HSA capabilities and
debug properties flags.

Ensure correct capabilities are exposed based on firmware support.

Flag definitions can be referenced in uapi/linux/kfd_sysfs.h.

v2: rebase topology fw check fix with kfd_node struct update

Signed-off-by: Jonathan Kim 
---
 drivers/gpu/drm/amd/amdkfd/kfd_topology.c | 101 --
 drivers/gpu/drm/amd/amdkfd/kfd_topology.h |   6 ++
 include/uapi/linux/kfd_sysfs.h|  15 
 3 files changed, 117 insertions(+), 5 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_topology.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_topology.c
index 8302d8967158..3def25b2bdbb 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_topology.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_topology.c
@@ -535,6 +535,8 @@ static ssize_t node_show(struct kobject *kobj, struct 
attribute *attr,
  dev->gpu->kfd->mec_fw_version);
sysfs_show_32bit_prop(buffer, offs, "capability",
  dev->node_props.capability);
+   sysfs_show_64bit_prop(buffer, offs, "debug_prop",
+ dev->node_props.debug_prop);
sysfs_show_32bit_prop(buffer, offs, "sdma_fw_version",
  dev->gpu->kfd->sdma_fw_version);
sysfs_show_64bit_prop(buffer, offs, "unique_id",
@@ -1857,6 +1859,97 @@ static int kfd_topology_add_device_locked(struct 
kfd_node *gpu, uint32_t gpu_id,
return res;
 }
 
+static void kfd_topology_set_dbg_firmware_support(struct kfd_topology_device 
*dev)
+{
+   bool firmware_supported = true;
+
+   if (KFD_GC_VERSION(dev->gpu) >= IP_VERSION(11, 0, 0) &&
+   KFD_GC_VERSION(dev->gpu) < IP_VERSION(12, 0, 0)) {
+   firmware_supported =
+   (dev->gpu->adev->mes.sched_version & 
AMDGPU_MES_VERSION_MASK) >= 9;
+   goto out;
+   }
+
+   /*
+* Note: Any unlisted devices here are assumed to support exception 
handling.
+* Add additional checks here as needed.
+*/
+   switch (KFD_GC_VERSION(dev->gpu)) {
+   case IP_VERSION(9, 0, 1):
+   firmware_supported = dev->gpu->kfd->mec_fw_version >= 459 + 
32768;
+   break;
+   case IP_VERSION(9, 1, 0):
+   case IP_VERSION(9, 2, 1):
+   case IP_VERSION(9, 2, 2):
+   case IP_VERSION(9, 3, 0):
+   case IP_VERSION(9, 4, 0):
+   firmware_supported = dev->gpu->kfd->mec_fw_version >= 459;
+   break;
+   case IP_VERSION(9, 4, 1):
+   firmware_supported = dev->gpu->kfd->mec_fw_version >= 60;
+   break;
+   case IP_VERSION(9, 4, 2):
+   firmware_supported = dev->gpu->kfd->mec_fw_version >= 51;
+   break;
+   case IP_VERSION(10, 1, 10):
+   case IP_VERSION(10, 1, 2):
+   case IP_VERSION(10, 1, 1):
+   firmware_supported = dev->gpu->kfd->mec_fw_version >= 144;
+   break;
+   case IP_VERSION(10, 3, 0):
+   case IP_VERSION(10, 3, 2):
+   case IP_VERSION(10, 3, 1):
+   case IP_VERSION(10, 3, 4):
+   case IP_VERSION(10, 3, 5):
+   firmware_supported = dev->gpu->kfd->mec_fw_version >= 89;
+   break;
+   case IP_VERSION(10, 1, 3):
+   case IP_VERSION(10, 3, 3):
+   firmware_supported = false;
+   break;
+   default:
+   break;
+   }
+
+out:
+   if (firmware_supported)
+   dev->node_props.capability |= 
HSA_CAP_TRAP_DEBUG_FIRMWARE_SUPPORTED;
+}
+
+static void kfd_topology_set_capabilities(struct kfd_topology_device *dev)
+{
+   dev->node_props.capability |= ((HSA_CAP_DOORBELL_TYPE_2_0 <<
+   HSA_CAP_DOORBELL_TYPE_TOTALBITS_SHIFT) &
+   HSA_CAP_DOORBELL_TYPE_TOTALBITS_MASK);
+
+   dev->node_props.capability |= HSA_CAP_TRAP_DEBUG_SUPPORT |
+   HSA_CAP_TRAP_DEBUG_WAVE_LAUNCH_TRAP_OVERRIDE_SUPPORTED |
+   HSA_CAP_TRAP_DEBUG_WAVE_LAUNCH_MODE_SUPPORTED;
+
+   if (KFD_GC_VERSION(dev->gpu) < IP_VERSION(10, 0, 0)) {
+   dev->node_props.debug_prop |= 
HSA_DBG_WATCH_ADDR_MASK_LO_BIT_GFX9 |
+   HSA_DBG_WATCH_ADDR_MASK_HI_BIT;
+
+   if (KFD_GC_VERSION(dev->gpu) < IP_VERSION(9, 4, 2))
+   dev->node_props.debug_prop |=
+   HSA_DBG_DISPATCH_INFO_ALWAYS_VALID;
+   else
+   dev->node_props.capability |=
+   
HSA_CAP_TRAP_DEBUG_PRECISE_MEMORY_OPERATIONS_SUPPORTED;
+   } else {
+

[PATCH 34/34] drm/amdkfd: optimize gfx off enable toggle for debugging

2023-03-27 Thread Jonathan Kim

Legacy debug devices limited to pinning a single debug VMID for debugging
are the only devices that require disabling GFX OFF while accessing
debug registers.  Debug devices that support multi-process debugging
rely on the hardware scheduler to update debug registers and do not run
into GFX OFF access issues.

Remove KFD GFX OFF enable toggle clutter by moving these calls into the
KGD debug calls themselves.

Signed-off-by: Jonathan Kim 
---
 .../drm/amd/amdgpu/amdgpu_amdkfd_arcturus.c   |  7 
 .../drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.c| 33 ++-
 .../gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c | 24 ++
 drivers/gpu/drm/amd/amdkfd/kfd_chardev.c  | 30 +
 drivers/gpu/drm/amd/amdkfd/kfd_debug.c| 21 +---
 5 files changed, 73 insertions(+), 42 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_arcturus.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_arcturus.c
index 6df215aba4c4..dec4a3381ccb 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_arcturus.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_arcturus.c
@@ -350,6 +350,8 @@ static uint32_t kgd_arcturus_enable_debug_trap(struct 
amdgpu_device *adev,
bool restore_dbg_registers,
uint32_t vmid)
 {
+   amdgpu_gfx_off_ctrl(adev, false);
+
mutex_lock(>grbm_idx_mutex);
 
kgd_gfx_v9_set_wave_launch_stall(adev, vmid, true);
@@ -362,6 +364,8 @@ static uint32_t kgd_arcturus_enable_debug_trap(struct 
amdgpu_device *adev,
 
mutex_unlock(>grbm_idx_mutex);
 
+   amdgpu_gfx_off_ctrl(adev, true);
+
return 0;
 }
 
@@ -375,6 +379,7 @@ static uint32_t kgd_arcturus_disable_debug_trap(struct 
amdgpu_device *adev,
bool keep_trap_enabled,
uint32_t vmid)
 {
+   amdgpu_gfx_off_ctrl(adev, false);
 
mutex_lock(>grbm_idx_mutex);
 
@@ -388,6 +393,8 @@ static uint32_t kgd_arcturus_disable_debug_trap(struct 
amdgpu_device *adev,
 
mutex_unlock(>grbm_idx_mutex);
 
+   amdgpu_gfx_off_ctrl(adev, true);
+
return 0;
 }
 const struct kfd2kgd_calls arcturus_kfd2kgd = {
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.c
index 444f9ea758d6..2132376e2107 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.c
@@ -753,12 +753,13 @@ uint32_t kgd_gfx_v10_enable_debug_trap(struct 
amdgpu_device *adev,
bool restore_dbg_registers,
uint32_t vmid)
 {
+   amdgpu_gfx_off_ctrl(adev, false);
 
mutex_lock(>grbm_idx_mutex);
 
kgd_gfx_v10_set_wave_launch_stall(adev, vmid, true);
 
-   /* assume gfx off is disabled for the debug session if rlc restore not 
supported. */
+   /* keep gfx off disabled for the debug session if rlc restore not 
supported. */
if (restore_dbg_registers) {
uint32_t data = 0;
 
@@ -783,6 +784,8 @@ uint32_t kgd_gfx_v10_enable_debug_trap(struct amdgpu_device 
*adev,
 
mutex_unlock(>grbm_idx_mutex);
 
+   amdgpu_gfx_off_ctrl(adev, true);
+
return 0;
 }
 
@@ -790,6 +793,8 @@ uint32_t kgd_gfx_v10_disable_debug_trap(struct 
amdgpu_device *adev,
bool keep_trap_enabled,
uint32_t vmid)
 {
+   amdgpu_gfx_off_ctrl(adev, false);
+
mutex_lock(>grbm_idx_mutex);
 
kgd_gfx_v10_set_wave_launch_stall(adev, vmid, true);
@@ -800,6 +805,16 @@ uint32_t kgd_gfx_v10_disable_debug_trap(struct 
amdgpu_device *adev,
 
mutex_unlock(>grbm_idx_mutex);
 
+   amdgpu_gfx_off_ctrl(adev, true);
+
+   /*
+* Remove the extra gfx off disable reference from debug restore call
+* for asics that do not support rlc restore for debug registers.
+*/
+   if (adev->ip_versions[GC_HWIP][0] == IP_VERSION(10, 1, 10) ||
+   adev->ip_versions[GC_HWIP][0] == IP_VERSION(10, 1, 1))
+   amdgpu_gfx_off_ctrl(adev, true);
+
return 0;
 }
 
@@ -831,6 +846,8 @@ uint32_t kgd_gfx_v10_set_wave_launch_trap_override(struct 
amdgpu_device *adev,
 {
uint32_t data, wave_cntl_prev;
 
+   amdgpu_gfx_off_ctrl(adev, false);
+
mutex_lock(>grbm_idx_mutex);
 
wave_cntl_prev = RREG32(SOC15_REG_OFFSET(GC, 0, mmSPI_GDBG_WAVE_CNTL));
@@ -852,6 +869,8 @@ uint32_t kgd_gfx_v10_set_wave_launch_trap_override(struct 
amdgpu_device *adev,
 
mutex_unlock(>grbm_idx_mutex);
 
+   amdgpu_gfx_off_ctrl(adev, true);
+
return 0;
 }
 
@@ -862,6 +881,8 @@ uint32_t kgd_gfx_v10_set_wave_launch_mode(struct 
amdgpu_device *adev,
uint32_t data = 0;
bool is_mode_set = !!wave_launch_mode;
 
+   amdgpu_gfx_off_ctrl(adev, false);

[PATCH 33/34] drm/amdkfd: bump kfd ioctl minor version for debug api availability

2023-03-27 Thread Jonathan Kim

Bump the minor version to declare debugging capability is now
available.

v2: bump to 1.13 after upstream rebase.

Signed-off-by: Jonathan Kim 
Reviewed-by: Felix Kuehling 
---
 drivers/gpu/drm/amd/amdkfd/kfd_chardev.c | 1 -
 include/uapi/linux/kfd_ioctl.h   | 3 ++-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
index 3254274bcee0..ead5afe4216b 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
@@ -2970,7 +2970,6 @@ static int kfd_ioctl_set_debug_trap(struct file *filep, 
struct kfd_process *p, v
if (!r)
target->exception_enable_mask = 
args->enable.exception_mask;
 
-   pr_warn("Debug functions limited\n");
break;
case KFD_IOC_DBG_TRAP_DISABLE:
r = kfd_dbg_trap_disable(target);
diff --git a/include/uapi/linux/kfd_ioctl.h b/include/uapi/linux/kfd_ioctl.h
index 0e05c0f3f348..7321fc8d1622 100644
--- a/include/uapi/linux/kfd_ioctl.h
+++ b/include/uapi/linux/kfd_ioctl.h
@@ -38,9 +38,10 @@
  * - 1.10 - Add SMI profiler event log
  * - 1.11 - Add unified memory for ctx save/restore area
  * - 1.12 - Add DMA buf export ioctl
+ * - 1.13 - Add debugger API
  */
 #define KFD_IOCTL_MAJOR_VERSION 1
-#define KFD_IOCTL_MINOR_VERSION 12
+#define KFD_IOCTL_MINOR_VERSION 13
 
 struct kfd_ioctl_get_version_args {
__u32 major_version;/* from KFD */
-- 
2.25.1

[PATCH 27/34] drm/amdkfd: add debug set and clear address watch points operation

2023-03-27 Thread Jonathan Kim

Shader read, write and atomic memory operations can be alerted to the
debugger as an address watch exception.

Allow the debugger to pass in a watch point to a particular memory
address per device.

Note that there exists only 4 watch points per devices to date, so have
the KFD keep track of what watch points are allocated or not.

v4: fixup bad indentation

v3: add gfx11 support.
cleanup gfx9 kgd calls to set and clear address watch.
use per device spinlock to set watch points.
fixup runlist refresh calls on set/clear address watch.

v2: change dev_id arg to gpu_id for consistency

Signed-off-by: Jonathan Kim 
Reviewed-by: Felix Kuehling 
---
 .../drm/amd/amdgpu/amdgpu_amdkfd_aldebaran.c  |  51 +++
 .../drm/amd/amdgpu/amdgpu_amdkfd_arcturus.c   |   2 +
 .../drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.c|  78 ++
 .../drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.h|   8 ++
 .../drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10_3.c  |   5 +-
 .../drm/amd/amdgpu/amdgpu_amdkfd_gfx_v11.c|  52 ++-
 .../gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c |  77 ++
 .../gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.h |   8 ++
 drivers/gpu/drm/amd/amdkfd/kfd_chardev.c  |  24 
 drivers/gpu/drm/amd/amdkfd/kfd_debug.c| 136 ++
 drivers/gpu/drm/amd/amdkfd/kfd_debug.h|   8 +-
 drivers/gpu/drm/amd/amdkfd/kfd_device.c   |   1 +
 drivers/gpu/drm/amd/amdkfd/kfd_priv.h |   6 +-
 13 files changed, 451 insertions(+), 5 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_aldebaran.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_aldebaran.c
index 774ecfc3451a..efd6a72aab4e 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_aldebaran.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_aldebaran.c
@@ -118,6 +118,55 @@ static uint32_t kgd_aldebaran_set_wave_launch_mode(struct 
amdgpu_device *adev,
return data;
 }
 
+#define TCP_WATCH_STRIDE (regTCP_WATCH1_ADDR_H - regTCP_WATCH0_ADDR_H)
+static uint32_t kgd_gfx_aldebaran_set_address_watch(
+   struct amdgpu_device *adev,
+   uint64_t watch_address,
+   uint32_t watch_address_mask,
+   uint32_t watch_id,
+   uint32_t watch_mode,
+   uint32_t debug_vmid)
+{
+   uint32_t watch_address_high;
+   uint32_t watch_address_low;
+   uint32_t watch_address_cntl;
+
+   watch_address_cntl = 0;
+   watch_address_low = lower_32_bits(watch_address);
+   watch_address_high = upper_32_bits(watch_address) & 0x;
+
+   watch_address_cntl = REG_SET_FIELD(watch_address_cntl,
+   TCP_WATCH0_CNTL,
+   MODE,
+   watch_mode);
+
+   watch_address_cntl = REG_SET_FIELD(watch_address_cntl,
+   TCP_WATCH0_CNTL,
+   MASK,
+   watch_address_mask >> 6);
+
+   watch_address_cntl = REG_SET_FIELD(watch_address_cntl,
+   TCP_WATCH0_CNTL,
+   VALID,
+   1);
+
+   WREG32_RLC((SOC15_REG_OFFSET(GC, 0, regTCP_WATCH0_ADDR_H) +
+   (watch_id * TCP_WATCH_STRIDE)),
+   watch_address_high);
+
+   WREG32_RLC((SOC15_REG_OFFSET(GC, 0, regTCP_WATCH0_ADDR_L) +
+   (watch_id * TCP_WATCH_STRIDE)),
+   watch_address_low);
+
+   return watch_address_cntl;
+}
+
+uint32_t kgd_gfx_aldebaran_clear_address_watch(struct amdgpu_device *adev,
+   uint32_t watch_id)
+{
+   return 0;
+}
+
 const struct kfd2kgd_calls aldebaran_kfd2kgd = {
.program_sh_mem_settings = kgd_gfx_v9_program_sh_mem_settings,
.set_pasid_vmid_mapping = kgd_gfx_v9_set_pasid_vmid_mapping,
@@ -141,6 +190,8 @@ const struct kfd2kgd_calls aldebaran_kfd2kgd = {
.validate_trap_override_request = 
kgd_aldebaran_validate_trap_override_request,
.set_wave_launch_trap_override = 
kgd_aldebaran_set_wave_launch_trap_override,
.set_wave_launch_mode = kgd_aldebaran_set_wave_launch_mode,
+   .set_address_watch = kgd_gfx_aldebaran_set_address_watch,
+   .clear_address_watch = kgd_gfx_aldebaran_clear_address_watch,
.get_iq_wait_times = kgd_gfx_v9_get_iq_wait_times,
.build_grace_period_packet_info = 
kgd_gfx_v9_build_grace_period_packet_info,
.program_trap_handler_settings = 
kgd_gfx_v9_program_trap_handler_settings,
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_arcturus.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_arcturus.c
index fbdc1b7b1e42..6df215aba4c4 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_arcturus.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_arcturus.c
@@ -413,6 +413,8 @@ const struct kfd2kgd_calls arcturus_kfd2kgd = {
.validate_trap_overri

[PATCH 21/34] drm/amdkfd: add debug trap enabled flag to tma

2023-03-27 Thread Jonathan Kim

From: Jay Cornwall 

Trap handler behavior will differ when a debugger is attached.

Make the debug trap flag available in the trap handler TMA.
Update it when the debug trap ioctl is invoked.

v4: fix up comments to clarify flagging implementation.

v3: Rebase for upstream

v2:
Add missing debug flag setup on APUs

Signed-off-by: Jay Cornwall 
Reviewed-by: Felix Kuehling 
Signed-off-by: Jonathan Kim 
---
 drivers/gpu/drm/amd/amdkfd/kfd_debug.c   | 11 +++
 drivers/gpu/drm/amd/amdkfd/kfd_priv.h|  2 ++
 drivers/gpu/drm/amd/amdkfd/kfd_process.c | 15 +++
 3 files changed, 28 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_debug.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_debug.c
index f498987dc21d..c779acb9a623 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_debug.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_debug.c
@@ -256,6 +256,8 @@ void kfd_dbg_trap_deactivate(struct kfd_process *target, 
bool unwind, int unwind
if (unwind && i == unwind_count)
break;
 
+   kfd_process_set_trap_debug_flag(>qpd, false);
+
/* GFX off is already disabled by debug activate if not RLC 
restore supported. */
if (kfd_dbg_is_rlc_restore_supported(pdd->dev))
amdgpu_gfx_off_ctrl(pdd->dev->adev, false);
@@ -351,6 +353,15 @@ int kfd_dbg_trap_activate(struct kfd_process *target)
if (kfd_dbg_is_rlc_restore_supported(pdd->dev))
amdgpu_gfx_off_ctrl(pdd->dev->adev, true);
 
+   /**
+* Setting the debug flag in the trap handler requires that the 
TMA has been
+* allocated, which occurs during CWSR initialization.
+* In the event that CWSR has not been initialized at this 
point, setting the
+* flag will be called again during CWSR initialization if the 
target process
+* is still debug enabled.
+*/
+   kfd_process_set_trap_debug_flag(>qpd, true);
+
if (!pdd->dev->shared_resources.enable_mes)
r = debug_refresh_runlist(pdd->dev->dqm);
else
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h 
b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
index d092c81c9dc2..42a4502287f2 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
@@ -1105,6 +1105,8 @@ int kfd_init_apertures(struct kfd_process *process);
 void kfd_process_set_trap_handler(struct qcm_process_device *qpd,
  uint64_t tba_addr,
  uint64_t tma_addr);
+void kfd_process_set_trap_debug_flag(struct qcm_process_device *qpd,
+bool enabled);
 
 /* CWSR initialization */
 int kfd_process_init_cwsr_apu(struct kfd_process *process, struct file *filep);
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
index c6a4d01bb1b5..d26aa339fa6b 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_process.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
@@ -1303,6 +1303,8 @@ int kfd_process_init_cwsr_apu(struct kfd_process *p, 
struct file *filep)
 
memcpy(qpd->cwsr_kaddr, dev->cwsr_isa, dev->cwsr_isa_size);
 
+   kfd_process_set_trap_debug_flag(qpd, p->debug_trap_enabled);
+
qpd->tma_addr = qpd->tba_addr + KFD_CWSR_TMA_OFFSET;
pr_debug("set tba :0x%llx, tma:0x%llx, cwsr_kaddr:%p for 
pqm.\n",
qpd->tba_addr, qpd->tma_addr, qpd->cwsr_kaddr);
@@ -1339,6 +1341,9 @@ static int kfd_process_device_init_cwsr_dgpu(struct 
kfd_process_device *pdd)
 
memcpy(qpd->cwsr_kaddr, dev->cwsr_isa, dev->cwsr_isa_size);
 
+   kfd_process_set_trap_debug_flag(>qpd,
+   pdd->process->debug_trap_enabled);
+
qpd->tma_addr = qpd->tba_addr + KFD_CWSR_TMA_OFFSET;
pr_debug("set tba :0x%llx, tma:0x%llx, cwsr_kaddr:%p for pqm.\n",
 qpd->tba_addr, qpd->tma_addr, qpd->cwsr_kaddr);
@@ -1425,6 +1430,16 @@ bool kfd_process_xnack_mode(struct kfd_process *p, bool 
supported)
return true;
 }
 
+void kfd_process_set_trap_debug_flag(struct qcm_process_device *qpd,
+bool enabled)
+{
+   if (qpd->cwsr_kaddr) {
+   uint64_t *tma =
+   (uint64_t *)(qpd->cwsr_kaddr + KFD_CWSR_TMA_OFFSET);
+   tma[2] = enabled;
+   }
+}
+
 /*
  * On return the kfd_process is fully operational and will be freed when the
  * mm is released
-- 
2.25.1

[PATCH 25/34] drm/amdkfd: add debug wave launch mode operation

2023-03-27 Thread Jonathan Kim

Allow the debugger to set wave behaviour on to either normally operate,
halt at launch, trap on every instruction, terminate immediately or
stall on allocation.

v3: remove unrequired stall_launch reference in an earlier patch.
gfx off optimization will be addressed later.

v2: add gfx11 support and remove deprecated launch mode options

Signed-off-by: Jonathan Kim 
---
 .../drm/amd/amdgpu/amdgpu_amdkfd_aldebaran.c  | 12 +++
 .../drm/amd/amdgpu/amdgpu_amdkfd_arcturus.c   |  1 +
 .../drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.c| 25 +
 .../drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.h|  3 ++
 .../drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10_3.c  |  3 +-
 .../drm/amd/amdgpu/amdgpu_amdkfd_gfx_v11.c| 14 +++-
 .../gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c | 25 +
 .../gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.h |  3 ++
 drivers/gpu/drm/amd/amdkfd/kfd_chardev.c  |  3 ++
 drivers/gpu/drm/amd/amdkfd/kfd_debug.c| 36 ++-
 drivers/gpu/drm/amd/amdkfd/kfd_debug.h|  2 ++
 11 files changed, 124 insertions(+), 3 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_aldebaran.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_aldebaran.c
index d7881bbd828d..774ecfc3451a 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_aldebaran.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_aldebaran.c
@@ -107,6 +107,17 @@ static uint32_t 
kgd_aldebaran_set_wave_launch_trap_override(struct amdgpu_device
return data;
 }
 
+static uint32_t kgd_aldebaran_set_wave_launch_mode(struct amdgpu_device *adev,
+   uint8_t wave_launch_mode,
+   uint32_t vmid)
+{
+   uint32_t data = 0;
+
+   data = REG_SET_FIELD(data, SPI_GDBG_PER_VMID_CNTL, LAUNCH_MODE, 
wave_launch_mode);
+
+   return data;
+}
+
 const struct kfd2kgd_calls aldebaran_kfd2kgd = {
.program_sh_mem_settings = kgd_gfx_v9_program_sh_mem_settings,
.set_pasid_vmid_mapping = kgd_gfx_v9_set_pasid_vmid_mapping,
@@ -129,6 +140,7 @@ const struct kfd2kgd_calls aldebaran_kfd2kgd = {
.disable_debug_trap = kgd_aldebaran_disable_debug_trap,
.validate_trap_override_request = 
kgd_aldebaran_validate_trap_override_request,
.set_wave_launch_trap_override = 
kgd_aldebaran_set_wave_launch_trap_override,
+   .set_wave_launch_mode = kgd_aldebaran_set_wave_launch_mode,
.get_iq_wait_times = kgd_gfx_v9_get_iq_wait_times,
.build_grace_period_packet_info = 
kgd_gfx_v9_build_grace_period_packet_info,
.program_trap_handler_settings = 
kgd_gfx_v9_program_trap_handler_settings,
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_arcturus.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_arcturus.c
index ec2587664001..fbdc1b7b1e42 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_arcturus.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_arcturus.c
@@ -412,6 +412,7 @@ const struct kfd2kgd_calls arcturus_kfd2kgd = {
.disable_debug_trap = kgd_arcturus_disable_debug_trap,
.validate_trap_override_request = 
kgd_gfx_v9_validate_trap_override_request,
.set_wave_launch_trap_override = 
kgd_gfx_v9_set_wave_launch_trap_override,
+   .set_wave_launch_mode = kgd_gfx_v9_set_wave_launch_mode,
.get_iq_wait_times = kgd_gfx_v9_get_iq_wait_times,
.build_grace_period_packet_info = 
kgd_gfx_v9_build_grace_period_packet_info,
.get_cu_occupancy = kgd_gfx_v9_get_cu_occupancy,
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.c
index 8e2468d02001..5ace6aa042c1 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.c
@@ -855,6 +855,30 @@ uint32_t kgd_gfx_v10_set_wave_launch_trap_override(struct 
amdgpu_device *adev,
return 0;
 }
 
+uint32_t kgd_gfx_v10_set_wave_launch_mode(struct amdgpu_device *adev,
+   uint8_t wave_launch_mode,
+   uint32_t vmid)
+{
+   uint32_t data = 0;
+   bool is_mode_set = !!wave_launch_mode;
+
+   mutex_lock(>grbm_idx_mutex);
+
+   kgd_gfx_v10_set_wave_launch_stall(adev, vmid, true);
+
+   data = REG_SET_FIELD(data, SPI_GDBG_WAVE_CNTL2,
+   VMID_MASK, is_mode_set ? 1 << vmid : 0);
+   data = REG_SET_FIELD(data, SPI_GDBG_WAVE_CNTL2,
+   MODE, is_mode_set ? wave_launch_mode : 0);
+   WREG32(SOC15_REG_OFFSET(GC, 0, mmSPI_GDBG_WAVE_CNTL2), data);
+
+   kgd_gfx_v10_set_wave_launch_stall(adev, vmid, false);
+
+   mutex_unlock(>grbm_idx_mutex);
+
+   return 0;
+}
+
 /* kgd_gfx_v10_get_iq_wait_times: Returns the mmCP_IQ_WAIT_TIME1/2 values
  * The values read are:
  * ib_offload_wait_time -- Wait Count for Indirect Buffer Offloads.
@@ -942,6 +966,7 @@ const struct kfd2kgd_calls gfx_v10_kfd2kgd = {
.disabl

[PATCH 26/34] drm/amdkfd: add debug suspend and resume process queues operation

2023-03-27 Thread Jonathan Kim

In order to inspect waves from the saved context at any point during a
debug session, the debugger must be able to preempt queues to trigger
context save by suspending them.

On queue suspend, the KFD will copy the context save header information
so that the debugger can correctly crawl the appropriate size of the saved
context. The debugger must then also be allowed to resume suspended queues.

A queue that is newly created cannot be suspended because queue ids are
recycled after destruction so the debugger needs to know that this has
occurred.  Query functions will be later added that will clear a given
queue of its new queue status.

A queue cannot be destroyed while it is suspended to preserve its saved
context during debugger inspection.  Have queue destruction block while
a queue is suspended and unblocked when it is resumed.  Likewise, if a
queue is about to be destroyed, it cannot be suspended.

Return the number of queues successfully suspended or resumed along with
a per queue status array where the upper bits per queue status show that
the request was invalid (new/destroyed queue suspend request, missing
queue) or an error occurred (HWS in a fatal state so it can't suspend or
resume queues).

v3: update safer copy context save header

v2: add gfx11/mes support.
prevent header copy on suspend from overwriting user fields.
simplify resume_queues function.
address other nit-picks

Signed-off-by: Jonathan Kim 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c|   5 +
 drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h|   1 +
 drivers/gpu/drm/amd/amdkfd/kfd_chardev.c  |  11 +
 drivers/gpu/drm/amd/amdkfd/kfd_debug.c|   7 +
 .../drm/amd/amdkfd/kfd_device_queue_manager.c | 446 +-
 .../drm/amd/amdkfd/kfd_device_queue_manager.h |  10 +
 .../gpu/drm/amd/amdkfd/kfd_mqd_manager_v10.c  |  10 +
 .../gpu/drm/amd/amdkfd/kfd_mqd_manager_v11.c  |  15 +-
 .../gpu/drm/amd/amdkfd/kfd_mqd_manager_v9.c   |  14 +-
 drivers/gpu/drm/amd/amdkfd/kfd_priv.h |   5 +-
 10 files changed, 510 insertions(+), 14 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
index 8ab5bbb53723..b184612ff91e 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
@@ -758,6 +758,11 @@ bool amdgpu_amdkfd_have_atomics_support(struct 
amdgpu_device *adev)
return adev->have_atomics_support;
 }
 
+void amdgpu_amdkfd_debug_mem_fence(struct amdgpu_device *adev)
+{
+   amdgpu_device_flush_hdp(adev, NULL);
+}
+
 void amdgpu_amdkfd_ras_poison_consumption_handler(struct amdgpu_device *adev, 
bool reset)
 {
amdgpu_umc_poison_handler(adev, reset);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
index 83cf1943ee34..c1ef008b39d9 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
@@ -312,6 +312,7 @@ int amdgpu_amdkfd_gpuvm_import_dmabuf(struct amdgpu_device 
*adev,
  uint64_t *mmap_offset);
 int amdgpu_amdkfd_gpuvm_export_dmabuf(struct kgd_mem *mem,
  struct dma_buf **dmabuf);
+void amdgpu_amdkfd_debug_mem_fence(struct amdgpu_device *adev);
 int amdgpu_amdkfd_get_tile_config(struct amdgpu_device *adev,
struct tile_config *config);
 void amdgpu_amdkfd_ras_poison_consumption_handler(struct amdgpu_device *adev,
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
index 3b4dffbbc563..e62d050a31e3 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
@@ -410,6 +410,7 @@ static int kfd_ioctl_create_queue(struct file *filep, 
struct kfd_process *p,
pr_debug("Write ptr address   == 0x%016llX\n",
args->write_pointer_address);
 
+   kfd_dbg_ev_raise(KFD_EC_MASK(EC_QUEUE_NEW), p, dev, queue_id, false, 
NULL, 0);
return 0;
 
 err_create_queue:
@@ -2982,7 +2983,17 @@ static int kfd_ioctl_set_debug_trap(struct file *filep, 
struct kfd_process *p, v
args->launch_mode.launch_mode);
break;
case KFD_IOC_DBG_TRAP_SUSPEND_QUEUES:
+   r = suspend_queues(target,
+   args->suspend_queues.num_queues,
+   args->suspend_queues.grace_period,
+   args->suspend_queues.exception_mask,
+   (uint32_t 
*)args->suspend_queues.queue_array_ptr);
+
+   break;
case KFD_IOC_DBG_TRAP_RESUME_QUEUES:
+   r = resume_queues(target, args->resume_queues.num_queues,
+   (uint32_t 
*)args->resume_queues.queue_array_ptr);
+   break;
case KFD_IOC_DBG_TRAP_SET_NODE_ADDRESS_WATCH:
case KFD_IOC_DBG_TRAP_C

[PATCH 28/34] drm/amdkfd: add debug set flags operation

2023-03-27 Thread Jonathan Kim

Allow the debugger to set single memory and single ALU operations.

Some exceptions are imprecise (memory violations, address watch) in the
sense that a trap occurs only when the exception interrupt occurs and
not at the non-halting faulty instruction.  Trap temporaries 0 & 1 save
the program counter address, which means that these values will not point
to the faulty instruction address but to whenever the interrupt was
raised.

Setting the Single Memory Operations flag will inject an automatic wait
on every memory operation instruction forcing imprecise memory exceptions
to become precise at the cost of performance.  This setting is not
permitted on debug devices that support only a global setting of this
option.

Return the previous set flags to the debugger as well.

v3: fix var declare spacing and add rewind to failed flag setting.

v3: make precise mem op the only available flag for now.

v2: add gfx11 support.

Signed-off-by: Jonathan Kim 
---
 drivers/gpu/drm/amd/amdkfd/kfd_chardev.c |  2 +
 drivers/gpu/drm/amd/amdkfd/kfd_debug.c   | 58 
 drivers/gpu/drm/amd/amdkfd/kfd_debug.h   |  1 +
 3 files changed, 61 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
index 194221ab0f25..da3478b133bd 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
@@ -3021,6 +3021,8 @@ static int kfd_ioctl_set_debug_trap(struct file *filep, 
struct kfd_process *p, v
args->clear_node_address_watch.id);
break;
case KFD_IOC_DBG_TRAP_SET_FLAGS:
+   r = kfd_dbg_trap_set_flags(target, >set_flags.flags);
+   break;
case KFD_IOC_DBG_TRAP_QUERY_DEBUG_EVENT:
case KFD_IOC_DBG_TRAP_QUERY_EXCEPTION_INFO:
case KFD_IOC_DBG_TRAP_GET_QUEUE_SNAPSHOT:
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_debug.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_debug.c
index 4b8b71b1a322..5d3193ae71e3 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_debug.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_debug.c
@@ -23,6 +23,7 @@
 #include "kfd_debug.h"
 #include "kfd_device_queue_manager.h"
 #include 
+#include 
 
 #define MAX_WATCH_ADDRESSES4
 
@@ -423,6 +424,59 @@ static void kfd_dbg_clear_process_address_watch(struct 
kfd_process *target)
kfd_dbg_trap_clear_dev_address_watch(target->pdds[i], 
j);
 }
 
+int kfd_dbg_trap_set_flags(struct kfd_process *target, uint32_t *flags)
+{
+   uint32_t prev_flags = target->dbg_flags;
+   int i, r = 0, rewind_count = 0;
+
+   for (i = 0; i < target->n_pdds; i++) {
+   if (!kfd_dbg_is_per_vmid_supported(target->pdds[i]->dev) &&
+   (*flags & KFD_DBG_TRAP_FLAG_SINGLE_MEM_OP)) {
+   *flags = prev_flags;
+   return -EACCES;
+   }
+   }
+
+   target->dbg_flags = *flags & KFD_DBG_TRAP_FLAG_SINGLE_MEM_OP;
+   *flags = prev_flags;
+   for (i = 0; i < target->n_pdds; i++) {
+   struct kfd_process_device *pdd = target->pdds[i];
+
+   if (!kfd_dbg_is_per_vmid_supported(pdd->dev))
+   continue;
+
+   if (!pdd->dev->shared_resources.enable_mes)
+   r = debug_refresh_runlist(pdd->dev->dqm);
+   else
+   r = kfd_dbg_set_mes_debug_mode(pdd);
+
+   if (r) {
+   target->dbg_flags = prev_flags;
+   break;
+   }
+
+   rewind_count++;
+   }
+
+   /* Rewind flags */
+   if (r) {
+   target->dbg_flags = prev_flags;
+
+   for (i = 0; i < rewind_count; i++) {
+   struct kfd_process_device *pdd = target->pdds[i];
+
+   if (!kfd_dbg_is_per_vmid_supported(pdd->dev))
+   continue;
+
+   if (!pdd->dev->shared_resources.enable_mes)
+   debug_refresh_runlist(pdd->dev->dqm);
+   else
+   kfd_dbg_set_mes_debug_mode(pdd);
+   }
+   }
+
+   return r;
+}
 
 /* kfd_dbg_trap_deactivate:
  * target: target process
@@ -437,9 +491,13 @@ void kfd_dbg_trap_deactivate(struct kfd_process *target, 
bool unwind, int unwind
int i;
 
if (!unwind) {
+   uint32_t flags = 0;
+
cancel_work_sync(>debug_event_workarea);
kfd_dbg_clear_process_address_watch(target);
kfd_dbg_trap_set_wave_launch_mode(target, 0);
+
+   kfd_dbg_trap_set_flags(target, );
}
 
for (i = 0; i < target->n_pdds; i++) {
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_debug.h 
b/drivers/gpu/drm/amd/amdkfd/kfd_debug.h
index 63c716ce5ab

[PATCH 20/34] drm/amdkfd: add runtime enable operation

2023-03-27 Thread Jonathan Kim

The debugger can attach to a process prior to HSA enablement (i.e.
inferior is spawned by the debugger and attached to immediately before
target process has been enabled for HSA dispatches) or it
can attach to a running target that is already HSA enabled.  Either
way, the debugger needs to know the enablement status to know when
it can inspect queues.

For the scenario where the debugger spawns the target process,
it will have to wait for ROCr's runtime enable request from the target.
The runtime enable request will be able to see that its process has been
debug attached.  ROCr raises an EC_PROCESS_RUNTIME signal to the
debugger then blocks the target process while waiting the debugger's
response. Once the debugger has received the runtime signal, it will
unblock the target process.

For the scenario where the debugger attaches to a running target
process, ROCr will set the target process' runtime status as enabled so
that on an attach request, the debugger will be able to see this
status and will continue with debug enablement as normal.

A secondary requirement is to conditionally enable the trap tempories only
if the user requests it (env var HSA_ENABLE_DEBUG=1) or if the debugger
attaches with HSA runtime enabled.  This is because setting up the trap
temporaries incurs a performance overhead that is unacceptable for
microbench performance in normal mode for certain customers.

In the scenario where the debugger spawns the target process, when ROCr
detects that the debugger has attached during the runtime enable
request, it will enable the trap temporaries before it blocks the target
process while waiting for the debugger to respond.

In the scenario where the debugger attaches to a running target process,
it will enable to trap temporaries itself.

Finally, there is an additional restriction that is required to be
enforced with runtime enable and HW debug mode setting. The debugger must
first ensure that HW debug mode has been enabled before permitting HW debug
mode operations.

With single process debug devices, allowing the debugger to set debug
HW modes prior to trap activation means that debug HW mode setting can
occur before the KFD has reserved the debug VMID (0xf) from the hardware
scheduler's VMID allocation resource pool.  This can result in the
hardware scheduler assigning VMID 0xf to a non-debugged process and
having that process inherit debug HW mode settings intended for the
debugged target process instead, which is both incorrect and potentially
fatal for normal mode operation.

With multi process debug devices, allowing the debugger to set debug
HW modes prior to trap activation means that non-debugged processes
migrating to a new VMID could inherit unintended debug settings.

All debug operations that touch HW settings must require trap activation
where trap activation is triggered by both debug attach and runtime
enablement (target has KFD opened and is ready to dispatch work).

v3: fix and cleanup ttmp_setup for runtime_enable.

v2: fix up hierarchy of semantics in description.

Signed-off-by: Jonathan Kim 
---
 drivers/gpu/drm/amd/amdkfd/kfd_chardev.c | 143 ++-
 drivers/gpu/drm/amd/amdkfd/kfd_debug.c   |   6 +-
 drivers/gpu/drm/amd/amdkfd/kfd_debug.h   |   4 +
 drivers/gpu/drm/amd/amdkfd/kfd_priv.h|   1 +
 4 files changed, 150 insertions(+), 4 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
index c48d4598583a..2a8d30dbb3f6 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
@@ -2724,11 +2724,140 @@ static int kfd_ioctl_criu(struct file *filep, struct 
kfd_process *p, void *data)
return ret;
 }
 
-static int kfd_ioctl_runtime_enable(struct file *filep, struct kfd_process *p, 
void *data)
+static int runtime_enable(struct kfd_process *p, uint64_t r_debug,
+   bool enable_ttmp_setup)
+{
+   int i = 0, ret = 0;
+
+   if (p->is_runtime_retry)
+   goto retry;
+
+   if (p->runtime_info.runtime_state != DEBUG_RUNTIME_STATE_DISABLED)
+   return -EBUSY;
+
+   for (i = 0; i < p->n_pdds; i++) {
+   struct kfd_process_device *pdd = p->pdds[i];
+
+   if (pdd->qpd.queue_count)
+   return -EEXIST;
+   }
+
+   p->runtime_info.runtime_state = DEBUG_RUNTIME_STATE_ENABLED;
+   p->runtime_info.r_debug = r_debug;
+   p->runtime_info.ttmp_setup = enable_ttmp_setup;
+
+   if (p->runtime_info.ttmp_setup) {
+   for (i = 0; i < p->n_pdds; i++) {
+   struct kfd_process_device *pdd = p->pdds[i];
+
+   if (!kfd_dbg_is_rlc_restore_supported(pdd->dev)) {
+   amdgpu_gfx_off_ctrl(pdd->dev->adev, false);
+   pdd->dev->kfd2kgd->enable_debug_trap(
+

[PATCH 19/34] drm/amdkfd: add send exception operation

2023-03-27 Thread Jonathan Kim

Add a debug operation that allows the debugger to send an exception
directly to runtime through a payload address.

For memory violations, normal vmfault signals will be applied to
notify runtime instead after passing in the saved exception data
when a memory violation was raised to the debugger.

For runtime exceptions, this will unblock the runtime enable
function which will be explained and implemented in a follow up
patch.

v2: missing closing brace in set workaround function got fixed
in patch 17.

Signed-off-by: Jonathan Kim 
---
 .../gpu/drm/amd/amdkfd/cik_event_interrupt.c  |  4 +-
 drivers/gpu/drm/amd/amdkfd/kfd_chardev.c  |  5 ++
 drivers/gpu/drm/amd/amdkfd/kfd_debug.c| 43 +++
 drivers/gpu/drm/amd/amdkfd/kfd_debug.h|  6 ++
 drivers/gpu/drm/amd/amdkfd/kfd_events.c   |  3 +-
 .../gpu/drm/amd/amdkfd/kfd_int_process_v9.c   |  2 +-
 drivers/gpu/drm/amd/amdkfd/kfd_priv.h |  7 +-
 drivers/gpu/drm/amd/amdkfd/kfd_process.c  | 71 ++-
 8 files changed, 135 insertions(+), 6 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c 
b/drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c
index 5c8023cba196..62a38cd820fc 100644
--- a/drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c
+++ b/drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c
@@ -118,9 +118,9 @@ static void cik_event_interrupt_wq(struct kfd_dev *dev,
return;
 
if (info.vmid == vmid)
-   kfd_signal_vm_fault_event(dev, pasid, );
+   kfd_signal_vm_fault_event(dev, pasid, , NULL);
else
-   kfd_signal_vm_fault_event(dev, pasid, NULL);
+   kfd_signal_vm_fault_event(dev, pasid, NULL, NULL);
}
 }
 
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
index acbed2a25a1e..c48d4598583a 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
@@ -2819,6 +2819,11 @@ static int kfd_ioctl_set_debug_trap(struct file *filep, 
struct kfd_process *p, v
r = kfd_dbg_trap_disable(target);
break;
case KFD_IOC_DBG_TRAP_SEND_RUNTIME_EVENT:
+   r = kfd_dbg_send_exception_to_runtime(target,
+   args->send_runtime_event.gpu_id,
+   args->send_runtime_event.queue_id,
+   args->send_runtime_event.exception_mask);
+   break;
case KFD_IOC_DBG_TRAP_SET_EXCEPTIONS_ENABLED:
case KFD_IOC_DBG_TRAP_SET_WAVE_LAUNCH_OVERRIDE:
case KFD_IOC_DBG_TRAP_SET_WAVE_LAUNCH_MODE:
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_debug.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_debug.c
index 23ca8123f8f5..b3bd101b3b34 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_debug.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_debug.c
@@ -125,6 +125,49 @@ bool kfd_dbg_ev_raise(uint64_t event_mask,
return is_subscribed;
 }
 
+int kfd_dbg_send_exception_to_runtime(struct kfd_process *p,
+   unsigned int dev_id,
+   unsigned int queue_id,
+   uint64_t error_reason)
+{
+   if (error_reason & KFD_EC_MASK(EC_DEVICE_MEMORY_VIOLATION)) {
+   struct kfd_process_device *pdd = NULL;
+   struct kfd_hsa_memory_exception_data *data;
+   int i;
+
+   for (i = 0; i < p->n_pdds; i++) {
+   if (p->pdds[i]->dev->id == dev_id) {
+   pdd = p->pdds[i];
+   break;
+   }
+   }
+
+   if (!pdd)
+   return -ENODEV;
+
+   data = (struct kfd_hsa_memory_exception_data *)
+   pdd->vm_fault_exc_data;
+
+   kfd_dqm_evict_pasid(pdd->dev->dqm, p->pasid);
+   kfd_signal_vm_fault_event(pdd->dev, p->pasid, NULL, data);
+   error_reason &= ~KFD_EC_MASK(EC_DEVICE_MEMORY_VIOLATION);
+   }
+
+   if (error_reason & (KFD_EC_MASK(EC_PROCESS_RUNTIME))) {
+   /*
+* block should only happen after the debugger receives runtime
+* enable notice.
+*/
+   up(>runtime_enable_sema);
+   error_reason &= ~KFD_EC_MASK(EC_PROCESS_RUNTIME);
+   }
+
+   if (error_reason)
+   return kfd_send_exception_to_runtime(p, queue_id, error_reason);
+
+   return 0;
+}
+
 static int kfd_dbg_set_queue_workaround(struct queue *q, bool enable)
 {
struct mqd_update_info minfo = {0};
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_debug.h 
b/drivers/gpu/drm/amd/amdkfd/kfd_debug.h
index a6cdbd9f1b54..adde17c45b4f 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_debug.h
+++

[PATCH 32/34] drm/amdkfd: add debug device snapshot operation

2023-03-27 Thread Jonathan Kim

Similar to queue snapshot, return an array of device information using
an entry_size check and return.
Unlike queue snapshots, the debugger needs to pass to correct number of
devices that exist.  If it fails to do so, the KFD will return the
number of actual devices so that the debugger can make a subsequent
successful call.

v3: was reviewed but re-requesting review with new revision and
subvendor information.
memset 0 device info entry to clear padding.

v2: change buf_size are to num_devices for more clarity.
expand device entry new members on copy.
fix minimum entry size calculation for queue and device snapshot.
change device snapshot implementation to match queue snapshot
implementation.

Signed-off-by: Jonathan Kim 
Reviewed-by: Felix Kuehling 
---
 drivers/gpu/drm/amd/amdkfd/kfd_chardev.c |  7 ++-
 drivers/gpu/drm/amd/amdkfd/kfd_debug.c   | 72 
 drivers/gpu/drm/amd/amdkfd/kfd_debug.h   |  5 ++
 3 files changed, 82 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
index 7868a0aecc9f..3254274bcee0 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
@@ -3046,8 +3046,11 @@ static int kfd_ioctl_set_debug_trap(struct file *filep, 
struct kfd_process *p, v
>queue_snapshot.entry_size);
break;
case KFD_IOC_DBG_TRAP_GET_DEVICE_SNAPSHOT:
-   pr_warn("Debug op %i not supported yet\n", args->op);
-   r = -EACCES;
+   r = kfd_dbg_trap_device_snapshot(target,
+   args->device_snapshot.exception_mask,
+   (void __user 
*)args->device_snapshot.snapshot_buf_ptr,
+   >device_snapshot.num_devices,
+   >device_snapshot.entry_size);
break;
default:
pr_err("Invalid option: %i\n", args->op);
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_debug.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_debug.c
index 905182669559..0ea85afcffd3 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_debug.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_debug.c
@@ -22,6 +22,7 @@
 
 #include "kfd_debug.h"
 #include "kfd_device_queue_manager.h"
+#include "kfd_topology.h"
 #include 
 #include 
 
@@ -1010,6 +1011,77 @@ int kfd_dbg_trap_query_exception_info(struct kfd_process 
*target,
return r;
 }
 
+int kfd_dbg_trap_device_snapshot(struct kfd_process *target,
+   uint64_t exception_clear_mask,
+   void __user *user_info,
+   uint32_t *number_of_device_infos,
+   uint32_t *entry_size)
+{
+   struct kfd_dbg_device_info_entry device_info;
+   uint32_t tmp_entry_size = *entry_size, tmp_num_devices;
+   int i, r = 0;
+
+   if (!(target && user_info && number_of_device_infos && entry_size))
+   return -EINVAL;
+
+   tmp_num_devices = min_t(size_t, *number_of_device_infos, 
target->n_pdds);
+   *number_of_device_infos = target->n_pdds;
+   *entry_size = min_t(size_t, *entry_size, sizeof(device_info));
+
+   if (!tmp_num_devices)
+   return 0;
+
+   memset(_info, 0, sizeof(device_info));
+
+   mutex_lock(>event_mutex);
+
+   /* Run over all pdd of the process */
+   for (i = 0; i < tmp_num_devices; i++) {
+   struct kfd_process_device *pdd = target->pdds[i];
+   struct kfd_topology_device *topo_dev = 
kfd_topology_device_by_id(pdd->dev->id);
+
+   device_info.gpu_id = pdd->dev->id;
+   device_info.exception_status = pdd->exception_status;
+   device_info.lds_base = pdd->lds_base;
+   device_info.lds_limit = pdd->lds_limit;
+   device_info.scratch_base = pdd->scratch_base;
+   device_info.scratch_limit = pdd->scratch_limit;
+   device_info.gpuvm_base = pdd->gpuvm_base;
+   device_info.gpuvm_limit = pdd->gpuvm_limit;
+   device_info.location_id = topo_dev->node_props.location_id;
+   device_info.vendor_id = topo_dev->node_props.vendor_id;
+   device_info.device_id = topo_dev->node_props.device_id;
+   device_info.revision_id = pdd->dev->adev->pdev->revision;
+   device_info.subsystem_vendor_id = 
pdd->dev->adev->pdev->subsystem_vendor;
+   device_info.subsystem_device_id = 
pdd->dev->adev->pdev->subsystem_device;
+   device_info.fw_version = pdd->dev->mec_fw_version;
+   device_info.gfx_target_version =
+   topo_dev->node_props.gfx_target_version;
+   device_info.simd_count = topo_dev->node_props.

[PATCH 16/34] drm/amdkfd: add per process hw trap enable and disable functions

2023-03-27 Thread Jonathan Kim

To enable HW debug mode per process, all devices must be debug enabled
successfully.  If a failure occures, rewind the enablement of debug mode
on the enabled devices.

A power management scenario that needs to be considered is HW
debug mode setting during GFXOFF.  During GFXOFF, these registers
will be unreachable so we have to transiently disable GFXOFF when
setting.  Also, some devices don't support the RLC save restore
function for these debug registers so we have to disable GFXOFF
completely during a debug session.

Cooperative launch also has debugging restriction based on HW/FW bugs.
If such bugs exists, the debugger cannot attach to a process that uses GWS
resources nor can GWS resources be requested if a process is being
debugged.

Multi-process debug devices can only enable trap temporaries based
on certain runtime scenerios, which will be explained when the
runtime enable functions are implemented in a follow up patch.

v3: make gfx off restore unconditional on detach and get rid of unneeded
count variable for unwind.
also move gfx11 workaround to later so we're not defining unused
functions and better illustrate the use of the workaround.

v2: add gfx11 support. fix fw checks. remove asic family name comments.

Signed-off-by: Jonathan Kim 
---
 drivers/gpu/drm/amd/amdkfd/kfd_chardev.c |   5 +
 drivers/gpu/drm/amd/amdkfd/kfd_debug.c   | 148 ++-
 drivers/gpu/drm/amd/amdkfd/kfd_debug.h   |  29 +
 drivers/gpu/drm/amd/amdkfd/kfd_process.c |  10 ++
 4 files changed, 190 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
index 58c08d8a191f..c0c54206b1c2 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
@@ -1473,6 +1473,11 @@ static int kfd_ioctl_alloc_queue_gws(struct file *filep,
goto out_unlock;
}
 
+   if (!kfd_dbg_has_gws_support(dev) && p->debug_trap_enabled) {
+   retval = -EBUSY;
+   goto out_unlock;
+   }
+
retval = pqm_set_gws(>pqm, args->queue_id, args->num_gws ? dev->gws 
: NULL);
mutex_unlock(>mutex);
 
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_debug.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_debug.c
index 2bf2aa21eccc..fe5b965ac8f6 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_debug.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_debug.c
@@ -21,13 +21,78 @@
  */
 
 #include "kfd_debug.h"
+#include "kfd_device_queue_manager.h"
 #include 
 
+static int kfd_dbg_set_mes_debug_mode(struct kfd_process_device *pdd)
+{
+   uint32_t spi_dbg_cntl = pdd->spi_dbg_override | 
pdd->spi_dbg_launch_mode;
+   uint32_t flags = pdd->process->dbg_flags;
+
+   if (!kfd_dbg_is_per_vmid_supported(pdd->dev))
+   return 0;
+
+   return amdgpu_mes_set_shader_debugger(pdd->dev->adev, 
pdd->proc_ctx_gpu_addr, spi_dbg_cntl,
+   pdd->watch_points, flags);
+}
+
+/* kfd_dbg_trap_deactivate:
+ * target: target process
+ * unwind: If this is unwinding a failed kfd_dbg_trap_enable()
+ * unwind_count:
+ * If unwind == true, how far down the pdd list we need
+ * to unwind
+ * else: ignored
+ */
+static void kfd_dbg_trap_deactivate(struct kfd_process *target, bool unwind, 
int unwind_count)
+{
+   int i;
+
+   for (i = 0; i < target->n_pdds; i++) {
+   struct kfd_process_device *pdd = target->pdds[i];
+
+   /* If this is an unwind, and we have unwound the required
+* enable calls on the pdd list, we need to stop now
+* otherwise we may mess up another debugger session.
+*/
+   if (unwind && i == unwind_count)
+   break;
+
+   /* GFX off is already disabled by debug activate if not RLC 
restore supported. */
+   if (kfd_dbg_is_rlc_restore_supported(pdd->dev))
+   amdgpu_gfx_off_ctrl(pdd->dev->adev, false);
+   pdd->spi_dbg_override =
+   pdd->dev->kfd2kgd->disable_debug_trap(
+   pdd->dev->adev,
+   target->runtime_info.ttmp_setup,
+   pdd->dev->vm_info.last_vmid_kfd);
+   amdgpu_gfx_off_ctrl(pdd->dev->adev, true);
+
+   if (!kfd_dbg_is_per_vmid_supported(pdd->dev) &&
+   release_debug_trap_vmid(pdd->dev->dqm, 
>qpd))
+   pr_err("Failed to release debug vmid on [%i]\n", 
pdd->dev->id);
+
+   if (!pdd->dev->shared_resources.enable_mes)
+   debug_refresh_runlist(pdd->dev->dqm);
+   else
+

[PATCH 13/34] drm/amdkfd: prepare map process for single process debug devices

2023-03-27 Thread Jonathan Kim

Older HW only supports debugging on a single process because the
SPI debug mode setting registers are device global.

The HWS has supplied a single pinned VMID (0xf) for MAP_PROCESS
for debug purposes. To pin the VMID, the KFD will remove the VMID from
the HWS dynamic VMID allocation via SET_RESOUCES so that a debugged
process will never migrate away from its pinned VMID.

The KFD is responsible for reserving and releasing this pinned VMID
accordingly whenever the debugger attaches and detaches respectively.

v2: remove unneeded formatting and add back mistakenly removed worker.

Signed-off-by: Jonathan Kim 
---
 .../drm/amd/amdkfd/kfd_device_queue_manager.c | 93 +++
 .../drm/amd/amdkfd/kfd_device_queue_manager.h |  5 +
 .../drm/amd/amdkfd/kfd_packet_manager_v9.c|  9 ++
 .../gpu/drm/amd/amdkfd/kfd_pm4_headers_ai.h   |  5 +-
 4 files changed, 111 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
index 807cad60d21e..40744dc3db57 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
@@ -1492,6 +1492,7 @@ static int initialize_cpsch(struct device_queue_manager 
*dqm)
dqm->gws_queue_count = 0;
dqm->active_runlist = false;
INIT_WORK(>hw_exception_work, kfd_process_hw_exception);
+   dqm->trap_debug_vmid = 0;
 
init_sdma_bitmaps(dqm);
 
@@ -2465,6 +2466,98 @@ static void kfd_process_hw_exception(struct work_struct 
*work)
amdgpu_amdkfd_gpu_reset(dqm->dev->adev);
 }
 
+int reserve_debug_trap_vmid(struct device_queue_manager *dqm,
+   struct qcm_process_device *qpd)
+{
+   int r;
+   int updated_vmid_mask;
+
+   if (dqm->sched_policy == KFD_SCHED_POLICY_NO_HWS) {
+   pr_err("Unsupported on sched_policy: %i\n", dqm->sched_policy);
+   return -EINVAL;
+   }
+
+   dqm_lock(dqm);
+
+   if (dqm->trap_debug_vmid != 0) {
+   pr_err("Trap debug id already reserved\n");
+   r = -EBUSY;
+   goto out_unlock;
+   }
+
+   r = unmap_queues_cpsch(dqm, KFD_UNMAP_QUEUES_FILTER_ALL_QUEUES, 0,
+   USE_DEFAULT_GRACE_PERIOD, false);
+   if (r)
+   goto out_unlock;
+
+   updated_vmid_mask = dqm->dev->shared_resources.compute_vmid_bitmap;
+   updated_vmid_mask &= ~(1 << dqm->dev->vm_info.last_vmid_kfd);
+
+   dqm->dev->shared_resources.compute_vmid_bitmap = updated_vmid_mask;
+   dqm->trap_debug_vmid = dqm->dev->vm_info.last_vmid_kfd;
+   r = set_sched_resources(dqm);
+   if (r)
+   goto out_unlock;
+
+   r = map_queues_cpsch(dqm);
+   if (r)
+   goto out_unlock;
+
+   pr_debug("Reserved VMID for trap debug: %i\n", dqm->trap_debug_vmid);
+
+out_unlock:
+   dqm_unlock(dqm);
+   return r;
+}
+
+/*
+ * Releases vmid for the trap debugger
+ */
+int release_debug_trap_vmid(struct device_queue_manager *dqm,
+   struct qcm_process_device *qpd)
+{
+   int r;
+   int updated_vmid_mask;
+   uint32_t trap_debug_vmid;
+
+   if (dqm->sched_policy == KFD_SCHED_POLICY_NO_HWS) {
+   pr_err("Unsupported on sched_policy: %i\n", dqm->sched_policy);
+   return -EINVAL;
+   }
+
+   dqm_lock(dqm);
+   trap_debug_vmid = dqm->trap_debug_vmid;
+   if (dqm->trap_debug_vmid == 0) {
+   pr_err("Trap debug id is not reserved\n");
+   r = -EINVAL;
+   goto out_unlock;
+   }
+
+   r = unmap_queues_cpsch(dqm, KFD_UNMAP_QUEUES_FILTER_ALL_QUEUES, 0,
+   USE_DEFAULT_GRACE_PERIOD, false);
+   if (r)
+   goto out_unlock;
+
+   updated_vmid_mask = dqm->dev->shared_resources.compute_vmid_bitmap;
+   updated_vmid_mask |= (1 << dqm->dev->vm_info.last_vmid_kfd);
+
+   dqm->dev->shared_resources.compute_vmid_bitmap = updated_vmid_mask;
+   dqm->trap_debug_vmid = 0;
+   r = set_sched_resources(dqm);
+   if (r)
+   goto out_unlock;
+
+   r = map_queues_cpsch(dqm);
+   if (r)
+   goto out_unlock;
+
+   pr_debug("Released VMID for trap debug: %i\n", trap_debug_vmid);
+
+out_unlock:
+   dqm_unlock(dqm);
+   return r;
+}
+
 #if defined(CONFIG_DEBUG_FS)
 
 static void seq_reg_dump(struct seq_file *m,
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h 
b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h
index fb48b124161f..0cb1504d24cf 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h
@@ -250,6 +250,7 @@ struct device_queue_manager {
struct kfd_

[PATCH 07/34] drm/amdgpu: add gfx9.4.1 hw debug mode enable and disable calls

2023-03-27 Thread Jonathan Kim

On GFX9.4.1, the implicit wait count instruction on s_barrier is
disabled by default in the driver during normal operation for
performance requirements.

There is a hardware bug in GFX9.4.1 where if the implicit wait count
instruction after an s_barrier instruction is disabled, any wave that
hits an exception may step over the s_barrier when returning from the
trap handler with the barrier logic having no ability to be
aware of this, thereby causing other waves to wait at the barrier
indefinitely resulting in a shader hang.  This bug has been corrected
for GFX9.4.2 and onward.

Since the debugger subscribes to hardware exceptions, in order to avoid
this bug, the debugger must enable implicit wait count on s_barrier
for a debug session and disable it on detach.

In order to change this setting in the in the device global SQ_CONFIG
register, the GFX pipeline must be idle.  GFX9.4.1 as a compute device
will either dispatch work through the compute ring buffers used for
image post processing or through the hardware scheduler by the KFD.

Have the KGD suspend and drain the compute ring buffer, then suspend the
hardware scheduler and block any future KFD process job requests before
changing the implicit wait count setting.  Once set, resume all work.

v3: simplfy boolean checks and fix comment format kern bot complaints

v2: remove flush on kfd suspend as that will be a general fix required
outside of this patch series.
comment on trap enable/disable ignored variables.

Signed-off-by: Jonathan Kim 
Reviewed-by: Felix Kuehling 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu.h   |   3 +
 .../drm/amd/amdgpu/amdgpu_amdkfd_arcturus.c   | 116 ++
 drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c |   4 +-
 3 files changed, 121 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
index bbac4239ceb3..7dbc9b0d4b9e 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
@@ -1040,6 +1040,9 @@ struct amdgpu_device {
struct pci_saved_state  *pci_state;
pci_channel_state_t pci_channel_state;
 
+   /* Track auto wait count on s_barrier settings */
+   boolbarrier_has_auto_waitcnt;
+
struct amdgpu_reset_control *reset_cntl;
uint32_t
ip_versions[MAX_HWIP][HWIP_MAX_INSTANCE];
 
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_arcturus.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_arcturus.c
index 4191af5a3f13..d2918e5c0dea 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_arcturus.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_arcturus.c
@@ -26,6 +26,7 @@
 #include "amdgpu.h"
 #include "amdgpu_amdkfd.h"
 #include "amdgpu_amdkfd_arcturus.h"
+#include "amdgpu_reset.h"
 #include "sdma0/sdma0_4_2_2_offset.h"
 #include "sdma0/sdma0_4_2_2_sh_mask.h"
 #include "sdma1/sdma1_4_2_2_offset.h"
@@ -48,6 +49,8 @@
 #include "amdgpu_amdkfd_gfx_v9.h"
 #include "gfxhub_v1_0.h"
 #include "mmhub_v9_4.h"
+#include "gc/gc_9_0_offset.h"
+#include "gc/gc_9_0_sh_mask.h"
 
 #define HQD_N_REGS 56
 #define DUMP_REG(addr) do {\
@@ -276,6 +279,117 @@ int kgd_arcturus_hqd_sdma_destroy(struct amdgpu_device 
*adev, void *mqd,
return 0;
 }
 
+/*
+ * Helper used to suspend/resume gfx pipe for image post process work to set
+ * barrier behaviour.
+ */
+static int suspend_resume_compute_scheduler(struct amdgpu_device *adev, bool 
suspend)
+{
+   int i, r = 0;
+
+   for (i = 0; i < adev->gfx.num_compute_rings; i++) {
+   struct amdgpu_ring *ring = >gfx.compute_ring[i];
+
+   if (!(ring && ring->sched.thread))
+   continue;
+
+   /* stop secheduler and drain ring. */
+   if (suspend) {
+   drm_sched_stop(>sched, NULL);
+   r = amdgpu_fence_wait_empty(ring);
+   if (r)
+   goto out;
+   } else {
+   drm_sched_start(>sched, false);
+   }
+   }
+
+out:
+   /* return on resume or failure to drain rings. */
+   if (!suspend || r)
+   return r;
+
+   return amdgpu_device_ip_wait_for_idle(adev, GC_HWIP);
+}
+
+static void set_barrier_auto_waitcnt(struct amdgpu_device *adev, bool 
enable_waitcnt)
+{
+   uint32_t data;
+
+   WRITE_ONCE(adev->barrier_has_auto_waitcnt, enable_waitcnt);
+
+   if (!down_read_trylock(>reset_domain->sem))
+   return;
+
+   amdgpu_amdkfd_suspend(adev, false);
+
+   if (suspend_resume_compute_scheduler(adev, true))
+   goto out;
+
+   data = RREG32(SOC15_REG_OFFSET(GC, 0, mmSQ_CONFIG));
+   data = REG_SET_FIELD(data, SQ_CONFIG, DI

[PATCH 30/34] drm/amdkfd: add debug query exception info operation

2023-03-27 Thread Jonathan Kim

Allow the debugger to query additional info based on an exception code.
For device exceptions, it's currently only memory violation information.
For process exceptions, it's currently only runtime information.
Queue exception only report the queue exception status.

The debugger has the option of clearing the target exception on query.

Signed-off-by: Jonathan Kim 
Reviewed-by: Felix Kuehling 
---
 drivers/gpu/drm/amd/amdkfd/kfd_chardev.c |   7 ++
 drivers/gpu/drm/amd/amdkfd/kfd_debug.c   | 120 +++
 drivers/gpu/drm/amd/amdkfd/kfd_debug.h   |   6 ++
 3 files changed, 133 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
index 593ede6b2af5..1f7e57abefb4 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
@@ -3031,6 +3031,13 @@ static int kfd_ioctl_set_debug_trap(struct file *filep, 
struct kfd_process *p, v
>query_debug_event.exception_mask);
break;
case KFD_IOC_DBG_TRAP_QUERY_EXCEPTION_INFO:
+   r = kfd_dbg_trap_query_exception_info(target,
+   args->query_exception_info.source_id,
+   args->query_exception_info.exception_code,
+   args->query_exception_info.clear_exception,
+   (void __user 
*)args->query_exception_info.info_ptr,
+   >query_exception_info.info_size);
+   break;
case KFD_IOC_DBG_TRAP_GET_QUEUE_SNAPSHOT:
case KFD_IOC_DBG_TRAP_GET_DEVICE_SNAPSHOT:
pr_warn("Debug op %i not supported yet\n", args->op);
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_debug.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_debug.c
index e7f1521ac593..905182669559 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_debug.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_debug.c
@@ -890,6 +890,126 @@ int kfd_dbg_trap_set_wave_launch_mode(struct kfd_process 
*target,
return r;
 }
 
+int kfd_dbg_trap_query_exception_info(struct kfd_process *target,
+   uint32_t source_id,
+   uint32_t exception_code,
+   bool clear_exception,
+   void __user *info,
+   uint32_t *info_size)
+{
+   bool found = false;
+   int r = 0;
+   uint32_t copy_size, actual_info_size = 0;
+   uint64_t *exception_status_ptr = NULL;
+
+   if (!target)
+   return -EINVAL;
+
+   if (!info || !info_size)
+   return -EINVAL;
+
+   mutex_lock(>event_mutex);
+
+   if (KFD_DBG_EC_TYPE_IS_QUEUE(exception_code)) {
+   /* Per queue exceptions */
+   struct queue *queue = NULL;
+   int i;
+
+   for (i = 0; i < target->n_pdds; i++) {
+   struct kfd_process_device *pdd = target->pdds[i];
+   struct qcm_process_device *qpd = >qpd;
+
+   list_for_each_entry(queue, >queues_list, list) {
+   if (!found && queue->properties.queue_id == 
source_id) {
+   found = true;
+   break;
+   }
+   }
+   if (found)
+   break;
+   }
+
+   if (!found) {
+   r = -EINVAL;
+   goto out;
+   }
+
+   if (!(queue->properties.exception_status & 
KFD_EC_MASK(exception_code))) {
+   r = -ENODATA;
+   goto out;
+   }
+   exception_status_ptr = >properties.exception_status;
+   } else if (KFD_DBG_EC_TYPE_IS_DEVICE(exception_code)) {
+   /* Per device exceptions */
+   struct kfd_process_device *pdd = NULL;
+   int i;
+
+   for (i = 0; i < target->n_pdds; i++) {
+   pdd = target->pdds[i];
+   if (pdd->dev->id == source_id) {
+   found = true;
+   break;
+   }
+   }
+
+   if (!found) {
+   r = -EINVAL;
+   goto out;
+   }
+
+   if (!(pdd->exception_status & KFD_EC_MASK(exception_code))) {
+   r = -ENODATA;
+   goto out;
+   }
+
+   if (exception_code == EC_DEVICE_MEMORY_VIOLATION) {
+   copy_size = min((size_t)(*info_size), 
pdd->vm_fault_exc_data_size);
+
+   if (copy_to_user(info, pdd->vm_fault_exc_data, 
copy_size)) {
+   r = -EFAULT;
+   goto out

[PATCH 31/34] drm/amdkfd: add debug queue snapshot operation

2023-03-27 Thread Jonathan Kim

Allow the debugger to get a snapshot of a specified number of queues
containing various queue property information that is copied to the
debugger.

Since the debugger doesn't know how many queues exist at any given time,
allow the debugger to pass the requested number of snapshots as 0 to get
the actual number of potential snapshots to use for a subsequent snapshot
request for actual information.

To prevent future ABI breakage, pass in the requested entry_size.
The KFD will return it's own entry_size in case the debugger still wants
log the information in a core dump on sizing failure.

Also allow the debugger to clear exceptions when doing a snapshot.

v4: declare and memset 0 snapshot struct out of loop

v3: fix uninitialized return and change queue snapshot to type void for
proper increment on buffer copy.
use memset 0 to init snapshot entry to clear struct padding.

v2: change buf_size arg to num_queues for clarity.
fix minimum entry size calculation.

Signed-off-by: Jonathan Kim 
Reviewed-by: Felix Kuehling 
---
 drivers/gpu/drm/amd/amdkfd/kfd_chardev.c  |  6 +++
 .../drm/amd/amdkfd/kfd_device_queue_manager.c | 36 +
 .../drm/amd/amdkfd/kfd_device_queue_manager.h |  3 ++
 drivers/gpu/drm/amd/amdkfd/kfd_priv.h |  5 +++
 .../amd/amdkfd/kfd_process_queue_manager.c| 40 +++
 5 files changed, 90 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
index 1f7e57abefb4..7868a0aecc9f 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
@@ -3039,6 +3039,12 @@ static int kfd_ioctl_set_debug_trap(struct file *filep, 
struct kfd_process *p, v
>query_exception_info.info_size);
break;
case KFD_IOC_DBG_TRAP_GET_QUEUE_SNAPSHOT:
+   r = pqm_get_queue_snapshot(>pqm,
+   args->queue_snapshot.exception_mask,
+   (void __user 
*)args->queue_snapshot.snapshot_buf_ptr,
+   >queue_snapshot.num_queues,
+   >queue_snapshot.entry_size);
+   break;
case KFD_IOC_DBG_TRAP_GET_DEVICE_SNAPSHOT:
pr_warn("Debug op %i not supported yet\n", args->op);
r = -EACCES;
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
index f79c26e02417..ce0a1483606e 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
@@ -3004,6 +3004,42 @@ int suspend_queues(struct kfd_process *p,
return total_suspended;
 }
 
+static uint32_t set_queue_type_for_user(struct queue_properties *q_props)
+{
+   switch (q_props->type) {
+   case KFD_QUEUE_TYPE_COMPUTE:
+   return q_props->format == KFD_QUEUE_FORMAT_PM4
+   ? KFD_IOC_QUEUE_TYPE_COMPUTE
+   : KFD_IOC_QUEUE_TYPE_COMPUTE_AQL;
+   case KFD_QUEUE_TYPE_SDMA:
+   return KFD_IOC_QUEUE_TYPE_SDMA;
+   case KFD_QUEUE_TYPE_SDMA_XGMI:
+   return KFD_IOC_QUEUE_TYPE_SDMA_XGMI;
+   default:
+   WARN_ONCE(true, "queue type not recognized!");
+   return 0x;
+   };
+}
+
+void set_queue_snapshot_entry(struct queue *q,
+ uint64_t exception_clear_mask,
+ struct kfd_queue_snapshot_entry *qss_entry)
+{
+   qss_entry->ring_base_address = q->properties.queue_address;
+   qss_entry->write_pointer_address = (uint64_t)q->properties.write_ptr;
+   qss_entry->read_pointer_address = (uint64_t)q->properties.read_ptr;
+   qss_entry->ctx_save_restore_address =
+   q->properties.ctx_save_restore_area_address;
+   qss_entry->ctx_save_restore_area_size =
+   q->properties.ctx_save_restore_area_size;
+   qss_entry->exception_status = q->properties.exception_status;
+   qss_entry->queue_id = q->properties.queue_id;
+   qss_entry->gpu_id = q->device->id;
+   qss_entry->ring_size = (uint32_t)q->properties.queue_size;
+   qss_entry->queue_type = set_queue_type_for_user(>properties);
+   q->properties.exception_status &= ~exception_clear_mask;
+}
+
 int debug_lock_and_unmap(struct device_queue_manager *dqm)
 {
int r;
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h 
b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h
index 7ccf8d0d1867..89d4a5b293a5 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h
@@ -296,6 +296,9 @@ int suspend_queues(struct kfd_process *p,
 int resume_qu

[PATCH 17/34] drm/amdkfd: apply trap workaround for gfx11

2023-03-27 Thread Jonathan Kim

Due to a HW bug, waves in only half the shader arrays can enter trap.

When starting a debug session, relocate all waves to the first shader
array of each shader engine and mask off the 2nd shader array as
unavailable.

When ending a debug session, re-enable the 2nd shader array per
shader engine.

User CU masking per queue cannot be guaranteed to remain functional
if requested during debugging (e.g. user cu mask requests only 2nd shader
array as an available resource leading to zero HW resources available)
nor can runtime be alerted of any of these changes during execution.

Make user CU masking and debugging mutual exclusive with respect to
availability.

If the debugger tries to attach to a process with a user cu masked
queue, return the runtime status as enabled but busy.

If the debugger tries to attach and fails to reallocate queue waves to
the first shader array of each shader engine, return the runtime status
as enabled but with an error.

In addition, like any other mutli-process debug supported devices,
disable trap temporary setup per-process to avoid performance impact from
setup overhead.

v2: move this after debug trap enable disable patch to better illustrate
application.
disable debugging for now on gfx11 due to broken fw.

Signed-off-by: Jonathan Kim 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h   |  2 +
 drivers/gpu/drm/amd/amdgpu/mes_v11_0.c|  7 +--
 drivers/gpu/drm/amd/amdkfd/kfd_chardev.c  |  2 -
 drivers/gpu/drm/amd/amdkfd/kfd_debug.c| 57 +++
 drivers/gpu/drm/amd/amdkfd/kfd_debug.h|  3 +-
 .../drm/amd/amdkfd/kfd_device_queue_manager.c |  7 +++
 .../gpu/drm/amd/amdkfd/kfd_mqd_manager_cik.c  |  3 +-
 .../gpu/drm/amd/amdkfd/kfd_mqd_manager_v10.c  |  3 +-
 .../gpu/drm/amd/amdkfd/kfd_mqd_manager_v11.c  | 42 ++
 .../gpu/drm/amd/amdkfd/kfd_mqd_manager_v9.c   |  3 +-
 .../gpu/drm/amd/amdkfd/kfd_mqd_manager_vi.c   |  3 +-
 drivers/gpu/drm/amd/amdkfd/kfd_priv.h |  5 +-
 .../amd/amdkfd/kfd_process_queue_manager.c|  9 ++-
 drivers/gpu/drm/amd/amdkfd/kfd_topology.c |  7 ++-
 14 files changed, 122 insertions(+), 31 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h
index d20df0cf0d88..b5f5eed2b5ef 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h
@@ -219,6 +219,8 @@ struct mes_add_queue_input {
uint32_tgws_size;
uint64_ttba_addr;
uint64_ttma_addr;
+   uint32_ttrap_en;
+   uint32_tskip_process_ctx_clear;
uint32_tis_kfd_process;
uint32_tis_aql_queue;
uint32_tqueue_size;
diff --git a/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c 
b/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c
index 08793dac618c..4fff158e231f 100644
--- a/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c
@@ -197,17 +197,14 @@ static int mes_v11_0_add_hw_queue(struct amdgpu_mes *mes,
mes_add_queue_pkt.gws_size = input->gws_size;
mes_add_queue_pkt.trap_handler_addr = input->tba_addr;
mes_add_queue_pkt.tma_addr = input->tma_addr;
+   mes_add_queue_pkt.trap_en = input->trap_en;
+   mes_add_queue_pkt.skip_process_ctx_clear = 
input->skip_process_ctx_clear;
mes_add_queue_pkt.is_kfd_process = input->is_kfd_process;
 
/* For KFD, gds_size is re-used for queue size (needed in MES for AQL 
queues) */
mes_add_queue_pkt.is_aql_queue = input->is_aql_queue;
mes_add_queue_pkt.gds_size = input->queue_size;
 
-   if (!(((adev->mes.sched_version & AMDGPU_MES_VERSION_MASK) >= 4) &&
- (adev->ip_versions[GC_HWIP][0] >= IP_VERSION(11, 0, 0)) &&
- (adev->ip_versions[GC_HWIP][0] <= IP_VERSION(11, 0, 3
-   mes_add_queue_pkt.trap_en = 1;
-
/* For KFD, gds_size is re-used for queue size (needed in MES for AQL 
queues) */
mes_add_queue_pkt.is_aql_queue = input->is_aql_queue;
mes_add_queue_pkt.gds_size = input->queue_size;
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
index c0c54206b1c2..acbed2a25a1e 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
@@ -530,8 +530,6 @@ static int kfd_ioctl_set_cu_mask(struct file *filp, struct 
kfd_process *p,
goto out;
}
 
-   minfo.update_flag = UPDATE_FLAG_CU_MASK;
-
mutex_lock(>mutex);
 
retval = pqm_update_mqd(>pqm, args->queue_id, );
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_debug.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_debug.c
index fe5b965ac8f6..7e6d8694938d 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_debug.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_debug.c
@@ -24,6 +24,57 @@
 #include "kfd_device_queue_manager.h"
 #include 
 
+

[PATCH 14/34] drm/amdgpu: prepare map process for multi-process debug devices

2023-03-27 Thread Jonathan Kim

Unlike single process debug devices, multi-process debug devices allow
debug mode setting per-VMID (non-device-global).

Because the HWS manages PASID-VMID mapping, the new MAP_PROCESS API allows
the KFD to forward the required SPI debug register write requests.

To request a new debug mode setting change, the KFD must be able to
preempt all queues then remap all queues with these new setting
requests for MAP_PROCESS to take effect.

Note that by default, trap enablement in non-debug mode must be disabled
for performance reasons for multi-process debug devices due to setup
overhead in FW.

v3: remove unneeded comment.  also add missing kfd_debug.h include
in dqm file.

v2: remove asic family code name comment in per vmid support check

Signed-off-by: Jonathan Kim 
---
 drivers/gpu/drm/amd/amdkfd/kfd_debug.h|  5 ++
 .../drm/amd/amdkfd/kfd_device_queue_manager.c | 51 +++
 .../drm/amd/amdkfd/kfd_device_queue_manager.h |  3 ++
 .../drm/amd/amdkfd/kfd_packet_manager_v9.c| 14 +
 drivers/gpu/drm/amd/amdkfd/kfd_priv.h |  9 
 drivers/gpu/drm/amd/amdkfd/kfd_process.c  |  5 ++
 6 files changed, 87 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_debug.h 
b/drivers/gpu/drm/amd/amdkfd/kfd_debug.h
index a33825f9caf7..20b8a3e97d8e 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_debug.h
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_debug.h
@@ -29,4 +29,9 @@ int kfd_dbg_trap_disable(struct kfd_process *target);
 int kfd_dbg_trap_enable(struct kfd_process *target, uint32_t fd,
void __user *runtime_info,
uint32_t *runtime_info_size);
+static inline bool kfd_dbg_is_per_vmid_supported(struct kfd_dev *dev)
+{
+   return KFD_GC_VERSION(dev) == IP_VERSION(9, 4, 2);
+}
+
 #endif
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
index 40744dc3db57..a39bd8f3d4bb 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
@@ -36,6 +36,7 @@
 #include "kfd_kernel_queue.h"
 #include "amdgpu_amdkfd.h"
 #include "mes_api_def.h"
+#include "kfd_debug.h"
 
 /* Size of the per-pipe EOP queue */
 #define CIK_HPD_EOP_BYTES_LOG2 11
@@ -2558,6 +2559,56 @@ int release_debug_trap_vmid(struct device_queue_manager 
*dqm,
return r;
 }
 
+int debug_lock_and_unmap(struct device_queue_manager *dqm)
+{
+   int r;
+
+   if (dqm->sched_policy == KFD_SCHED_POLICY_NO_HWS) {
+   pr_err("Unsupported on sched_policy: %i\n", dqm->sched_policy);
+   return -EINVAL;
+   }
+
+   if (!kfd_dbg_is_per_vmid_supported(dqm->dev))
+   return 0;
+
+   dqm_lock(dqm);
+
+   r = unmap_queues_cpsch(dqm, KFD_UNMAP_QUEUES_FILTER_ALL_QUEUES, 0, 0, 
false);
+   if (r)
+   dqm_unlock(dqm);
+
+   return r;
+}
+
+int debug_map_and_unlock(struct device_queue_manager *dqm)
+{
+   int r;
+
+   if (dqm->sched_policy == KFD_SCHED_POLICY_NO_HWS) {
+   pr_err("Unsupported on sched_policy: %i\n", dqm->sched_policy);
+   return -EINVAL;
+   }
+
+   if (!kfd_dbg_is_per_vmid_supported(dqm->dev))
+   return 0;
+
+   r = map_queues_cpsch(dqm);
+
+   dqm_unlock(dqm);
+
+   return r;
+}
+
+int debug_refresh_runlist(struct device_queue_manager *dqm)
+{
+   int r = debug_lock_and_unmap(dqm);
+
+   if (r)
+   return r;
+
+   return debug_map_and_unlock(dqm);
+}
+
 #if defined(CONFIG_DEBUG_FS)
 
 static void seq_reg_dump(struct seq_file *m,
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h 
b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h
index 0cb1504d24cf..bef3be84c5cc 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h
@@ -286,6 +286,9 @@ int reserve_debug_trap_vmid(struct device_queue_manager 
*dqm,
struct qcm_process_device *qpd);
 int release_debug_trap_vmid(struct device_queue_manager *dqm,
struct qcm_process_device *qpd);
+int debug_lock_and_unmap(struct device_queue_manager *dqm);
+int debug_map_and_unlock(struct device_queue_manager *dqm);
+int debug_refresh_runlist(struct device_queue_manager *dqm);
 
 static inline unsigned int get_sh_mem_bases_32(struct kfd_process_device *pdd)
 {
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager_v9.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager_v9.c
index 363cf8e005cc..81c190337e34 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager_v9.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager_v9.c
@@ -88,6 +88,10 @@ static int pm_map_process_aldebaran(struct packet_manager 
*pm,
 {
struct pm4_mes_map_process_aldebaran *packet;
uint64_t vm_page_table_base_addr = qpd->page_table_base;
+

[PATCH 15/34] drm/amdgpu: expose debug api for mes

2023-03-27 Thread Jonathan Kim

Similar to the F32 HWS, the RS64 HWS for GFX11 now supports a multi-process
debug API.

The skip_process_ctx_clear ADD_QUEUE requirement is to prevent the MES
from clearing the process context when the first queue is added to the
scheduler in order to maintain debug mode settings during queue preemption
and restore.  The MES clears the process context in this case due to an
unresolved FW caching bug during normal mode operations.
During debug mode, the KFD will hold a reference to the target process
so the process context should never go stale and MES can afford to skip
this requirement.

Signed-off-by: Jonathan Kim 
Reviewed-by: Felix Kuehling 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c   | 32 +++
 drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h   | 20 
 drivers/gpu/drm/amd/amdgpu/mes_v11_0.c| 12 +++
 drivers/gpu/drm/amd/include/mes_v11_api_def.h | 21 +++-
 4 files changed, 84 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c
index 0e55823ef6ca..505f453038dd 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c
@@ -924,6 +924,38 @@ int amdgpu_mes_reg_wait(struct amdgpu_device *adev, 
uint32_t reg,
return r;
 }
 
+int amdgpu_mes_set_shader_debugger(struct amdgpu_device *adev,
+   uint64_t process_context_addr,
+   uint32_t spi_gdbg_per_vmid_cntl,
+   const uint32_t *tcp_watch_cntl,
+   uint32_t flags)
+{
+   struct mes_misc_op_input op_input = {0};
+   int r;
+
+   if (!adev->mes.funcs->misc_op) {
+   DRM_ERROR("mes set shader debugger is not supported!\n");
+   return -EINVAL;
+   }
+
+   op_input.op = MES_MISC_OP_SET_SHADER_DEBUGGER;
+   op_input.set_shader_debugger.process_context_addr = 
process_context_addr;
+   op_input.set_shader_debugger.flags.u32all = flags;
+   op_input.set_shader_debugger.spi_gdbg_per_vmid_cntl = 
spi_gdbg_per_vmid_cntl;
+   memcpy(op_input.set_shader_debugger.tcp_watch_cntl, tcp_watch_cntl,
+   sizeof(op_input.set_shader_debugger.tcp_watch_cntl));
+
+   amdgpu_mes_lock(>mes);
+
+   r = adev->mes.funcs->misc_op(>mes, _input);
+   if (r)
+   DRM_ERROR("failed to set_shader_debugger\n");
+
+   amdgpu_mes_unlock(>mes);
+
+   return r;
+}
+
 static void
 amdgpu_mes_ring_to_queue_props(struct amdgpu_device *adev,
   struct amdgpu_ring *ring,
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h
index 547ec35691fa..d20df0cf0d88 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h
@@ -256,6 +256,7 @@ enum mes_misc_opcode {
MES_MISC_OP_READ_REG,
MES_MISC_OP_WRM_REG_WAIT,
MES_MISC_OP_WRM_REG_WR_WAIT,
+   MES_MISC_OP_SET_SHADER_DEBUGGER,
 };
 
 struct mes_misc_op_input {
@@ -278,6 +279,20 @@ struct mes_misc_op_input {
uint32_t   reg0;
uint32_t   reg1;
} wrm_reg;
+
+   struct {
+   uint64_t process_context_addr;
+   union {
+   struct {
+   uint64_t single_memop : 1;
+   uint64_t single_alu_op : 1;
+   uint64_t reserved: 30;
+   };
+   uint32_t u32all;
+   } flags;
+   uint32_t spi_gdbg_per_vmid_cntl;
+   uint32_t tcp_watch_cntl[4];
+   } set_shader_debugger;
};
 };
 
@@ -340,6 +355,11 @@ int amdgpu_mes_reg_wait(struct amdgpu_device *adev, 
uint32_t reg,
 int amdgpu_mes_reg_write_reg_wait(struct amdgpu_device *adev,
  uint32_t reg0, uint32_t reg1,
  uint32_t ref, uint32_t mask);
+int amdgpu_mes_set_shader_debugger(struct amdgpu_device *adev,
+   uint64_t process_context_addr,
+   uint32_t spi_gdbg_per_vmid_cntl,
+   const uint32_t *tcp_watch_cntl,
+   uint32_t flags);
 
 int amdgpu_mes_add_ring(struct amdgpu_device *adev, int gang_id,
int queue_type, int idx,
diff --git a/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c 
b/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c
index 5826eac270d7..08793dac618c 100644
--- a/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c
@@ -334,6 +334,18 @@ static int mes_v11_0_misc_op(struct amdgpu_mes *mes,
misc_pkt.wait_

[PATCH 23/34] drm/amdkfd: add debug set exceptions enabled operation

2023-03-27 Thread Jonathan Kim

The debugger subscibes to nofication for requested exceptions on attach.
Allow the debugger to change its subsciption later on.

Signed-off-by: Jonathan Kim 
Reviewed-by: Felix Kuehling 
---
 drivers/gpu/drm/amd/amdkfd/kfd_chardev.c |  3 ++
 drivers/gpu/drm/amd/amdkfd/kfd_debug.c   | 36 
 drivers/gpu/drm/amd/amdkfd/kfd_debug.h   |  2 ++
 3 files changed, 41 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
index 2a8d30dbb3f6..aa90e5874ad2 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
@@ -2966,6 +2966,9 @@ static int kfd_ioctl_set_debug_trap(struct file *filep, 
struct kfd_process *p, v
args->send_runtime_event.exception_mask);
break;
case KFD_IOC_DBG_TRAP_SET_EXCEPTIONS_ENABLED:
+   kfd_dbg_set_enabled_debug_exception_mask(target,
+   args->set_exceptions_enabled.exception_mask);
+   break;
case KFD_IOC_DBG_TRAP_SET_WAVE_LAUNCH_OVERRIDE:
case KFD_IOC_DBG_TRAP_SET_WAVE_LAUNCH_MODE:
case KFD_IOC_DBG_TRAP_SUSPEND_QUEUES:
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_debug.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_debug.c
index eead93e7fdc6..9ed364a20398 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_debug.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_debug.c
@@ -521,3 +521,39 @@ int kfd_dbg_trap_enable(struct kfd_process *target, 
uint32_t fd,
 
return r;
 }
+
+void kfd_dbg_set_enabled_debug_exception_mask(struct kfd_process *target,
+   uint64_t exception_set_mask)
+{
+   uint64_t found_mask = 0;
+   struct process_queue_manager *pqm;
+   struct process_queue_node *pqn;
+   static const char write_data = '.';
+   loff_t pos = 0;
+   int i;
+
+   mutex_lock(>event_mutex);
+
+   found_mask |= target->exception_status;
+
+   pqm = >pqm;
+   list_for_each_entry(pqn, >queues, process_queue_list) {
+   if (!pqn)
+   continue;
+
+   found_mask |= pqn->q->properties.exception_status;
+   }
+
+   for (i = 0; i < target->n_pdds; i++) {
+   struct kfd_process_device *pdd = target->pdds[i];
+
+   found_mask |= pdd->exception_status;
+   }
+
+   if (exception_set_mask & found_mask)
+   kernel_write(target->dbg_ev_file, _data, 1, );
+
+   target->exception_enable_mask = exception_set_mask;
+
+   mutex_unlock(>event_mutex);
+}
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_debug.h 
b/drivers/gpu/drm/amd/amdkfd/kfd_debug.h
index c71f1595fdc8..f25dcf539f7b 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_debug.h
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_debug.h
@@ -56,6 +56,8 @@ static inline bool kfd_dbg_is_per_vmid_supported(struct 
kfd_dev *dev)
 
 void debug_event_write_work_handler(struct work_struct *work);
 
+void kfd_dbg_set_enabled_debug_exception_mask(struct kfd_process *target,
+   uint64_t exception_set_mask);
 /*
  * If GFX off is enabled, chips that do not support RLC restore for the debug
  * registers will disable GFX off temporarily for the entire debug session.
-- 
2.25.1

[PATCH 24/34] drm/amdkfd: add debug wave launch override operation

2023-03-27 Thread Jonathan Kim

This operation allows the debugger to override the enabled HW
exceptions on the device.

On debug devices that only support the debugging of a single process,
the HW exceptions are global and set through the SPI_GDBG_TRAP_MASK
register.
Because they are global, only address watch exceptions are allowed to
be enabled.  In other words, the debugger must preserve all non-address
watch exception states in normal mode operation by barring a full
replacement override or a non-address watch override request.

For multi-process debugging, all HW exception overrides are per-VMID so
all exceptions can be overridden or fully replaced.

In order for the debugger to know what is permissible, returned the
supported override mask back to the debugger along with the previously
enable overrides.

v5: gfx off optimization to be done later

v4: move ioctl header reference here when it's actually required

v3: v2 was reviewed but requesting re-review for GFX11 added supported.

v2: switch unsupported override mode return from EPERM to EINVAL to
support unique EPERM on PTRACE failure.

Signed-off-by: Jonathan Kim 
Reviewed-by: Felix Kuehling 
---
 .../drm/amd/amdgpu/amdgpu_amdkfd_aldebaran.c  | 47 ++
 .../drm/amd/amdgpu/amdgpu_amdkfd_arcturus.c   |  2 +
 .../drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.c| 55 
 .../drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.h| 10 +++
 .../drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10_3.c  |  5 +-
 .../drm/amd/amdgpu/amdgpu_amdkfd_gfx_v11.c| 87 ++-
 .../gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c | 55 
 .../gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.h | 10 +++
 drivers/gpu/drm/amd/amdkfd/kfd_chardev.c  |  7 ++
 drivers/gpu/drm/amd/amdkfd/kfd_debug.c| 69 +++
 drivers/gpu/drm/amd/amdkfd/kfd_debug.h|  6 ++
 11 files changed, 351 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_aldebaran.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_aldebaran.c
index b811a0985050..d7881bbd828d 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_aldebaran.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_aldebaran.c
@@ -25,6 +25,7 @@
 #include "amdgpu_amdkfd_gfx_v9.h"
 #include "gc/gc_9_4_2_offset.h"
 #include "gc/gc_9_4_2_sh_mask.h"
+#include 
 
 /*
  * Returns TRAP_EN, EXCP_EN and EXCP_REPLACE.
@@ -62,6 +63,50 @@ static uint32_t kgd_aldebaran_disable_debug_trap(struct 
amdgpu_device *adev,
return data;
 }
 
+static int kgd_aldebaran_validate_trap_override_request(struct amdgpu_device 
*adev,
+   uint32_t trap_override,
+   uint32_t 
*trap_mask_supported)
+{
+   *trap_mask_supported &= KFD_DBG_TRAP_MASK_FP_INVALID |
+   KFD_DBG_TRAP_MASK_FP_INPUT_DENORMAL |
+   KFD_DBG_TRAP_MASK_FP_DIVIDE_BY_ZERO |
+   KFD_DBG_TRAP_MASK_FP_OVERFLOW |
+   KFD_DBG_TRAP_MASK_FP_UNDERFLOW |
+   KFD_DBG_TRAP_MASK_FP_INEXACT |
+   KFD_DBG_TRAP_MASK_INT_DIVIDE_BY_ZERO |
+   KFD_DBG_TRAP_MASK_DBG_ADDRESS_WATCH |
+   KFD_DBG_TRAP_MASK_DBG_MEMORY_VIOLATION;
+
+   if (trap_override != KFD_DBG_TRAP_OVERRIDE_OR &&
+   trap_override != KFD_DBG_TRAP_OVERRIDE_REPLACE)
+   return -EPERM;
+
+   return 0;
+}
+
+/* returns TRAP_EN, EXCP_EN and EXCP_RPLACE. */
+static uint32_t kgd_aldebaran_set_wave_launch_trap_override(struct 
amdgpu_device *adev,
+   uint32_t vmid,
+   uint32_t trap_override,
+   uint32_t trap_mask_bits,
+   uint32_t trap_mask_request,
+   uint32_t *trap_mask_prev,
+   uint32_t kfd_dbg_trap_cntl_prev)
+
+{
+   uint32_t data = 0;
+
+   *trap_mask_prev = REG_GET_FIELD(kfd_dbg_trap_cntl_prev, 
SPI_GDBG_PER_VMID_CNTL, EXCP_EN);
+   trap_mask_bits = (trap_mask_bits & trap_mask_request) |
+   (*trap_mask_prev & ~trap_mask_request);
+
+   data = REG_SET_FIELD(data, SPI_GDBG_PER_VMID_CNTL, TRAP_EN, 1);
+   data = REG_SET_FIELD(data, SPI_GDBG_PER_VMID_CNTL, EXCP_EN, 
trap_mask_bits);
+   data = REG_SET_FIELD(data, SPI_GDBG_PER_VMID_CNTL, EXCP_REPLACE, 
trap_override);
+
+   return data;
+}
+
 const struct kfd2kgd_calls aldebaran_kfd2kgd = {
.program_sh_mem_settings = kgd_gfx_v9_program_sh_mem_settings,
.set_pasid_vmid_mapping = kgd_gfx_v9_set_pasid_vmid_mapping,
@@ -82,6 +127,8 @@ const struct kfd2kgd_calls aldebaran_kfd2kgd = {
.get_cu_occupancy = kgd_gfx_v9_get_cu_occupancy,
.enable_debug_trap = kgd_aldeba

[PATCH 29/34] drm/amdkfd: add debug query event operation

2023-03-27 Thread Jonathan Kim

Allow the debugger to query a single queue, device and process
exception.
The KFD should also return the GPU or Queue id of the exception.
The debugger also has the option of clearing exceptions after
being queried.

Signed-off-by: Jonathan Kim 
Reviewed-by: Felix Kuehling 
---
 drivers/gpu/drm/amd/amdkfd/kfd_chardev.c |  6 +++
 drivers/gpu/drm/amd/amdkfd/kfd_debug.c   | 64 
 drivers/gpu/drm/amd/amdkfd/kfd_debug.h   |  5 ++
 3 files changed, 75 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
index da3478b133bd..593ede6b2af5 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
@@ -3024,6 +3024,12 @@ static int kfd_ioctl_set_debug_trap(struct file *filep, 
struct kfd_process *p, v
r = kfd_dbg_trap_set_flags(target, >set_flags.flags);
break;
case KFD_IOC_DBG_TRAP_QUERY_DEBUG_EVENT:
+   r = kfd_dbg_ev_query_debug_event(target,
+   >query_debug_event.queue_id,
+   >query_debug_event.gpu_id,
+   args->query_debug_event.exception_mask,
+   >query_debug_event.exception_mask);
+   break;
case KFD_IOC_DBG_TRAP_QUERY_EXCEPTION_INFO:
case KFD_IOC_DBG_TRAP_GET_QUEUE_SNAPSHOT:
case KFD_IOC_DBG_TRAP_GET_DEVICE_SNAPSHOT:
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_debug.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_debug.c
index 5d3193ae71e3..e7f1521ac593 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_debug.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_debug.c
@@ -27,6 +27,70 @@
 
 #define MAX_WATCH_ADDRESSES4
 
+int kfd_dbg_ev_query_debug_event(struct kfd_process *process,
+ unsigned int *queue_id,
+ unsigned int *gpu_id,
+ uint64_t exception_clear_mask,
+ uint64_t *event_status)
+{
+   struct process_queue_manager *pqm;
+   struct process_queue_node *pqn;
+   int i;
+
+   if (!(process && process->debug_trap_enabled))
+   return -ENODATA;
+
+   mutex_lock(>event_mutex);
+   *event_status = 0;
+   *queue_id = 0;
+   *gpu_id = 0;
+
+   /* find and report queue events */
+   pqm = >pqm;
+   list_for_each_entry(pqn, >queues, process_queue_list) {
+   uint64_t tmp = process->exception_enable_mask;
+
+   if (!pqn->q)
+   continue;
+
+   tmp &= pqn->q->properties.exception_status;
+
+   if (!tmp)
+   continue;
+
+   *event_status = pqn->q->properties.exception_status;
+   *queue_id = pqn->q->properties.queue_id;
+   *gpu_id = pqn->q->device->id;
+   pqn->q->properties.exception_status &= ~exception_clear_mask;
+   goto out;
+   }
+
+   /* find and report device events */
+   for (i = 0; i < process->n_pdds; i++) {
+   struct kfd_process_device *pdd = process->pdds[i];
+   uint64_t tmp = process->exception_enable_mask
+   & pdd->exception_status;
+
+   if (!tmp)
+   continue;
+
+   *event_status = pdd->exception_status;
+   *gpu_id = pdd->dev->id;
+   pdd->exception_status &= ~exception_clear_mask;
+   goto out;
+   }
+
+   /* report process events */
+   if (process->exception_enable_mask & process->exception_status) {
+   *event_status = process->exception_status;
+   process->exception_status &= ~exception_clear_mask;
+   }
+
+out:
+   mutex_unlock(>event_mutex);
+   return *event_status ? 0 : -EAGAIN;
+}
+
 void debug_event_write_work_handler(struct work_struct *work)
 {
struct kfd_process *process;
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_debug.h 
b/drivers/gpu/drm/amd/amdkfd/kfd_debug.h
index 782362d82890..4f2195d57ff0 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_debug.h
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_debug.h
@@ -27,6 +27,11 @@
 
 void kfd_dbg_trap_deactivate(struct kfd_process *target, bool unwind, int 
unwind_count);
 int kfd_dbg_trap_activate(struct kfd_process *target);
+int kfd_dbg_ev_query_debug_event(struct kfd_process *process,
+   unsigned int *queue_id,
+   unsigned int *gpu_id,
+   uint64_t exception_clear_mask,
+   uint64_t *event_status);
 bool kfd_set_dbg_ev_from_interrupt(struct kfd_dev *dev,
   unsigned int pasid,
   uint32_t doorbell_id,
-- 
2.25.1

[PATCH 11/34] drm/amdgpu: add gfx11 hw debug mode enable and disable calls

2023-03-27 Thread Jonathan Kim

Implement the per-device calls to enable or disable HW debug mode
for GFX11.

v2: remove unneeded ioctl reference and fix types and comment formats.

Signed-off-by: Jonathan Kim 
Reviewed-by: Felix Kuehling 
---
 .../drm/amd/amdgpu/amdgpu_amdkfd_gfx_v11.c| 38 +++
 1 file changed, 38 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v11.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v11.c
index 7e80caa05060..dd282085d6c9 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v11.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v11.c
@@ -606,6 +606,42 @@ static void set_vm_context_page_table_base_v11(struct 
amdgpu_device *adev,
adev->gfxhub.funcs->setup_vm_pt_regs(adev, vmid, page_table_base);
 }
 
+/*
+ * Returns TRAP_EN, EXCP_EN and EXCP_REPLACE.
+ *
+ * restore_dbg_registers is ignored here but is a general interface requirement
+ * for devices that support GFXOFF and where the RLC save/restore list
+ * does not support hw registers for debugging i.e. the driver has to manually
+ * initialize the debug mode registers after it has disabled GFX off during the
+ * debug session.
+ */
+static uint32_t kgd_gfx_v11_enable_debug_trap(struct amdgpu_device *adev,
+   bool restore_dbg_registers,
+   uint32_t vmid)
+{
+   uint32_t data = 0;
+
+   data = REG_SET_FIELD(data, SPI_GDBG_PER_VMID_CNTL, TRAP_EN, 1);
+   data = REG_SET_FIELD(data, SPI_GDBG_PER_VMID_CNTL, EXCP_EN, 0);
+   data = REG_SET_FIELD(data, SPI_GDBG_PER_VMID_CNTL, EXCP_REPLACE, 0);
+
+   return data;
+}
+
+/* Returns TRAP_EN, EXCP_EN and EXCP_REPLACE. */
+static uint32_t kgd_gfx_v11_disable_debug_trap(struct amdgpu_device *adev,
+   bool keep_trap_enabled,
+   uint32_t vmid)
+{
+   uint32_t data = 0;
+
+   data = REG_SET_FIELD(data, SPI_GDBG_PER_VMID_CNTL, TRAP_EN, 
keep_trap_enabled);
+   data = REG_SET_FIELD(data, SPI_GDBG_PER_VMID_CNTL, EXCP_EN, 0);
+   data = REG_SET_FIELD(data, SPI_GDBG_PER_VMID_CNTL, EXCP_REPLACE, 0);
+
+   return data;
+}
+
 const struct kfd2kgd_calls gfx_v11_kfd2kgd = {
.program_sh_mem_settings = program_sh_mem_settings_v11,
.set_pasid_vmid_mapping = set_pasid_vmid_mapping_v11,
@@ -622,4 +658,6 @@ const struct kfd2kgd_calls gfx_v11_kfd2kgd = {
.wave_control_execute = wave_control_execute_v11,
.get_atc_vmid_pasid_mapping_info = NULL,
.set_vm_context_page_table_base = set_vm_context_page_table_base_v11,
+   .enable_debug_trap = kgd_gfx_v11_enable_debug_trap,
+   .disable_debug_trap = kgd_gfx_v11_disable_debug_trap
 };
-- 
2.25.1

[PATCH 22/34] drm/amdkfd: update process interrupt handling for debug events

2023-03-27 Thread Jonathan Kim

The debugger must be notified by any debugger subscribed exception
that comes from hardware interrupts.

If a debugger session exits, any exceptions it subscribed to may still
have interrupts in the interrupt ring buffer or KGD/KFD pipeline.
To prevent a new session from inheriting stale interrupts, when a new
queue is created, open an interrupt drain and allow the IH ring to drain
from a timestamped checkpoint.  Then inject a custom IV so that once
the custom IV is picked up by the KFD, it's safe to close the drain
and proceed with queue creation.

The drain must also be on debug disable as SW interrupts may still
be processed.  Drain at this time and clear all the exception status.

The debugger may also not be attached nor subscibed to certain
exceptions so forward them directly to the runtime.

GFX10 also requires its own IV processing, hence the creation of
kfd_int_process_v10.c.  This is because the IV from SQ interrupts are
packed into a new continguous format unlike GFX9. To make this clear,
a separate interrupting handling code file was created.

v4: fixup indentation formatting
v3: enable gfx11 interrupts
v2: fix interrupt drain on debug disable.
fix interrupt drain on queue create during -ERESTARTSYS.
fix up macros naming for ECODE parsing.

Signed-off-by: Jonathan Kim 
Reviewed-by: Felix Kuehling 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c|  16 +
 drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h|   2 +
 drivers/gpu/drm/amd/amdkfd/Makefile   |   1 +
 drivers/gpu/drm/amd/amdkfd/kfd_debug.c|  84 
 drivers/gpu/drm/amd/amdkfd/kfd_debug.h|   6 +
 drivers/gpu/drm/amd/amdkfd/kfd_device.c   |   4 +-
 .../gpu/drm/amd/amdkfd/kfd_int_process_v10.c  | 405 ++
 .../gpu/drm/amd/amdkfd/kfd_int_process_v11.c  |  21 +-
 .../gpu/drm/amd/amdkfd/kfd_int_process_v9.c   |  98 -
 drivers/gpu/drm/amd/amdkfd/kfd_priv.h |  12 +
 drivers/gpu/drm/amd/amdkfd/kfd_process.c  |  47 ++
 .../amd/amdkfd/kfd_process_queue_manager.c|   4 +
 12 files changed, 680 insertions(+), 20 deletions(-)
 create mode 100644 drivers/gpu/drm/amd/amdkfd/kfd_int_process_v10.c

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
index f99d4873bf22..8ab5bbb53723 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
@@ -763,6 +763,22 @@ void amdgpu_amdkfd_ras_poison_consumption_handler(struct 
amdgpu_device *adev, bo
amdgpu_umc_poison_handler(adev, reset);
 }
 
+int amdgpu_amdkfd_send_close_event_drain_irq(struct amdgpu_device *adev,
+   uint32_t *payload)
+{
+   int ret;
+
+   /* Device or IH ring is not ready so bail. */
+   ret = amdgpu_ih_wait_on_checkpoint_process_ts(adev, >irq.ih);
+   if (ret)
+   return ret;
+
+   /* Send payload to fence KFD interrupts */
+   amdgpu_amdkfd_interrupt(adev, payload);
+
+   return 0;
+}
+
 bool amdgpu_amdkfd_ras_query_utcl2_poison_status(struct amdgpu_device *adev)
 {
if (adev->gfx.ras && adev->gfx.ras->query_utcl2_poison_status)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
index 01ba3589b60a..83cf1943ee34 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
@@ -241,6 +241,8 @@ int amdgpu_amdkfd_get_xgmi_bandwidth_mbytes(struct 
amdgpu_device *dst,
struct amdgpu_device *src,
bool is_min);
 int amdgpu_amdkfd_get_pcie_bandwidth_mbytes(struct amdgpu_device *adev, bool 
is_min);
+int amdgpu_amdkfd_send_close_event_drain_irq(struct amdgpu_device *adev,
+   uint32_t *payload);
 
 /* Read user wptr from a specified user address space with page fault
  * disabled. The memory must be pinned and mapped to the hardware when
diff --git a/drivers/gpu/drm/amd/amdkfd/Makefile 
b/drivers/gpu/drm/amd/amdkfd/Makefile
index 747754428073..2ec8f27c5366 100644
--- a/drivers/gpu/drm/amd/amdkfd/Makefile
+++ b/drivers/gpu/drm/amd/amdkfd/Makefile
@@ -53,6 +53,7 @@ AMDKFD_FILES  := $(AMDKFD_PATH)/kfd_module.o \
$(AMDKFD_PATH)/kfd_events.o \
$(AMDKFD_PATH)/cik_event_interrupt.o \
$(AMDKFD_PATH)/kfd_int_process_v9.o \
+   $(AMDKFD_PATH)/kfd_int_process_v10.o \
$(AMDKFD_PATH)/kfd_int_process_v11.o \
$(AMDKFD_PATH)/kfd_smi_events.o \
$(AMDKFD_PATH)/kfd_crat.o \
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_debug.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_debug.c
index c779acb9a623..eead93e7fdc6 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_debug.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_debug.c
@@ -125,6 +125,64 @@ bool kfd_dbg_ev_raise(uint64_t event_mask,
return is_subscribed;
 }
 
+/* set pending event queue entry from

[PATCH 18/34] drm/amdkfd: add raise exception event function

2023-03-27 Thread Jonathan Kim

Exception events can be generated from interrupts or queue activitity.

The raise event function will save exception status of a queue, device
or process then notify the debugger of the status change by writing to
a debugger polled file descriptor that the debugger provides during
debug attach.

For memory violation exceptions, extra exception data will be saved.

The debugger will be able to query the saved exception states by query
operation that will be provided by follow up patches.

v3: move debug event worker definitions to this patch.

v2: WIP - fix missing brace

Signed-off-by: Jonathan Kim 
---
 drivers/gpu/drm/amd/amdkfd/kfd_debug.c   | 104 +++
 drivers/gpu/drm/amd/amdkfd/kfd_debug.h   |   7 ++
 drivers/gpu/drm/amd/amdkfd/kfd_priv.h|  10 +++
 drivers/gpu/drm/amd/amdkfd/kfd_process.c |   2 +
 4 files changed, 123 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_debug.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_debug.c
index 7e6d8694938d..23ca8123f8f5 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_debug.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_debug.c
@@ -24,6 +24,107 @@
 #include "kfd_device_queue_manager.h"
 #include 
 
+void debug_event_write_work_handler(struct work_struct *work)
+{
+   struct kfd_process *process;
+
+   static const char write_data = '.';
+   loff_t pos = 0;
+
+   process = container_of(work,
+   struct kfd_process,
+   debug_event_workarea);
+
+   kernel_write(process->dbg_ev_file, _data, 1, );
+}
+
+/* update process/device/queue exception status, write to descriptor
+ * only if exception_status is enabled.
+ */
+bool kfd_dbg_ev_raise(uint64_t event_mask,
+   struct kfd_process *process, struct kfd_dev *dev,
+   unsigned int source_id, bool use_worker,
+   void *exception_data, size_t exception_data_size)
+{
+   struct process_queue_manager *pqm;
+   struct process_queue_node *pqn;
+   int i;
+   static const char write_data = '.';
+   loff_t pos = 0;
+   bool is_subscribed = true;
+
+   if (!(process && process->debug_trap_enabled))
+   return false;
+
+   mutex_lock(>event_mutex);
+
+   if (event_mask & KFD_EC_MASK_DEVICE) {
+   for (i = 0; i < process->n_pdds; i++) {
+   struct kfd_process_device *pdd = process->pdds[i];
+
+   if (pdd->dev != dev)
+   continue;
+
+   pdd->exception_status |= event_mask & 
KFD_EC_MASK_DEVICE;
+
+   if (event_mask & 
KFD_EC_MASK(EC_DEVICE_MEMORY_VIOLATION)) {
+   if (!pdd->vm_fault_exc_data) {
+   pdd->vm_fault_exc_data = kmemdup(
+   exception_data,
+   exception_data_size,
+   GFP_KERNEL);
+   if (!pdd->vm_fault_exc_data)
+   pr_debug("Failed to allocate 
exception data memory");
+   } else {
+   pr_debug("Debugger exception data not 
saved\n");
+   print_hex_dump_bytes("exception data: ",
+   DUMP_PREFIX_OFFSET,
+   exception_data,
+   exception_data_size);
+   }
+   }
+   break;
+   }
+   } else if (event_mask & KFD_EC_MASK_PROCESS) {
+   process->exception_status |= event_mask & KFD_EC_MASK_PROCESS;
+   } else {
+   pqm = >pqm;
+   list_for_each_entry(pqn, >queues,
+   process_queue_list) {
+   int target_id;
+
+   if (!pqn->q)
+   continue;
+
+   target_id = event_mask & KFD_EC_MASK(EC_QUEUE_NEW) ?
+   pqn->q->properties.queue_id :
+   pqn->q->doorbell_id;
+
+   if (pqn->q->device != dev || target_id != source_id)
+   continue;
+
+   pqn->q->properties.exception_status |= event_mask;
+   break;
+   }
+   }
+
+   if (process->exception_enable_mask & event_mask) {
+   if (use_worker)
+   schedule_work(>debug_event_workarea);
+   e

[PATCH 08/34] drm/amdkfd: fix kfd_suspend_all_processes for gfx941 debugging

2023-03-27 Thread Jonathan Kim

The debugger for GFX9.4.1 uses kfd_suspend_all_processes to pause the
compute pipe line so it can safely toggle the SQ's implicit wait on
barrier setting during debug attach/detach to work around the wave
exception s_barrier race condition.

For mGPU setups, repeated calls to cancel all outstanding restore work can
result in an assymetric permanent cancelling of the restored work from the
debug device after it has toggled the HW work around settings.
Instead of cancelling the outstanding restore work, just flush it as it
will be properly evicted anyways by the current suspend call.

Signed-off-by: Jonathan Kim 
---
 drivers/gpu/drm/amd/amdkfd/kfd_process.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
index 1e3795e7e18d..55a4ddd35e12 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_process.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
@@ -2008,7 +2008,7 @@ void kfd_suspend_all_processes(void)
WARN(debug_evictions, "Evicting all processes");
hash_for_each_rcu(kfd_processes_table, temp, p, kfd_processes) {
cancel_delayed_work_sync(>eviction_work);
-   cancel_delayed_work_sync(>restore_work);
+   flush_delayed_work(>restore_work);
 
if (kfd_process_evict_queues(p, 
KFD_QUEUE_EVICTION_TRIGGER_SUSPEND))
pr_err("Failed to suspend process 0x%x\n", p->pasid);
-- 
2.25.1

[PATCH 06/34] drm/amdgpu: add gfx9 hw debug mode enable and disable calls

2023-03-27 Thread Jonathan Kim

Implement the per-device calls to enable or disable HW debug mode for
GFX9 prior to GFX9.4.1.

GFX9.4.1 and onward will require their own enable/disable sequence as
follow on patches.

When hardware debug mode setting is requested, waves will inherit
these settings in the Shader Processor Input's (SPI) Sequencer Global
Block (SQG). This means that the KGD must drain all waves from the SPI
into SQG (approximately 96 SPI clock cycles) prior to debug mode setting
to ensure that the order of operations that the debugger expects with
regards to debug mode setting transaction requests and wave inheritence
of that mode is upheld.

Also ensure that exception overrides are reset to their original state
prior to debug enable or disable.

v3: fixup typo and comment format complaints from kern bot.
Also remove unrequired gfx9 wave_launch_stall from kfd_debug.h

v2: remove unnecessary static srbm lock renaming.
add comments to explain ignored arguments for debug trap enable and
disable.

Signed-off-by: Jonathan Kim 
Reviewed-by: Felix Kuehling 
---
 .../gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c | 92 +++
 .../gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.h |  9 ++
 2 files changed, 101 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c
index e92b93557c13..e0bd61e16847 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c
@@ -646,6 +646,96 @@ int kgd_gfx_v9_wave_control_execute(struct amdgpu_device 
*adev,
return 0;
 }
 
+/*
+ * GFX9 helper for wave launch stall requirements on debug trap setting.
+ *
+ * vmid:
+ *   Target VMID to stall/unstall.
+ *
+ * stall:
+ *   0-unstall wave launch (enable), 1-stall wave launch (disable).
+ *   After wavefront launch has been stalled, allocated waves must drain from
+ *   SPI in order for debug trap settings to take effect on those waves.
+ *   This is roughly a ~96 clock cycle wait on SPI where a read on
+ *   SPI_GDBG_WAVE_CNTL translates to ~32 clock cycles.
+ *   KGD_GFX_V9_WAVE_LAUNCH_SPI_DRAIN_LATENCY indicates the number of reads 
required.
+ *
+ *   NOTE: We can afford to clear the entire STALL_VMID field on unstall
+ *   because GFX9.4.1 cannot support multi-process debugging due to trap
+ *   configuration and masking being limited to global scope.  Always assume
+ *   single process conditions.
+ */
+#define KGD_GFX_V9_WAVE_LAUNCH_SPI_DRAIN_LATENCY   3
+void kgd_gfx_v9_set_wave_launch_stall(struct amdgpu_device *adev,
+   uint32_t vmid,
+   bool stall)
+{
+   int i;
+   uint32_t data = RREG32(SOC15_REG_OFFSET(GC, 0, mmSPI_GDBG_WAVE_CNTL));
+
+   if (adev->ip_versions[GC_HWIP][0] == IP_VERSION(9, 4, 1))
+   data = REG_SET_FIELD(data, SPI_GDBG_WAVE_CNTL, STALL_VMID,
+   stall ? 1 << vmid : 0);
+   else
+   data = REG_SET_FIELD(data, SPI_GDBG_WAVE_CNTL, STALL_RA,
+   stall ? 1 : 0);
+
+   WREG32(SOC15_REG_OFFSET(GC, 0, mmSPI_GDBG_WAVE_CNTL), data);
+
+   if (!stall)
+   return;
+
+   for (i = 0; i < KGD_GFX_V9_WAVE_LAUNCH_SPI_DRAIN_LATENCY; i++)
+   RREG32(SOC15_REG_OFFSET(GC, 0, mmSPI_GDBG_WAVE_CNTL));
+}
+
+/*
+ * restore_dbg_registers is ignored here but is a general interface requirement
+ * for devices that support GFXOFF and where the RLC save/restore list
+ * does not support hw registers for debugging i.e. the driver has to manually
+ * initialize the debug mode registers after it has disabled GFX off during the
+ * debug session.
+ */
+uint32_t kgd_gfx_v9_enable_debug_trap(struct amdgpu_device *adev,
+   bool restore_dbg_registers,
+   uint32_t vmid)
+{
+   mutex_lock(>grbm_idx_mutex);
+
+   kgd_gfx_v9_set_wave_launch_stall(adev, vmid, true);
+
+   WREG32(SOC15_REG_OFFSET(GC, 0, mmSPI_GDBG_TRAP_MASK), 0);
+
+   kgd_gfx_v9_set_wave_launch_stall(adev, vmid, false);
+
+   mutex_unlock(>grbm_idx_mutex);
+
+   return 0;
+}
+
+/*
+ * keep_trap_enabled is ignored here but is a general interface requirement
+ * for devices that support multi-process debugging where the performance
+ * overhead from trap temporary setup needs to be bypassed when the debug
+ * session has ended.
+ */
+uint32_t kgd_gfx_v9_disable_debug_trap(struct amdgpu_device *adev,
+   bool keep_trap_enabled,
+   uint32_t vmid)
+{
+   mutex_lock(>grbm_idx_mutex);
+
+   kgd_gfx_v9_set_wave_launch_stall(adev, vmid, true);
+
+   WREG32(SOC15_REG_OFFSET(GC, 0, mmSPI_GDBG_TRAP_MASK), 0);
+
+   kgd_gfx_v9_set_wave_launch_stall(adev, vmid, false);
+
+   mutex_unlock(>grbm_idx_mutex);
+
+

[PATCH 02/34] drm/amdkfd: display debug capabilities

2023-03-27 Thread Jonathan Kim

Expose debug capabilities in the KFD topology node's HSA capabilities and
debug properties flags.

Ensure correct capabilities are exposed based on firmware support.

Flag definitions can be referenced in uapi/linux/kfd_sysfs.h.

v2: v1 was reviewed but re-requesting review for the following.
- remove asic family code name comments in firmware support checking
- add gfx11 requirements in fw support checks and debug props and caps

Signed-off-by: Jonathan Kim 
Reviewed-by: Felix Kuehling 
---
 drivers/gpu/drm/amd/amdkfd/kfd_topology.c | 101 --
 drivers/gpu/drm/amd/amdkfd/kfd_topology.h |   6 ++
 include/uapi/linux/kfd_sysfs.h|  15 
 3 files changed, 117 insertions(+), 5 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_topology.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_topology.c
index 8e4124dcb6e4..0deeb87e5a2d 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_topology.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_topology.c
@@ -551,6 +551,8 @@ static ssize_t node_show(struct kobject *kobj, struct 
attribute *attr,
  dev->gpu->mec_fw_version);
sysfs_show_32bit_prop(buffer, offs, "capability",
  dev->node_props.capability);
+   sysfs_show_64bit_prop(buffer, offs, "debug_prop",
+ dev->node_props.debug_prop);
sysfs_show_32bit_prop(buffer, offs, "sdma_fw_version",
  dev->gpu->sdma_fw_version);
sysfs_show_64bit_prop(buffer, offs, "unique_id",
@@ -1865,6 +1867,97 @@ static int kfd_topology_add_device_locked(struct kfd_dev 
*gpu, uint32_t gpu_id,
return res;
 }
 
+static void kfd_topology_set_dbg_firmware_support(struct kfd_topology_device 
*dev)
+{
+   bool firmware_supported = true;
+
+   if (KFD_GC_VERSION(dev->gpu) >= IP_VERSION(11, 0, 0) &&
+   KFD_GC_VERSION(dev->gpu) < IP_VERSION(12, 0, 0)) {
+   firmware_supported =
+   (dev->gpu->adev->mes.sched_version & 
AMDGPU_MES_VERSION_MASK) >= 9;
+   goto out;
+   }
+
+   /*
+* Note: Any unlisted devices here are assumed to support exception 
handling.
+* Add additional checks here as needed.
+*/
+   switch (KFD_GC_VERSION(dev->gpu)) {
+   case IP_VERSION(9, 0, 1):
+   firmware_supported = dev->gpu->mec_fw_version >= 459 + 32768;
+   break;
+   case IP_VERSION(9, 1, 0):
+   case IP_VERSION(9, 2, 1):
+   case IP_VERSION(9, 2, 2):
+   case IP_VERSION(9, 3, 0):
+   case IP_VERSION(9, 4, 0):
+   firmware_supported = dev->gpu->mec_fw_version >= 459;
+   break;
+   case IP_VERSION(9, 4, 1):
+   firmware_supported = dev->gpu->mec_fw_version >= 60;
+   break;
+   case IP_VERSION(9, 4, 2):
+   firmware_supported = dev->gpu->mec_fw_version >= 51;
+   break;
+   case IP_VERSION(10, 1, 10):
+   case IP_VERSION(10, 1, 2):
+   case IP_VERSION(10, 1, 1):
+   firmware_supported = dev->gpu->mec_fw_version >= 144;
+   break;
+   case IP_VERSION(10, 3, 0):
+   case IP_VERSION(10, 3, 2):
+   case IP_VERSION(10, 3, 1):
+   case IP_VERSION(10, 3, 4):
+   case IP_VERSION(10, 3, 5):
+   firmware_supported = dev->gpu->mec_fw_version >= 89;
+   break;
+   case IP_VERSION(10, 1, 3):
+   case IP_VERSION(10, 3, 3):
+   firmware_supported = false;
+   break;
+   default:
+   break;
+   }
+
+out:
+   if (firmware_supported)
+   dev->node_props.capability |= 
HSA_CAP_TRAP_DEBUG_FIRMWARE_SUPPORTED;
+}
+
+static void kfd_topology_set_capabilities(struct kfd_topology_device *dev)
+{
+   dev->node_props.capability |= ((HSA_CAP_DOORBELL_TYPE_2_0 <<
+   HSA_CAP_DOORBELL_TYPE_TOTALBITS_SHIFT) &
+   HSA_CAP_DOORBELL_TYPE_TOTALBITS_MASK);
+
+   dev->node_props.capability |= HSA_CAP_TRAP_DEBUG_SUPPORT |
+   HSA_CAP_TRAP_DEBUG_WAVE_LAUNCH_TRAP_OVERRIDE_SUPPORTED |
+   HSA_CAP_TRAP_DEBUG_WAVE_LAUNCH_MODE_SUPPORTED;
+
+   if (KFD_GC_VERSION(dev->gpu) < IP_VERSION(10, 0, 0)) {
+   dev->node_props.debug_prop |= 
HSA_DBG_WATCH_ADDR_MASK_LO_BIT_GFX9 |
+   HSA_DBG_WATCH_ADDR_MASK_HI_BIT;
+
+   if (KFD_GC_VERSION(dev->gpu) < IP_VERSION(9, 4, 2))
+   dev->node_props.debug_prop |=
+   HSA_DBG_DISPATCH_INFO_ALWAYS_VALID;
+   else
+   dev->node_props.capability |

[PATCH 05/34] drm/amdgpu: setup hw debug registers on driver initialization

2023-03-27 Thread Jonathan Kim

Add missing debug trap registers references and initialize all debug
registers on boot by clearing the hardware exception overrides and the
wave allocation ID index.

The debugger requires that TTMPs 6 & 7 save the dispatch ID to map
waves onto dispatch during compute context inspection.
In order to correctly set this up, set the special reserved CP bit by
default whenever the MQD is initailized.

v2: leave TRAP_EN set for multi-process debugging as per process disable
will be taken care of in later patches.
fixup typo in description.
enable ttmp setup for dispatch boundary in mqd init for gfx11.
add trap on wave start and end registers for gfx11.

Signed-off-by: Jonathan Kim 
Reviewed-by: Felix Kuehling 
---
 drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c| 26 +++
 drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c|  1 +
 drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c | 30 
 .../gpu/drm/amd/amdkfd/kfd_mqd_manager_v10.c  |  5 ++
 .../gpu/drm/amd/amdkfd/kfd_mqd_manager_v11.c  |  5 ++
 .../gpu/drm/amd/amdkfd/kfd_mqd_manager_v9.c   |  5 ++
 .../include/asic_reg/gc/gc_10_1_0_offset.h| 14 
 .../include/asic_reg/gc/gc_10_1_0_sh_mask.h   | 69 +++
 .../include/asic_reg/gc/gc_10_3_0_offset.h| 10 +++
 .../include/asic_reg/gc/gc_10_3_0_sh_mask.h   |  4 ++
 .../include/asic_reg/gc/gc_11_0_0_sh_mask.h   |  4 ++
 11 files changed, 173 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c 
b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
index 516409989235..b62596140db4 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
@@ -4823,6 +4823,29 @@ static u32 
gfx_v10_0_init_pa_sc_tile_steering_override(struct amdgpu_device *ade
 
 #define DEFAULT_SH_MEM_BASES   (0x6000)
 
+static void gfx_v10_0_debug_trap_config_init(struct amdgpu_device *adev,
+   uint32_t first_vmid,
+   uint32_t last_vmid)
+{
+   uint32_t data;
+   uint32_t trap_config_vmid_mask = 0;
+   int i;
+
+   /* Calculate trap config vmid mask */
+   for (i = first_vmid; i < last_vmid; i++)
+   trap_config_vmid_mask |= (1 << i);
+
+   data = REG_SET_FIELD(0, SPI_GDBG_TRAP_CONFIG,
+   VMID_SEL, trap_config_vmid_mask);
+   data = REG_SET_FIELD(data, SPI_GDBG_TRAP_CONFIG,
+   TRAP_EN, 1);
+   WREG32(SOC15_REG_OFFSET(GC, 0, mmSPI_GDBG_TRAP_CONFIG), data);
+   WREG32(SOC15_REG_OFFSET(GC, 0, mmSPI_GDBG_TRAP_MASK), 0);
+
+   WREG32(SOC15_REG_OFFSET(GC, 0, mmSPI_GDBG_TRAP_DATA0), 0);
+   WREG32(SOC15_REG_OFFSET(GC, 0, mmSPI_GDBG_TRAP_DATA1), 0);
+}
+
 static void gfx_v10_0_init_compute_vmid(struct amdgpu_device *adev)
 {
int i;
@@ -4854,6 +4877,9 @@ static void gfx_v10_0_init_compute_vmid(struct 
amdgpu_device *adev)
WREG32_SOC15_OFFSET(GC, 0, mmGDS_GWS_VMID0, i, 0);
WREG32_SOC15_OFFSET(GC, 0, mmGDS_OA_VMID0, i, 0);
}
+
+   gfx_v10_0_debug_trap_config_init(adev, adev->vm_manager.first_kfd_vmid,
+   AMDGPU_NUM_VMID);
 }
 
 static void gfx_v10_0_init_gds_vmid(struct amdgpu_device *adev)
diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c 
b/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c
index ecf8ceb53311..56c211b790db 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c
@@ -1603,6 +1603,7 @@ static void gfx_v11_0_init_compute_vmid(struct 
amdgpu_device *adev)
/* Enable trap for each kfd vmid. */
data = RREG32_SOC15(GC, 0, regSPI_GDBG_PER_VMID_CNTL);
data = REG_SET_FIELD(data, SPI_GDBG_PER_VMID_CNTL, TRAP_EN, 1);
+   WREG32_SOC15(GC, 0, regSPI_GDBG_PER_VMID_CNTL, data);
}
soc21_grbm_select(adev, 0, 0, 0, 0);
mutex_unlock(>srbm_mutex);
diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c 
b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
index ae09fc1cfe6b..d3ad9e9d4133 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
@@ -2289,6 +2289,29 @@ static void gfx_v9_0_setup_rb(struct amdgpu_device *adev)
adev->gfx.config.num_rbs = hweight32(active_rbs);
 }
 
+static void gfx_v9_0_debug_trap_config_init(struct amdgpu_device *adev,
+   uint32_t first_vmid,
+   uint32_t last_vmid)
+{
+   uint32_t data;
+   uint32_t trap_config_vmid_mask = 0;
+   int i;
+
+   /* Calculate trap config vmid mask */
+   for (i = first_vmid; i < last_vmid; i++)
+   trap_config_vmid_mask |= (1 << i);
+
+   data = REG_SET_FIELD(0, SPI_GDBG_TRAP_CONFIG,
+   VMID_SEL, trap_config_vmid_mask);
+   data = REG_SET_FIELD(data, SPI_GDBG_TRAP_CONFIG,
+   TRAP_EN, 1);
+   WREG32(SOC15_REG_OFFSET(GC, 0, mmSPI_GDBG_TRAP_CONFIG), data);
+   WREG32(SOC15_REG_OFFSET(

[PATCH 12/34] drm/amdgpu: add configurable grace period for unmap queues

2023-03-27 Thread Jonathan Kim

The HWS schedule allows a grace period for wave completion prior to
preemption for better performance by avoiding CWSR on waves that can
potentially complete quickly. The debugger, on the other hand, will
want to inspect wave status immediately after it actively triggers
preemption (a suspend function to be provided).

To minimize latency between preemption and debugger wave inspection, allow
immediate preemption by setting the grace period to 0.

Note that setting the preepmtion grace period to 0 will result in an
infinite grace period being set due to a CP FW bug so set it to 1 for now.

v2: clarify purpose in the description of this patch

Signed-off-by: Jonathan Kim 
Reviewed-by: Felix Kuehling 
---
 .../drm/amd/amdgpu/amdgpu_amdkfd_aldebaran.c  |  2 +
 .../drm/amd/amdgpu/amdgpu_amdkfd_arcturus.c   |  2 +
 .../drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.c| 43 
 .../drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.h|  6 ++
 .../drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10_3.c  |  2 +
 .../gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c | 43 
 .../gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.h |  9 ++-
 .../drm/amd/amdkfd/kfd_device_queue_manager.c | 62 +-
 .../drm/amd/amdkfd/kfd_device_queue_manager.h |  2 +
 .../gpu/drm/amd/amdkfd/kfd_packet_manager.c   | 32 +
 .../drm/amd/amdkfd/kfd_packet_manager_v9.c| 39 +++
 .../gpu/drm/amd/amdkfd/kfd_pm4_headers_ai.h   | 65 +++
 drivers/gpu/drm/amd/amdkfd/kfd_priv.h |  5 ++
 13 files changed, 291 insertions(+), 21 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_aldebaran.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_aldebaran.c
index a6f98141c29c..b811a0985050 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_aldebaran.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_aldebaran.c
@@ -82,5 +82,7 @@ const struct kfd2kgd_calls aldebaran_kfd2kgd = {
.get_cu_occupancy = kgd_gfx_v9_get_cu_occupancy,
.enable_debug_trap = kgd_aldebaran_enable_debug_trap,
.disable_debug_trap = kgd_aldebaran_disable_debug_trap,
+   .get_iq_wait_times = kgd_gfx_v9_get_iq_wait_times,
+   .build_grace_period_packet_info = 
kgd_gfx_v9_build_grace_period_packet_info,
.program_trap_handler_settings = 
kgd_gfx_v9_program_trap_handler_settings,
 };
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_arcturus.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_arcturus.c
index d2918e5c0dea..a62bd0068515 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_arcturus.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_arcturus.c
@@ -410,6 +410,8 @@ const struct kfd2kgd_calls arcturus_kfd2kgd = {
kgd_gfx_v9_set_vm_context_page_table_base,
.enable_debug_trap = kgd_arcturus_enable_debug_trap,
.disable_debug_trap = kgd_arcturus_disable_debug_trap,
+   .get_iq_wait_times = kgd_gfx_v9_get_iq_wait_times,
+   .build_grace_period_packet_info = 
kgd_gfx_v9_build_grace_period_packet_info,
.get_cu_occupancy = kgd_gfx_v9_get_cu_occupancy,
.program_trap_handler_settings = 
kgd_gfx_v9_program_trap_handler_settings
 };
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.c
index 969015281510..605387e55d33 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.c
@@ -802,6 +802,47 @@ uint32_t kgd_gfx_v10_disable_debug_trap(struct 
amdgpu_device *adev,
return 0;
 }
 
+/* kgd_gfx_v10_get_iq_wait_times: Returns the mmCP_IQ_WAIT_TIME1/2 values
+ * The values read are:
+ * ib_offload_wait_time -- Wait Count for Indirect Buffer Offloads.
+ * atomic_offload_wait_time -- Wait Count for L2 and GDS Atomics Offloads.
+ * wrm_offload_wait_time-- Wait Count for WAIT_REG_MEM Offloads.
+ * gws_wait_time-- Wait Count for Global Wave Syncs.
+ * que_sleep_wait_time  -- Wait Count for Dequeue Retry.
+ * sch_wave_wait_time   -- Wait Count for Scheduling Wave Message.
+ * sem_rearm_wait_time  -- Wait Count for Semaphore re-arm.
+ * deq_retry_wait_time  -- Wait Count for Global Wave Syncs.
+ */
+void kgd_gfx_v10_get_iq_wait_times(struct amdgpu_device *adev,
+   uint32_t *wait_times)
+
+{
+   *wait_times = RREG32(SOC15_REG_OFFSET(GC, 0, mmCP_IQ_WAIT_TIME2));
+}
+
+void kgd_gfx_v10_build_grace_period_packet_info(struct amdgpu_device *adev,
+   uint32_t wait_times,
+   uint32_t grace_period,
+   uint32_t *reg_offset,
+   uint32_t *reg_data)
+{
+   *reg_data = wait_times;
+
+   /*
+* The CP cannont handle a 0 grace period input and will result in
+* an infinite grace period being set so set to 1 to prevent

[PATCH 09/34] drm/amdgpu: add gfx10 hw debug mode enable and disable calls

2023-03-27 Thread Jonathan Kim

Similar to GFX9 debug devices, set the hardware debug mode by draining
the SPI appropriately prior the mode setting request.

Because GFX10 has waves allocated by the work group boundary and each
SE's SPI instances do not communicate, the SPI drain time is much longer.
This long drain time will be fixed for GFX11 onwards.

Also remove a bunch of deprecated misplaced references for GFX10.3.

v2: fix 'boundaray' typo in description and added gfx10 kgd2kfd header
to avoid kern bot missing prototype complaint.

Signed-off-by: Jonathan Kim 
Reviewed-by: Felix Kuehling 
---
 .../drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.c|  96 
 .../drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.h|  28 
 .../drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10_3.c  | 147 +-
 3 files changed, 127 insertions(+), 144 deletions(-)
 create mode 100644 drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.h

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.c
index 9378fc79e9ea..969015281510 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.c
@@ -21,6 +21,7 @@
  */
 #include "amdgpu.h"
 #include "amdgpu_amdkfd.h"
+#include "amdgpu_amdkfd_gfx_v10.h"
 #include "gc/gc_10_1_0_offset.h"
 #include "gc/gc_10_1_0_sh_mask.h"
 #include "athub/athub_2_0_0_offset.h"
@@ -708,6 +709,99 @@ static void set_vm_context_page_table_base(struct 
amdgpu_device *adev,
adev->gfxhub.funcs->setup_vm_pt_regs(adev, vmid, page_table_base);
 }
 
+/*
+ * GFX10 helper for wave launch stall requirements on debug trap setting.
+ *
+ * vmid:
+ *   Target VMID to stall/unstall.
+ *
+ * stall:
+ *   0-unstall wave launch (enable), 1-stall wave launch (disable).
+ *   After wavefront launch has been stalled, allocated waves must drain from
+ *   SPI in order for debug trap settings to take effect on those waves.
+ *   This is roughly a ~3500 clock cycle wait on SPI where a read on
+ *   SPI_GDBG_WAVE_CNTL translates to ~32 clock cycles.
+ *   KGD_GFX_V10_WAVE_LAUNCH_SPI_DRAIN_LATENCY indicates the number of reads 
required.
+ *
+ *   NOTE: We can afford to clear the entire STALL_VMID field on unstall
+ *   because current GFX10 chips cannot support multi-process debugging due to
+ *   trap configuration and masking being limited to global scope.  Always
+ *   assume single process conditions.
+ *
+ */
+
+#define KGD_GFX_V10_WAVE_LAUNCH_SPI_DRAIN_LATENCY  110
+static void kgd_gfx_v10_set_wave_launch_stall(struct amdgpu_device *adev, 
uint32_t vmid, bool stall)
+{
+   uint32_t data = RREG32(SOC15_REG_OFFSET(GC, 0, mmSPI_GDBG_WAVE_CNTL));
+   int i;
+
+   data = REG_SET_FIELD(data, SPI_GDBG_WAVE_CNTL, STALL_VMID,
+   stall ? 1 << vmid : 0);
+
+   WREG32(SOC15_REG_OFFSET(GC, 0, mmSPI_GDBG_WAVE_CNTL), data);
+
+   if (!stall)
+   return;
+
+   for (i = 0; i < KGD_GFX_V10_WAVE_LAUNCH_SPI_DRAIN_LATENCY; i++)
+   RREG32(SOC15_REG_OFFSET(GC, 0, mmSPI_GDBG_WAVE_CNTL));
+}
+
+uint32_t kgd_gfx_v10_enable_debug_trap(struct amdgpu_device *adev,
+   bool restore_dbg_registers,
+   uint32_t vmid)
+{
+
+   mutex_lock(>grbm_idx_mutex);
+
+   kgd_gfx_v10_set_wave_launch_stall(adev, vmid, true);
+
+   /* assume gfx off is disabled for the debug session if rlc restore not 
supported. */
+   if (restore_dbg_registers) {
+   uint32_t data = 0;
+
+   data = REG_SET_FIELD(data, SPI_GDBG_TRAP_CONFIG,
+   VMID_SEL, 1 << vmid);
+   data = REG_SET_FIELD(data, SPI_GDBG_TRAP_CONFIG,
+   TRAP_EN, 1);
+   WREG32(SOC15_REG_OFFSET(GC, 0, mmSPI_GDBG_TRAP_CONFIG), data);
+   WREG32(SOC15_REG_OFFSET(GC, 0, mmSPI_GDBG_TRAP_DATA0), 0);
+   WREG32(SOC15_REG_OFFSET(GC, 0, mmSPI_GDBG_TRAP_DATA1), 0);
+
+   kgd_gfx_v10_set_wave_launch_stall(adev, vmid, false);
+
+   mutex_unlock(>grbm_idx_mutex);
+
+   return 0;
+   }
+
+   WREG32(SOC15_REG_OFFSET(GC, 0, mmSPI_GDBG_TRAP_MASK), 0);
+
+   kgd_gfx_v10_set_wave_launch_stall(adev, vmid, false);
+
+   mutex_unlock(>grbm_idx_mutex);
+
+   return 0;
+}
+
+uint32_t kgd_gfx_v10_disable_debug_trap(struct amdgpu_device *adev,
+   bool keep_trap_enabled,
+   uint32_t vmid)
+{
+   mutex_lock(>grbm_idx_mutex);
+
+   kgd_gfx_v10_set_wave_launch_stall(adev, vmid, true);
+
+   WREG32(SOC15_REG_OFFSET(GC, 0, mmSPI_GDBG_TRAP_MASK), 0);
+
+   kgd_gfx_v10_set_wave_launch_stall(adev, vmid, false);
+
+   mutex_unlock(>grbm_idx_mutex);
+
+   return 0;
+}
+
 static void program_trap_handler_settings

[PATCH 10/34] drm/amdgpu: add gfx9.4.2 hw debug mode enable and disable calls

2023-03-27 Thread Jonathan Kim

GFX9.4.2 now supports per-VMID debug mode controls registers
(SPI_GDBG_PER_VMID_CNTL).

Because the KFD lets the HWS handle PASID-VMID mapping, the KFD will
forward all debug mode setting register writes to the HWS scheduler
using a new MAP_PROCESS API, so instead of writing to registers, return
the required register values that the HWS needs to write on debug enable
and disable.

v3: fix typo and comment format kern bot complaint.
add back cu occupancy that was removed by mistake.

v2: add commentary on unused restore_dbg_registers for debug enable.

Signed-off-by: Jonathan Kim 
Reviewed-by: Felix Kuehling 
---
 .../drm/amd/amdgpu/amdgpu_amdkfd_aldebaran.c  | 42 ++-
 1 file changed, 41 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_aldebaran.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_aldebaran.c
index 4485bb29bec9..a6f98141c29c 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_aldebaran.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_aldebaran.c
@@ -23,6 +23,44 @@
 #include "amdgpu_amdkfd.h"
 #include "amdgpu_amdkfd_arcturus.h"
 #include "amdgpu_amdkfd_gfx_v9.h"
+#include "gc/gc_9_4_2_offset.h"
+#include "gc/gc_9_4_2_sh_mask.h"
+
+/*
+ * Returns TRAP_EN, EXCP_EN and EXCP_REPLACE.
+ *
+ * restore_dbg_registers is ignored here but is a general interface requirement
+ * for devices that support GFXOFF and where the RLC save/restore list
+ * does not support hw registers for debugging i.e. the driver has to manually
+ * initialize the debug mode registers after it has disabled GFX off during the
+ * debug session.
+ */
+static uint32_t kgd_aldebaran_enable_debug_trap(struct amdgpu_device *adev,
+   bool restore_dbg_registers,
+   uint32_t vmid)
+{
+   uint32_t data = 0;
+
+   data = REG_SET_FIELD(data, SPI_GDBG_PER_VMID_CNTL, TRAP_EN, 1);
+   data = REG_SET_FIELD(data, SPI_GDBG_PER_VMID_CNTL, EXCP_EN, 0);
+   data = REG_SET_FIELD(data, SPI_GDBG_PER_VMID_CNTL, EXCP_REPLACE, 0);
+
+   return data;
+}
+
+/* returns TRAP_EN, EXCP_EN and EXCP_REPLACE. */
+static uint32_t kgd_aldebaran_disable_debug_trap(struct amdgpu_device *adev,
+   bool keep_trap_enabled,
+   uint32_t vmid)
+{
+   uint32_t data = 0;
+
+   data = REG_SET_FIELD(data, SPI_GDBG_PER_VMID_CNTL, TRAP_EN, 
keep_trap_enabled);
+   data = REG_SET_FIELD(data, SPI_GDBG_PER_VMID_CNTL, EXCP_EN, 0);
+   data = REG_SET_FIELD(data, SPI_GDBG_PER_VMID_CNTL, EXCP_REPLACE, 0);
+
+   return data;
+}
 
 const struct kfd2kgd_calls aldebaran_kfd2kgd = {
.program_sh_mem_settings = kgd_gfx_v9_program_sh_mem_settings,
@@ -42,5 +80,7 @@ const struct kfd2kgd_calls aldebaran_kfd2kgd = {
kgd_gfx_v9_get_atc_vmid_pasid_mapping_info,
.set_vm_context_page_table_base = 
kgd_gfx_v9_set_vm_context_page_table_base,
.get_cu_occupancy = kgd_gfx_v9_get_cu_occupancy,
-   .program_trap_handler_settings = 
kgd_gfx_v9_program_trap_handler_settings
+   .enable_debug_trap = kgd_aldebaran_enable_debug_trap,
+   .disable_debug_trap = kgd_aldebaran_disable_debug_trap,
+   .program_trap_handler_settings = 
kgd_gfx_v9_program_trap_handler_settings,
 };
-- 
2.25.1

[PATCH 01/34] drm/amdkfd: add debug and runtime enable interface

2023-03-27 Thread Jonathan Kim

Introduce the GPU debug operations interface.

For ROCm-GDB to extend the GNU Debugger's ability to inspect the AMD GPU
instruction set, provide the necessary interface to allow the debugger
to HW debug-mode set and query exceptions per HSA queue, process or
device.

The runtime_enable interface coordinates exception handling with the
HSA runtime.

Usage is available in the kern docs at uapi/linux/kfd_ioctl.h.

v3:  previously reviewed but updated context save area header
struct from suggestions on patch 25/32.

v2: was previously reviewed but removed deprecrated wave launch modes
(kill and disable).
Also remove non-needed dbg flag option.
Add revision and subvendor info to debug device snapshot entry.
Add trap on wave start and end override option.

Signed-off-by: Jonathan Kim 
---
 drivers/gpu/drm/amd/amdkfd/kfd_chardev.c |  48 ++
 include/uapi/linux/kfd_ioctl.h   | 667 ++-
 2 files changed, 714 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
index 7228a3db63a2..13a630391a6a 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
@@ -2715,6 +2715,48 @@ static int kfd_ioctl_criu(struct file *filep, struct 
kfd_process *p, void *data)
return ret;
 }
 
+static int kfd_ioctl_runtime_enable(struct file *filep, struct kfd_process *p, 
void *data)
+{
+   return 0;
+}
+
+static int kfd_ioctl_set_debug_trap(struct file *filep, struct kfd_process *p, 
void *data)
+{
+   struct kfd_ioctl_dbg_trap_args *args = data;
+   int r = 0;
+
+   if (sched_policy == KFD_SCHED_POLICY_NO_HWS) {
+   pr_err("Debugging does not support sched_policy %i", 
sched_policy);
+   return -EINVAL;
+   }
+
+   switch (args->op) {
+   case KFD_IOC_DBG_TRAP_ENABLE:
+   case KFD_IOC_DBG_TRAP_DISABLE:
+   case KFD_IOC_DBG_TRAP_SEND_RUNTIME_EVENT:
+   case KFD_IOC_DBG_TRAP_SET_EXCEPTIONS_ENABLED:
+   case KFD_IOC_DBG_TRAP_SET_WAVE_LAUNCH_OVERRIDE:
+   case KFD_IOC_DBG_TRAP_SET_WAVE_LAUNCH_MODE:
+   case KFD_IOC_DBG_TRAP_SUSPEND_QUEUES:
+   case KFD_IOC_DBG_TRAP_RESUME_QUEUES:
+   case KFD_IOC_DBG_TRAP_SET_NODE_ADDRESS_WATCH:
+   case KFD_IOC_DBG_TRAP_CLEAR_NODE_ADDRESS_WATCH:
+   case KFD_IOC_DBG_TRAP_SET_FLAGS:
+   case KFD_IOC_DBG_TRAP_QUERY_DEBUG_EVENT:
+   case KFD_IOC_DBG_TRAP_QUERY_EXCEPTION_INFO:
+   case KFD_IOC_DBG_TRAP_GET_QUEUE_SNAPSHOT:
+   case KFD_IOC_DBG_TRAP_GET_DEVICE_SNAPSHOT:
+   pr_warn("Debugging not supported yet\n");
+   r = -EACCES;
+   break;
+   default:
+   pr_err("Invalid option: %i\n", args->op);
+   r = -EINVAL;
+   }
+
+   return r;
+}
+
 #define AMDKFD_IOCTL_DEF(ioctl, _func, _flags) \
[_IOC_NR(ioctl)] = {.cmd = ioctl, .func = _func, .flags = _flags, \
.cmd_drv = 0, .name = #ioctl}
@@ -2827,6 +2869,12 @@ static const struct amdkfd_ioctl_desc amdkfd_ioctls[] = {
 
AMDKFD_IOCTL_DEF(AMDKFD_IOC_EXPORT_DMABUF,
kfd_ioctl_export_dmabuf, 0),
+
+   AMDKFD_IOCTL_DEF(AMDKFD_IOC_RUNTIME_ENABLE,
+   kfd_ioctl_runtime_enable, 0),
+
+   AMDKFD_IOCTL_DEF(AMDKFD_IOC_DBG_TRAP,
+   kfd_ioctl_set_debug_trap, 0),
 };
 
 #define AMDKFD_CORE_IOCTL_COUNTARRAY_SIZE(amdkfd_ioctls)
diff --git a/include/uapi/linux/kfd_ioctl.h b/include/uapi/linux/kfd_ioctl.h
index 2da5c3ad71bd..0e05c0f3f348 100644
--- a/include/uapi/linux/kfd_ioctl.h
+++ b/include/uapi/linux/kfd_ioctl.h
@@ -110,6 +110,32 @@ struct kfd_ioctl_get_available_memory_args {
__u32 pad;
 };
 
+struct kfd_dbg_device_info_entry {
+   __u64 exception_status;
+   __u64 lds_base;
+   __u64 lds_limit;
+   __u64 scratch_base;
+   __u64 scratch_limit;
+   __u64 gpuvm_base;
+   __u64 gpuvm_limit;
+   __u32 gpu_id;
+   __u32 location_id;
+   __u32 vendor_id;
+   __u32 device_id;
+   __u32 revision_id;
+   __u32 subsystem_vendor_id;
+   __u32 subsystem_device_id;
+   __u32 fw_version;
+   __u32 gfx_target_version;
+   __u32 simd_count;
+   __u32 max_waves_per_simd;
+   __u32 array_count;
+   __u32 simd_arrays_per_engine;
+   __u32 capability;
+   __u32 debug_prop;
+   __u32 pad;
+};
+
 /* For kfd_ioctl_set_memory_policy_args.default_policy and alternate_policy */
 #define KFD_IOC_CACHE_POLICY_COHERENT 0
 #define KFD_IOC_CACHE_POLICY_NONCOHERENT 1
@@ -773,6 +799,639 @@ struct kfd_ioctl_set_xnack_mode_args {
__s32 xnack_enabled;
 };
 
+/* Wave launch override modes */
+enum kfd_dbg_trap_override_mode {
+   KFD_DBG_TRAP_OVERRIDE_OR = 0,
+   KFD_DBG_TRAP_OVERRIDE_REPLACE = 1
+};
+
+/* Wave launch overrides */
+enum kfd_dbg_trap_mask {
+   KFD_DBG_TRAP_MASK_FP_INVALID = 1,
+

[PATCH 04/34] drm/amdgpu: add kgd hw debug mode setting interface

2023-03-27 Thread Jonathan Kim

Introduce the require KGD debug calls that will execute hardware debug
mode setting.

Signed-off-by: Jonathan Kim 
Reviewed-by: Felix Kuehling 
---
 .../gpu/drm/amd/include/kgd_kfd_interface.h   | 34 +++
 1 file changed, 34 insertions(+)

diff --git a/drivers/gpu/drm/amd/include/kgd_kfd_interface.h 
b/drivers/gpu/drm/amd/include/kgd_kfd_interface.h
index 5cb3e8634739..15e7a5c920a0 100644
--- a/drivers/gpu/drm/amd/include/kgd_kfd_interface.h
+++ b/drivers/gpu/drm/amd/include/kgd_kfd_interface.h
@@ -289,6 +289,40 @@ struct kfd2kgd_calls {
uint32_t vmid, uint64_t page_table_base);
uint32_t (*read_vmid_from_vmfault_reg)(struct amdgpu_device *adev);
 
+   uint32_t (*enable_debug_trap)(struct amdgpu_device *adev,
+   bool restore_dbg_registers,
+   uint32_t vmid);
+   uint32_t (*disable_debug_trap)(struct amdgpu_device *adev,
+   bool keep_trap_enabled,
+   uint32_t vmid);
+   int (*validate_trap_override_request)(struct amdgpu_device *adev,
+   uint32_t trap_override,
+   uint32_t *trap_mask_supported);
+   uint32_t (*set_wave_launch_trap_override)(struct amdgpu_device *adev,
+uint32_t vmid,
+uint32_t trap_override,
+uint32_t trap_mask_bits,
+uint32_t trap_mask_request,
+uint32_t *trap_mask_prev,
+uint32_t kfd_dbg_trap_cntl_prev);
+   uint32_t (*set_wave_launch_mode)(struct amdgpu_device *adev,
+   uint8_t wave_launch_mode,
+   uint32_t vmid);
+   uint32_t (*set_address_watch)(struct amdgpu_device *adev,
+   uint64_t watch_address,
+   uint32_t watch_address_mask,
+   uint32_t watch_id,
+   uint32_t watch_mode,
+   uint32_t debug_vmid);
+   uint32_t (*clear_address_watch)(struct amdgpu_device *adev,
+   uint32_t watch_id);
+   void (*get_iq_wait_times)(struct amdgpu_device *adev,
+   uint32_t *wait_times);
+   void (*build_grace_period_packet_info)(struct amdgpu_device *adev,
+   uint32_t wait_times,
+   uint32_t grace_period,
+   uint32_t *reg_offset,
+   uint32_t *reg_data);
void (*get_cu_occupancy)(struct amdgpu_device *adev, int pasid,
int *wave_cnt, int *max_waves_per_cu);
void (*program_trap_handler_settings)(struct amdgpu_device *adev,
-- 
2.25.1

[PATCH 03/34] drm/amdkfd: prepare per-process debug enable and disable

2023-03-27 Thread Jonathan Kim

The ROCm debugger will attach to a process to debug by PTRACE and will
expect the KFD to prepare a process for the target PID, whether the
target PID has opened the KFD device or not.

This patch is to explicity handle this requirement.  Further HW mode
setting and runtime coordination requirements will be handled in
following patches.

In the case where the target process has not opened the KFD device,
a new KFD process must be created for the target PID.
The debugger as well as the target process for this case will have not
acquired any VMs so handle process restoration to correctly account for
this.

To coordinate with HSA runtime, the debugger must be aware of the target
process' runtime enablement status and will copy the runtime status
information into the debugged KFD process for later query.

On enablement, the debugger will subscribe to a set of exceptions where
each exception events will notify the debugger through a pollable FIFO
file descriptor that the debugger provides to the KFD to manage.

Finally on process termination of either the debugger or the target,
debugging must be disabled if it has not been done so.

v4: fix up removal of queue eviction counter reset.
move debug event worker to patch 17 where it's actually used.
fix ioctl disable and process termination disable race by having
ioctl disable check and get mm reference first.

v3: fix typo on debug trap disable and PTRACE ATTACH relax check.
remove unnecessary queue eviction counter reset when there's nothing
to evict.
change err code to EALREADY if attaching to an already attached process.
move debug disable to release worker to avoid race with disable from
ioctl call.

v2: relax debug trap disable and PTRACE ATTACH requirement.

Signed-off-by: Jonathan Kim 
---
 drivers/gpu/drm/amd/amdkfd/Makefile   |   3 +-
 drivers/gpu/drm/amd/amdkfd/kfd_chardev.c  | 102 +-
 drivers/gpu/drm/amd/amdkfd/kfd_debug.c|  80 ++
 drivers/gpu/drm/amd/amdkfd/kfd_debug.h|  32 ++
 .../drm/amd/amdkfd/kfd_device_queue_manager.c |  26 +++--
 drivers/gpu/drm/amd/amdkfd/kfd_priv.h |  31 +-
 drivers/gpu/drm/amd/amdkfd/kfd_process.c  |  60 +++
 7 files changed, 304 insertions(+), 30 deletions(-)
 create mode 100644 drivers/gpu/drm/amd/amdkfd/kfd_debug.c
 create mode 100644 drivers/gpu/drm/amd/amdkfd/kfd_debug.h

diff --git a/drivers/gpu/drm/amd/amdkfd/Makefile 
b/drivers/gpu/drm/amd/amdkfd/Makefile
index e758c2a24cd0..747754428073 100644
--- a/drivers/gpu/drm/amd/amdkfd/Makefile
+++ b/drivers/gpu/drm/amd/amdkfd/Makefile
@@ -55,7 +55,8 @@ AMDKFD_FILES  := $(AMDKFD_PATH)/kfd_module.o \
$(AMDKFD_PATH)/kfd_int_process_v9.o \
$(AMDKFD_PATH)/kfd_int_process_v11.o \
$(AMDKFD_PATH)/kfd_smi_events.o \
-   $(AMDKFD_PATH)/kfd_crat.o
+   $(AMDKFD_PATH)/kfd_crat.o \
+   $(AMDKFD_PATH)/kfd_debug.o
 
 ifneq ($(CONFIG_AMD_IOMMU_V2),)
 AMDKFD_FILES += $(AMDKFD_PATH)/kfd_iommu.o
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
index 13a630391a6a..58c08d8a191f 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
@@ -44,6 +44,7 @@
 #include "amdgpu_amdkfd.h"
 #include "kfd_smi_events.h"
 #include "amdgpu_dma_buf.h"
+#include "kfd_debug.h"
 
 static long kfd_ioctl(struct file *, unsigned int, unsigned long);
 static int kfd_open(struct inode *, struct file *);
@@ -142,10 +143,15 @@ static int kfd_open(struct inode *inode, struct file 
*filep)
return -EPERM;
}
 
-   process = kfd_create_process(filep);
+   process = kfd_create_process(current);
if (IS_ERR(process))
return PTR_ERR(process);
 
+   if (kfd_process_init_cwsr_apu(process, filep)) {
+   kfd_unref_process(process);
+   return -EFAULT;
+   }
+
if (kfd_is_locked()) {
dev_dbg(kfd_device, "kfd is locked!\n"
"process %d unreferenced", process->pasid);
@@ -2723,6 +2729,10 @@ static int kfd_ioctl_runtime_enable(struct file *filep, 
struct kfd_process *p, v
 static int kfd_ioctl_set_debug_trap(struct file *filep, struct kfd_process *p, 
void *data)
 {
struct kfd_ioctl_dbg_trap_args *args = data;
+   struct task_struct *thread = NULL;
+   struct mm_struct *mm = NULL;
+   struct pid *pid = NULL;
+   struct kfd_process *target = NULL;
int r = 0;
 
if (sched_policy == KFD_SCHED_POLICY_NO_HWS) {
@@ -2730,9 +2740,81 @@ static int kfd_ioctl_set_debug_trap(struct file *filep, 
struct kfd_process *p, v
return -EINVAL;
}
 
+   pid = find_get_pid(args->pid);
+   if (!pid) {
+   pr_debug("Cannot find pid info for %i\n", args->pid);
+   r = -ESRCH;
+

[PATCH 09/32] drm/amdgpu: add gfx9.4.2 hw debug mode enable and disable calls

2023-01-25 Thread Jonathan Kim

GFX9.4.2 now supports per-VMID debug mode controls registers
(SPI_GDBG_PER_VMID_CNTL).

Because the KFD lets the HWS handle PASID-VMID mapping, the KFD will
forward all debug mode setting register writes to the HWS scheduler
using a new MAP_PROCESS API, so instead of writing to registers, return
the required register values that the HWS needs to write on debug enable
and disable.

v2: add commentary on unused restore_dbg_registers for debug enable.

Signed-off-by: Jonathan Kim 
---
 .../drm/amd/amdgpu/amdgpu_amdkfd_aldebaran.c  | 43 ++-
 1 file changed, 41 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_aldebaran.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_aldebaran.c
index 4485bb29bec9..89868f9927ae 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_aldebaran.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_aldebaran.c
@@ -23,6 +23,44 @@
 #include "amdgpu_amdkfd.h"
 #include "amdgpu_amdkfd_arcturus.h"
 #include "amdgpu_amdkfd_gfx_v9.h"
+#include "gc/gc_9_4_2_offset.h"
+#include "gc/gc_9_4_2_sh_mask.h"
+
+/**
+ * Returns TRAP_EN, EXCP_EN and EXCP_REPLACE.
+ *
+ * restore_dbg_reisters is ignored here but is a general interface requirement
+ * for devices that support GFXOFF and where the RLC save/restore list
+ * does not support hw registers for debugging i.e. the driver has to manually
+ * initialize the debug mode registers after it has disabled GFX off during the
+ * debug session.
+ */
+static uint32_t kgd_aldebaran_enable_debug_trap(struct amdgpu_device *adev,
+   bool restore_dbg_registers,
+   uint32_t vmid)
+{
+   uint32_t data = 0;
+
+   data = REG_SET_FIELD(data, SPI_GDBG_PER_VMID_CNTL, TRAP_EN, 1);
+   data = REG_SET_FIELD(data, SPI_GDBG_PER_VMID_CNTL, EXCP_EN, 0);
+   data = REG_SET_FIELD(data, SPI_GDBG_PER_VMID_CNTL, EXCP_REPLACE, 0);
+
+   return data;
+}
+
+/* returns TRAP_EN, EXCP_EN and EXCP_REPLACE. */
+static uint32_t kgd_aldebaran_disable_debug_trap(struct amdgpu_device *adev,
+   bool keep_trap_enabled,
+   uint32_t vmid)
+{
+   uint32_t data = 0;
+
+   data = REG_SET_FIELD(data, SPI_GDBG_PER_VMID_CNTL, TRAP_EN, 
keep_trap_enabled);
+   data = REG_SET_FIELD(data, SPI_GDBG_PER_VMID_CNTL, EXCP_EN, 0);
+   data = REG_SET_FIELD(data, SPI_GDBG_PER_VMID_CNTL, EXCP_REPLACE, 0);
+
+   return data;
+}
 
 const struct kfd2kgd_calls aldebaran_kfd2kgd = {
.program_sh_mem_settings = kgd_gfx_v9_program_sh_mem_settings,
@@ -41,6 +79,7 @@ const struct kfd2kgd_calls aldebaran_kfd2kgd = {
.get_atc_vmid_pasid_mapping_info =
kgd_gfx_v9_get_atc_vmid_pasid_mapping_info,
.set_vm_context_page_table_base = 
kgd_gfx_v9_set_vm_context_page_table_base,
-   .get_cu_occupancy = kgd_gfx_v9_get_cu_occupancy,
-   .program_trap_handler_settings = 
kgd_gfx_v9_program_trap_handler_settings
+   .enable_debug_trap = kgd_aldebaran_enable_debug_trap,
+   .disable_debug_trap = kgd_aldebaran_disable_debug_trap,
+   .program_trap_handler_settings = 
kgd_gfx_v9_program_trap_handler_settings,
 };
-- 
2.25.1

[PATCH 31/32] drm/amdkfd: add debug device snapshot operation

2023-01-25 Thread Jonathan Kim

Similar to queue snapshot, return an array of device information using
an entry_size check and return.
Unlike queue snapshots, the debugger needs to pass to correct number of
devices that exist.  If it fails to do so, the KFD will return the
number of actual devices so that the debugger can make a subsequent
successful call.

v3: was reviewed but re-requesting review with new revision and
subvendor information.
memset 0 device info entry to clear padding.

v2: change buf_size are to num_devices for more clarity.
expand device entry new members on copy.
fix minimum entry size calculation for queue and device snapshot.
change device snapshot implementation to match queue snapshot
implementation.

Signed-off-by: Jonathan Kim 
---
 drivers/gpu/drm/amd/amdkfd/kfd_chardev.c |  7 ++-
 drivers/gpu/drm/amd/amdkfd/kfd_debug.c   | 72 
 drivers/gpu/drm/amd/amdkfd/kfd_debug.h   |  5 ++
 3 files changed, 82 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
index 93b288233577..da74a6ef4d9b 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
@@ -2972,8 +2972,11 @@ static int kfd_ioctl_set_debug_trap(struct file *filep, 
struct kfd_process *p, v
>queue_snapshot.entry_size);
break;
case KFD_IOC_DBG_TRAP_GET_DEVICE_SNAPSHOT:
-   pr_warn("Debug op %i not supported yet\n", args->op);
-   r = -EACCES;
+   r = kfd_dbg_trap_device_snapshot(target,
+   args->device_snapshot.exception_mask,
+   (void __user 
*)args->device_snapshot.snapshot_buf_ptr,
+   >device_snapshot.num_devices,
+   >device_snapshot.entry_size);
break;
default:
pr_err("Invalid option: %i\n", args->op);
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_debug.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_debug.c
index db316f0625f8..d1c4eb9652fd 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_debug.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_debug.c
@@ -22,6 +22,7 @@
 
 #include "kfd_debug.h"
 #include "kfd_device_queue_manager.h"
+#include "kfd_topology.h"
 #include 
 #include 
 
@@ -998,6 +999,77 @@ int kfd_dbg_trap_query_exception_info(struct kfd_process 
*target,
return r;
 }
 
+int kfd_dbg_trap_device_snapshot(struct kfd_process *target,
+   uint64_t exception_clear_mask,
+   void __user *user_info,
+   uint32_t *number_of_device_infos,
+   uint32_t *entry_size)
+{
+   struct kfd_dbg_device_info_entry device_info;
+   uint32_t tmp_entry_size = *entry_size, tmp_num_devices;
+   int i, r = 0;
+
+   if (!(target && user_info && number_of_device_infos && entry_size))
+   return -EINVAL;
+
+   tmp_num_devices = min_t(size_t, *number_of_device_infos, 
target->n_pdds);
+   *number_of_device_infos = target->n_pdds;
+   *entry_size = min_t(size_t, *entry_size, sizeof(device_info));
+
+   if (!tmp_num_devices)
+   return 0;
+
+   memset(_info, 0, sizeof(device_info));
+
+   mutex_lock(>event_mutex);
+
+   /* Run over all pdd of the process */
+   for (i = 0; i < tmp_num_devices; i++) {
+   struct kfd_process_device *pdd = target->pdds[i];
+   struct kfd_topology_device *topo_dev = 
kfd_topology_device_by_id(pdd->dev->id);
+
+   device_info.gpu_id = pdd->dev->id;
+   device_info.exception_status = pdd->exception_status;
+   device_info.lds_base = pdd->lds_base;
+   device_info.lds_limit = pdd->lds_limit;
+   device_info.scratch_base = pdd->scratch_base;
+   device_info.scratch_limit = pdd->scratch_limit;
+   device_info.gpuvm_base = pdd->gpuvm_base;
+   device_info.gpuvm_limit = pdd->gpuvm_limit;
+   device_info.location_id = topo_dev->node_props.location_id;
+   device_info.vendor_id = topo_dev->node_props.vendor_id;
+   device_info.device_id = topo_dev->node_props.device_id;
+   device_info.revision_id = pdd->dev->adev->pdev->revision;
+   device_info.subsystem_vendor_id = 
pdd->dev->adev->pdev->subsystem_vendor;
+   device_info.subsystem_device_id = 
pdd->dev->adev->pdev->subsystem_device;
+   device_info.fw_version = pdd->dev->mec_fw_version;
+   device_info.gfx_target_version =
+   topo_dev->node_props.gfx_target_version;
+   device_info.simd_count = topo_dev->node_props.simd_count;
+   device_info.max_waves_pe

[PATCH 25/32] drm/amdkfd: add debug suspend and resume process queues operation

2023-01-25 Thread Jonathan Kim

In order to inspect waves from the saved context at any point during a
debug session, the debugger must be able to preempt queues to trigger
context save by suspending them.

On queue suspend, the KFD will copy the context save header information
so that the debugger can correctly crawl the appropriate size of the saved
context. The debugger must then also be allowed to resume suspended queues.

A queue that is newly created cannot be suspended because queue ids are
recycled after destruction so the debugger needs to know that this has
occurred.  Query functions will be later added that will clear a given
queue of its new queue status.

A queue cannot be destroyed while it is suspended to preserve its saved
context during debugger inspection.  Have queue destruction block while
a queue is suspended and unblocked when it is resumed.  Likewise, if a
queue is about to be destroyed, it cannot be suspended.

Return the number of queues successfully suspended or resumed along with
a per queue status array where the upper bits per queue status show that
the request was invalid (new/destroyed queue suspend request, missing
queue) or an error occurred (HWS in a fatal state so it can't suspend or
resume queues).

v2: add gfx11/mes support.
prevent header copy on suspend from overwriting user fields.
simplify resume_queues function.
address other nit-picks

Signed-off-by: Jonathan Kim 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c|   5 +
 drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h|   1 +
 drivers/gpu/drm/amd/amdkfd/kfd_chardev.c  |  11 +
 drivers/gpu/drm/amd/amdkfd/kfd_debug.c|   7 +
 .../drm/amd/amdkfd/kfd_device_queue_manager.c | 446 +-
 .../drm/amd/amdkfd/kfd_device_queue_manager.h |  10 +
 .../gpu/drm/amd/amdkfd/kfd_mqd_manager_v10.c  |  14 +
 .../gpu/drm/amd/amdkfd/kfd_mqd_manager_v11.c  |  11 +-
 .../gpu/drm/amd/amdkfd/kfd_mqd_manager_v9.c   |  18 +-
 drivers/gpu/drm/amd/amdkfd/kfd_priv.h |   5 +-
 10 files changed, 518 insertions(+), 10 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
index 60c3b0449d86..d50415fe0475 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
@@ -758,6 +758,11 @@ bool amdgpu_amdkfd_have_atomics_support(struct 
amdgpu_device *adev)
return adev->have_atomics_support;
 }
 
+void amdgpu_amdkfd_debug_mem_fence(struct amdgpu_device *adev)
+{
+   amdgpu_device_flush_hdp(adev, NULL);
+}
+
 void amdgpu_amdkfd_ras_poison_consumption_handler(struct amdgpu_device *adev, 
bool reset)
 {
amdgpu_umc_poison_handler(adev, reset);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
index df782274a4c8..9d1c6ab14331 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
@@ -310,6 +310,7 @@ int amdgpu_amdkfd_gpuvm_import_dmabuf(struct amdgpu_device 
*adev,
  uint64_t va, void *drm_priv,
  struct kgd_mem **mem, uint64_t *size,
  uint64_t *mmap_offset);
+void amdgpu_amdkfd_debug_mem_fence(struct amdgpu_device *adev);
 int amdgpu_amdkfd_get_tile_config(struct amdgpu_device *adev,
struct tile_config *config);
 void amdgpu_amdkfd_ras_poison_consumption_handler(struct amdgpu_device *adev,
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
index 205a487d91d2..b62e93b35a44 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
@@ -410,6 +410,7 @@ static int kfd_ioctl_create_queue(struct file *filep, 
struct kfd_process *p,
pr_debug("Write ptr address   == 0x%016llX\n",
args->write_pointer_address);
 
+   kfd_dbg_ev_raise(KFD_EC_MASK(EC_QUEUE_NEW), p, dev, queue_id, false, 
NULL, 0);
return 0;
 
 err_create_queue:
@@ -2908,7 +2909,17 @@ static int kfd_ioctl_set_debug_trap(struct file *filep, 
struct kfd_process *p, v
args->launch_mode.launch_mode);
break;
case KFD_IOC_DBG_TRAP_SUSPEND_QUEUES:
+   r = suspend_queues(target,
+   args->suspend_queues.num_queues,
+   args->suspend_queues.grace_period,
+   args->suspend_queues.exception_mask,
+   (uint32_t 
*)args->suspend_queues.queue_array_ptr);
+
+   break;
case KFD_IOC_DBG_TRAP_RESUME_QUEUES:
+   r = resume_queues(target, args->resume_queues.num_queues,
+   (uint32_t 
*)args->resume_queues.queue_array_ptr);
+   break;
case KFD_IOC_DBG_TRAP_SET_NODE_ADDRESS_WATCH:
case KFD_IOC_DBG_TRAP_CLEAR_NODE_ADDRESS_WATCH:

[PATCH 18/32] drm/amdkfd: add send exception operation

2023-01-25 Thread Jonathan Kim

Add a debug operation that allows the debugger to send an exception
directly to runtime through a payload address.

For memory violations, normal vmfault signals will be applied to
notify runtime instead after passing in the saved exception data
when a memory violation was raised to the debugger.

For runtime exceptions, this will unblock the runtime enable
function which will be explained and implemented in a follow up
patch.

Signed-off-by: Jonathan Kim 
---
 .../gpu/drm/amd/amdkfd/cik_event_interrupt.c  |  4 +-
 drivers/gpu/drm/amd/amdkfd/kfd_chardev.c  |  5 ++
 drivers/gpu/drm/amd/amdkfd/kfd_debug.c| 44 
 drivers/gpu/drm/amd/amdkfd/kfd_debug.h|  5 ++
 drivers/gpu/drm/amd/amdkfd/kfd_events.c   |  3 +-
 .../gpu/drm/amd/amdkfd/kfd_int_process_v9.c   |  2 +-
 drivers/gpu/drm/amd/amdkfd/kfd_priv.h |  7 +-
 drivers/gpu/drm/amd/amdkfd/kfd_process.c  | 71 ++-
 8 files changed, 135 insertions(+), 6 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c 
b/drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c
index 5c8023cba196..62a38cd820fc 100644
--- a/drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c
+++ b/drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c
@@ -118,9 +118,9 @@ static void cik_event_interrupt_wq(struct kfd_dev *dev,
return;
 
if (info.vmid == vmid)
-   kfd_signal_vm_fault_event(dev, pasid, );
+   kfd_signal_vm_fault_event(dev, pasid, , NULL);
else
-   kfd_signal_vm_fault_event(dev, pasid, NULL);
+   kfd_signal_vm_fault_event(dev, pasid, NULL, NULL);
}
 }
 
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
index 628178126d3b..09fe8576dc8c 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
@@ -2738,6 +2738,11 @@ static int kfd_ioctl_set_debug_trap(struct file *filep, 
struct kfd_process *p, v
r = kfd_dbg_trap_disable(target);
break;
case KFD_IOC_DBG_TRAP_SEND_RUNTIME_EVENT:
+   r = kfd_dbg_send_exception_to_runtime(target,
+   args->send_runtime_event.gpu_id,
+   args->send_runtime_event.queue_id,
+   args->send_runtime_event.exception_mask);
+   break;
case KFD_IOC_DBG_TRAP_SET_EXCEPTIONS_ENABLED:
case KFD_IOC_DBG_TRAP_SET_WAVE_LAUNCH_OVERRIDE:
case KFD_IOC_DBG_TRAP_SET_WAVE_LAUNCH_MODE:
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_debug.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_debug.c
index fcd064b13f6a..4174b479ea6f 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_debug.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_debug.c
@@ -125,6 +125,49 @@ bool kfd_dbg_ev_raise(uint64_t event_mask,
return is_subscribed;
 }
 
+int kfd_dbg_send_exception_to_runtime(struct kfd_process *p,
+   unsigned int dev_id,
+   unsigned int queue_id,
+   uint64_t error_reason)
+{
+   if (error_reason & KFD_EC_MASK(EC_DEVICE_MEMORY_VIOLATION)) {
+   struct kfd_process_device *pdd = NULL;
+   struct kfd_hsa_memory_exception_data *data;
+   int i;
+
+   for (i = 0; i < p->n_pdds; i++) {
+   if (p->pdds[i]->dev->id == dev_id) {
+   pdd = p->pdds[i];
+   break;
+   }
+   }
+
+   if (!pdd)
+   return -ENODEV;
+
+   data = (struct kfd_hsa_memory_exception_data *)
+   pdd->vm_fault_exc_data;
+
+   kfd_dqm_evict_pasid(pdd->dev->dqm, p->pasid);
+   kfd_signal_vm_fault_event(pdd->dev, p->pasid, NULL, data);
+   error_reason &= ~KFD_EC_MASK(EC_DEVICE_MEMORY_VIOLATION);
+   }
+
+   if (error_reason & (KFD_EC_MASK(EC_PROCESS_RUNTIME))) {
+   /*
+* block should only happen after the debugger receives runtime
+* enable notice.
+*/
+   up(>runtime_enable_sema);
+   error_reason &= ~KFD_EC_MASK(EC_PROCESS_RUNTIME);
+   }
+
+   if (error_reason)
+   return kfd_send_exception_to_runtime(p, queue_id, error_reason);
+
+   return 0;
+}
+
 static int kfd_dbg_set_queue_workaround(struct queue *q, bool enable)
 {
struct mqd_update_info minfo = {0};
@@ -175,6 +218,7 @@ static int kfd_dbg_set_workaround(struct kfd_process 
*target, bool enable)
}
 
return r;
+}
 
 static int kfd_dbg_set_mes_debug_mode(struct kfd_process_device *pdd)
 {
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_debu

[PATCH 16/32] drm/amdkfd: add per process hw trap enable and disable functions

2023-01-25 Thread Jonathan Kim

To enable HW debug mode per process, all devices must be debug enabled
successfully.  If a failure occures, rewind the enablement of debug mode
on the enabled devices.

A power management scenario that needs to be considered is HW
debug mode setting during GFXOFF.  During GFXOFF, these registers
will be unreachable so we have to transiently disable GFXOFF when
setting.  Also, some devices don't support the RLC save restore
function for these debug registers so we have to disable GFXOFF
completely during a debug session.

Cooperative launch also has debugging restriction based on HW/FW bugs.
If such bugs exists, the debugger cannot attach to a process that uses GWS
resources nor can GWS resources be requested if a process is being
debugged.

Multi-process debug devices can only enable trap temporaries based
on certain runtime scenerios, which will be explained when the
runtime enable functions are implemented in a follow up patch.

v2: add gfx11 support. fix fw checks. remove asic family name comments.

Signed-off-by: Jonathan Kim 
---
 drivers/gpu/drm/amd/amdkfd/kfd_chardev.c  |   5 +
 drivers/gpu/drm/amd/amdkfd/kfd_debug.c| 148 +-
 drivers/gpu/drm/amd/amdkfd/kfd_debug.h|  29 
 .../drm/amd/amdkfd/kfd_device_queue_manager.c |   1 +
 drivers/gpu/drm/amd/amdkfd/kfd_process.c  |   9 ++
 5 files changed, 190 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
index f5f639de28f0..628178126d3b 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
@@ -1453,6 +1453,11 @@ static int kfd_ioctl_alloc_queue_gws(struct file *filep,
goto out_unlock;
}
 
+   if (!kfd_dbg_has_gws_support(dev) && p->debug_trap_enabled) {
+   retval = -EBUSY;
+   goto out_unlock;
+   }
+
retval = pqm_set_gws(>pqm, args->queue_id, args->num_gws ? dev->gws 
: NULL);
mutex_unlock(>mutex);
 
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_debug.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_debug.c
index 6e99a0160275..659dfc7411fe 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_debug.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_debug.c
@@ -21,6 +21,7 @@
  */
 
 #include "kfd_debug.h"
+#include "kfd_device_queue_manager.h"
 #include 
 
 void debug_event_write_work_handler(struct work_struct *work)
@@ -101,11 +102,68 @@ static int kfd_dbg_set_mes_debug_mode(struct 
kfd_process_device *pdd)
pdd->watch_points, flags);
 }
 
+/* kfd_dbg_trap_deactivate:
+ * target: target process
+ * unwind: If this is unwinding a failed kfd_dbg_trap_enable()
+ * unwind_count:
+ * If unwind == true, how far down the pdd list we need
+ * to unwind
+ * else: ignored
+ */
+static void kfd_dbg_trap_deactivate(struct kfd_process *target, bool unwind, 
int unwind_count)
+{
+   int i, count = 0;
+
+   for (i = 0; i < target->n_pdds; i++) {
+   struct kfd_process_device *pdd = target->pdds[i];
+
+   /* If this is an unwind, and we have unwound the required
+* enable calls on the pdd list, we need to stop now
+* otherwise we may mess up another debugger session.
+*/
+   if (unwind && count == unwind_count)
+   break;
+
+   /* GFX off is already disabled by debug activate if not RLC 
restore supported. */
+   if (kfd_dbg_is_rlc_restore_supported(pdd->dev))
+   amdgpu_gfx_off_ctrl(pdd->dev->adev, false);
+   pdd->spi_dbg_override =
+   pdd->dev->kfd2kgd->disable_debug_trap(
+   pdd->dev->adev,
+   target->runtime_info.ttmp_setup,
+   pdd->dev->vm_info.last_vmid_kfd);
+   if (kfd_dbg_is_rlc_restore_supported(pdd->dev))
+   amdgpu_gfx_off_ctrl(pdd->dev->adev, true);
+
+   if (!kfd_dbg_is_per_vmid_supported(pdd->dev) &&
+   release_debug_trap_vmid(pdd->dev->dqm, 
>qpd))
+   pr_err("Failed to release debug vmid on [%i]\n", 
pdd->dev->id);
+
+   if (!pdd->dev->shared_resources.enable_mes)
+   debug_refresh_runlist(pdd->dev->dqm);
+   else
+   kfd_dbg_set_mes_debug_mode(pdd);
+
+   count++;
+   }
+
+   kfd_dbg_set_workaround(target, false);
+}
+
 int kfd_dbg_trap_disable(struct kfd_process *target)
 {
if (!target->debug_trap_enabled)
return 0;
 
+   /*
+* Defer deactivation to runtime if runtim

[PATCH 26/32] drm/amdkfd: add debug set and clear address watch points operation

2023-01-25 Thread Jonathan Kim

Shader read, write and atomic memory operations can be alerted to the
debugger as an address watch exception.

Allow the debugger to pass in a watch point to a particular memory
address per device.

Note that there exists only 4 watch points per devices to date, so have
the KFD keep track of what watch points are allocated or not.

v3: add gfx11 support.
cleanup gfx9 kgd calls to set and clear address watch.
use per device spinlock to set watch points.
fixup runlist refresh calls on set/clear address watch.

v2: change dev_id arg to gpu_id for consistency

Signed-off-by: Jonathan Kim 
---
 .../drm/amd/amdgpu/amdgpu_amdkfd_aldebaran.c  |  51 +++
 .../drm/amd/amdgpu/amdgpu_amdkfd_arcturus.c   |   2 +
 .../drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.c|  78 ++
 .../drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.h|   8 ++
 .../drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10_3.c  |   5 +-
 .../drm/amd/amdgpu/amdgpu_amdkfd_gfx_v11.c|  52 ++-
 .../gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c |  77 ++
 .../gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.h |   8 ++
 drivers/gpu/drm/amd/amdkfd/kfd_chardev.c  |  24 
 drivers/gpu/drm/amd/amdkfd/kfd_debug.c| 136 ++
 drivers/gpu/drm/amd/amdkfd/kfd_debug.h|   8 +-
 drivers/gpu/drm/amd/amdkfd/kfd_device.c   |   1 +
 drivers/gpu/drm/amd/amdkfd/kfd_priv.h |   6 +-
 13 files changed, 451 insertions(+), 5 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_aldebaran.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_aldebaran.c
index 4de2066215b4..18baf1cd8c01 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_aldebaran.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_aldebaran.c
@@ -118,6 +118,55 @@ static uint32_t kgd_aldebaran_set_wave_launch_mode(struct 
amdgpu_device *adev,
return data;
 }
 
+#define TCP_WATCH_STRIDE (regTCP_WATCH1_ADDR_H - regTCP_WATCH0_ADDR_H)
+static uint32_t kgd_gfx_aldebaran_set_address_watch(
+   struct amdgpu_device *adev,
+   uint64_t watch_address,
+   uint32_t watch_address_mask,
+   uint32_t watch_id,
+   uint32_t watch_mode,
+   uint32_t debug_vmid)
+{
+   uint32_t watch_address_high;
+   uint32_t watch_address_low;
+   uint32_t watch_address_cntl;
+
+   watch_address_cntl = 0;
+   watch_address_low = lower_32_bits(watch_address);
+   watch_address_high = upper_32_bits(watch_address) & 0x;
+
+   watch_address_cntl = REG_SET_FIELD(watch_address_cntl,
+   TCP_WATCH0_CNTL,
+   MODE,
+   watch_mode);
+
+   watch_address_cntl = REG_SET_FIELD(watch_address_cntl,
+   TCP_WATCH0_CNTL,
+   MASK,
+   watch_address_mask >> 6);
+
+   watch_address_cntl = REG_SET_FIELD(watch_address_cntl,
+   TCP_WATCH0_CNTL,
+   VALID,
+   1);
+
+   WREG32_RLC((SOC15_REG_OFFSET(GC, 0, regTCP_WATCH0_ADDR_H) +
+   (watch_id * TCP_WATCH_STRIDE)),
+   watch_address_high);
+
+   WREG32_RLC((SOC15_REG_OFFSET(GC, 0, regTCP_WATCH0_ADDR_L) +
+   (watch_id * TCP_WATCH_STRIDE)),
+   watch_address_low);
+
+   return watch_address_cntl;
+}
+
+uint32_t kgd_gfx_aldebaran_clear_address_watch(struct amdgpu_device *adev,
+   uint32_t watch_id)
+{
+   return 0;
+}
+
 const struct kfd2kgd_calls aldebaran_kfd2kgd = {
.program_sh_mem_settings = kgd_gfx_v9_program_sh_mem_settings,
.set_pasid_vmid_mapping = kgd_gfx_v9_set_pasid_vmid_mapping,
@@ -140,6 +189,8 @@ const struct kfd2kgd_calls aldebaran_kfd2kgd = {
.validate_trap_override_request = 
kgd_aldebaran_validate_trap_override_request,
.set_wave_launch_trap_override = 
kgd_aldebaran_set_wave_launch_trap_override,
.set_wave_launch_mode = kgd_aldebaran_set_wave_launch_mode,
+   .set_address_watch = kgd_gfx_aldebaran_set_address_watch,
+   .clear_address_watch = kgd_gfx_aldebaran_clear_address_watch,
.get_iq_wait_times = kgd_gfx_v9_get_iq_wait_times,
.build_grace_period_packet_info = 
kgd_gfx_v9_build_grace_period_packet_info,
.program_trap_handler_settings = 
kgd_gfx_v9_program_trap_handler_settings,
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_arcturus.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_arcturus.c
index 500013540356..a7fb5ef13166 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_arcturus.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_arcturus.c
@@ -413,6 +413,8 @@ const struct kfd2kgd_calls arcturus_kfd2kgd = {
.validate_trap_override_request = 
kgd_gfx_v9_validate_trap_overri

[PATCH 29/32] drm/amdkfd: add debug query exception info operation

2023-01-25 Thread Jonathan Kim

Allow the debugger to query additional info based on an exception code.
For device exceptions, it's currently only memory violation information.
For process exceptions, it's currently only runtime information.
Queue exception only report the queue exception status.

The debugger has the option of clearing the target exception on query.

Signed-off-by: Jonathan Kim 
Reviewed-by: Felix Kuehling 
---
 drivers/gpu/drm/amd/amdkfd/kfd_chardev.c |   7 ++
 drivers/gpu/drm/amd/amdkfd/kfd_debug.c   | 120 +++
 drivers/gpu/drm/amd/amdkfd/kfd_debug.h   |   6 ++
 3 files changed, 133 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
index 0ae1237fa193..d3d2026b6e65 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
@@ -2957,6 +2957,13 @@ static int kfd_ioctl_set_debug_trap(struct file *filep, 
struct kfd_process *p, v
>query_debug_event.exception_mask);
break;
case KFD_IOC_DBG_TRAP_QUERY_EXCEPTION_INFO:
+   r = kfd_dbg_trap_query_exception_info(target,
+   args->query_exception_info.source_id,
+   args->query_exception_info.exception_code,
+   args->query_exception_info.clear_exception,
+   (void __user 
*)args->query_exception_info.info_ptr,
+   >query_exception_info.info_size);
+   break;
case KFD_IOC_DBG_TRAP_GET_QUEUE_SNAPSHOT:
case KFD_IOC_DBG_TRAP_GET_DEVICE_SNAPSHOT:
pr_warn("Debug op %i not supported yet\n", args->op);
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_debug.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_debug.c
index 032207efef15..db316f0625f8 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_debug.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_debug.c
@@ -878,6 +878,126 @@ int kfd_dbg_trap_set_wave_launch_mode(struct kfd_process 
*target,
return r;
 }
 
+int kfd_dbg_trap_query_exception_info(struct kfd_process *target,
+   uint32_t source_id,
+   uint32_t exception_code,
+   bool clear_exception,
+   void __user *info,
+   uint32_t *info_size)
+{
+   bool found = false;
+   int r = 0;
+   uint32_t copy_size, actual_info_size = 0;
+   uint64_t *exception_status_ptr = NULL;
+
+   if (!target)
+   return -EINVAL;
+
+   if (!info || !info_size)
+   return -EINVAL;
+
+   mutex_lock(>event_mutex);
+
+   if (KFD_DBG_EC_TYPE_IS_QUEUE(exception_code)) {
+   /* Per queue exceptions */
+   struct queue *queue = NULL;
+   int i;
+
+   for (i = 0; i < target->n_pdds; i++) {
+   struct kfd_process_device *pdd = target->pdds[i];
+   struct qcm_process_device *qpd = >qpd;
+
+   list_for_each_entry(queue, >queues_list, list) {
+   if (!found && queue->properties.queue_id == 
source_id) {
+   found = true;
+   break;
+   }
+   }
+   if (found)
+   break;
+   }
+
+   if (!found) {
+   r = -EINVAL;
+   goto out;
+   }
+
+   if (!(queue->properties.exception_status & 
KFD_EC_MASK(exception_code))) {
+   r = -ENODATA;
+   goto out;
+   }
+   exception_status_ptr = >properties.exception_status;
+   } else if (KFD_DBG_EC_TYPE_IS_DEVICE(exception_code)) {
+   /* Per device exceptions */
+   struct kfd_process_device *pdd = NULL;
+   int i;
+
+   for (i = 0; i < target->n_pdds; i++) {
+   pdd = target->pdds[i];
+   if (pdd->dev->id == source_id) {
+   found = true;
+   break;
+   }
+   }
+
+   if (!found) {
+   r = -EINVAL;
+   goto out;
+   }
+
+   if (!(pdd->exception_status & KFD_EC_MASK(exception_code))) {
+   r = -ENODATA;
+   goto out;
+   }
+
+   if (exception_code == EC_DEVICE_MEMORY_VIOLATION) {
+   copy_size = min((size_t)(*info_size), 
pdd->vm_fault_exc_data_size);
+
+   if (copy_to_user(info, pdd->vm_fault_exc_data, 
copy_size)) {
+   r = -EFAULT;
+   goto out

[PATCH 30/32] drm/amdkfd: add debug queue snapshot operation

2023-01-25 Thread Jonathan Kim

Allow the debugger to get a snapshot of a specified number of queues
containing various queue property information that is copied to the
debugger.

Since the debugger doesn't know how many queues exist at any given time,
allow the debugger to pass the requested number of snapshots as 0 to get
the actual number of potential snapshots to use for a subsequent snapshot
request for actual information.

To prevent future ABI breakage, pass in the requested entry_size.
The KFD will return it's own entry_size in case the debugger still wants
log the information in a core dump on sizing failure.

Also allow the debugger to clear exceptions when doing a snapshot.

v3: fix uninitialized return and change queue snapshot to type void for
proper increment on buffer copy.
use memset 0 to init snapshot entry to clear struct padding.

v2: change buf_size arg to num_queues for clarity.
fix minimum entry size calculation.

Signed-off-by: Jonathan Kim 
---
 drivers/gpu/drm/amd/amdkfd/kfd_chardev.c  |  6 +++
 .../drm/amd/amdkfd/kfd_device_queue_manager.c | 36 
 .../drm/amd/amdkfd/kfd_device_queue_manager.h |  3 ++
 drivers/gpu/drm/amd/amdkfd/kfd_priv.h |  5 +++
 .../amd/amdkfd/kfd_process_queue_manager.c| 41 +++
 5 files changed, 91 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
index d3d2026b6e65..93b288233577 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
@@ -2965,6 +2965,12 @@ static int kfd_ioctl_set_debug_trap(struct file *filep, 
struct kfd_process *p, v
>query_exception_info.info_size);
break;
case KFD_IOC_DBG_TRAP_GET_QUEUE_SNAPSHOT:
+   r = pqm_get_queue_snapshot(>pqm,
+   args->queue_snapshot.exception_mask,
+   (void __user 
*)args->queue_snapshot.snapshot_buf_ptr,
+   >queue_snapshot.num_queues,
+   >queue_snapshot.entry_size);
+   break;
case KFD_IOC_DBG_TRAP_GET_DEVICE_SNAPSHOT:
pr_warn("Debug op %i not supported yet\n", args->op);
r = -EACCES;
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
index 7792fe9491c5..5ae504a512f0 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
@@ -3000,6 +3000,42 @@ int suspend_queues(struct kfd_process *p,
return total_suspended;
 }
 
+static uint32_t set_queue_type_for_user(struct queue_properties *q_props)
+{
+   switch (q_props->type) {
+   case KFD_QUEUE_TYPE_COMPUTE:
+   return q_props->format == KFD_QUEUE_FORMAT_PM4
+   ? KFD_IOC_QUEUE_TYPE_COMPUTE
+   : KFD_IOC_QUEUE_TYPE_COMPUTE_AQL;
+   case KFD_QUEUE_TYPE_SDMA:
+   return KFD_IOC_QUEUE_TYPE_SDMA;
+   case KFD_QUEUE_TYPE_SDMA_XGMI:
+   return KFD_IOC_QUEUE_TYPE_SDMA_XGMI;
+   default:
+   WARN_ONCE(true, "queue type not recognized!");
+   return 0x;
+   };
+}
+
+void set_queue_snapshot_entry(struct queue *q,
+ uint64_t exception_clear_mask,
+ struct kfd_queue_snapshot_entry *qss_entry)
+{
+   qss_entry->ring_base_address = q->properties.queue_address;
+   qss_entry->write_pointer_address = (uint64_t)q->properties.write_ptr;
+   qss_entry->read_pointer_address = (uint64_t)q->properties.read_ptr;
+   qss_entry->ctx_save_restore_address =
+   q->properties.ctx_save_restore_area_address;
+   qss_entry->ctx_save_restore_area_size =
+   q->properties.ctx_save_restore_area_size;
+   qss_entry->exception_status = q->properties.exception_status;
+   qss_entry->queue_id = q->properties.queue_id;
+   qss_entry->gpu_id = q->device->id;
+   qss_entry->ring_size = (uint32_t)q->properties.queue_size;
+   qss_entry->queue_type = set_queue_type_for_user(>properties);
+   q->properties.exception_status &= ~exception_clear_mask;
+}
+
 int debug_lock_and_unmap(struct device_queue_manager *dqm)
 {
int r;
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h 
b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h
index 7ccf8d0d1867..89d4a5b293a5 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h
@@ -296,6 +296,9 @@ int suspend_queues(struct kfd_process *p,
 int resume_queues(struct kfd_process *p,
uint32_t num_queues,

[PATCH 05/32] drm/amdgpu: setup hw debug registers on driver initialization

2023-01-25 Thread Jonathan Kim

Add missing debug trap registers references and initialize all debug
registers on boot by clearing the hardware exception overrides and the
wave allocation ID index.

The debugger requires that TTMPs 6 & 7 save the dispatch ID to map
waves onto dispatch during compute context inspection.
In order to correctly set this up, set the special reserved CP bit by
default whenever the MQD is initailized.

v2: leave TRAP_EN set for multi-process debugging as per process disable
will be taken care of in later patches.
fixup typo in description.
enable ttmp setup for dispatch boundary in mqd init for gfx11.
add trap on wave start and end registers for gfx11.

Signed-off-by: Jonathan Kim 
---
 drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c| 26 +++
 drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c|  1 +
 drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c | 30 
 .../gpu/drm/amd/amdkfd/kfd_mqd_manager_v10.c  |  5 ++
 .../gpu/drm/amd/amdkfd/kfd_mqd_manager_v11.c  |  5 ++
 .../gpu/drm/amd/amdkfd/kfd_mqd_manager_v9.c   |  5 ++
 .../include/asic_reg/gc/gc_10_1_0_offset.h| 14 
 .../include/asic_reg/gc/gc_10_1_0_sh_mask.h   | 69 +++
 .../include/asic_reg/gc/gc_10_3_0_offset.h| 10 +++
 .../include/asic_reg/gc/gc_10_3_0_sh_mask.h   |  4 ++
 .../include/asic_reg/gc/gc_11_0_0_sh_mask.h   |  4 ++
 11 files changed, 173 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c 
b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
index 6983acc456b2..a5faf23805b5 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
@@ -4823,6 +4823,29 @@ static u32 
gfx_v10_0_init_pa_sc_tile_steering_override(struct amdgpu_device *ade
 
 #define DEFAULT_SH_MEM_BASES   (0x6000)
 
+static void gfx_v10_0_debug_trap_config_init(struct amdgpu_device *adev,
+   uint32_t first_vmid,
+   uint32_t last_vmid)
+{
+   uint32_t data;
+   uint32_t trap_config_vmid_mask = 0;
+   int i;
+
+   /* Calculate trap config vmid mask */
+   for (i = first_vmid; i < last_vmid; i++)
+   trap_config_vmid_mask |= (1 << i);
+
+   data = REG_SET_FIELD(0, SPI_GDBG_TRAP_CONFIG,
+   VMID_SEL, trap_config_vmid_mask);
+   data = REG_SET_FIELD(data, SPI_GDBG_TRAP_CONFIG,
+   TRAP_EN, 1);
+   WREG32(SOC15_REG_OFFSET(GC, 0, mmSPI_GDBG_TRAP_CONFIG), data);
+   WREG32(SOC15_REG_OFFSET(GC, 0, mmSPI_GDBG_TRAP_MASK), 0);
+
+   WREG32(SOC15_REG_OFFSET(GC, 0, mmSPI_GDBG_TRAP_DATA0), 0);
+   WREG32(SOC15_REG_OFFSET(GC, 0, mmSPI_GDBG_TRAP_DATA1), 0);
+}
+
 static void gfx_v10_0_init_compute_vmid(struct amdgpu_device *adev)
 {
int i;
@@ -4854,6 +4877,9 @@ static void gfx_v10_0_init_compute_vmid(struct 
amdgpu_device *adev)
WREG32_SOC15_OFFSET(GC, 0, mmGDS_GWS_VMID0, i, 0);
WREG32_SOC15_OFFSET(GC, 0, mmGDS_OA_VMID0, i, 0);
}
+
+   gfx_v10_0_debug_trap_config_init(adev, adev->vm_manager.first_kfd_vmid,
+   AMDGPU_NUM_VMID);
 }
 
 static void gfx_v10_0_init_gds_vmid(struct amdgpu_device *adev)
diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c 
b/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c
index c621b2ad7ba3..3ca7a31fb770 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c
@@ -1572,6 +1572,7 @@ static void gfx_v11_0_init_compute_vmid(struct 
amdgpu_device *adev)
/* Enable trap for each kfd vmid. */
data = RREG32_SOC15(GC, 0, regSPI_GDBG_PER_VMID_CNTL);
data = REG_SET_FIELD(data, SPI_GDBG_PER_VMID_CNTL, TRAP_EN, 1);
+   WREG32_SOC15(GC, 0, regSPI_GDBG_PER_VMID_CNTL, data);
}
soc21_grbm_select(adev, 0, 0, 0, 0);
mutex_unlock(>srbm_mutex);
diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c 
b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
index 8ad5c03506f2..222fe87161b7 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
@@ -2289,6 +2289,29 @@ static void gfx_v9_0_setup_rb(struct amdgpu_device *adev)
adev->gfx.config.num_rbs = hweight32(active_rbs);
 }
 
+static void gfx_v9_0_debug_trap_config_init(struct amdgpu_device *adev,
+   uint32_t first_vmid,
+   uint32_t last_vmid)
+{
+   uint32_t data;
+   uint32_t trap_config_vmid_mask = 0;
+   int i;
+
+   /* Calculate trap config vmid mask */
+   for (i = first_vmid; i < last_vmid; i++)
+   trap_config_vmid_mask |= (1 << i);
+
+   data = REG_SET_FIELD(0, SPI_GDBG_TRAP_CONFIG,
+   VMID_SEL, trap_config_vmid_mask);
+   data = REG_SET_FIELD(data, SPI_GDBG_TRAP_CONFIG,
+   TRAP_EN, 1);
+   WREG32(SOC15_REG_OFFSET(GC, 0, mmSPI_GDBG_TRAP_CONFIG), data);
+   WREG32(SOC15_REG_OFFSET(GC, 0, mmSPI_GDBG_TRAP_MASK), 0);
+

[PATCH 22/32] drm/amdkfd: add debug set exceptions enabled operation

2023-01-25 Thread Jonathan Kim

The debugger subscibes to nofication for requested exceptions on attach.
Allow the debugger to change its subsciption later on.

Signed-off-by: Jonathan Kim 
Reviewed-by: Felix Kuehling 
---
 drivers/gpu/drm/amd/amdkfd/kfd_chardev.c |  3 ++
 drivers/gpu/drm/amd/amdkfd/kfd_debug.c   | 36 
 drivers/gpu/drm/amd/amdkfd/kfd_debug.h   |  2 ++
 3 files changed, 41 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
index 46f9d453dc5e..9b87ba351eff 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
@@ -2892,6 +2892,9 @@ static int kfd_ioctl_set_debug_trap(struct file *filep, 
struct kfd_process *p, v
args->send_runtime_event.exception_mask);
break;
case KFD_IOC_DBG_TRAP_SET_EXCEPTIONS_ENABLED:
+   kfd_dbg_set_enabled_debug_exception_mask(target,
+   args->set_exceptions_enabled.exception_mask);
+   break;
case KFD_IOC_DBG_TRAP_SET_WAVE_LAUNCH_OVERRIDE:
case KFD_IOC_DBG_TRAP_SET_WAVE_LAUNCH_MODE:
case KFD_IOC_DBG_TRAP_SUSPEND_QUEUES:
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_debug.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_debug.c
index 0c876172db4b..3ea53aaa776b 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_debug.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_debug.c
@@ -529,3 +529,39 @@ int kfd_dbg_trap_enable(struct kfd_process *target, 
uint32_t fd,
 
return r;
 }
+
+void kfd_dbg_set_enabled_debug_exception_mask(struct kfd_process *target,
+   uint64_t exception_set_mask)
+{
+   uint64_t found_mask = 0;
+   struct process_queue_manager *pqm;
+   struct process_queue_node *pqn;
+   static const char write_data = '.';
+   loff_t pos = 0;
+   int i;
+
+   mutex_lock(>event_mutex);
+
+   found_mask |= target->exception_status;
+
+   pqm = >pqm;
+   list_for_each_entry(pqn, >queues, process_queue_list) {
+   if (!pqn)
+   continue;
+
+   found_mask |= pqn->q->properties.exception_status;
+   }
+
+   for (i = 0; i < target->n_pdds; i++) {
+   struct kfd_process_device *pdd = target->pdds[i];
+
+   found_mask |= pdd->exception_status;
+   }
+
+   if (exception_set_mask & found_mask)
+   kernel_write(target->dbg_ev_file, _data, 1, );
+
+   target->exception_enable_mask = exception_set_mask;
+
+   mutex_unlock(>event_mutex);
+}
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_debug.h 
b/drivers/gpu/drm/amd/amdkfd/kfd_debug.h
index 43284243b2c4..81557579ab04 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_debug.h
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_debug.h
@@ -59,6 +59,8 @@ static inline bool kfd_dbg_is_per_vmid_supported(struct 
kfd_dev *dev)
 
 void debug_event_write_work_handler(struct work_struct *work);
 
+void kfd_dbg_set_enabled_debug_exception_mask(struct kfd_process *target,
+   uint64_t exception_set_mask);
 /*
  * If GFX off is enabled, chips that do not support RLC restore for the debug
  * registers will disable GFX off temporarily for the entire debug session.
-- 
2.25.1

[PATCH 21/32] drm/amdkfd: update process interrupt handling for debug events

2023-01-25 Thread Jonathan Kim

The debugger must be notified by any debugger subscribed exception
that comes from hardware interrupts.

If a debugger session exits, any exceptions it subscribed to may still
have interrupts in the interrupt ring buffer or KGD/KFD pipeline.
To prevent a new session from inheriting stale interrupts, when a new
queue is created, open an interrupt drain and allow the IH ring to drain
from a timestamped checkpoint.  Then inject a custom IV so that once
the custom IV is picked up by the KFD, it's safe to close the drain
and proceed with queue creation.

The drain must also be on debug disable as SW interrupts may still
be processed.  Drain at this time and clear all the exception status.

The debugger may also not be attached nor subscibed to certain
exceptions so forward them directly to the runtime.

GFX10 also requires its own IV processing, hence the creation of
kfd_int_process_v10.c.  This is because the IV from SQ interrupts are
packed into a new continguous format unlike GFX9. To make this clear,
a separate interrupting handling code file was created.

v3: enable gfx11 interrupts
v2: fix interrupt drain on debug disable.
fix interrupt drain on queue create during -ERESTARTSYS.
fix up macros naming for ECODE parsing.

Signed-off-by: Jonathan Kim 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c|  16 +
 drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h|   2 +
 drivers/gpu/drm/amd/amdkfd/Makefile   |   1 +
 drivers/gpu/drm/amd/amdkfd/kfd_debug.c|  85 
 drivers/gpu/drm/amd/amdkfd/kfd_debug.h|   6 +
 drivers/gpu/drm/amd/amdkfd/kfd_device.c   |   4 +-
 .../gpu/drm/amd/amdkfd/kfd_int_process_v10.c  | 405 ++
 .../gpu/drm/amd/amdkfd/kfd_int_process_v11.c  |  21 +-
 .../gpu/drm/amd/amdkfd/kfd_int_process_v9.c   |  98 -
 drivers/gpu/drm/amd/amdkfd/kfd_priv.h |  12 +
 drivers/gpu/drm/amd/amdkfd/kfd_process.c  |  47 ++
 .../amd/amdkfd/kfd_process_queue_manager.c|   4 +
 12 files changed, 681 insertions(+), 20 deletions(-)
 create mode 100644 drivers/gpu/drm/amd/amdkfd/kfd_int_process_v10.c

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
index 8816853e50c0..60c3b0449d86 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
@@ -763,6 +763,22 @@ void amdgpu_amdkfd_ras_poison_consumption_handler(struct 
amdgpu_device *adev, bo
amdgpu_umc_poison_handler(adev, reset);
 }
 
+int amdgpu_amdkfd_send_close_event_drain_irq(struct amdgpu_device *adev,
+   uint32_t *payload)
+{
+   int ret;
+
+   /* Device or IH ring is not ready so bail. */
+   ret = amdgpu_ih_wait_on_checkpoint_process_ts(adev, >irq.ih);
+   if (ret)
+   return ret;
+
+   /* Send payload to fence KFD interrupts */
+   amdgpu_amdkfd_interrupt(adev, payload);
+
+   return 0;
+}
+
 bool amdgpu_amdkfd_ras_query_utcl2_poison_status(struct amdgpu_device *adev)
 {
if (adev->gfx.ras && adev->gfx.ras->query_utcl2_poison_status)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
index 333780491867..df782274a4c8 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
@@ -241,6 +241,8 @@ int amdgpu_amdkfd_get_xgmi_bandwidth_mbytes(struct 
amdgpu_device *dst,
struct amdgpu_device *src,
bool is_min);
 int amdgpu_amdkfd_get_pcie_bandwidth_mbytes(struct amdgpu_device *adev, bool 
is_min);
+int amdgpu_amdkfd_send_close_event_drain_irq(struct amdgpu_device *adev,
+   uint32_t *payload);
 
 /* Read user wptr from a specified user address space with page fault
  * disabled. The memory must be pinned and mapped to the hardware when
diff --git a/drivers/gpu/drm/amd/amdkfd/Makefile 
b/drivers/gpu/drm/amd/amdkfd/Makefile
index 747754428073..2ec8f27c5366 100644
--- a/drivers/gpu/drm/amd/amdkfd/Makefile
+++ b/drivers/gpu/drm/amd/amdkfd/Makefile
@@ -53,6 +53,7 @@ AMDKFD_FILES  := $(AMDKFD_PATH)/kfd_module.o \
$(AMDKFD_PATH)/kfd_events.o \
$(AMDKFD_PATH)/cik_event_interrupt.o \
$(AMDKFD_PATH)/kfd_int_process_v9.o \
+   $(AMDKFD_PATH)/kfd_int_process_v10.o \
$(AMDKFD_PATH)/kfd_int_process_v11.o \
$(AMDKFD_PATH)/kfd_smi_events.o \
$(AMDKFD_PATH)/kfd_crat.o \
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_debug.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_debug.c
index 16acf3d416eb..0c876172db4b 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_debug.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_debug.c
@@ -125,6 +125,65 @@ bool kfd_dbg_ev_raise(uint64_t event_mask,
return is_subscribed;
 }
 
+/* set pending event queue entry from ring entry  */
+bool kfd_set_dbg_ev_from_interrupt

[PATCH 23/32] drm/amdkfd: add debug wave launch override operation

2023-01-25 Thread Jonathan Kim

This operation allows the debugger to override the enabled HW
exceptions on the device.

On debug devices that only support the debugging of a single process,
the HW exceptions are global and set through the SPI_GDBG_TRAP_MASK
register.
Because they are global, only address watch exceptions are allowed to
be enabled.  In other words, the debugger must preserve all non-address
watch exception states in normal mode operation by barring a full
replacement override or a non-address watch override request.

For multi-process debugging, all HW exception overrides are per-VMID so
all exceptions can be overridden or fully replaced.

In order for the debugger to know what is permissible, returned the
supported override mask back to the debugger along with the previously
enable overrides.

v3: v2 was reviewed but requesting re-review for GFX11 added supported.

v2: switch unsupported override mode return from EPERM to EINVAL to
support unique EPERM on PTRACE failure.

Signed-off-by: Jonathan Kim 
---
 .../drm/amd/amdgpu/amdgpu_amdkfd_aldebaran.c  | 47 ++
 .../drm/amd/amdgpu/amdgpu_amdkfd_arcturus.c   |  2 +
 .../drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.c| 55 
 .../drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.h| 10 +++
 .../drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10_3.c  |  5 +-
 .../drm/amd/amdgpu/amdgpu_amdkfd_gfx_v11.c| 86 ++-
 .../gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c | 55 
 .../gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.h | 10 +++
 drivers/gpu/drm/amd/amdkfd/kfd_chardev.c  |  7 ++
 drivers/gpu/drm/amd/amdkfd/kfd_debug.c| 69 +++
 drivers/gpu/drm/amd/amdkfd/kfd_debug.h|  6 ++
 11 files changed, 350 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_aldebaran.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_aldebaran.c
index a64a53f9efe6..84a9d9391ea4 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_aldebaran.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_aldebaran.c
@@ -25,6 +25,7 @@
 #include "amdgpu_amdkfd_gfx_v9.h"
 #include "gc/gc_9_4_2_offset.h"
 #include "gc/gc_9_4_2_sh_mask.h"
+#include 
 
 /**
  * Returns TRAP_EN, EXCP_EN and EXCP_REPLACE.
@@ -62,6 +63,50 @@ static uint32_t kgd_aldebaran_disable_debug_trap(struct 
amdgpu_device *adev,
return data;
 }
 
+static int kgd_aldebaran_validate_trap_override_request(struct amdgpu_device 
*adev,
+   uint32_t trap_override,
+   uint32_t 
*trap_mask_supported)
+{
+   *trap_mask_supported &= KFD_DBG_TRAP_MASK_FP_INVALID |
+   KFD_DBG_TRAP_MASK_FP_INPUT_DENORMAL |
+   KFD_DBG_TRAP_MASK_FP_DIVIDE_BY_ZERO |
+   KFD_DBG_TRAP_MASK_FP_OVERFLOW |
+   KFD_DBG_TRAP_MASK_FP_UNDERFLOW |
+   KFD_DBG_TRAP_MASK_FP_INEXACT |
+   KFD_DBG_TRAP_MASK_INT_DIVIDE_BY_ZERO |
+   KFD_DBG_TRAP_MASK_DBG_ADDRESS_WATCH |
+   KFD_DBG_TRAP_MASK_DBG_MEMORY_VIOLATION;
+
+   if (trap_override != KFD_DBG_TRAP_OVERRIDE_OR &&
+   trap_override != KFD_DBG_TRAP_OVERRIDE_REPLACE)
+   return -EPERM;
+
+   return 0;
+}
+
+/* returns TRAP_EN, EXCP_EN and EXCP_RPLACE. */
+static uint32_t kgd_aldebaran_set_wave_launch_trap_override(struct 
amdgpu_device *adev,
+   uint32_t vmid,
+   uint32_t trap_override,
+   uint32_t trap_mask_bits,
+   uint32_t trap_mask_request,
+   uint32_t *trap_mask_prev,
+   uint32_t kfd_dbg_trap_cntl_prev)
+
+{
+   uint32_t data = 0;
+
+   *trap_mask_prev = REG_GET_FIELD(kfd_dbg_trap_cntl_prev, 
SPI_GDBG_PER_VMID_CNTL, EXCP_EN);
+   trap_mask_bits = (trap_mask_bits & trap_mask_request) |
+   (*trap_mask_prev & ~trap_mask_request);
+
+   data = REG_SET_FIELD(data, SPI_GDBG_PER_VMID_CNTL, TRAP_EN, 1);
+   data = REG_SET_FIELD(data, SPI_GDBG_PER_VMID_CNTL, EXCP_EN, 
trap_mask_bits);
+   data = REG_SET_FIELD(data, SPI_GDBG_PER_VMID_CNTL, EXCP_REPLACE, 
trap_override);
+
+   return data;
+}
+
 const struct kfd2kgd_calls aldebaran_kfd2kgd = {
.program_sh_mem_settings = kgd_gfx_v9_program_sh_mem_settings,
.set_pasid_vmid_mapping = kgd_gfx_v9_set_pasid_vmid_mapping,
@@ -81,6 +126,8 @@ const struct kfd2kgd_calls aldebaran_kfd2kgd = {
.set_vm_context_page_table_base = 
kgd_gfx_v9_set_vm_context_page_table_base,
.enable_debug_trap = kgd_aldebaran_enable_debug_trap,
.disable_debug_trap = kgd_aldebaran_disable_debug_trap,
+   .validate_trap_override_reque

[PATCH 27/32] drm/amdkfd: add debug set flags operation

2023-01-25 Thread Jonathan Kim

Allow the debugger to set single memory and single ALU operations.

Some exceptions are imprecise (memory violations, address watch) in the
sense that a trap occurs only when the exception interrupt occurs and
not at the non-halting faulty instruction.  Trap temporaries 0 & 1 save
the program counter address, which means that these values will not point
to the faulty instruction address but to whenever the interrupt was
raised.

Setting the Single Memory Operations flag will inject an automatic wait
on every memory operation instruction forcing imprecise memory exceptions
to become precise at the cost of performance.  This setting is not
permitted on debug devices that support only a global setting of this
option.

Return the previous set flags to the debugger as well.

v3: make precise mem op the only available flag for now.

v2: add gfx11 support.

Signed-off-by: Jonathan Kim 
---
 drivers/gpu/drm/amd/amdkfd/kfd_chardev.c |  2 ++
 drivers/gpu/drm/amd/amdkfd/kfd_debug.c   | 38 
 drivers/gpu/drm/amd/amdkfd/kfd_debug.h   |  1 +
 3 files changed, 41 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
index 8f2ede781863..c34caa14b84e 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
@@ -2947,6 +2947,8 @@ static int kfd_ioctl_set_debug_trap(struct file *filep, 
struct kfd_process *p, v
args->clear_node_address_watch.id);
break;
case KFD_IOC_DBG_TRAP_SET_FLAGS:
+   r = kfd_dbg_trap_set_flags(target, >set_flags.flags);
+   break;
case KFD_IOC_DBG_TRAP_QUERY_DEBUG_EVENT:
case KFD_IOC_DBG_TRAP_QUERY_EXCEPTION_INFO:
case KFD_IOC_DBG_TRAP_GET_QUEUE_SNAPSHOT:
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_debug.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_debug.c
index 8d2e1adb442d..77ba7da2bb9d 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_debug.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_debug.c
@@ -23,6 +23,7 @@
 #include "kfd_debug.h"
 #include "kfd_device_queue_manager.h"
 #include 
+#include 
 
 #define MAX_WATCH_ADDRESSES4
 
@@ -425,6 +426,40 @@ static void kfd_dbg_clear_process_address_watch(struct 
kfd_process *target)
kfd_dbg_trap_clear_dev_address_watch(target->pdds[i], 
j);
 }
 
+int kfd_dbg_trap_set_flags(struct kfd_process *target, uint32_t *flags)
+{
+   uint32_t prev_flags = target->dbg_flags;
+   int i, r = 0;
+
+   for (i = 0; i < target->n_pdds; i++) {
+   if (!kfd_dbg_is_per_vmid_supported(target->pdds[i]->dev) &&
+   (*flags & KFD_DBG_TRAP_FLAG_SINGLE_MEM_OP)) {
+   *flags = prev_flags;
+   return -EACCES;
+   }
+   }
+
+   target->dbg_flags = *flags & KFD_DBG_TRAP_FLAG_SINGLE_MEM_OP;
+   *flags = prev_flags;
+   for (i = 0; i < target->n_pdds; i++) {
+   struct kfd_process_device *pdd = target->pdds[i];
+
+   if (!kfd_dbg_is_per_vmid_supported(pdd->dev))
+   continue;
+
+   if (!pdd->dev->shared_resources.enable_mes)
+   r = debug_refresh_runlist(pdd->dev->dqm);
+   else
+   r = kfd_dbg_set_mes_debug_mode(pdd);
+
+   if (r) {
+   target->dbg_flags = prev_flags;
+   break;
+   }
+   }
+
+   return r;
+}
 
 /* kfd_dbg_trap_deactivate:
  * target: target process
@@ -439,9 +474,12 @@ void kfd_dbg_trap_deactivate(struct kfd_process *target, 
bool unwind, int unwind
int i, count = 0;
 
if (!unwind) {
+   uint32_t flags = 0;
cancel_work_sync(>debug_event_workarea);
kfd_dbg_clear_process_address_watch(target);
kfd_dbg_trap_set_wave_launch_mode(target, 0);
+
+   kfd_dbg_trap_set_flags(target, );
}
 
for (i = 0; i < target->n_pdds; i++) {
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_debug.h 
b/drivers/gpu/drm/amd/amdkfd/kfd_debug.h
index 63c716ce5ab9..782362d82890 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_debug.h
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_debug.h
@@ -57,6 +57,7 @@ int kfd_dbg_trap_set_dev_address_watch(struct 
kfd_process_device *pdd,
uint32_t watch_address_mask,
uint32_t *watch_id,
uint32_t watch_mode);
+int kfd_dbg_trap_set_flags(struct kfd_process *target, uint32_t *flags);
 int kfd_dbg_send_exception_to_runtime(struct kfd_process *p,
unsigned int dev_id,
unsigned int queue_id,
-- 
2.25.1

1 2 3 >

1 - 100 of 250 matches

Mail list logo