---
drivers/accel/amdxdna/aie2_ctx.c | 85 ++++++++++++++++++++++++---
drivers/accel/amdxdna/aie2_message.c | 41 +++++++++++++
drivers/accel/amdxdna/aie2_msg_priv.h | 52 ++++++++++++++++
drivers/accel/amdxdna/aie2_pci.c | 14 +++++
drivers/accel/amdxdna/aie2_pci.h | 5 ++
drivers/accel/amdxdna/amdxdna_ctx.c | 6 +-
drivers/accel/amdxdna/amdxdna_ctx.h | 18 +++++-
drivers/accel/amdxdna/npu4_regs.c | 3 +-
8 files changed, 213 insertions(+), 11 deletions(-)
diff --git a/drivers/accel/amdxdna/aie2_ctx.c
b/drivers/accel/amdxdna/aie2_ctx.c
index 779ac70d62d7..6292349868c5 100644
--- a/drivers/accel/amdxdna/aie2_ctx.c
+++ b/drivers/accel/amdxdna/aie2_ctx.c
@@ -29,6 +29,16 @@ MODULE_PARM_DESC(force_cmdlist, "Force use command
list (Default true)");
#define HWCTX_MAX_TIMEOUT 60000 /* milliseconds */
+struct aie2_ctx_health {
+ struct amdxdna_ctx_health header;
+ u32 txn_op_idx;
+ u32 ctx_pc;
+ u32 fatal_error_type;
+ u32 fatal_error_exception_type;
+ u32 fatal_error_exception_pc;
+ u32 fatal_error_app_module;
+};
+
static void aie2_job_release(struct kref *ref)
{
struct amdxdna_sched_job *job;
@@ -39,6 +49,7 @@ static void aie2_job_release(struct kref *ref)
wake_up(&job->hwctx->priv->job_free_wq);
if (job->out_fence)
dma_fence_put(job->out_fence);
+ kfree(job->aie2_job_health);
kfree(job);
}
@@ -176,6 +187,50 @@ aie2_sched_notify(struct amdxdna_sched_job *job)
aie2_job_put(job);
}
+static void aie2_set_cmd_timeout(struct amdxdna_sched_job *job)
+{
+ struct aie2_ctx_health *aie2_health __free(kfree) = NULL;
+ struct amdxdna_dev *xdna = job->hwctx->client->xdna;
+ struct amdxdna_gem_obj *cmd_abo = job->cmd_bo;
+ struct app_health_report *report = job->aie2_job_health;
+ u32 fail_cmd_idx = 0;
+
+ if (!report)
+ goto set_timeout;
+
+ XDNA_ERR(xdna, "Firmware timeout state capture:");
+ XDNA_ERR(xdna, "\tVersion: %d.%d", report->major, report->minor);
+ XDNA_ERR(xdna, "\tReport size: 0x%x", report->size);
+ XDNA_ERR(xdna, "\tContext ID: %d", report->context_id);
+ XDNA_ERR(xdna, "\tDPU PC: 0x%x", report->dpu_pc);
+ XDNA_ERR(xdna, "\tTXN OP ID: 0x%x", report->txn_op_id);
+ XDNA_ERR(xdna, "\tContext PC: 0x%x", report->ctx_pc);
+ XDNA_ERR(xdna, "\tFatal error type: 0x%x",
report->fatal_info.fatal_type);
+ XDNA_ERR(xdna, "\tFatal error exception type: 0x%x",
report->fatal_info.exception_type);
+ XDNA_ERR(xdna, "\tFatal error exception PC: 0x%x",
report->fatal_info.exception_pc);
+ XDNA_ERR(xdna, "\tFatal error app module: 0x%x",
report->fatal_info.app_module);
+ XDNA_ERR(xdna, "\tFatal error task ID: %d",
report->fatal_info.task_index);
+ XDNA_ERR(xdna, "\tTimed out sub command ID: %d",
report->run_list_id);
+
+ fail_cmd_idx = report->run_list_id;
+ aie2_health = kzalloc_obj(*aie2_health);
+ if (!aie2_health)
+ goto set_timeout;
+
+ aie2_health->header.version = AMDXDNA_CMD_CTX_HEALTH_V1;
+ aie2_health->header.npu_gen = AMDXDNA_CMD_CTX_HEALTH_AIE2;
+ aie2_health->txn_op_idx = report->txn_op_id;
+ aie2_health->ctx_pc = report->ctx_pc;
+ aie2_health->fatal_error_type = report->fatal_info.fatal_type;
+ aie2_health->fatal_error_exception_type =
report->fatal_info.exception_type;
+ aie2_health->fatal_error_exception_pc =
report->fatal_info.exception_pc;
+ aie2_health->fatal_error_app_module =
report->fatal_info.app_module;
+
+set_timeout:
+ amdxdna_cmd_set_error(cmd_abo, job, fail_cmd_idx,
ERT_CMD_STATE_TIMEOUT,
+ aie2_health, sizeof(*aie2_health));
+}
+
static int
aie2_sched_resp_handler(void *handle, void __iomem *data, size_t size)
{
@@ -187,13 +242,13 @@ aie2_sched_resp_handler(void *handle, void
__iomem *data, size_t size)
cmd_abo = job->cmd_bo;
if (unlikely(job->job_timeout)) {
- amdxdna_cmd_set_error(cmd_abo, job, 0, ERT_CMD_STATE_TIMEOUT);
+ aie2_set_cmd_timeout(job);
ret = -EINVAL;
goto out;
}
if (unlikely(!data) || unlikely(size != sizeof(u32))) {
- amdxdna_cmd_set_error(cmd_abo, job, 0, ERT_CMD_STATE_ABORT);
+ amdxdna_cmd_set_error(cmd_abo, job, 0, ERT_CMD_STATE_ABORT,
NULL, 0);
ret = -EINVAL;
goto out;
}
@@ -203,7 +258,7 @@ aie2_sched_resp_handler(void *handle, void
__iomem *data, size_t size)
if (status == AIE2_STATUS_SUCCESS)
amdxdna_cmd_set_state(cmd_abo, ERT_CMD_STATE_COMPLETED);
else
- amdxdna_cmd_set_error(cmd_abo, job, 0, ERT_CMD_STATE_ERROR);
+ amdxdna_cmd_set_error(cmd_abo, job, 0, ERT_CMD_STATE_ERROR,
NULL, 0);
out:
aie2_sched_notify(job);
@@ -237,21 +292,21 @@ aie2_sched_cmdlist_resp_handler(void *handle,
void __iomem *data, size_t size)
struct amdxdna_sched_job *job = handle;
struct amdxdna_gem_obj *cmd_abo;
struct amdxdna_dev *xdna;
+ u32 fail_cmd_idx = 0;
u32 fail_cmd_status;
- u32 fail_cmd_idx;
u32 cmd_status;
int ret = 0;
cmd_abo = job->cmd_bo;
if (unlikely(job->job_timeout)) {
- amdxdna_cmd_set_error(cmd_abo, job, 0, ERT_CMD_STATE_TIMEOUT);
+ aie2_set_cmd_timeout(job);
ret = -EINVAL;
goto out;
}
if (unlikely(!data) || unlikely(size != sizeof(u32) * 3)) {
- amdxdna_cmd_set_error(cmd_abo, job, 0, ERT_CMD_STATE_ABORT);
+ amdxdna_cmd_set_error(cmd_abo, job, 0, ERT_CMD_STATE_ABORT,
NULL, 0);
ret = -EINVAL;
goto out;
}
@@ -271,10 +326,10 @@ aie2_sched_cmdlist_resp_handler(void *handle,
void __iomem *data, size_t size)
fail_cmd_idx, fail_cmd_status);
if (fail_cmd_status == AIE2_STATUS_SUCCESS) {
- amdxdna_cmd_set_error(cmd_abo, job, fail_cmd_idx,
ERT_CMD_STATE_ABORT);
+ amdxdna_cmd_set_error(cmd_abo, job, fail_cmd_idx,
ERT_CMD_STATE_ABORT, NULL, 0);
ret = -EINVAL;
} else {
- amdxdna_cmd_set_error(cmd_abo, job, fail_cmd_idx,
ERT_CMD_STATE_ERROR);
+ amdxdna_cmd_set_error(cmd_abo, job, fail_cmd_idx,
ERT_CMD_STATE_ERROR, NULL, 0);
}
out:
@@ -363,12 +418,26 @@ aie2_sched_job_timedout(struct drm_sched_job
*sched_job)
{
struct amdxdna_sched_job *job = drm_job_to_xdna_job(sched_job);
struct amdxdna_hwctx *hwctx = job->hwctx;
+ struct app_health_report *report;
struct amdxdna_dev *xdna;
+ int ret;
xdna = hwctx->client->xdna;
trace_xdna_job(sched_job, hwctx->name, "job timedout", job->seq);
job->job_timeout = true;
+
mutex_lock(&xdna->dev_lock);
+ report = kzalloc_obj(*report);
+ if (!report)
+ goto reset_hwctx;
+
+ ret = aie2_query_app_health(xdna->dev_handle, hwctx->fw_ctx_id,
report);
+ if (ret)
+ kfree(report);
+ else
+ job->aie2_job_health = report;
+
+reset_hwctx:
aie2_hwctx_stop(xdna, hwctx, sched_job);
aie2_hwctx_restart(xdna, hwctx);
diff --git a/drivers/accel/amdxdna/aie2_message.c
b/drivers/accel/amdxdna/aie2_message.c
index 798128b6b7b7..4ec591306854 100644
--- a/drivers/accel/amdxdna/aie2_message.c
+++ b/drivers/accel/amdxdna/aie2_message.c
@@ -1185,3 +1185,44 @@ int aie2_config_debug_bo(struct amdxdna_hwctx
*hwctx, struct amdxdna_sched_job *
return xdna_mailbox_send_msg(chann, &msg, TX_TIMEOUT);
}
+
+int aie2_query_app_health(struct amdxdna_dev_hdl *ndev, u32 context_id,
+ struct app_health_report *report)
+{
+ DECLARE_AIE2_MSG(get_app_health, MSG_OP_GET_APP_HEALTH);
+ struct amdxdna_dev *xdna = ndev->xdna;
+ struct app_health_report *buf;
+ dma_addr_t dma_addr;
+ u32 buf_size;
+ int ret;
+
+ if (!AIE2_FEATURE_ON(ndev, AIE2_APP_HEALTH)) {
+ XDNA_DBG(xdna, "App health feature not supported");
+ return -EOPNOTSUPP;
+ }
+
+ buf_size = sizeof(*report);
+ buf = aie2_alloc_msg_buffer(ndev, &buf_size, &dma_addr);
+ if (IS_ERR(buf)) {
+ XDNA_ERR(xdna, "Failed to allocate buffer for app health");
+ return PTR_ERR(buf);
+ }
+
+ req.buf_addr = dma_addr;
+ req.context_id = context_id;
+ req.buf_size = buf_size;
+
+ drm_clflush_virt_range(buf, sizeof(*report));
+ ret = aie2_send_mgmt_msg_wait(ndev, &msg);
+ if (ret) {
+ XDNA_ERR(xdna, "Get app health failed, ret %d status 0x%x",
ret, resp.status);
+ goto free_buf;
+ }
+
+ /* Copy the report to caller's buffer */
+ memcpy(report, buf, sizeof(*report));
+
+free_buf:
+ aie2_free_msg_buffer(ndev, buf_size, buf, dma_addr);
+ return ret;
+}
diff --git a/drivers/accel/amdxdna/aie2_msg_priv.h
b/drivers/accel/amdxdna/aie2_msg_priv.h
index 728ef56f7f0a..f18e89a39e35 100644
--- a/drivers/accel/amdxdna/aie2_msg_priv.h
+++ b/drivers/accel/amdxdna/aie2_msg_priv.h
@@ -31,6 +31,7 @@ enum aie2_msg_opcode {
MSG_OP_SET_RUNTIME_CONFIG = 0x10A,
MSG_OP_GET_RUNTIME_CONFIG = 0x10B,
MSG_OP_REGISTER_ASYNC_EVENT_MSG = 0x10C,
+ MSG_OP_GET_APP_HEALTH = 0x114,
MSG_OP_MAX_DRV_OPCODE,
MSG_OP_GET_PROTOCOL_VERSION = 0x301,
MSG_OP_MAX_OPCODE
@@ -451,4 +452,55 @@ struct config_debug_bo_req {
struct config_debug_bo_resp {
enum aie2_msg_status status;
} __packed;
+
+struct fatal_error_info {
+ __u32 fatal_type; /* Fatal error type */
+ __u32 exception_type; /* Only valid if fatal_type is a
specific value */
+ __u32 exception_argument; /* Argument based on exception type */
+ __u32 exception_pc; /* Program Counter at the time of the
exception */
+ __u32 app_module; /* Error module name */
+ __u32 task_index; /* Index of the task in which the
error occurred */
+ __u32 reserved[128];
+};
+
+struct app_health_report {
+ __u16 major;
+ __u16 minor;
+ __u32 size;
+ __u32 context_id;
+ /*
+ * Program Counter (PC) of the last initiated DPU opcode, as
reported by the ERT
+ * application. Before execution begins or after successful
completion, the value is set
+ * to UINT_MAX. If execution halts prematurely due to an error,
this field retains the
+ * opcode's PC value.
+ * Note: To optimize performance, the ERT may simplify certain
aspects of reporting.
+ * Proper interpretation requires familiarity with the
implementation details.
+ */
+ __u32 dpu_pc;
+ /*
+ * Index of the last initiated TXN opcode.
+ * Before execution starts or after successful completion, the
value is set to UINT_MAX.
+ * If execution halts prematurely due to an error, this field
retains the opcode's ID.
+ * Note: To optimize performance, the ERT may simplify certain
aspects of reporting.
+ * Proper interpretation requires familiarity with the
implementation details.
+ */
+ __u32 txn_op_id;
+ /* The PC of the context at the time of the report */
+ __u32 ctx_pc;
+ struct fatal_error_info fatal_info;
+ /* Index of the most recently executed run list entry. */
+ __u32 run_list_id;
+};
+
+struct get_app_health_req {
+ __u32 context_id;
+ __u32 buf_size;
+ __u64 buf_addr;
+} __packed;
+
+struct get_app_health_resp {
+ enum aie2_msg_status status;
+ __u32 required_buffer_size;
+ __u32 reserved[7];
+} __packed;
#endif /* _AIE2_MSG_PRIV_H_ */
diff --git a/drivers/accel/amdxdna/aie2_pci.c
b/drivers/accel/amdxdna/aie2_pci.c
index ddd3d82f3426..9e39bfe75971 100644
--- a/drivers/accel/amdxdna/aie2_pci.c
+++ b/drivers/accel/amdxdna/aie2_pci.c
@@ -846,7 +846,10 @@ static int aie2_hwctx_status_cb(struct
amdxdna_hwctx *hwctx, void *arg)
struct amdxdna_drm_hwctx_entry *tmp __free(kfree) = NULL;
struct amdxdna_drm_get_array *array_args = arg;
struct amdxdna_drm_hwctx_entry __user *buf;
+ struct app_health_report report;
+ struct amdxdna_dev_hdl *ndev;
u32 size;
+ int ret;
if (!array_args->num_element)
return -EINVAL;
@@ -869,6 +872,17 @@ static int aie2_hwctx_status_cb(struct
amdxdna_hwctx *hwctx, void *arg)
tmp->latency = hwctx->qos.latency;
tmp->frame_exec_time = hwctx->qos.frame_exec_time;
tmp->state = AMDXDNA_HWCTX_STATE_ACTIVE;
+ ndev = hwctx->client->xdna->dev_handle;
+ ret = aie2_query_app_health(ndev, hwctx->fw_ctx_id, &report);
+ if (!ret) {
+ /* Fill in app health report fields */
+ tmp->txn_op_idx = report.txn_op_id;
+ tmp->ctx_pc = report.ctx_pc;
+ tmp->fatal_error_type = report.fatal_info.fatal_type;
+ tmp->fatal_error_exception_type =
report.fatal_info.exception_type;
+ tmp->fatal_error_exception_pc = report.fatal_info.exception_pc;
+ tmp->fatal_error_app_module = report.fatal_info.app_module;
+ }
buf = u64_to_user_ptr(array_args->buffer);
size = min(sizeof(*tmp), array_args->element_size);
diff --git a/drivers/accel/amdxdna/aie2_pci.h
b/drivers/accel/amdxdna/aie2_pci.h
index 885ae7e6bfc7..efcf4be035f0 100644
--- a/drivers/accel/amdxdna/aie2_pci.h
+++ b/drivers/accel/amdxdna/aie2_pci.h
@@ -10,6 +10,7 @@
#include <linux/limits.h>
#include <linux/semaphore.h>
+#include "aie2_msg_priv.h"
#include "amdxdna_mailbox.h"
#define AIE2_INTERVAL 20000 /* us */
@@ -261,6 +262,7 @@ enum aie2_fw_feature {
AIE2_NPU_COMMAND,
AIE2_PREEMPT,
AIE2_TEMPORAL_ONLY,
+ AIE2_APP_HEALTH,
AIE2_FEATURE_MAX
};
@@ -271,6 +273,7 @@ struct aie2_fw_feature_tbl {
u32 min_minor;
};
+#define AIE2_ALL_FEATURES GENMASK_ULL(AIE2_FEATURE_MAX - 1,
AIE2_NPU_COMMAND)
#define AIE2_FEATURE_ON(ndev, feature) test_bit(feature,
&(ndev)->feature_mask)
struct amdxdna_dev_priv {
@@ -341,6 +344,8 @@ int aie2_query_aie_version(struct amdxdna_dev_hdl
*ndev, struct aie_version *ver
int aie2_query_aie_metadata(struct amdxdna_dev_hdl *ndev, struct
aie_metadata *metadata);
int aie2_query_firmware_version(struct amdxdna_dev_hdl *ndev,
struct amdxdna_fw_ver *fw_ver);
+int aie2_query_app_health(struct amdxdna_dev_hdl *ndev, u32 context_id,
+ struct app_health_report *report);
int aie2_create_context(struct amdxdna_dev_hdl *ndev, struct
amdxdna_hwctx *hwctx);
int aie2_destroy_context(struct amdxdna_dev_hdl *ndev, struct
amdxdna_hwctx *hwctx);
int aie2_map_host_buf(struct amdxdna_dev_hdl *ndev, u32 context_id,
u64 addr, u64 size);
diff --git a/drivers/accel/amdxdna/amdxdna_ctx.c
b/drivers/accel/amdxdna/amdxdna_ctx.c
index 666dfd7b2a80..4b921715176d 100644
--- a/drivers/accel/amdxdna/amdxdna_ctx.c
+++ b/drivers/accel/amdxdna/amdxdna_ctx.c
@@ -137,7 +137,8 @@ u32 amdxdna_cmd_get_cu_idx(struct amdxdna_gem_obj
*abo)
int amdxdna_cmd_set_error(struct amdxdna_gem_obj *abo,
struct amdxdna_sched_job *job, u32 cmd_idx,
- enum ert_cmd_state error_state)
+ enum ert_cmd_state error_state,
+ void *err_data, size_t size)
{
struct amdxdna_client *client = job->hwctx->client;
struct amdxdna_cmd *cmd = abo->mem.kva;
@@ -156,6 +157,9 @@ int amdxdna_cmd_set_error(struct amdxdna_gem_obj
*abo,
}
memset(cmd->data, 0xff, abo->mem.size - sizeof(*cmd));
+ if (err_data)
+ memcpy(cmd->data, err_data, min(size, abo->mem.size -
sizeof(*cmd)));
+
if (cc)
amdxdna_gem_put_obj(abo);
diff --git a/drivers/accel/amdxdna/amdxdna_ctx.h
b/drivers/accel/amdxdna/amdxdna_ctx.h
index fbdf9d000871..57db1527a93b 100644
--- a/drivers/accel/amdxdna/amdxdna_ctx.h
+++ b/drivers/accel/amdxdna/amdxdna_ctx.h
@@ -72,6 +72,13 @@ struct amdxdna_cmd_preempt_data {
u32 prop_args[]; /* properties and regular kernel arguments */
};
+#define AMDXDNA_CMD_CTX_HEALTH_V1 1
+#define AMDXDNA_CMD_CTX_HEALTH_AIE2 0
+struct amdxdna_ctx_health {
+ u32 version;
+ u32 npu_gen;
+};
+
/* Exec buffer command header format */
#define AMDXDNA_CMD_STATE GENMASK(3, 0)
#define AMDXDNA_CMD_EXTRA_CU_MASK GENMASK(11, 10)
@@ -122,6 +129,11 @@ struct amdxdna_drv_cmd {
u32 result;
};
+struct app_health_report;
+union amdxdna_job_priv {
+ struct app_health_report *aie2_health;
+};
+
struct amdxdna_sched_job {
struct drm_sched_job base;
struct kref refcnt;
@@ -136,10 +148,13 @@ struct amdxdna_sched_job {
u64 seq;
struct amdxdna_drv_cmd *drv_cmd;
struct amdxdna_gem_obj *cmd_bo;
+ union amdxdna_job_priv priv;
size_t bo_cnt;
struct drm_gem_object *bos[] __counted_by(bo_cnt);
};
+#define aie2_job_health priv.aie2_health
+
static inline u32
amdxdna_cmd_get_op(struct amdxdna_gem_obj *abo)
{
@@ -169,7 +184,8 @@ void *amdxdna_cmd_get_payload(struct
amdxdna_gem_obj *abo, u32 *size);
u32 amdxdna_cmd_get_cu_idx(struct amdxdna_gem_obj *abo);
int amdxdna_cmd_set_error(struct amdxdna_gem_obj *abo,
struct amdxdna_sched_job *job, u32 cmd_idx,
- enum ert_cmd_state error_state);
+ enum ert_cmd_state error_state,
+ void *err_data, size_t size);
void amdxdna_sched_job_cleanup(struct amdxdna_sched_job *job);
void amdxdna_hwctx_remove_all(struct amdxdna_client *client);
diff --git a/drivers/accel/amdxdna/npu4_regs.c
b/drivers/accel/amdxdna/npu4_regs.c
index ce25eef5fc34..619bff042e52 100644
--- a/drivers/accel/amdxdna/npu4_regs.c
+++ b/drivers/accel/amdxdna/npu4_regs.c
@@ -93,7 +93,8 @@ const struct aie2_fw_feature_tbl
npu4_fw_feature_table[] = {
{ .features = BIT_U64(AIE2_NPU_COMMAND), .major = 6, .min_minor
= 15 },
{ .features = BIT_U64(AIE2_PREEMPT), .major = 6, .min_minor =
12 },
{ .features = BIT_U64(AIE2_TEMPORAL_ONLY), .major = 6,
.min_minor = 12 },
- { .features = GENMASK_ULL(AIE2_TEMPORAL_ONLY, AIE2_NPU_COMMAND),
.major = 7 },
+ { .features = BIT_U64(AIE2_APP_HEALTH), .major = 6, .min_minor =
18 },
+ { .features = AIE2_ALL_FEATURES, .major = 7 },
{ 0 }
};