Applied to drm-misc-next.

On 3/17/26 12:25, Mario Limonciello wrote:


On 3/16/26 23:49, Lizhi Hou wrote:
The firmware implements the GET_APP_HEALTH command to collect debug
information for a specific hardware context.

When a command times out, the driver issues this command to collect the
relevant debug information. User space tools can also retrieve this
information through the hardware context query IOCTL.

Signed-off-by: Lizhi Hou <[email protected]>
Reviewed-by: Mario Limonciello <[email protected]>
---
  drivers/accel/amdxdna/aie2_ctx.c      | 85 ++++++++++++++++++++++++---
  drivers/accel/amdxdna/aie2_message.c  | 41 +++++++++++++
  drivers/accel/amdxdna/aie2_msg_priv.h | 52 ++++++++++++++++
  drivers/accel/amdxdna/aie2_pci.c      | 14 +++++
  drivers/accel/amdxdna/aie2_pci.h      |  5 ++
  drivers/accel/amdxdna/amdxdna_ctx.c   |  6 +-
  drivers/accel/amdxdna/amdxdna_ctx.h   | 18 +++++-
  drivers/accel/amdxdna/npu4_regs.c     |  3 +-
  8 files changed, 213 insertions(+), 11 deletions(-)

diff --git a/drivers/accel/amdxdna/aie2_ctx.c b/drivers/accel/amdxdna/aie2_ctx.c
index 779ac70d62d7..6292349868c5 100644
--- a/drivers/accel/amdxdna/aie2_ctx.c
+++ b/drivers/accel/amdxdna/aie2_ctx.c
@@ -29,6 +29,16 @@ MODULE_PARM_DESC(force_cmdlist, "Force use command list (Default true)");
    #define HWCTX_MAX_TIMEOUT    60000 /* milliseconds */
  +struct aie2_ctx_health {
+    struct amdxdna_ctx_health header;
+    u32 txn_op_idx;
+    u32 ctx_pc;
+    u32 fatal_error_type;
+    u32 fatal_error_exception_type;
+    u32 fatal_error_exception_pc;
+    u32 fatal_error_app_module;
+};
+
  static void aie2_job_release(struct kref *ref)
  {
      struct amdxdna_sched_job *job;
@@ -39,6 +49,7 @@ static void aie2_job_release(struct kref *ref)
      wake_up(&job->hwctx->priv->job_free_wq);
      if (job->out_fence)
          dma_fence_put(job->out_fence);
+    kfree(job->aie2_job_health);
      kfree(job);
  }
  @@ -176,6 +187,50 @@ aie2_sched_notify(struct amdxdna_sched_job *job)
      aie2_job_put(job);
  }
  +static void aie2_set_cmd_timeout(struct amdxdna_sched_job *job)
+{
+    struct aie2_ctx_health *aie2_health __free(kfree) = NULL;
+    struct amdxdna_dev *xdna = job->hwctx->client->xdna;
+    struct amdxdna_gem_obj *cmd_abo = job->cmd_bo;
+    struct app_health_report *report = job->aie2_job_health;
+    u32 fail_cmd_idx = 0;
+
+    if (!report)
+        goto set_timeout;
+
+    XDNA_ERR(xdna, "Firmware timeout state capture:");
+    XDNA_ERR(xdna, "\tVersion: %d.%d", report->major, report->minor);
+    XDNA_ERR(xdna, "\tReport size: 0x%x", report->size);
+    XDNA_ERR(xdna, "\tContext ID: %d", report->context_id);
+    XDNA_ERR(xdna, "\tDPU PC: 0x%x", report->dpu_pc);
+    XDNA_ERR(xdna, "\tTXN OP ID: 0x%x", report->txn_op_id);
+    XDNA_ERR(xdna, "\tContext PC: 0x%x", report->ctx_pc);
+    XDNA_ERR(xdna, "\tFatal error type: 0x%x", report->fatal_info.fatal_type); +    XDNA_ERR(xdna, "\tFatal error exception type: 0x%x", report->fatal_info.exception_type); +    XDNA_ERR(xdna, "\tFatal error exception PC: 0x%x", report->fatal_info.exception_pc); +    XDNA_ERR(xdna, "\tFatal error app module: 0x%x", report->fatal_info.app_module); +    XDNA_ERR(xdna, "\tFatal error task ID: %d", report->fatal_info.task_index); +    XDNA_ERR(xdna, "\tTimed out sub command ID: %d", report->run_list_id);
+
+    fail_cmd_idx = report->run_list_id;
+    aie2_health = kzalloc_obj(*aie2_health);
+    if (!aie2_health)
+        goto set_timeout;
+
+    aie2_health->header.version = AMDXDNA_CMD_CTX_HEALTH_V1;
+    aie2_health->header.npu_gen = AMDXDNA_CMD_CTX_HEALTH_AIE2;
+    aie2_health->txn_op_idx = report->txn_op_id;
+    aie2_health->ctx_pc = report->ctx_pc;
+    aie2_health->fatal_error_type = report->fatal_info.fatal_type;
+    aie2_health->fatal_error_exception_type = report->fatal_info.exception_type; +    aie2_health->fatal_error_exception_pc = report->fatal_info.exception_pc; +    aie2_health->fatal_error_app_module = report->fatal_info.app_module;
+
+set_timeout:
+    amdxdna_cmd_set_error(cmd_abo, job, fail_cmd_idx, ERT_CMD_STATE_TIMEOUT,
+                  aie2_health, sizeof(*aie2_health));
+}
+
  static int
  aie2_sched_resp_handler(void *handle, void __iomem *data, size_t size)
  {
@@ -187,13 +242,13 @@ aie2_sched_resp_handler(void *handle, void __iomem *data, size_t size)
      cmd_abo = job->cmd_bo;
        if (unlikely(job->job_timeout)) {
-        amdxdna_cmd_set_error(cmd_abo, job, 0, ERT_CMD_STATE_TIMEOUT);
+        aie2_set_cmd_timeout(job);
          ret = -EINVAL;
          goto out;
      }
        if (unlikely(!data) || unlikely(size != sizeof(u32))) {
-        amdxdna_cmd_set_error(cmd_abo, job, 0, ERT_CMD_STATE_ABORT);
+        amdxdna_cmd_set_error(cmd_abo, job, 0, ERT_CMD_STATE_ABORT, NULL, 0);
          ret = -EINVAL;
          goto out;
      }
@@ -203,7 +258,7 @@ aie2_sched_resp_handler(void *handle, void __iomem *data, size_t size)
      if (status == AIE2_STATUS_SUCCESS)
          amdxdna_cmd_set_state(cmd_abo, ERT_CMD_STATE_COMPLETED);
      else
-        amdxdna_cmd_set_error(cmd_abo, job, 0, ERT_CMD_STATE_ERROR);
+        amdxdna_cmd_set_error(cmd_abo, job, 0, ERT_CMD_STATE_ERROR, NULL, 0);
    out:
      aie2_sched_notify(job);
@@ -237,21 +292,21 @@ aie2_sched_cmdlist_resp_handler(void *handle, void __iomem *data, size_t size)
      struct amdxdna_sched_job *job = handle;
      struct amdxdna_gem_obj *cmd_abo;
      struct amdxdna_dev *xdna;
+    u32 fail_cmd_idx = 0;
      u32 fail_cmd_status;
-    u32 fail_cmd_idx;
      u32 cmd_status;
      int ret = 0;
        cmd_abo = job->cmd_bo;
        if (unlikely(job->job_timeout)) {
-        amdxdna_cmd_set_error(cmd_abo, job, 0, ERT_CMD_STATE_TIMEOUT);
+        aie2_set_cmd_timeout(job);
          ret = -EINVAL;
          goto out;
      }
        if (unlikely(!data) || unlikely(size != sizeof(u32) * 3)) {
-        amdxdna_cmd_set_error(cmd_abo, job, 0, ERT_CMD_STATE_ABORT);
+        amdxdna_cmd_set_error(cmd_abo, job, 0, ERT_CMD_STATE_ABORT, NULL, 0);
          ret = -EINVAL;
          goto out;
      }
@@ -271,10 +326,10 @@ aie2_sched_cmdlist_resp_handler(void *handle, void __iomem *data, size_t size)
           fail_cmd_idx, fail_cmd_status);
        if (fail_cmd_status == AIE2_STATUS_SUCCESS) {
-        amdxdna_cmd_set_error(cmd_abo, job, fail_cmd_idx, ERT_CMD_STATE_ABORT); +        amdxdna_cmd_set_error(cmd_abo, job, fail_cmd_idx, ERT_CMD_STATE_ABORT, NULL, 0);
          ret = -EINVAL;
      } else {
-        amdxdna_cmd_set_error(cmd_abo, job, fail_cmd_idx, ERT_CMD_STATE_ERROR); +        amdxdna_cmd_set_error(cmd_abo, job, fail_cmd_idx, ERT_CMD_STATE_ERROR, NULL, 0);
      }
    out:
@@ -363,12 +418,26 @@ aie2_sched_job_timedout(struct drm_sched_job *sched_job)
  {
      struct amdxdna_sched_job *job = drm_job_to_xdna_job(sched_job);
      struct amdxdna_hwctx *hwctx = job->hwctx;
+    struct app_health_report *report;
      struct amdxdna_dev *xdna;
+    int ret;
        xdna = hwctx->client->xdna;
      trace_xdna_job(sched_job, hwctx->name, "job timedout", job->seq);
      job->job_timeout = true;
+
      mutex_lock(&xdna->dev_lock);
+    report = kzalloc_obj(*report);
+    if (!report)
+        goto reset_hwctx;
+
+    ret = aie2_query_app_health(xdna->dev_handle, hwctx->fw_ctx_id, report);
+    if (ret)
+        kfree(report);
+    else
+        job->aie2_job_health = report;
+
+reset_hwctx:
      aie2_hwctx_stop(xdna, hwctx, sched_job);
        aie2_hwctx_restart(xdna, hwctx);
diff --git a/drivers/accel/amdxdna/aie2_message.c b/drivers/accel/amdxdna/aie2_message.c
index 798128b6b7b7..4ec591306854 100644
--- a/drivers/accel/amdxdna/aie2_message.c
+++ b/drivers/accel/amdxdna/aie2_message.c
@@ -1185,3 +1185,44 @@ int aie2_config_debug_bo(struct amdxdna_hwctx *hwctx, struct amdxdna_sched_job *
        return xdna_mailbox_send_msg(chann, &msg, TX_TIMEOUT);
  }
+
+int aie2_query_app_health(struct amdxdna_dev_hdl *ndev, u32 context_id,
+              struct app_health_report *report)
+{
+    DECLARE_AIE2_MSG(get_app_health, MSG_OP_GET_APP_HEALTH);
+    struct amdxdna_dev *xdna = ndev->xdna;
+    struct app_health_report *buf;
+    dma_addr_t dma_addr;
+    u32 buf_size;
+    int ret;
+
+    if (!AIE2_FEATURE_ON(ndev, AIE2_APP_HEALTH)) {
+        XDNA_DBG(xdna, "App health feature not supported");
+        return -EOPNOTSUPP;
+    }
+
+    buf_size = sizeof(*report);
+    buf = aie2_alloc_msg_buffer(ndev, &buf_size, &dma_addr);
+    if (IS_ERR(buf)) {
+        XDNA_ERR(xdna, "Failed to allocate buffer for app health");
+        return PTR_ERR(buf);
+    }
+
+    req.buf_addr = dma_addr;
+    req.context_id = context_id;
+    req.buf_size = buf_size;
+
+    drm_clflush_virt_range(buf, sizeof(*report));
+    ret = aie2_send_mgmt_msg_wait(ndev, &msg);
+    if (ret) {
+        XDNA_ERR(xdna, "Get app health failed, ret %d status 0x%x", ret, resp.status);
+        goto free_buf;
+    }
+
+    /* Copy the report to caller's buffer */
+    memcpy(report, buf, sizeof(*report));
+
+free_buf:
+    aie2_free_msg_buffer(ndev, buf_size, buf, dma_addr);
+    return ret;
+}
diff --git a/drivers/accel/amdxdna/aie2_msg_priv.h b/drivers/accel/amdxdna/aie2_msg_priv.h
index 728ef56f7f0a..f18e89a39e35 100644
--- a/drivers/accel/amdxdna/aie2_msg_priv.h
+++ b/drivers/accel/amdxdna/aie2_msg_priv.h
@@ -31,6 +31,7 @@ enum aie2_msg_opcode {
      MSG_OP_SET_RUNTIME_CONFIG          = 0x10A,
      MSG_OP_GET_RUNTIME_CONFIG          = 0x10B,
      MSG_OP_REGISTER_ASYNC_EVENT_MSG    = 0x10C,
+    MSG_OP_GET_APP_HEALTH              = 0x114,
      MSG_OP_MAX_DRV_OPCODE,
      MSG_OP_GET_PROTOCOL_VERSION        = 0x301,
      MSG_OP_MAX_OPCODE
@@ -451,4 +452,55 @@ struct config_debug_bo_req {
  struct config_debug_bo_resp {
      enum aie2_msg_status    status;
  } __packed;
+
+struct fatal_error_info {
+    __u32 fatal_type;         /* Fatal error type */
+    __u32 exception_type;     /* Only valid if fatal_type is a specific value */
+    __u32 exception_argument; /* Argument based on exception type */
+    __u32 exception_pc;       /* Program Counter at the time of the exception */
+    __u32 app_module;         /* Error module name */
+    __u32 task_index;         /* Index of the task in which the error occurred */
+    __u32 reserved[128];
+};
+
+struct app_health_report {
+    __u16 major;
+    __u16 minor;
+    __u32 size;
+    __u32 context_id;
+    /*
+     * Program Counter (PC) of the last initiated DPU opcode, as reported by the ERT +     * application. Before execution begins or after successful completion, the value is set +     * to UINT_MAX. If execution halts prematurely due to an error, this field retains the
+     * opcode's PC value.
+     * Note: To optimize performance, the ERT may simplify certain aspects of reporting. +     * Proper interpretation requires familiarity with the implementation details.
+     */
+    __u32 dpu_pc;
+    /*
+     * Index of the last initiated TXN opcode.
+     * Before execution starts or after successful completion, the value is set to UINT_MAX. +     * If execution halts prematurely due to an error, this field retains the opcode's ID. +     * Note: To optimize performance, the ERT may simplify certain aspects of reporting. +     * Proper interpretation requires familiarity with the implementation details.
+     */
+    __u32 txn_op_id;
+    /* The PC of the context at the time of the report */
+    __u32 ctx_pc;
+    struct fatal_error_info        fatal_info;
+    /* Index of the most recently executed run list entry. */
+    __u32 run_list_id;
+};
+
+struct get_app_health_req {
+    __u32 context_id;
+    __u32 buf_size;
+    __u64 buf_addr;
+} __packed;
+
+struct get_app_health_resp {
+    enum aie2_msg_status status;
+    __u32 required_buffer_size;
+    __u32 reserved[7];
+} __packed;
  #endif /* _AIE2_MSG_PRIV_H_ */
diff --git a/drivers/accel/amdxdna/aie2_pci.c b/drivers/accel/amdxdna/aie2_pci.c
index ddd3d82f3426..9e39bfe75971 100644
--- a/drivers/accel/amdxdna/aie2_pci.c
+++ b/drivers/accel/amdxdna/aie2_pci.c
@@ -846,7 +846,10 @@ static int aie2_hwctx_status_cb(struct amdxdna_hwctx *hwctx, void *arg)
      struct amdxdna_drm_hwctx_entry *tmp __free(kfree) = NULL;
      struct amdxdna_drm_get_array *array_args = arg;
      struct amdxdna_drm_hwctx_entry __user *buf;
+    struct app_health_report report;
+    struct amdxdna_dev_hdl *ndev;
      u32 size;
+    int ret;
        if (!array_args->num_element)
          return -EINVAL;
@@ -869,6 +872,17 @@ static int aie2_hwctx_status_cb(struct amdxdna_hwctx *hwctx, void *arg)
      tmp->latency = hwctx->qos.latency;
      tmp->frame_exec_time = hwctx->qos.frame_exec_time;
      tmp->state = AMDXDNA_HWCTX_STATE_ACTIVE;
+    ndev = hwctx->client->xdna->dev_handle;
+    ret = aie2_query_app_health(ndev, hwctx->fw_ctx_id, &report);
+    if (!ret) {
+        /* Fill in app health report fields */
+        tmp->txn_op_idx = report.txn_op_id;
+        tmp->ctx_pc = report.ctx_pc;
+        tmp->fatal_error_type = report.fatal_info.fatal_type;
+        tmp->fatal_error_exception_type = report.fatal_info.exception_type;
+        tmp->fatal_error_exception_pc = report.fatal_info.exception_pc;
+        tmp->fatal_error_app_module = report.fatal_info.app_module;
+    }
        buf = u64_to_user_ptr(array_args->buffer);
      size = min(sizeof(*tmp), array_args->element_size);
diff --git a/drivers/accel/amdxdna/aie2_pci.h b/drivers/accel/amdxdna/aie2_pci.h
index 885ae7e6bfc7..efcf4be035f0 100644
--- a/drivers/accel/amdxdna/aie2_pci.h
+++ b/drivers/accel/amdxdna/aie2_pci.h
@@ -10,6 +10,7 @@
  #include <linux/limits.h>
  #include <linux/semaphore.h>
  +#include "aie2_msg_priv.h"
  #include "amdxdna_mailbox.h"
    #define AIE2_INTERVAL    20000    /* us */
@@ -261,6 +262,7 @@ enum aie2_fw_feature {
      AIE2_NPU_COMMAND,
      AIE2_PREEMPT,
      AIE2_TEMPORAL_ONLY,
+    AIE2_APP_HEALTH,
      AIE2_FEATURE_MAX
  };
  @@ -271,6 +273,7 @@ struct aie2_fw_feature_tbl {
      u32 min_minor;
  };
  +#define AIE2_ALL_FEATURES    GENMASK_ULL(AIE2_FEATURE_MAX - 1, AIE2_NPU_COMMAND)   #define AIE2_FEATURE_ON(ndev, feature)    test_bit(feature, &(ndev)->feature_mask)
    struct amdxdna_dev_priv {
@@ -341,6 +344,8 @@ int aie2_query_aie_version(struct amdxdna_dev_hdl *ndev, struct aie_version *ver   int aie2_query_aie_metadata(struct amdxdna_dev_hdl *ndev, struct aie_metadata *metadata);
  int aie2_query_firmware_version(struct amdxdna_dev_hdl *ndev,
                  struct amdxdna_fw_ver *fw_ver);
+int aie2_query_app_health(struct amdxdna_dev_hdl *ndev, u32 context_id,
+              struct app_health_report *report);
  int aie2_create_context(struct amdxdna_dev_hdl *ndev, struct amdxdna_hwctx *hwctx);   int aie2_destroy_context(struct amdxdna_dev_hdl *ndev, struct amdxdna_hwctx *hwctx);   int aie2_map_host_buf(struct amdxdna_dev_hdl *ndev, u32 context_id, u64 addr, u64 size); diff --git a/drivers/accel/amdxdna/amdxdna_ctx.c b/drivers/accel/amdxdna/amdxdna_ctx.c
index 666dfd7b2a80..4b921715176d 100644
--- a/drivers/accel/amdxdna/amdxdna_ctx.c
+++ b/drivers/accel/amdxdna/amdxdna_ctx.c
@@ -137,7 +137,8 @@ u32 amdxdna_cmd_get_cu_idx(struct amdxdna_gem_obj *abo)
    int amdxdna_cmd_set_error(struct amdxdna_gem_obj *abo,
                struct amdxdna_sched_job *job, u32 cmd_idx,
-              enum ert_cmd_state error_state)
+              enum ert_cmd_state error_state,
+              void *err_data, size_t size)
  {
      struct amdxdna_client *client = job->hwctx->client;
      struct amdxdna_cmd *cmd = abo->mem.kva;
@@ -156,6 +157,9 @@ int amdxdna_cmd_set_error(struct amdxdna_gem_obj *abo,
      }
        memset(cmd->data, 0xff, abo->mem.size - sizeof(*cmd));
+    if (err_data)
+        memcpy(cmd->data, err_data, min(size, abo->mem.size - sizeof(*cmd)));
+
      if (cc)
          amdxdna_gem_put_obj(abo);
  diff --git a/drivers/accel/amdxdna/amdxdna_ctx.h b/drivers/accel/amdxdna/amdxdna_ctx.h
index fbdf9d000871..57db1527a93b 100644
--- a/drivers/accel/amdxdna/amdxdna_ctx.h
+++ b/drivers/accel/amdxdna/amdxdna_ctx.h
@@ -72,6 +72,13 @@ struct amdxdna_cmd_preempt_data {
      u32 prop_args[];    /* properties and regular kernel arguments */
  };
  +#define AMDXDNA_CMD_CTX_HEALTH_V1    1
+#define AMDXDNA_CMD_CTX_HEALTH_AIE2    0
+struct amdxdna_ctx_health {
+    u32 version;
+    u32 npu_gen;
+};
+
  /* Exec buffer command header format */
  #define AMDXDNA_CMD_STATE        GENMASK(3, 0)
  #define AMDXDNA_CMD_EXTRA_CU_MASK    GENMASK(11, 10)
@@ -122,6 +129,11 @@ struct amdxdna_drv_cmd {
      u32            result;
  };
  +struct app_health_report;
+union amdxdna_job_priv {
+    struct app_health_report *aie2_health;
+};
+
  struct amdxdna_sched_job {
      struct drm_sched_job    base;
      struct kref        refcnt;
@@ -136,10 +148,13 @@ struct amdxdna_sched_job {
      u64            seq;
      struct amdxdna_drv_cmd    *drv_cmd;
      struct amdxdna_gem_obj    *cmd_bo;
+    union amdxdna_job_priv    priv;
      size_t            bo_cnt;
      struct drm_gem_object    *bos[] __counted_by(bo_cnt);
  };
  +#define aie2_job_health priv.aie2_health
+
  static inline u32
  amdxdna_cmd_get_op(struct amdxdna_gem_obj *abo)
  {
@@ -169,7 +184,8 @@ void *amdxdna_cmd_get_payload(struct amdxdna_gem_obj *abo, u32 *size);
  u32 amdxdna_cmd_get_cu_idx(struct amdxdna_gem_obj *abo);
  int amdxdna_cmd_set_error(struct amdxdna_gem_obj *abo,
                struct amdxdna_sched_job *job, u32 cmd_idx,
-              enum ert_cmd_state error_state);
+              enum ert_cmd_state error_state,
+              void *err_data, size_t size);
    void amdxdna_sched_job_cleanup(struct amdxdna_sched_job *job);
  void amdxdna_hwctx_remove_all(struct amdxdna_client *client);
diff --git a/drivers/accel/amdxdna/npu4_regs.c b/drivers/accel/amdxdna/npu4_regs.c
index ce25eef5fc34..619bff042e52 100644
--- a/drivers/accel/amdxdna/npu4_regs.c
+++ b/drivers/accel/amdxdna/npu4_regs.c
@@ -93,7 +93,8 @@ const struct aie2_fw_feature_tbl npu4_fw_feature_table[] = {       { .features = BIT_U64(AIE2_NPU_COMMAND), .major = 6, .min_minor = 15 },       { .features = BIT_U64(AIE2_PREEMPT), .major = 6, .min_minor = 12 },       { .features = BIT_U64(AIE2_TEMPORAL_ONLY), .major = 6, .min_minor = 12 }, -    { .features = GENMASK_ULL(AIE2_TEMPORAL_ONLY, AIE2_NPU_COMMAND), .major = 7 }, +    { .features = BIT_U64(AIE2_APP_HEALTH), .major = 6, .min_minor = 18 },
+    { .features = AIE2_ALL_FEATURES, .major = 7 },
      { 0 }
  };

Reply via email to