Re: [PATCH V6 07/10] accel/amdxdna: Add command execution

2024-11-06 Thread Lizhi Hou



On 11/3/24 22:31, Matthew Brost wrote:

On Wed, Oct 30, 2024 at 08:51:44AM -0700, Lizhi Hou wrote:

Add interfaces for user application to submit command and wait for its
completion.

Co-developed-by: Min Ma 
Signed-off-by: Min Ma 
Signed-off-by: Lizhi Hou 
---
  drivers/accel/amdxdna/aie2_ctx.c  | 664 +-
  drivers/accel/amdxdna/aie2_message.c  | 343 +
  drivers/accel/amdxdna/aie2_pci.c  |   5 +
  drivers/accel/amdxdna/aie2_pci.h  |  35 +
  drivers/accel/amdxdna/aie2_psp.c  |   2 +
  drivers/accel/amdxdna/aie2_smu.c  |   2 +
  drivers/accel/amdxdna/amdxdna_ctx.c   | 330 -
  drivers/accel/amdxdna/amdxdna_ctx.h   | 111 +++
  drivers/accel/amdxdna/amdxdna_gem.c   |  10 +
  drivers/accel/amdxdna/amdxdna_gem.h   |   1 +
  .../accel/amdxdna/amdxdna_mailbox_helper.c|   5 +
  drivers/accel/amdxdna/amdxdna_pci_drv.c   |   5 +
  drivers/accel/amdxdna/amdxdna_pci_drv.h   |   4 +
  drivers/accel/amdxdna/amdxdna_sysfs.c |   5 +
  drivers/accel/amdxdna/npu1_regs.c |   1 +
  drivers/accel/amdxdna/npu2_regs.c |   1 +
  drivers/accel/amdxdna/npu4_regs.c |   1 +
  drivers/accel/amdxdna/npu5_regs.c |   1 +
  include/trace/events/amdxdna.h|  41 ++
  include/uapi/drm/amdxdna_accel.h  |  38 +
  20 files changed, 1596 insertions(+), 9 deletions(-)

diff --git a/drivers/accel/amdxdna/aie2_ctx.c b/drivers/accel/amdxdna/aie2_ctx.c
index 617fc05077d9..c3ac668e16ab 100644
--- a/drivers/accel/amdxdna/aie2_ctx.c
+++ b/drivers/accel/amdxdna/aie2_ctx.c
@@ -8,8 +8,12 @@
  #include 
  #include 
  #include 
+#include 
+#include 
  #include 
+#include 
  
+#include "aie2_msg_priv.h"

  #include "aie2_pci.h"
  #include "aie2_solver.h"
  #include "amdxdna_ctx.h"
@@ -17,6 +21,337 @@
  #include "amdxdna_mailbox.h"
  #include "amdxdna_pci_drv.h"
  
+bool force_cmdlist;

+module_param(force_cmdlist, bool, 0600);
+MODULE_PARM_DESC(force_cmdlist, "Force use command list (Default false)");
+
+#define HWCTX_MAX_TIMEOUT  6 /* milliseconds */
+
+static struct amdxdna_sched_job *
+aie2_hwctx_get_job(struct amdxdna_hwctx *hwctx, u64 seq)
+{
+   int idx;
+
+   /* Special sequence number for oldest fence if exist */
+   if (seq == AMDXDNA_INVALID_CMD_HANDLE) {
+   idx = get_job_idx(hwctx->priv->seq);
+   goto out;
+   }
+
+   if (seq >= hwctx->priv->seq)
+   return ERR_PTR(-EINVAL);
+
+   if (seq + HWCTX_MAX_CMDS < hwctx->priv->seq)
+   return NULL;
+
+   idx = get_job_idx(seq);
+
+out:
+   return hwctx->priv->pending[idx];
+}
+
+/* The bad_job is used in aie2_sched_job_timedout, otherwise, set it to NULL */
+static void aie2_hwctx_stop(struct amdxdna_dev *xdna, struct amdxdna_hwctx 
*hwctx,
+   struct drm_sched_job *bad_job)
+{
+   drm_sched_stop(&hwctx->priv->sched, bad_job);
+   aie2_destroy_context(xdna->dev_handle, hwctx);
+}
+
+static int aie2_hwctx_restart(struct amdxdna_dev *xdna, struct amdxdna_hwctx 
*hwctx)
+{
+   struct amdxdna_gem_obj *heap = hwctx->priv->heap;
+   int ret;
+
+   ret = aie2_create_context(xdna->dev_handle, hwctx);
+   if (ret) {
+   XDNA_ERR(xdna, "Create hwctx failed, ret %d", ret);
+   goto out;
+   }
+
+   ret = aie2_map_host_buf(xdna->dev_handle, hwctx->fw_ctx_id,
+   heap->mem.userptr, heap->mem.size);
+   if (ret) {
+   XDNA_ERR(xdna, "Map host buf failed, ret %d", ret);
+   goto out;
+   }
+
+   if (hwctx->status != HWCTX_STAT_READY) {
+   XDNA_DBG(xdna, "hwctx is not ready, status %d", hwctx->status);
+   goto out;
+   }
+
+   ret = aie2_config_cu(hwctx);
+   if (ret) {
+   XDNA_ERR(xdna, "Config cu failed, ret %d", ret);
+   goto out;
+   }
+
+out:
+   drm_sched_start(&hwctx->priv->sched);
+   XDNA_DBG(xdna, "%s restarted, ret %d", hwctx->name, ret);
+   return ret;
+}
+
+void aie2_stop_ctx_by_col_map(struct amdxdna_client *client, u32 col_map)
+{
+   struct amdxdna_dev *xdna = client->xdna;
+   struct amdxdna_hwctx *hwctx;
+   int next = 0;
+
+   drm_WARN_ON(&xdna->ddev, !mutex_is_locked(&xdna->dev_lock));
+   mutex_lock(&client->hwctx_lock);
+   idr_for_each_entry_continue(&client->hwctx_idr, hwctx, next) {
+   /* check if the HW context uses the error column */
+   if (!(col_map & amdxdna_hwctx_col_map(hwctx)))
+   continue;
+
+   aie2_hwctx_stop(xdna, hwctx, NULL);
+   hwctx->old_status = hwctx->status;
+   hwctx->status = HWCTX_STAT_STOP;
+   XDNA_DBG(xdna, "Stop %s", hwctx->name);
+   }
+   mutex_unlock(&client->hwctx_lock);
+}
+
+void aie2_restart_ctx(struct amd

Re: [PATCH V6 07/10] accel/amdxdna: Add command execution

2024-11-03 Thread Matthew Brost
On Wed, Oct 30, 2024 at 08:51:44AM -0700, Lizhi Hou wrote:
> Add interfaces for user application to submit command and wait for its
> completion.
> 
> Co-developed-by: Min Ma 
> Signed-off-by: Min Ma 
> Signed-off-by: Lizhi Hou 
> ---
>  drivers/accel/amdxdna/aie2_ctx.c  | 664 +-
>  drivers/accel/amdxdna/aie2_message.c  | 343 +
>  drivers/accel/amdxdna/aie2_pci.c  |   5 +
>  drivers/accel/amdxdna/aie2_pci.h  |  35 +
>  drivers/accel/amdxdna/aie2_psp.c  |   2 +
>  drivers/accel/amdxdna/aie2_smu.c  |   2 +
>  drivers/accel/amdxdna/amdxdna_ctx.c   | 330 -
>  drivers/accel/amdxdna/amdxdna_ctx.h   | 111 +++
>  drivers/accel/amdxdna/amdxdna_gem.c   |  10 +
>  drivers/accel/amdxdna/amdxdna_gem.h   |   1 +
>  .../accel/amdxdna/amdxdna_mailbox_helper.c|   5 +
>  drivers/accel/amdxdna/amdxdna_pci_drv.c   |   5 +
>  drivers/accel/amdxdna/amdxdna_pci_drv.h   |   4 +
>  drivers/accel/amdxdna/amdxdna_sysfs.c |   5 +
>  drivers/accel/amdxdna/npu1_regs.c |   1 +
>  drivers/accel/amdxdna/npu2_regs.c |   1 +
>  drivers/accel/amdxdna/npu4_regs.c |   1 +
>  drivers/accel/amdxdna/npu5_regs.c |   1 +
>  include/trace/events/amdxdna.h|  41 ++
>  include/uapi/drm/amdxdna_accel.h  |  38 +
>  20 files changed, 1596 insertions(+), 9 deletions(-)
> 
> diff --git a/drivers/accel/amdxdna/aie2_ctx.c 
> b/drivers/accel/amdxdna/aie2_ctx.c
> index 617fc05077d9..c3ac668e16ab 100644
> --- a/drivers/accel/amdxdna/aie2_ctx.c
> +++ b/drivers/accel/amdxdna/aie2_ctx.c
> @@ -8,8 +8,12 @@
>  #include 
>  #include 
>  #include 
> +#include 
> +#include 
>  #include 
> +#include 
>  
> +#include "aie2_msg_priv.h"
>  #include "aie2_pci.h"
>  #include "aie2_solver.h"
>  #include "amdxdna_ctx.h"
> @@ -17,6 +21,337 @@
>  #include "amdxdna_mailbox.h"
>  #include "amdxdna_pci_drv.h"
>  
> +bool force_cmdlist;
> +module_param(force_cmdlist, bool, 0600);
> +MODULE_PARM_DESC(force_cmdlist, "Force use command list (Default false)");
> +
> +#define HWCTX_MAX_TIMEOUT6 /* milliseconds */
> +
> +static struct amdxdna_sched_job *
> +aie2_hwctx_get_job(struct amdxdna_hwctx *hwctx, u64 seq)
> +{
> + int idx;
> +
> + /* Special sequence number for oldest fence if exist */
> + if (seq == AMDXDNA_INVALID_CMD_HANDLE) {
> + idx = get_job_idx(hwctx->priv->seq);
> + goto out;
> + }
> +
> + if (seq >= hwctx->priv->seq)
> + return ERR_PTR(-EINVAL);
> +
> + if (seq + HWCTX_MAX_CMDS < hwctx->priv->seq)
> + return NULL;
> +
> + idx = get_job_idx(seq);
> +
> +out:
> + return hwctx->priv->pending[idx];
> +}
> +
> +/* The bad_job is used in aie2_sched_job_timedout, otherwise, set it to NULL 
> */
> +static void aie2_hwctx_stop(struct amdxdna_dev *xdna, struct amdxdna_hwctx 
> *hwctx,
> + struct drm_sched_job *bad_job)
> +{
> + drm_sched_stop(&hwctx->priv->sched, bad_job);
> + aie2_destroy_context(xdna->dev_handle, hwctx);
> +}
> +
> +static int aie2_hwctx_restart(struct amdxdna_dev *xdna, struct amdxdna_hwctx 
> *hwctx)
> +{
> + struct amdxdna_gem_obj *heap = hwctx->priv->heap;
> + int ret;
> +
> + ret = aie2_create_context(xdna->dev_handle, hwctx);
> + if (ret) {
> + XDNA_ERR(xdna, "Create hwctx failed, ret %d", ret);
> + goto out;
> + }
> +
> + ret = aie2_map_host_buf(xdna->dev_handle, hwctx->fw_ctx_id,
> + heap->mem.userptr, heap->mem.size);
> + if (ret) {
> + XDNA_ERR(xdna, "Map host buf failed, ret %d", ret);
> + goto out;
> + }
> +
> + if (hwctx->status != HWCTX_STAT_READY) {
> + XDNA_DBG(xdna, "hwctx is not ready, status %d", hwctx->status);
> + goto out;
> + }
> +
> + ret = aie2_config_cu(hwctx);
> + if (ret) {
> + XDNA_ERR(xdna, "Config cu failed, ret %d", ret);
> + goto out;
> + }
> +
> +out:
> + drm_sched_start(&hwctx->priv->sched);
> + XDNA_DBG(xdna, "%s restarted, ret %d", hwctx->name, ret);
> + return ret;
> +}
> +
> +void aie2_stop_ctx_by_col_map(struct amdxdna_client *client, u32 col_map)
> +{
> + struct amdxdna_dev *xdna = client->xdna;
> + struct amdxdna_hwctx *hwctx;
> + int next = 0;
> +
> + drm_WARN_ON(&xdna->ddev, !mutex_is_locked(&xdna->dev_lock));
> + mutex_lock(&client->hwctx_lock);
> + idr_for_each_entry_continue(&client->hwctx_idr, hwctx, next) {
> + /* check if the HW context uses the error column */
> + if (!(col_map & amdxdna_hwctx_col_map(hwctx)))
> + continue;
> +
> + aie2_hwctx_stop(xdna, hwctx, NULL);
> + hwctx->old_status = hwctx->status;
> + hwctx->status = HWCTX_STAT_STOP;
> + XDNA_DBG(xdna, "Stop %s", hwc

[PATCH V6 07/10] accel/amdxdna: Add command execution

2024-10-30 Thread Lizhi Hou
Add interfaces for user application to submit command and wait for its
completion.

Co-developed-by: Min Ma 
Signed-off-by: Min Ma 
Signed-off-by: Lizhi Hou 
---
 drivers/accel/amdxdna/aie2_ctx.c  | 664 +-
 drivers/accel/amdxdna/aie2_message.c  | 343 +
 drivers/accel/amdxdna/aie2_pci.c  |   5 +
 drivers/accel/amdxdna/aie2_pci.h  |  35 +
 drivers/accel/amdxdna/aie2_psp.c  |   2 +
 drivers/accel/amdxdna/aie2_smu.c  |   2 +
 drivers/accel/amdxdna/amdxdna_ctx.c   | 330 -
 drivers/accel/amdxdna/amdxdna_ctx.h   | 111 +++
 drivers/accel/amdxdna/amdxdna_gem.c   |  10 +
 drivers/accel/amdxdna/amdxdna_gem.h   |   1 +
 .../accel/amdxdna/amdxdna_mailbox_helper.c|   5 +
 drivers/accel/amdxdna/amdxdna_pci_drv.c   |   5 +
 drivers/accel/amdxdna/amdxdna_pci_drv.h   |   4 +
 drivers/accel/amdxdna/amdxdna_sysfs.c |   5 +
 drivers/accel/amdxdna/npu1_regs.c |   1 +
 drivers/accel/amdxdna/npu2_regs.c |   1 +
 drivers/accel/amdxdna/npu4_regs.c |   1 +
 drivers/accel/amdxdna/npu5_regs.c |   1 +
 include/trace/events/amdxdna.h|  41 ++
 include/uapi/drm/amdxdna_accel.h  |  38 +
 20 files changed, 1596 insertions(+), 9 deletions(-)

diff --git a/drivers/accel/amdxdna/aie2_ctx.c b/drivers/accel/amdxdna/aie2_ctx.c
index 617fc05077d9..c3ac668e16ab 100644
--- a/drivers/accel/amdxdna/aie2_ctx.c
+++ b/drivers/accel/amdxdna/aie2_ctx.c
@@ -8,8 +8,12 @@
 #include 
 #include 
 #include 
+#include 
+#include 
 #include 
+#include 
 
+#include "aie2_msg_priv.h"
 #include "aie2_pci.h"
 #include "aie2_solver.h"
 #include "amdxdna_ctx.h"
@@ -17,6 +21,337 @@
 #include "amdxdna_mailbox.h"
 #include "amdxdna_pci_drv.h"
 
+bool force_cmdlist;
+module_param(force_cmdlist, bool, 0600);
+MODULE_PARM_DESC(force_cmdlist, "Force use command list (Default false)");
+
+#define HWCTX_MAX_TIMEOUT  6 /* milliseconds */
+
+static struct amdxdna_sched_job *
+aie2_hwctx_get_job(struct amdxdna_hwctx *hwctx, u64 seq)
+{
+   int idx;
+
+   /* Special sequence number for oldest fence if exist */
+   if (seq == AMDXDNA_INVALID_CMD_HANDLE) {
+   idx = get_job_idx(hwctx->priv->seq);
+   goto out;
+   }
+
+   if (seq >= hwctx->priv->seq)
+   return ERR_PTR(-EINVAL);
+
+   if (seq + HWCTX_MAX_CMDS < hwctx->priv->seq)
+   return NULL;
+
+   idx = get_job_idx(seq);
+
+out:
+   return hwctx->priv->pending[idx];
+}
+
+/* The bad_job is used in aie2_sched_job_timedout, otherwise, set it to NULL */
+static void aie2_hwctx_stop(struct amdxdna_dev *xdna, struct amdxdna_hwctx 
*hwctx,
+   struct drm_sched_job *bad_job)
+{
+   drm_sched_stop(&hwctx->priv->sched, bad_job);
+   aie2_destroy_context(xdna->dev_handle, hwctx);
+}
+
+static int aie2_hwctx_restart(struct amdxdna_dev *xdna, struct amdxdna_hwctx 
*hwctx)
+{
+   struct amdxdna_gem_obj *heap = hwctx->priv->heap;
+   int ret;
+
+   ret = aie2_create_context(xdna->dev_handle, hwctx);
+   if (ret) {
+   XDNA_ERR(xdna, "Create hwctx failed, ret %d", ret);
+   goto out;
+   }
+
+   ret = aie2_map_host_buf(xdna->dev_handle, hwctx->fw_ctx_id,
+   heap->mem.userptr, heap->mem.size);
+   if (ret) {
+   XDNA_ERR(xdna, "Map host buf failed, ret %d", ret);
+   goto out;
+   }
+
+   if (hwctx->status != HWCTX_STAT_READY) {
+   XDNA_DBG(xdna, "hwctx is not ready, status %d", hwctx->status);
+   goto out;
+   }
+
+   ret = aie2_config_cu(hwctx);
+   if (ret) {
+   XDNA_ERR(xdna, "Config cu failed, ret %d", ret);
+   goto out;
+   }
+
+out:
+   drm_sched_start(&hwctx->priv->sched);
+   XDNA_DBG(xdna, "%s restarted, ret %d", hwctx->name, ret);
+   return ret;
+}
+
+void aie2_stop_ctx_by_col_map(struct amdxdna_client *client, u32 col_map)
+{
+   struct amdxdna_dev *xdna = client->xdna;
+   struct amdxdna_hwctx *hwctx;
+   int next = 0;
+
+   drm_WARN_ON(&xdna->ddev, !mutex_is_locked(&xdna->dev_lock));
+   mutex_lock(&client->hwctx_lock);
+   idr_for_each_entry_continue(&client->hwctx_idr, hwctx, next) {
+   /* check if the HW context uses the error column */
+   if (!(col_map & amdxdna_hwctx_col_map(hwctx)))
+   continue;
+
+   aie2_hwctx_stop(xdna, hwctx, NULL);
+   hwctx->old_status = hwctx->status;
+   hwctx->status = HWCTX_STAT_STOP;
+   XDNA_DBG(xdna, "Stop %s", hwctx->name);
+   }
+   mutex_unlock(&client->hwctx_lock);
+}
+
+void aie2_restart_ctx(struct amdxdna_client *client)
+{
+   struct amdxdna_dev *xdna = client->xdna;
+   struct amdxdna_hwctx *hwctx;
+   int next = 0;
+
+