From: Karol Wachowski <karol.wachow...@intel.com>

Add new inference_timeout_ms parameter that allows specifying
maximum allowed duration in milliseconds that inference can take before
triggering a recovery.

Calculate maximum number of heartbeat retries based on ratio between
inference timeout and tdr timeout.

Signed-off-by: Karol Wachowski <karol.wachow...@intel.com>
Signed-off-by: Jacek Lawrynowicz <jacek.lawrynow...@linux.intel.com>
---
 drivers/accel/ivpu/ivpu_drv.h |  1 +
 drivers/accel/ivpu/ivpu_hw.c  |  4 ++++
 drivers/accel/ivpu/ivpu_pm.c  | 15 ++++++++++++---
 3 files changed, 17 insertions(+), 3 deletions(-)

diff --git a/drivers/accel/ivpu/ivpu_drv.h b/drivers/accel/ivpu/ivpu_drv.h
index 5497e7030e915..b6d6b3238b596 100644
--- a/drivers/accel/ivpu/ivpu_drv.h
+++ b/drivers/accel/ivpu/ivpu_drv.h
@@ -165,6 +165,7 @@ struct ivpu_device {
                int boot;
                int jsm;
                int tdr;
+               int inference;
                int autosuspend;
                int d0i3_entry_msg;
                int state_dump_msg;
diff --git a/drivers/accel/ivpu/ivpu_hw.c b/drivers/accel/ivpu/ivpu_hw.c
index 633160470c939..08dcc31b56f4d 100644
--- a/drivers/accel/ivpu/ivpu_hw.c
+++ b/drivers/accel/ivpu/ivpu_hw.c
@@ -94,12 +94,14 @@ static void timeouts_init(struct ivpu_device *vdev)
                vdev->timeout.boot = -1;
                vdev->timeout.jsm = -1;
                vdev->timeout.tdr = -1;
+               vdev->timeout.inference = -1;
                vdev->timeout.autosuspend = -1;
                vdev->timeout.d0i3_entry_msg = -1;
        } else if (ivpu_is_fpga(vdev)) {
                vdev->timeout.boot = 50;
                vdev->timeout.jsm = 15000;
                vdev->timeout.tdr = 30000;
+               vdev->timeout.inference = 900000;
                vdev->timeout.autosuspend = -1;
                vdev->timeout.d0i3_entry_msg = 500;
                vdev->timeout.state_dump_msg = 10000;
@@ -107,6 +109,7 @@ static void timeouts_init(struct ivpu_device *vdev)
                vdev->timeout.boot = 50;
                vdev->timeout.jsm = 500;
                vdev->timeout.tdr = 10000;
+               vdev->timeout.inference = 300000;
                vdev->timeout.autosuspend = 100;
                vdev->timeout.d0i3_entry_msg = 100;
                vdev->timeout.state_dump_msg = 10;
@@ -114,6 +117,7 @@ static void timeouts_init(struct ivpu_device *vdev)
                vdev->timeout.boot = 1000;
                vdev->timeout.jsm = 500;
                vdev->timeout.tdr = 2000;
+               vdev->timeout.inference = 60000;
                if (ivpu_hw_ip_gen(vdev) == IVPU_HW_IP_37XX)
                        vdev->timeout.autosuspend = 10;
                else
diff --git a/drivers/accel/ivpu/ivpu_pm.c b/drivers/accel/ivpu/ivpu_pm.c
index ea30db181cd75..eacda1dbe8405 100644
--- a/drivers/accel/ivpu/ivpu_pm.c
+++ b/drivers/accel/ivpu/ivpu_pm.c
@@ -33,8 +33,11 @@ static unsigned long ivpu_tdr_timeout_ms;
 module_param_named(tdr_timeout_ms, ivpu_tdr_timeout_ms, ulong, 0644);
 MODULE_PARM_DESC(tdr_timeout_ms, "Timeout for device hang detection, in 
milliseconds, 0 - default");
 
+static unsigned long ivpu_inference_timeout_ms;
+module_param_named(inference_timeout_ms, ivpu_inference_timeout_ms, ulong, 
0644);
+MODULE_PARM_DESC(inference_timeout_ms, "Inference maximum duration, in 
milliseconds, 0 - default");
+
 #define PM_RESCHEDULE_LIMIT     5
-#define PM_TDR_HEARTBEAT_LIMIT  30
 
 static void ivpu_pm_prepare_cold_boot(struct ivpu_device *vdev)
 {
@@ -191,6 +194,10 @@ static void ivpu_job_timeout_work(struct work_struct *work)
 {
        struct ivpu_pm_info *pm = container_of(work, struct ivpu_pm_info, 
job_timeout_work.work);
        struct ivpu_device *vdev = pm->vdev;
+       unsigned long timeout_ms = ivpu_tdr_timeout_ms ? ivpu_tdr_timeout_ms : 
vdev->timeout.tdr;
+       unsigned long inference_timeout_ms = ivpu_inference_timeout_ms ? 
ivpu_inference_timeout_ms :
+                                            vdev->timeout.inference;
+       u64 inference_max_retries;
        u64 heartbeat;
 
        if (ivpu_jsm_get_heartbeat(vdev, 0, &heartbeat) || heartbeat <= 
vdev->fw->last_heartbeat) {
@@ -198,8 +205,10 @@ static void ivpu_job_timeout_work(struct work_struct *work)
                goto recovery;
        }
 
-       if (atomic_fetch_inc(&vdev->job_timeout_counter) > 
PM_TDR_HEARTBEAT_LIMIT) {
-               ivpu_err(vdev, "Job timeout detected, heartbeat limit 
exceeded\n");
+       inference_max_retries = DIV_ROUND_UP(inference_timeout_ms, timeout_ms);
+       if (atomic_fetch_inc(&vdev->job_timeout_counter) >= 
inference_max_retries) {
+               ivpu_err(vdev, "Job timeout detected, heartbeat limit (%lld) 
exceeded\n",
+                        inference_max_retries);
                goto recovery;
        }
 
-- 
2.45.1

Reply via email to