Currently, requests are considered expired after a certain
amount of time runs out (DRM_I915_REQUEST_TIMEOUT). The logic
outright cancels such requests, without regard to them being
processed or preempted. This behavior may be observed with long
running workloads, which are being processed, albeit very
slowly.

Amend the problem by adding a set of time related fields to
i915 request structure and using them to calculate actual
work time the request consumed on the GPU.

Signed-off-by: Krzysztof Karas <[email protected]>
---
 drivers/gpu/drm/i915/gt/intel_gt_requests.c |  7 +++++++
 drivers/gpu/drm/i915/i915_request.c         | 10 ++++++++++
 drivers/gpu/drm/i915/i915_request.h         |  3 +++
 3 files changed, 20 insertions(+)

diff --git a/drivers/gpu/drm/i915/gt/intel_gt_requests.c 
b/drivers/gpu/drm/i915/gt/intel_gt_requests.c
index 93298820bee2..24f8d02e6abb 100644
--- a/drivers/gpu/drm/i915/gt/intel_gt_requests.c
+++ b/drivers/gpu/drm/i915/gt/intel_gt_requests.c
@@ -249,10 +249,17 @@ void intel_gt_watchdog_work(struct work_struct *work)
 
        llist_for_each_entry_safe(rq, rn, first, watchdog.link) {
                if (!i915_request_completed(rq)) {
+                       ktime_t timeout = rq->context->watchdog.timeout_us * 
NSEC_PER_USEC;
+                       ktime_t total = rq->watchdog.total_run_time;
                        struct dma_fence *f = &rq->fence;
                        const char __rcu *timeline;
                        const char __rcu *driver;
 
+                       if (i915_request_is_running(rq) ||
+                           (i915_request_started(rq) && total < timeout)) {
+                               continue;
+                       }
+
                        rcu_read_lock();
                        driver = dma_fence_driver_name(f);
                        timeline = dma_fence_timeline_name(f);
diff --git a/drivers/gpu/drm/i915/i915_request.c 
b/drivers/gpu/drm/i915/i915_request.c
index d2c7b1090df0..697976f3c7fa 100644
--- a/drivers/gpu/drm/i915/i915_request.c
+++ b/drivers/gpu/drm/i915/i915_request.c
@@ -676,6 +676,7 @@ bool __i915_request_submit(struct i915_request *request)
 active:
        clear_bit(I915_FENCE_FLAG_PQUEUE, &request->fence.flags);
        set_bit(I915_FENCE_FLAG_ACTIVE, &request->fence.flags);
+       request->watchdog.running_since = ktime_get();
 
        /*
         * XXX Rollback bonded-execution on __i915_request_unsubmit()?
@@ -731,6 +732,15 @@ void __i915_request_unsubmit(struct i915_request *request)
         */
        GEM_BUG_ON(!test_bit(I915_FENCE_FLAG_ACTIVE, &request->fence.flags));
        clear_bit_unlock(I915_FENCE_FLAG_ACTIVE, &request->fence.flags);
+       if (ktime_to_ns(request->watchdog.running_since)) {
+               ktime_t now = ktime_get();
+
+               request->watchdog.total_run_time =
+                       ktime_add(request->watchdog.total_run_time,
+                                 ktime_sub(now, 
request->watchdog.running_since));
+               request->watchdog.preempted_at = now;
+               request->watchdog.running_since = 0;
+       }
        if (test_bit(DMA_FENCE_FLAG_ENABLE_SIGNAL_BIT, &request->fence.flags))
                i915_request_cancel_breadcrumb(request);
 
diff --git a/drivers/gpu/drm/i915/i915_request.h 
b/drivers/gpu/drm/i915/i915_request.h
index b09135301f39..48b619ca6bf4 100644
--- a/drivers/gpu/drm/i915/i915_request.h
+++ b/drivers/gpu/drm/i915/i915_request.h
@@ -325,6 +325,9 @@ struct i915_request {
        struct i915_request_watchdog {
                struct llist_node link;
                struct hrtimer timer;
+               ktime_t running_since;
+               ktime_t preempted_at;
+               ktime_t total_run_time;
        } watchdog;
 
        /*
-- 
2.34.1

Reply via email to