Hi Krzysztof,

On Fri, Jan 30, 2026 at 07:45:08PM +0100, Krzysztof Niemiec wrote:
> The i915_active selftests live_active_wait and live_active_retire
> operate on an i915_active attached to a mock, empty request, created as
> part of test setup. A fence is attached to this request to control when
> the request is processed. The tests then wait for the completion of the
> active with __i915_active_wait(), and the test is considered successful
> if this results in setting a variable in the active callback.
> 
> However, the behavior of __i915_active_wait() is such that if the
> refcount for the active is 0, the function is almost completely skipped;
> waiting on a already completed active yields no effect. This includes a
> subsequent call to the retire() function of the active (which is the
> callback that the test is interested about, and which dictates whether
> its successful or not). So, if the active is completed before the
> aforementioned call to __i915_active_wait(), the test will fail.
> 
> Most of the test runs in a single thread, including creating the
> request, creating the fence for it, signalling that fence, and calling
> __i915_active_wait(). However, the request itself is handled
> asynchronously. This creates a race condition where if the request is
> completed after signalling the fence, but before waiting on its active,
> the active callback will not be invoked, failing the test.
> 
> Defer signalling the request's fence, to ensure the main test thread
> gets to call __i915_active_wait() before request completion.
> 
> v4:
> - Lower the delay timeout to 50ms (Jonathan)
> - Put the check on work_finished inside a helper function (Jonathan)
> 
> v3:
> - Embed the variables inside the live_active struct (Andi)
> - Move the schedule_delayed_work call closer to the wait (Andi)
> - Implement error handling in case an error state - the wait has
>   finished, but the deferred work didn't run - is somehow achieved (Andi)
> 
> v2:
> - Clarify the need for a fix a little more (Krzysztof K., Janusz)
> 
> Closes: https://gitlab.freedesktop.org/drm/i915/kernel/-/issues/14808
> Signed-off-by: Krzysztof Niemiec <[email protected]>

I'm sorry, but for now this patch, for the reason I explained you
in a previous version, is a nack.

You are trying to bypass locking issues by adding a random 50ms
delay. It's too fragile and looks hackish.

In any case, I've seen a few issues below, in case someone else
agrees and is willing to merge it.

> ---
>  drivers/gpu/drm/i915/selftests/i915_active.c | 51 +++++++++++++++++---
>  1 file changed, 45 insertions(+), 6 deletions(-)
> 
> diff --git a/drivers/gpu/drm/i915/selftests/i915_active.c 
> b/drivers/gpu/drm/i915/selftests/i915_active.c
> index 36c3a5460221..eadd0a8bc094 100644
> --- a/drivers/gpu/drm/i915/selftests/i915_active.c
> +++ b/drivers/gpu/drm/i915/selftests/i915_active.c
> @@ -21,6 +21,10 @@ struct live_active {
>       struct i915_active base;
>       struct kref ref;
>       bool retired;
> +
> +     struct i915_sw_fence *submit;
> +     struct delayed_work work;
> +     bool work_finished;
>  };
>  
>  static void __live_get(struct live_active *active)
> @@ -76,11 +80,37 @@ static struct live_active *__live_alloc(struct 
> drm_i915_private *i915)
>       return active;
>  }
>  
> +static void __live_submit_work_handler(struct work_struct *work)

I don't see the point for the '__' here.

> +{
> +     struct delayed_work *d_work = container_of(work, struct delayed_work, 
> work);
> +     struct live_active *active = container_of(d_work, struct live_active, 
> work);
> +     i915_sw_fence_commit(active->submit);
> +     heap_fence_put(active->submit);
> +     active->work_finished = true;
> +}
> +
> +static int
> +__live_work_confirm_finished(struct drm_i915_private *i915,
> +                          struct live_active *active)
> +{
> +     int err = 0;
> +
> +     if (!active->work_finished) {
> +             struct drm_printer p = drm_err_printer(&i915->drm, __func__);
> +
> +             drm_printf(&p, "active->work hasn't finished, something went\
> +                             terribly wrong\n");

until 100 characters per line is fine, but I'm sure you can
reword to something better.

> +             err = -EINVAL;
> +             cancel_delayed_work_sync(&active->work);
> +     }
> +
> +     return err;
> +}
> +
>  static struct live_active *
>  __live_active_setup(struct drm_i915_private *i915)
>  {
>       struct intel_engine_cs *engine;
> -     struct i915_sw_fence *submit;
>       struct live_active *active;
>       unsigned int count = 0;
>       int err = 0;
> @@ -89,8 +119,11 @@ __live_active_setup(struct drm_i915_private *i915)
>       if (!active)
>               return ERR_PTR(-ENOMEM);
>  
> -     submit = heap_fence_create(GFP_KERNEL);
> -     if (!submit) {
> +     INIT_DELAYED_WORK(&active->work, __live_submit_work_handler);
> +     active->work_finished = false;
> +
> +     active->submit = heap_fence_create(GFP_KERNEL);
> +     if (!active->submit) {
>               kfree(active);
>               return ERR_PTR(-ENOMEM);
>       }
> @@ -109,7 +142,7 @@ __live_active_setup(struct drm_i915_private *i915)
>               }
>  
>               err = i915_sw_fence_await_sw_fence_gfp(&rq->submit,
> -                                                    submit,
> +                                                    active->submit,
>                                                      GFP_KERNEL);
>               if (err >= 0)
>                       err = i915_active_add_request(&active->base, rq);
> @@ -134,8 +167,6 @@ __live_active_setup(struct drm_i915_private *i915)
>       }
>  
>  out:
> -     i915_sw_fence_commit(submit);
> -     heap_fence_put(submit);
>       if (err) {
>               __live_put(active);
>               active = ERR_PTR(err);
> @@ -156,6 +187,8 @@ static int live_active_wait(void *arg)
>       if (IS_ERR(active))
>               return PTR_ERR(active);

if we return with an error...

>  
> +     schedule_delayed_work(&active->work, msecs_to_jiffies(50));

... we don't schedule the work and therefore you leak the fence.

> +
>       __i915_active_wait(&active->base, TASK_UNINTERRUPTIBLE);
>       if (!READ_ONCE(active->retired)) {
>               struct drm_printer p = drm_err_printer(&i915->drm, __func__);
> @@ -166,6 +199,8 @@ static int live_active_wait(void *arg)
>               err = -EINVAL;
>       }
>  
> +     err = __live_work_confirm_finished(i915, active);

Here you are overwriting err.

> +
>       __live_put(active);
>  
>       if (igt_flush_test(i915))
> @@ -186,6 +221,8 @@ static int live_active_retire(void *arg)
>       if (IS_ERR(active))
>               return PTR_ERR(active);
>  
> +     schedule_delayed_work(&active->work, msecs_to_jiffies(50));
> +
>       /* waits for & retires all requests */
>       if (igt_flush_test(i915))
>               err = -EIO;
> @@ -199,6 +236,8 @@ static int live_active_retire(void *arg)
>               err = -EINVAL;
>       }
>  
> +     err = __live_work_confirm_finished(i915, active);

here as well.

Andi

> +
>       __live_put(active);
>  
>       return err;
> -- 
> 2.45.2
> 

Reply via email to