On Thu, Oct 24, 2019 at 6:40 AM Chris Wilson <ch...@chris-wilson.co.uk>
wrote:

> Our existing behaviour is to allow contexts and their GPU requests to
> persist past the point of closure until the requests are complete. This
> allows clients to operate in a 'fire-and-forget' manner where they can
> setup a rendering pipeline and hand it over to the display server and
> immediately exiting. As the rendering pipeline is kept alive until
> completion, the display server (or other consumer) can use the results
> in the future and present them to the user.
>
> However, not all clients want this persistent behaviour and would prefer
> that the contexts are cleaned up immediately upon closure. This ensures
> that when clients are run without hangchecking, any GPU hang is
> terminated with the process and does not continue to hog resources.
>
> By defining a context property to allow clients to control persistence
> explicitly, we can remove the blanket advice to disable hangchecking
> that seems to be far too prevalent.
>

Just to be clear, when you say "disable hangchecking" do you mean disabling
it for all processes via a kernel parameter at boot time or a sysfs entry
or similar?  Or is there some mechanism whereby a context can request no
hang checking?


> The default behaviour for new controls is the legacy persistence mode.
> New clients will have to opt out for immediate cleanup on context
> closure. If the hangchecking modparam is disabled, so is persistent
> context support -- all contexts will be terminated on closure.
>

What happens to fences when the context is cancelled?  Is it the same
behavior as we have today for when a GPU hang is detected and a context is
banned?

--Jason



> Testcase: igt/gem_ctx_persistence
> Signed-off-by: Chris Wilson <ch...@chris-wilson.co.uk>
> Cc: Joonas Lahtinen <joonas.lahti...@linux.intel.com>
> Cc: MichaƂ Winiarski <michal.winiar...@intel.com>
> Cc: Jon Bloomfield <jon.bloomfi...@intel.com>
> Reviewed-by: Jon Bloomfield <jon.bloomfi...@intel.com>
> Reviewed-by: Tvrtko Ursulin <tvrtko.ursu...@intel.com>
> ---
>  drivers/gpu/drm/i915/gem/i915_gem_context.c   | 50 ++++++++++++++++++-
>  drivers/gpu/drm/i915/gem/i915_gem_context.h   | 15 ++++++
>  .../gpu/drm/i915/gem/i915_gem_context_types.h |  1 +
>  .../gpu/drm/i915/gem/selftests/mock_context.c |  2 +
>  include/uapi/drm/i915_drm.h                   | 15 ++++++
>  5 files changed, 82 insertions(+), 1 deletion(-)
>
> diff --git a/drivers/gpu/drm/i915/gem/i915_gem_context.c
> b/drivers/gpu/drm/i915/gem/i915_gem_context.c
> index 55f1f93c0925..0f1bbf5d1a11 100644
> --- a/drivers/gpu/drm/i915/gem/i915_gem_context.c
> +++ b/drivers/gpu/drm/i915/gem/i915_gem_context.c
> @@ -437,12 +437,39 @@ static void context_close(struct i915_gem_context
> *ctx)
>          * case we opt to forcibly kill off all remaining requests on
>          * context close.
>          */
> -       if (!i915_modparams.enable_hangcheck)
> +       if (!i915_gem_context_is_persistent(ctx) ||
> +           !i915_modparams.enable_hangcheck)
>                 kill_context(ctx);
>
>         i915_gem_context_put(ctx);
>  }
>
> +static int __context_set_persistence(struct i915_gem_context *ctx, bool
> state)
> +{
> +       if (i915_gem_context_is_persistent(ctx) == state)
> +               return 0;
> +
> +       if (state) {
> +               /*
> +                * Only contexts that are short-lived [that will expire or
> be
> +                * reset] are allowed to survive past termination. We
> require
> +                * hangcheck to ensure that the persistent requests are
> healthy.
> +                */
> +               if (!i915_modparams.enable_hangcheck)
> +                       return -EINVAL;
> +
> +               i915_gem_context_set_persistence(ctx);
> +       } else {
> +               /* To cancel a context we use "preempt-to-idle" */
> +               if (!(ctx->i915->caps.scheduler &
> I915_SCHEDULER_CAP_PREEMPTION))
> +                       return -ENODEV;
> +
> +               i915_gem_context_clear_persistence(ctx);
> +       }
> +
> +       return 0;
> +}
> +
>  static struct i915_gem_context *
>  __create_context(struct drm_i915_private *i915)
>  {
> @@ -477,6 +504,7 @@ __create_context(struct drm_i915_private *i915)
>
>         i915_gem_context_set_bannable(ctx);
>         i915_gem_context_set_recoverable(ctx);
> +       __context_set_persistence(ctx, true /* cgroup hook? */);
>
>         for (i = 0; i < ARRAY_SIZE(ctx->hang_timestamp); i++)
>                 ctx->hang_timestamp[i] = jiffies -
> CONTEXT_FAST_HANG_JIFFIES;
> @@ -633,6 +661,7 @@ i915_gem_context_create_kernel(struct drm_i915_private
> *i915, int prio)
>                 return ctx;
>
>         i915_gem_context_clear_bannable(ctx);
> +       i915_gem_context_set_persistence(ctx);
>         ctx->sched.priority = I915_USER_PRIORITY(prio);
>
>         GEM_BUG_ON(!i915_gem_context_is_kernel(ctx));
> @@ -1743,6 +1772,16 @@ get_engines(struct i915_gem_context *ctx,
>         return err;
>  }
>
> +static int
> +set_persistence(struct i915_gem_context *ctx,
> +               const struct drm_i915_gem_context_param *args)
> +{
> +       if (args->size)
> +               return -EINVAL;
> +
> +       return __context_set_persistence(ctx, args->value);
> +}
> +
>  static int ctx_setparam(struct drm_i915_file_private *fpriv,
>                         struct i915_gem_context *ctx,
>                         struct drm_i915_gem_context_param *args)
> @@ -1820,6 +1859,10 @@ static int ctx_setparam(struct
> drm_i915_file_private *fpriv,
>                 ret = set_engines(ctx, args);
>                 break;
>
> +       case I915_CONTEXT_PARAM_PERSISTENCE:
> +               ret = set_persistence(ctx, args);
> +               break;
> +
>         case I915_CONTEXT_PARAM_BAN_PERIOD:
>         default:
>                 ret = -EINVAL;
> @@ -2272,6 +2315,11 @@ int i915_gem_context_getparam_ioctl(struct
> drm_device *dev, void *data,
>                 ret = get_engines(ctx, args);
>                 break;
>
> +       case I915_CONTEXT_PARAM_PERSISTENCE:
> +               args->size = 0;
> +               args->value = i915_gem_context_is_persistent(ctx);
> +               break;
> +
>         case I915_CONTEXT_PARAM_BAN_PERIOD:
>         default:
>                 ret = -EINVAL;
> diff --git a/drivers/gpu/drm/i915/gem/i915_gem_context.h
> b/drivers/gpu/drm/i915/gem/i915_gem_context.h
> index cfe80590f0ed..18e50a769a6e 100644
> --- a/drivers/gpu/drm/i915/gem/i915_gem_context.h
> +++ b/drivers/gpu/drm/i915/gem/i915_gem_context.h
> @@ -76,6 +76,21 @@ static inline void
> i915_gem_context_clear_recoverable(struct i915_gem_context *c
>         clear_bit(UCONTEXT_RECOVERABLE, &ctx->user_flags);
>  }
>
> +static inline bool i915_gem_context_is_persistent(const struct
> i915_gem_context *ctx)
> +{
> +       return test_bit(UCONTEXT_PERSISTENCE, &ctx->user_flags);
> +}
> +
> +static inline void i915_gem_context_set_persistence(struct
> i915_gem_context *ctx)
> +{
> +       set_bit(UCONTEXT_PERSISTENCE, &ctx->user_flags);
> +}
> +
> +static inline void i915_gem_context_clear_persistence(struct
> i915_gem_context *ctx)
> +{
> +       clear_bit(UCONTEXT_PERSISTENCE, &ctx->user_flags);
> +}
> +
>  static inline bool i915_gem_context_is_banned(const struct
> i915_gem_context *ctx)
>  {
>         return test_bit(CONTEXT_BANNED, &ctx->flags);
> diff --git a/drivers/gpu/drm/i915/gem/i915_gem_context_types.h
> b/drivers/gpu/drm/i915/gem/i915_gem_context_types.h
> index fe97b8ba4fda..861d7d92fe9f 100644
> --- a/drivers/gpu/drm/i915/gem/i915_gem_context_types.h
> +++ b/drivers/gpu/drm/i915/gem/i915_gem_context_types.h
> @@ -137,6 +137,7 @@ struct i915_gem_context {
>  #define UCONTEXT_NO_ERROR_CAPTURE      1
>  #define UCONTEXT_BANNABLE              2
>  #define UCONTEXT_RECOVERABLE           3
> +#define UCONTEXT_PERSISTENCE           4
>
>         /**
>          * @flags: small set of booleans
> diff --git a/drivers/gpu/drm/i915/gem/selftests/mock_context.c
> b/drivers/gpu/drm/i915/gem/selftests/mock_context.c
> index 74ddd682c9cd..29b8984f0e47 100644
> --- a/drivers/gpu/drm/i915/gem/selftests/mock_context.c
> +++ b/drivers/gpu/drm/i915/gem/selftests/mock_context.c
> @@ -22,6 +22,8 @@ mock_context(struct drm_i915_private *i915,
>         INIT_LIST_HEAD(&ctx->link);
>         ctx->i915 = i915;
>
> +       i915_gem_context_set_persistence(ctx);
> +
>         mutex_init(&ctx->engines_mutex);
>         e = default_engines(ctx);
>         if (IS_ERR(e))
> diff --git a/include/uapi/drm/i915_drm.h b/include/uapi/drm/i915_drm.h
> index 63d40cba97e0..b5fd5e4bd16e 100644
> --- a/include/uapi/drm/i915_drm.h
> +++ b/include/uapi/drm/i915_drm.h
> @@ -1572,6 +1572,21 @@ struct drm_i915_gem_context_param {
>   *   i915_context_engines_bond (I915_CONTEXT_ENGINES_EXT_BOND)
>   */
>  #define I915_CONTEXT_PARAM_ENGINES     0xa
> +
> +/*
> + * I915_CONTEXT_PARAM_PERSISTENCE:
> + *
> + * Allow the context and active rendering to survive the process until
> + * completion. Persistence allows fire-and-forget clients to queue up a
> + * bunch of work, hand the output over to a display server and the quit.
> + * If the context is not marked as persistent, upon closing (either via
> + * an explicit DRM_I915_GEM_CONTEXT_DESTROY or implicitly from file
> closure
> + * or process termination), the context and any outstanding requests will
> be
> + * cancelled (and exported fences for cancelled requests marked as -EIO).
> + *
> + * By default, new contexts allow persistence.
> + */
> +#define I915_CONTEXT_PARAM_PERSISTENCE 0xb
>  /* Must be kept compact -- no holes and well documented */
>
>         __u64 value;
> --
> 2.24.0.rc0
>
> _______________________________________________
> Intel-gfx mailing list
> Intel-gfx@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/intel-gfx
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

Reply via email to