[Intel-gfx] [PATCH 04/10] drm/i915/execlists: Force preemption

2019-10-14 Thread Chris Wilson
If the preempted context takes too long to relinquish control, e.g. it
is stuck inside a shader with arbitration disabled, evict that context
with an engine reset. This ensures that preemptions are reasonably
responsive, providing a tighter QoS for the more important context at
the cost of flagging unresponsive contexts more frequently (i.e. instead
of using an ~10s hangcheck, we now evict at ~100ms).  The challenge of
lies in picking a timeout that can be reasonably serviced by HW for
typical workloads, balancing the existing clients against the needs for
responsiveness.

Note that coupled with timeslicing, this will lead to rapid GPU "hang"
detection with multiple active contexts vying for GPU time.

The preempt timeout can be adjusted per-engine using,

/sys/class/drm/card?/engine/*/preempt_timeout_ms

v2: Couple in sysfs control of preemption timeout

Signed-off-by: Chris Wilson 
Cc: Mika Kuoppala 
Cc: Tvrtko Ursulin 
Reviewed-by: Mika Kuoppala 
---
 drivers/gpu/drm/i915/Kconfig.profile |  15 +++
 drivers/gpu/drm/i915/gt/intel_engine_cs.c|   2 +
 drivers/gpu/drm/i915/gt/intel_engine_sysfs.c |  33 +
 drivers/gpu/drm/i915/gt/intel_engine_types.h |   9 ++
 drivers/gpu/drm/i915/gt/intel_lrc.c  | 127 +--
 drivers/gpu/drm/i915/gt/selftest_lrc.c   |  98 ++
 drivers/gpu/drm/i915/i915_params.h   |   2 +-
 7 files changed, 277 insertions(+), 9 deletions(-)

diff --git a/drivers/gpu/drm/i915/Kconfig.profile 
b/drivers/gpu/drm/i915/Kconfig.profile
index 48df8889a88a..8fceea85937b 100644
--- a/drivers/gpu/drm/i915/Kconfig.profile
+++ b/drivers/gpu/drm/i915/Kconfig.profile
@@ -25,3 +25,18 @@ config DRM_I915_SPIN_REQUEST
  May be 0 to disable the initial spin. In practice, we estimate
  the cost of enabling the interrupt (if currently disabled) to be
  a few microseconds.
+
+config DRM_I915_PREEMPT_TIMEOUT
+   int "Preempt timeout (ms)"
+   default 100 # milliseconds
+   help
+ How long to wait (in milliseconds) for a preemption event to occur
+ when submitting a new context via execlists. If the current context
+ does not hit an arbitration point and yield to HW before the timer
+ expires, the HW will be reset to allow the more important context
+ to execute.
+
+ This is adjustable via
+ /sys/class/drm/card?/engine/*/preempt_timeout_ms
+
+ May be 0 to disable the timeout.
diff --git a/drivers/gpu/drm/i915/gt/intel_engine_cs.c 
b/drivers/gpu/drm/i915/gt/intel_engine_cs.c
index c9d639c6becb..1eb51147839a 100644
--- a/drivers/gpu/drm/i915/gt/intel_engine_cs.c
+++ b/drivers/gpu/drm/i915/gt/intel_engine_cs.c
@@ -304,6 +304,8 @@ static int intel_engine_setup(struct intel_gt *gt, enum 
intel_engine_id id)
engine->instance = info->instance;
__sprint_engine_name(engine);
 
+   engine->props.preempt_timeout = CONFIG_DRM_I915_PREEMPT_TIMEOUT;
+
/*
 * To be overridden by the backend on setup. However to facilitate
 * cleanup on error during setup, we always provide the destroy vfunc.
diff --git a/drivers/gpu/drm/i915/gt/intel_engine_sysfs.c 
b/drivers/gpu/drm/i915/gt/intel_engine_sysfs.c
index 823153e56c67..1aae83eb4237 100644
--- a/drivers/gpu/drm/i915/gt/intel_engine_sysfs.c
+++ b/drivers/gpu/drm/i915/gt/intel_engine_sysfs.c
@@ -133,6 +133,34 @@ all_caps_show(struct kobject *kobj, struct kobj_attribute 
*attr, char *buf)
 static struct kobj_attribute all_caps_attr =
 __ATTR(known_capabilities, 0444, all_caps_show, NULL);
 
+static ssize_t
+preempt_timeout_show(struct kobject *kobj, struct kobj_attribute *attr,
+char *buf)
+{
+   struct intel_engine_cs *engine = kobj_to_engine(kobj);
+
+   return sprintf(buf, "%lu\n", engine->props.preempt_timeout);
+}
+
+static ssize_t
+preempt_timeout_store(struct kobject *kobj, struct kobj_attribute *attr,
+ const char *buf, size_t count)
+{
+   struct intel_engine_cs *engine = kobj_to_engine(kobj);
+   unsigned long timeout;
+   int err;
+
+   err = kstrtoul(buf, 0, );
+   if (err)
+   return err;
+
+   WRITE_ONCE(engine->props.preempt_timeout, timeout);
+   return count;
+}
+
+static struct kobj_attribute preempt_timeout_attr =
+__ATTR(preempt_timeout_ms, 0644, preempt_timeout_show, preempt_timeout_store);
+
 static void kobj_engine_release(struct kobject *kobj)
 {
kfree(kobj);
@@ -193,6 +221,11 @@ void intel_engines_add_sysfs(struct drm_i915_private *i915)
if (sysfs_create_files(kobj, files))
goto err_object;
 
+   if (CONFIG_DRM_I915_PREEMPT_TIMEOUT &&
+   intel_engine_has_preemption(engine) &&
+   sysfs_create_file(kobj, _timeout_attr.attr))
+   goto err_engine;
+
if (0) {
 err_object:
kobject_put(kobj);
diff --git 

[Intel-gfx] [PATCH 04/10] drm/i915/execlists: Force preemption

2019-10-10 Thread Chris Wilson
If the preempted context takes too long to relinquish control, e.g. it
is stuck inside a shader with arbitration disabled, evict that context
with an engine reset. This ensures that preemptions are reasonably
responsive, providing a tighter QoS for the more important context at
the cost of flagging unresponsive contexts more frequently (i.e. instead
of using an ~10s hangcheck, we now evict at ~100ms).  The challenge of
lies in picking a timeout that can be reasonably serviced by HW for
typical workloads, balancing the existing clients against the needs for
responsiveness.

Note that coupled with timeslicing, this will lead to rapid GPU "hang"
detection with multiple active contexts vying for GPU time.

The preempt timeout can be adjusted per-engine using,

/sys/class/drm/card?/engine/*/preempt_timeout_ms

v2: Couple in sysfs control of preemption timeout

Signed-off-by: Chris Wilson 
Cc: Mika Kuoppala 
Cc: Tvrtko Ursulin 
Reviewed-by: Mika Kuoppala 
---
 drivers/gpu/drm/i915/Kconfig.profile | 15 
 drivers/gpu/drm/i915/gt/intel_engine_cs.c|  2 +
 drivers/gpu/drm/i915/gt/intel_engine_sysfs.c | 32 +++
 drivers/gpu/drm/i915/gt/intel_engine_types.h |  9 ++
 drivers/gpu/drm/i915/gt/intel_lrc.c  | 95 ++--
 drivers/gpu/drm/i915/i915_params.h   |  2 +-
 6 files changed, 146 insertions(+), 9 deletions(-)

diff --git a/drivers/gpu/drm/i915/Kconfig.profile 
b/drivers/gpu/drm/i915/Kconfig.profile
index 48df8889a88a..8fceea85937b 100644
--- a/drivers/gpu/drm/i915/Kconfig.profile
+++ b/drivers/gpu/drm/i915/Kconfig.profile
@@ -25,3 +25,18 @@ config DRM_I915_SPIN_REQUEST
  May be 0 to disable the initial spin. In practice, we estimate
  the cost of enabling the interrupt (if currently disabled) to be
  a few microseconds.
+
+config DRM_I915_PREEMPT_TIMEOUT
+   int "Preempt timeout (ms)"
+   default 100 # milliseconds
+   help
+ How long to wait (in milliseconds) for a preemption event to occur
+ when submitting a new context via execlists. If the current context
+ does not hit an arbitration point and yield to HW before the timer
+ expires, the HW will be reset to allow the more important context
+ to execute.
+
+ This is adjustable via
+ /sys/class/drm/card?/engine/*/preempt_timeout_ms
+
+ May be 0 to disable the timeout.
diff --git a/drivers/gpu/drm/i915/gt/intel_engine_cs.c 
b/drivers/gpu/drm/i915/gt/intel_engine_cs.c
index c9d639c6becb..1eb51147839a 100644
--- a/drivers/gpu/drm/i915/gt/intel_engine_cs.c
+++ b/drivers/gpu/drm/i915/gt/intel_engine_cs.c
@@ -304,6 +304,8 @@ static int intel_engine_setup(struct intel_gt *gt, enum 
intel_engine_id id)
engine->instance = info->instance;
__sprint_engine_name(engine);
 
+   engine->props.preempt_timeout = CONFIG_DRM_I915_PREEMPT_TIMEOUT;
+
/*
 * To be overridden by the backend on setup. However to facilitate
 * cleanup on error during setup, we always provide the destroy vfunc.
diff --git a/drivers/gpu/drm/i915/gt/intel_engine_sysfs.c 
b/drivers/gpu/drm/i915/gt/intel_engine_sysfs.c
index cbe9ec59beeb..aac26097c916 100644
--- a/drivers/gpu/drm/i915/gt/intel_engine_sysfs.c
+++ b/drivers/gpu/drm/i915/gt/intel_engine_sysfs.c
@@ -45,10 +45,37 @@ mmio_show(struct kobject *kobj, struct kobj_attribute 
*attr, char *buf)
return sprintf(buf, "0x%x\n", kobj_to_engine(kobj)->mmio_base);
 }
 
+static ssize_t
+preempt_timeout_show(struct kobject *kobj, struct kobj_attribute *attr,
+char *buf)
+{
+   struct intel_engine_cs *engine = kobj_to_engine(kobj);
+
+   return sprintf(buf, "%lu\n", engine->props.preempt_timeout);
+}
+
+static ssize_t
+preempt_timeout_store(struct kobject *kobj, struct kobj_attribute *attr,
+ const char *buf, size_t count)
+{
+   struct intel_engine_cs *engine = kobj_to_engine(kobj);
+   unsigned long timeout;
+   int err;
+
+   err = kstrtoul(buf, 0, );
+   if (err)
+   return err;
+
+   engine->props.preempt_timeout = timeout;
+   return count;
+}
+
 static struct kobj_attribute name_attr = __ATTR(name, 0444, name_show, NULL);
 static struct kobj_attribute class_attr = __ATTR(class, 0444, class_show, 
NULL);
 static struct kobj_attribute inst_attr = __ATTR(instance, 0444, inst_show, 
NULL);
 static struct kobj_attribute mmio_attr = __ATTR(mmio_base, 0444, mmio_show, 
NULL);
+static struct kobj_attribute preempt_timeout_attr =
+__ATTR(preempt_timeout_ms, 0600, preempt_timeout_show, preempt_timeout_store);
 
 static void kobj_engine_release(struct kobject *kobj)
 {
@@ -109,6 +136,11 @@ void intel_engines_add_sysfs(struct drm_i915_private *i915)
if (sysfs_create_files(kobj, files))
goto err_engine;
 
+   if (CONFIG_DRM_I915_PREEMPT_TIMEOUT &&
+   intel_engine_has_preemption(engine) &&
+