Re: [Intel-gfx] [PATCH v6 18/20] drm/i915: Watchdog timeout: DRM kernel interface to set the timeout

2017-04-20 Thread Michel Thierry



On 20/04/17 01:52, Chris Wilson wrote:

On Wed, Apr 19, 2017 at 06:09:00PM -0700, Michel Thierry wrote:

This patch is missing:

diff --git a/drivers/gpu/drm/i915/i915_gem_context.c
b/drivers/gpu/drm/i915/i915_gem_context.c
index c1013af0b910..a8bdea43a217 100644
--- a/drivers/gpu/drm/i915/i915_gem_context.c
+++ b/drivers/gpu/drm/i915/i915_gem_context.c
@@ -1135,7 +1135,7 @@ int i915_gem_context_getparam_ioctl(struct
drm_device *dev, void *data,
return PTR_ERR(ctx);
}

-   args->size = 0;
+   args->size = (args->param != I915_CONTEXT_PARAM_WATCHDOG) ? 0 :
args->size;
switch (args->param) {
case I915_CONTEXT_PARAM_BAN_PERIOD:
ret = -EINVAL;

Or there will be no way to get the current thresholds (chunk was
missed due to some TRTT code nearby). I'll be sure to include it in
the next version.


No. It is always preset to 0. The PARAM should set it to the actual
struct size (it would write) and *not* the user's size.
-Chris



Ok, then I'll change the shortcut in get_watchdog, because as it is you 
can query the size, but not the thresholds.


int i915_gem_context_get_watchdog()
{
...
if (args->size == 0)
goto out;
...
out:
args->size = sizeof(threshold_in_us);

return 0;
}
}
___
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx


Re: [Intel-gfx] [PATCH v6 18/20] drm/i915: Watchdog timeout: DRM kernel interface to set the timeout

2017-04-20 Thread Chris Wilson
On Wed, Apr 19, 2017 at 06:09:00PM -0700, Michel Thierry wrote:
> This patch is missing:
> 
> diff --git a/drivers/gpu/drm/i915/i915_gem_context.c
> b/drivers/gpu/drm/i915/i915_gem_context.c
> index c1013af0b910..a8bdea43a217 100644
> --- a/drivers/gpu/drm/i915/i915_gem_context.c
> +++ b/drivers/gpu/drm/i915/i915_gem_context.c
> @@ -1135,7 +1135,7 @@ int i915_gem_context_getparam_ioctl(struct
> drm_device *dev, void *data,
>   return PTR_ERR(ctx);
>   }
> 
> - args->size = 0;
> + args->size = (args->param != I915_CONTEXT_PARAM_WATCHDOG) ? 0 :
> args->size;
>   switch (args->param) {
>   case I915_CONTEXT_PARAM_BAN_PERIOD:
>   ret = -EINVAL;
> 
> Or there will be no way to get the current thresholds (chunk was
> missed due to some TRTT code nearby). I'll be sure to include it in
> the next version.

No. It is always preset to 0. The PARAM should set it to the actual
struct size (it would write) and *not* the user's size.
-Chris

-- 
Chris Wilson, Intel Open Source Technology Centre
___
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx


Re: [Intel-gfx] [PATCH v6 18/20] drm/i915: Watchdog timeout: DRM kernel interface to set the timeout

2017-04-19 Thread Michel Thierry

On 18/04/17 13:23, Michel Thierry wrote:

Final enablement patch for GPU hang detection using watchdog timeout.
Using the gem_context_setparam ioctl, users can specify the desired
timeout value in microseconds, and the driver will do the conversion to
'timestamps'.

The recommended default watchdog threshold for video engines is 6 us,
since this has been _empirically determined_ to be a good compromise for
low-latency requirements and low rate of false positives. The default
register value is ~106000us and the theoretical max value (all 1s) is
353 seconds.

Note, UABI engine ids and i915 engine ids are different, and this patch
uses the i915 ones. Some kind of mapping table [1] is required if we
decide to use the UABI engine ids.

[1] 
http://patchwork.freedesktop.org/patch/msgid/20170329135831.30254-2-ch...@chris-wilson.co.uk

v2: Fixed get api to return values in microseconds. Threshold updated to
be per context engine. Check for u32 overflow. Capture ctx threshold
value in error state.

v3: Add a way to get array size, short-cut to disable all thresholds,
return EFAULT / EINVAL as needed. Move the capture of the threshold
value in the error state into a new patch. BXT has a different
timestamp base (because why not?).

Signed-off-by: Tomas Elf 
Signed-off-by: Arun Siluvery 
Signed-off-by: Michel Thierry 
---
 drivers/gpu/drm/i915/i915_drv.h |  29 +
 drivers/gpu/drm/i915/i915_gem_context.c | 102 
 drivers/gpu/drm/i915/intel_lrc.c|   5 +-
 include/uapi/drm/i915_drm.h |   1 +
 4 files changed, 135 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index 203f2112dd18..f65a236fddef 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -3574,6 +3574,35 @@ i915_gem_context_lookup_timeline(struct i915_gem_context 
*ctx,
return >timeline.engine[engine->id];
 }

+/*
+ * BDW & SKL+ Timestamp timer resolution = 0.080 uSec,
+ * or 1250 counts per second, or ~12 counts per microsecond.
+ *
+ * But Broxton Timestamp timer resolution is different, 0.052 uSec,
+ * or 1920 counts per second, or ~19 counts per microsecond.
+ */
+#define SKL_TIMESTAMP_CNTS_PER_USEC 12
+#define BXT_TIMESTAMP_CNTS_PER_USEC 19
+#define TIMESTAMP_CNTS_PER_USEC(dev_priv) (IS_BROXTON(dev_priv) ? \
+  BXT_TIMESTAMP_CNTS_PER_USEC : \
+  SKL_TIMESTAMP_CNTS_PER_USEC)
+static inline u32
+watchdog_to_us(struct drm_i915_private *dev_priv, u32 value_in_clock_counts)
+{
+   return value_in_clock_counts / TIMESTAMP_CNTS_PER_USEC(dev_priv);
+}
+
+static inline u32
+watchdog_to_clock_counts(struct drm_i915_private *dev_priv, u64 value_in_us)
+{
+   u64 threshold = value_in_us * TIMESTAMP_CNTS_PER_USEC(dev_priv);
+
+   if (overflows_type(threshold, u32))
+   return -EINVAL;
+
+   return threshold;
+}
+
 int i915_perf_open_ioctl(struct drm_device *dev, void *data,
 struct drm_file *file);

diff --git a/drivers/gpu/drm/i915/i915_gem_context.c 
b/drivers/gpu/drm/i915/i915_gem_context.c
index edbed85a1c88..85a6467a25a6 100644
--- a/drivers/gpu/drm/i915/i915_gem_context.c
+++ b/drivers/gpu/drm/i915/i915_gem_context.c
@@ -422,6 +422,102 @@ i915_gem_context_create_gvt(struct drm_device *dev)
return ctx;
 }

+/* Return the timer count threshold in microseconds. */
+int i915_gem_context_get_watchdog(struct i915_gem_context *ctx,
+ struct drm_i915_gem_context_param *args)
+{
+   struct drm_i915_private *dev_priv = ctx->i915;
+   struct intel_engine_cs *engine;
+   enum intel_engine_id id;
+   u32 threshold_in_us[I915_NUM_ENGINES];
+
+   if (args->size == 0)
+   goto out;
+
+   if (args->size < sizeof(threshold_in_us))
+   return -EFAULT;
+
+   if (!dev_priv->engine[VCS]->emit_start_watchdog)
+   return -ENODEV;
+
+   for_each_engine(engine, dev_priv, id) {
+   struct intel_context *ce = >engine[id];
+
+   threshold_in_us[id] = watchdog_to_us(dev_priv,
+ce->watchdog_threshold);
+   }
+
+   mutex_unlock(_priv->drm.struct_mutex);
+   if (__copy_to_user(u64_to_user_ptr(args->value),
+  _in_us,
+  sizeof(threshold_in_us))) {
+   mutex_lock(_priv->drm.struct_mutex);
+   return -EFAULT;
+   }
+   mutex_lock(_priv->drm.struct_mutex);
+
+out:
+   args->size = sizeof(threshold_in_us);
+
+   return 0;
+}
+
+/*
+ * Based on time out value in microseconds (us) calculate
+ * timer count thresholds needed based on core frequency.
+ * Watchdog can be disabled by setting it to 0.
+ */
+int 

Re: [Intel-gfx] [PATCH v6 18/20] drm/i915: Watchdog timeout: DRM kernel interface to set the timeout

2017-04-19 Thread Daniele Ceraolo Spurio



On 18/04/17 13:23, Michel Thierry wrote:

Final enablement patch for GPU hang detection using watchdog timeout.
Using the gem_context_setparam ioctl, users can specify the desired
timeout value in microseconds, and the driver will do the conversion to
'timestamps'.

The recommended default watchdog threshold for video engines is 6 us,
since this has been _empirically determined_ to be a good compromise for
low-latency requirements and low rate of false positives. The default
register value is ~106000us and the theoretical max value (all 1s) is
353 seconds.

Note, UABI engine ids and i915 engine ids are different, and this patch
uses the i915 ones. Some kind of mapping table [1] is required if we
decide to use the UABI engine ids.

[1] 
http://patchwork.freedesktop.org/patch/msgid/20170329135831.30254-2-ch...@chris-wilson.co.uk


Now that we have class and instance definitions, could it be worth to 
use those instead to index the engines? If we pair it with the engine 
discovery ioctl that Tvrtko proposed we could have something that is 
reasonably future-proof.


Thanks,
Daniele
___
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx


Re: [Intel-gfx] [PATCH v6 18/20] drm/i915: Watchdog timeout: DRM kernel interface to set the timeout

2017-04-19 Thread Jeff McGee
On Tue, Apr 18, 2017 at 01:23:33PM -0700, Michel Thierry wrote:
> Final enablement patch for GPU hang detection using watchdog timeout.
> Using the gem_context_setparam ioctl, users can specify the desired
> timeout value in microseconds, and the driver will do the conversion to
> 'timestamps'.
> 
> The recommended default watchdog threshold for video engines is 6 us,
> since this has been _empirically determined_ to be a good compromise for
> low-latency requirements and low rate of false positives. The default
> register value is ~106000us and the theoretical max value (all 1s) is
> 353 seconds.
> 
> Note, UABI engine ids and i915 engine ids are different, and this patch
> uses the i915 ones. Some kind of mapping table [1] is required if we
> decide to use the UABI engine ids.
> 
> [1] 
> http://patchwork.freedesktop.org/patch/msgid/20170329135831.30254-2-ch...@chris-wilson.co.uk
> 
> v2: Fixed get api to return values in microseconds. Threshold updated to
> be per context engine. Check for u32 overflow. Capture ctx threshold
> value in error state.
> 
> v3: Add a way to get array size, short-cut to disable all thresholds,
> return EFAULT / EINVAL as needed. Move the capture of the threshold
> value in the error state into a new patch. BXT has a different
> timestamp base (because why not?).
> 
> Signed-off-by: Tomas Elf 
> Signed-off-by: Arun Siluvery 
> Signed-off-by: Michel Thierry 
> ---
>  drivers/gpu/drm/i915/i915_drv.h |  29 +
>  drivers/gpu/drm/i915/i915_gem_context.c | 102 
> 
>  drivers/gpu/drm/i915/intel_lrc.c|   5 +-
>  include/uapi/drm/i915_drm.h |   1 +
>  4 files changed, 135 insertions(+), 2 deletions(-)
> 
> diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
> index 203f2112dd18..f65a236fddef 100644
> --- a/drivers/gpu/drm/i915/i915_drv.h
> +++ b/drivers/gpu/drm/i915/i915_drv.h
> @@ -3574,6 +3574,35 @@ i915_gem_context_lookup_timeline(struct 
> i915_gem_context *ctx,
>   return >timeline.engine[engine->id];
>  }
>  
> +/*
> + * BDW & SKL+ Timestamp timer resolution = 0.080 uSec,
> + * or 1250 counts per second, or ~12 counts per microsecond.
> + *
> + * But Broxton Timestamp timer resolution is different, 0.052 uSec,
> + * or 1920 counts per second, or ~19 counts per microsecond.
> + */
> +#define SKL_TIMESTAMP_CNTS_PER_USEC 12
> +#define BXT_TIMESTAMP_CNTS_PER_USEC 19
> +#define TIMESTAMP_CNTS_PER_USEC(dev_priv) (IS_BROXTON(dev_priv) ? \
> +BXT_TIMESTAMP_CNTS_PER_USEC : \
> +SKL_TIMESTAMP_CNTS_PER_USEC)
> +static inline u32
> +watchdog_to_us(struct drm_i915_private *dev_priv, u32 value_in_clock_counts)
> +{
> + return value_in_clock_counts / TIMESTAMP_CNTS_PER_USEC(dev_priv);
> +}
> +
> +static inline u32
> +watchdog_to_clock_counts(struct drm_i915_private *dev_priv, u64 value_in_us)
> +{
> + u64 threshold = value_in_us * TIMESTAMP_CNTS_PER_USEC(dev_priv);
> +
> + if (overflows_type(threshold, u32))
> + return -EINVAL;
> +
> + return threshold;
> +}
> +
>  int i915_perf_open_ioctl(struct drm_device *dev, void *data,
>struct drm_file *file);
>  
> diff --git a/drivers/gpu/drm/i915/i915_gem_context.c 
> b/drivers/gpu/drm/i915/i915_gem_context.c
> index edbed85a1c88..85a6467a25a6 100644
> --- a/drivers/gpu/drm/i915/i915_gem_context.c
> +++ b/drivers/gpu/drm/i915/i915_gem_context.c
> @@ -422,6 +422,102 @@ i915_gem_context_create_gvt(struct drm_device *dev)
>   return ctx;
>  }
>  
> +/* Return the timer count threshold in microseconds. */
> +int i915_gem_context_get_watchdog(struct i915_gem_context *ctx,
> +   struct drm_i915_gem_context_param *args)
> +{
> + struct drm_i915_private *dev_priv = ctx->i915;
> + struct intel_engine_cs *engine;
> + enum intel_engine_id id;
> + u32 threshold_in_us[I915_NUM_ENGINES];
> +
> + if (args->size == 0)
> + goto out;
> +
> + if (args->size < sizeof(threshold_in_us))
> + return -EFAULT;
> +
> + if (!dev_priv->engine[VCS]->emit_start_watchdog)
> + return -ENODEV;
> +
> + for_each_engine(engine, dev_priv, id) {
> + struct intel_context *ce = >engine[id];
> +
> + threshold_in_us[id] = watchdog_to_us(dev_priv,
> +  ce->watchdog_threshold);
> + }
> +
> + mutex_unlock(_priv->drm.struct_mutex);
> + if (__copy_to_user(u64_to_user_ptr(args->value),
> +_in_us,
> +sizeof(threshold_in_us))) {
> + mutex_lock(_priv->drm.struct_mutex);
> + return -EFAULT;
> + }
> + mutex_lock(_priv->drm.struct_mutex);
> +
> +out:
> + args->size = sizeof(threshold_in_us);
> +
> + return 0;
> +}
> +
> +/*