We currently capture the GPU state after we detect a hang. This is vital
for us to both triage and debug hangs in the wild (post-mortem
debugging). However, it comes at the cost of running some potentially
dangerous code (since it has to make very few assumption about the state
of the driver) that is quite resource intensive.

Signed-off-by: Chris Wilson <ch...@chris-wilson.co.uk>
---
 drivers/gpu/drm/i915/Kconfig          | 10 ++++++++++
 drivers/gpu/drm/i915/i915_debugfs.c   |  6 ++++++
 drivers/gpu/drm/i915/i915_drv.h       | 16 ++++++++++++++++
 drivers/gpu/drm/i915/i915_gpu_error.c |  7 +++++++
 drivers/gpu/drm/i915/i915_params.c    |  9 +++++++++
 drivers/gpu/drm/i915/i915_params.h    |  1 +
 drivers/gpu/drm/i915/i915_sysfs.c     |  8 ++++++++
 drivers/gpu/drm/i915/intel_display.c  |  4 ++++
 drivers/gpu/drm/i915/intel_overlay.c  |  4 ++++
 9 files changed, 65 insertions(+)

diff --git a/drivers/gpu/drm/i915/Kconfig b/drivers/gpu/drm/i915/Kconfig
index 7769e469118f..10a6ac11b6a9 100644
--- a/drivers/gpu/drm/i915/Kconfig
+++ b/drivers/gpu/drm/i915/Kconfig
@@ -46,6 +46,16 @@ config DRM_I915_PRELIMINARY_HW_SUPPORT
 
          If in doubt, say "N".
 
+config DRM_I915_CAPTURE_ERROR
+       bool "Enable capturing GPU state following a hang"
+       depends on DRM_I915
+       default y
+       help
+         This option enables capturing the GPU state when a hang is detected.
+         This information is vital for triaging hangs and assists in debugging.
+
+         If in doubt, say "Y".
+
 config DRM_I915_USERPTR
        bool "Always enable userptr support"
        depends on DRM_I915
diff --git a/drivers/gpu/drm/i915/i915_debugfs.c 
b/drivers/gpu/drm/i915/i915_debugfs.c
index 27b0e34dadec..278fb18efde2 100644
--- a/drivers/gpu/drm/i915/i915_debugfs.c
+++ b/drivers/gpu/drm/i915/i915_debugfs.c
@@ -956,6 +956,8 @@ static int i915_hws_info(struct seq_file *m, void *data)
        return 0;
 }
 
+#ifdef CONFIG_DRM_I915_CAPTURE_ERROR
+
 static ssize_t
 i915_error_state_write(struct file *filp,
                       const char __user *ubuf,
@@ -1038,6 +1040,8 @@ static const struct file_operations i915_error_state_fops 
= {
        .release = i915_error_state_release,
 };
 
+#endif
+
 static int
 i915_next_seqno_get(void *data, u64 *val)
 {
@@ -5309,7 +5313,9 @@ static const struct i915_debugfs_files {
        {"i915_ring_missed_irq", &i915_ring_missed_irq_fops},
        {"i915_ring_test_irq", &i915_ring_test_irq_fops},
        {"i915_gem_drop_caches", &i915_drop_caches_fops},
+#ifdef CONFIG_DRM_I915_CAPTURE_ERROR
        {"i915_error_state", &i915_error_state_fops},
+#endif
        {"i915_next_seqno", &i915_next_seqno_fops},
        {"i915_display_crc_ctl", &i915_display_crc_ctl_fops},
        {"i915_pri_wm_latency", &i915_pri_wm_latency_fops},
diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index 4dd307ed4336..1643acb02513 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -3522,6 +3522,8 @@ static inline void intel_display_crc_init(struct 
drm_i915_private *dev_priv) {}
 #endif
 
 /* i915_gpu_error.c */
+#ifdef CONFIG_DRM_I915_CAPTURE_ERROR
+
 __printf(2, 3)
 void i915_error_printf(struct drm_i915_error_state_buf *e, const char *f, ...);
 int i915_error_state_to_str(struct drm_i915_error_state_buf *estr,
@@ -3542,6 +3544,20 @@ void i915_error_state_get(struct drm_device *dev,
 void i915_error_state_put(struct i915_error_state_file_priv *error_priv);
 void i915_destroy_error_state(struct drm_device *dev);
 
+#else
+
+static inline void i915_capture_error_state(struct drm_i915_private *dev_priv,
+                                           u32 engine_mask,
+                                           const char *error_msg)
+{
+}
+
+static inline void i915_destroy_error_state(struct drm_device *dev)
+{
+}
+
+#endif
+
 void i915_get_extra_instdone(struct drm_i915_private *dev_priv, uint32_t 
*instdone);
 const char *i915_cache_level_str(struct drm_i915_private *i915, int type);
 
diff --git a/drivers/gpu/drm/i915/i915_gpu_error.c 
b/drivers/gpu/drm/i915/i915_gpu_error.c
index 334f15df7c8d..364f5b17df21 100644
--- a/drivers/gpu/drm/i915/i915_gpu_error.c
+++ b/drivers/gpu/drm/i915/i915_gpu_error.c
@@ -30,6 +30,8 @@
 #include <generated/utsrelease.h>
 #include "i915_drv.h"
 
+#ifdef CONFIG_DRM_I915_CAPTURE_ERROR
+
 static const char *engine_str(int engine)
 {
        switch (engine) {
@@ -1435,6 +1437,9 @@ void i915_capture_error_state(struct drm_i915_private 
*dev_priv,
        struct drm_i915_error_state *error;
        unsigned long flags;
 
+       if (!i915.error_capture)
+               return;
+
        if (READ_ONCE(dev_priv->gpu_error.first_error))
                return;
 
@@ -1520,6 +1525,8 @@ void i915_destroy_error_state(struct drm_device *dev)
                kref_put(&error->ref, i915_error_state_free);
 }
 
+#endif
+
 const char *i915_cache_level_str(struct drm_i915_private *i915, int type)
 {
        switch (type) {
diff --git a/drivers/gpu/drm/i915/i915_params.c 
b/drivers/gpu/drm/i915/i915_params.c
index 768ad89d9cd4..e72a41223535 100644
--- a/drivers/gpu/drm/i915/i915_params.c
+++ b/drivers/gpu/drm/i915/i915_params.c
@@ -47,6 +47,7 @@ struct i915_params i915 __read_mostly = {
        .load_detect_test = 0,
        .force_reset_modeset_test = 0,
        .reset = true,
+       .error_capture = true,
        .invert_brightness = 0,
        .disable_display = 0,
        .enable_cmd_parser = 1,
@@ -115,6 +116,14 @@ MODULE_PARM_DESC(vbt_sdvo_panel_type,
 module_param_named_unsafe(reset, i915.reset, bool, 0600);
 MODULE_PARM_DESC(reset, "Attempt GPU resets (default: true)");
 
+#ifdef CONFIG_DRM_I915_CAPTURE_ERROR
+module_param_named(error_capture, i915.error_capture, bool, 0600);
+MODULE_PARM_DESC(error_capture,
+       "Record the GPU state following a hang. "
+       "This information in /sys/class/drm/card<N>/error is vital for "
+       "triaging and debugging hangs.");
+#endif
+
 module_param_named_unsafe(enable_hangcheck, i915.enable_hangcheck, bool, 0644);
 MODULE_PARM_DESC(enable_hangcheck,
        "Periodically check GPU activity for detecting hangs. "
diff --git a/drivers/gpu/drm/i915/i915_params.h 
b/drivers/gpu/drm/i915/i915_params.h
index 3a0dd78ddb38..94efc899c1ef 100644
--- a/drivers/gpu/drm/i915/i915_params.h
+++ b/drivers/gpu/drm/i915/i915_params.h
@@ -59,6 +59,7 @@ struct i915_params {
        bool load_detect_test;
        bool force_reset_modeset_test;
        bool reset;
+       bool error_capture;
        bool disable_display;
        bool verbose_state_checks;
        bool nuclear_pageflip;
diff --git a/drivers/gpu/drm/i915/i915_sysfs.c 
b/drivers/gpu/drm/i915/i915_sysfs.c
index 1012eeea1324..c9b71f676a57 100644
--- a/drivers/gpu/drm/i915/i915_sysfs.c
+++ b/drivers/gpu/drm/i915/i915_sysfs.c
@@ -514,6 +514,8 @@ static const struct attribute *vlv_attrs[] = {
        NULL,
 };
 
+#ifdef CONFIG_DRM_I915_CAPTURE_ERROR
+
 static ssize_t error_state_read(struct file *filp, struct kobject *kobj,
                                struct bin_attribute *attr, char *buf,
                                loff_t off, size_t count)
@@ -571,6 +573,8 @@ static struct bin_attribute error_state_attr = {
        .write = error_state_write,
 };
 
+#endif
+
 void i915_setup_sysfs(struct drm_i915_private *dev_priv)
 {
        struct device *kdev = dev_priv->drm.primary->kdev;
@@ -617,17 +621,21 @@ void i915_setup_sysfs(struct drm_i915_private *dev_priv)
        if (ret)
                DRM_ERROR("RPS sysfs setup failed\n");
 
+#ifdef CONFIG_DRM_I915_CAPTURE_ERROR
        ret = sysfs_create_bin_file(&kdev->kobj,
                                    &error_state_attr);
        if (ret)
                DRM_ERROR("error_state sysfs setup failed\n");
+#endif
 }
 
 void i915_teardown_sysfs(struct drm_i915_private *dev_priv)
 {
        struct device *kdev = dev_priv->drm.primary->kdev;
 
+#ifdef CONFIG_DRM_I915_CAPTURE_ERROR
        sysfs_remove_bin_file(&kdev->kobj, &error_state_attr);
+#endif
        if (IS_VALLEYVIEW(dev_priv) || IS_CHERRYVIEW(dev_priv))
                sysfs_remove_files(&kdev->kobj, vlv_attrs);
        else
diff --git a/drivers/gpu/drm/i915/intel_display.c 
b/drivers/gpu/drm/i915/intel_display.c
index 497d99b88468..53ee34131ade 100644
--- a/drivers/gpu/drm/i915/intel_display.c
+++ b/drivers/gpu/drm/i915/intel_display.c
@@ -17083,6 +17083,8 @@ int intel_modeset_vga_set_state(struct drm_device *dev, 
bool state)
        return 0;
 }
 
+#ifdef CONFIG_DRM_I915_CAPTURE_ERROR
+
 struct intel_display_error_state {
 
        u32 power_well_driver;
@@ -17265,3 +17267,5 @@ intel_display_print_error_state(struct 
drm_i915_error_state_buf *m,
                err_printf(m, "  VSYNC: %08x\n", error->transcoder[i].vsync);
        }
 }
+
+#endif
diff --git a/drivers/gpu/drm/i915/intel_overlay.c 
b/drivers/gpu/drm/i915/intel_overlay.c
index a24bc8c7889f..7c392547711f 100644
--- a/drivers/gpu/drm/i915/intel_overlay.c
+++ b/drivers/gpu/drm/i915/intel_overlay.c
@@ -1470,6 +1470,8 @@ void intel_cleanup_overlay(struct drm_i915_private 
*dev_priv)
        kfree(dev_priv->overlay);
 }
 
+#ifdef CONFIG_DRM_I915_CAPTURE_ERROR
+
 struct intel_overlay_error_state {
        struct overlay_registers regs;
        unsigned long base;
@@ -1587,3 +1589,5 @@ intel_overlay_print_error_state(struct 
drm_i915_error_state_buf *m,
        P(UVSCALEV);
 #undef P
 }
+
+#endif
-- 
2.9.3

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

Reply via email to