Report the per-ring timeout in milliseconds and allow users to adjust
the timeout dynamically. This can be useful for debugging, e.g. to more
easily test whether a submission genuinely hangs or is just taking very
long, and to temporarily disable GPU recovery so that shader problems
can be examined in detail, including single-stepping through shader
code.

It feels a bit questionable to access ring->sched.timeout without any
locking -- under a C++ memory model it would technically be undefined
behavior. But it's not like a lot can go wrong here in practice, and
it's not clear to me what locking or atomics, if any, should be used.

Signed-off-by: Nicolai Hähnle <[email protected]>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c | 32 +++++++++++++++++++++++-
 1 file changed, 31 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c
index dc474b809604..32d223daa789 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c
@@ -471,35 +471,65 @@ static ssize_t amdgpu_debugfs_ring_read(struct file *f, 
char __user *buf,
 
        return result;
 }
 
 static const struct file_operations amdgpu_debugfs_ring_fops = {
        .owner = THIS_MODULE,
        .read = amdgpu_debugfs_ring_read,
        .llseek = default_llseek
 };
 
+static int amdgpu_debugfs_timeout_ring_get(void *data, u64 *val) {
+       struct amdgpu_ring *ring = data;
+
+       if (ring->sched.timeout == MAX_SCHEDULE_TIMEOUT)
+               *val = 0;
+       else
+               *val = jiffies_to_msecs(ring->sched.timeout);
+
+       return 0;
+}
+
+static int amdgpu_debugfs_timeout_ring_set(void *data, u64 val) {
+       struct amdgpu_ring *ring = data;
+
+       if (val == 0)
+               ring->sched.timeout = MAX_SCHEDULE_TIMEOUT;
+       else
+               ring->sched.timeout = msecs_to_jiffies(val);
+
+       return 0;
+}
+
+DEFINE_DEBUGFS_ATTRIBUTE(amdgpu_debugfs_timeout_ring_fops,
+                        amdgpu_debugfs_timeout_ring_get,
+                        amdgpu_debugfs_timeout_ring_set,
+                        "%llu\n");
+
 #endif
 
 void amdgpu_debugfs_ring_init(struct amdgpu_device *adev,
                              struct amdgpu_ring *ring)
 {
 #if defined(CONFIG_DEBUG_FS)
        struct drm_minor *minor = adev_to_drm(adev)->primary;
        struct dentry *root = minor->debugfs_root;
-       char name[32];
+       char name[40];
 
        sprintf(name, "amdgpu_ring_%s", ring->name);
        debugfs_create_file_size(name, S_IFREG | S_IRUGO, root, ring,
                                 &amdgpu_debugfs_ring_fops,
                                 ring->ring_size + 12);
 
+       sprintf(name, "amdgpu_timeout_ring_%s", ring->name);
+       debugfs_create_file(name, S_IFREG | S_IRUGO | S_IWUSR, root, ring,
+                           &amdgpu_debugfs_timeout_ring_fops);
 #endif
 }
 
 /**
  * amdgpu_ring_test_helper - tests ring and set sched readiness status
  *
  * @ring: ring to try the recovery on
  *
  * Tests ring and set sched readiness status
  *
-- 
2.40.0

Reply via email to