[PATCH 6/9] drm/radeon: handle lockup in delayed work, v5

2014-08-27 Thread Christian König
From: Christian K?nig 

v5 (chk): complete rework, start when the first fence is emitted,
  stop when the last fence is signalled, make it work
  correctly with GPU resets, cleanup radeon_fence_wait_seq

Signed-off-by: Maarten Lankhorst 
Signed-off-by: Christian K?nig 
---
 drivers/gpu/drm/radeon/radeon.h   |   2 +
 drivers/gpu/drm/radeon/radeon_fence.c | 200 +-
 2 files changed, 124 insertions(+), 78 deletions(-)

diff --git a/drivers/gpu/drm/radeon/radeon.h b/drivers/gpu/drm/radeon/radeon.h
index f528ae8..fce8b32 100644
--- a/drivers/gpu/drm/radeon/radeon.h
+++ b/drivers/gpu/drm/radeon/radeon.h
@@ -349,6 +349,7 @@ extern void evergreen_tiling_fields(unsigned tiling_flags, 
unsigned *bankw,
  * Fences.
  */
 struct radeon_fence_driver {
+   struct radeon_device*rdev;
uint32_tscratch_reg;
uint64_tgpu_addr;
volatile uint32_t   *cpu_addr;
@@ -356,6 +357,7 @@ struct radeon_fence_driver {
uint64_tsync_seq[RADEON_NUM_RINGS];
atomic64_t  last_seq;
boolinitialized;
+   struct delayed_work lockup_work;
 };

 struct radeon_fence {
diff --git a/drivers/gpu/drm/radeon/radeon_fence.c 
b/drivers/gpu/drm/radeon/radeon_fence.c
index e8a28e7..ac15f34 100644
--- a/drivers/gpu/drm/radeon/radeon_fence.c
+++ b/drivers/gpu/drm/radeon/radeon_fence.c
@@ -98,6 +98,25 @@ static u32 radeon_fence_read(struct radeon_device *rdev, int 
ring)
 }

 /**
+ * radeon_fence_schedule_check - schedule lockup check
+ *
+ * @rdev: radeon_device pointer
+ * @ring: ring index we should work with
+ *
+ * Queues a delayed work item to check for lockups.
+ */
+static void radeon_fence_schedule_check(struct radeon_device *rdev, int ring)
+{
+   /*
+* Do not reset the timer here with mod_delayed_work,
+* this can livelock in an interaction with TTM delayed destroy.
+*/
+   queue_delayed_work(system_power_efficient_wq,
+  >fence_drv[ring].lockup_work,
+  RADEON_FENCE_JIFFIES_TIMEOUT);
+}
+
+/**
  * radeon_fence_emit - emit a fence on the requested ring
  *
  * @rdev: radeon_device pointer
@@ -122,19 +141,21 @@ int radeon_fence_emit(struct radeon_device *rdev,
(*fence)->ring = ring;
radeon_fence_ring_emit(rdev, ring, *fence);
trace_radeon_fence_emit(rdev->ddev, ring, (*fence)->seq);
+   radeon_fence_schedule_check(rdev, ring);
return 0;
 }

 /**
- * radeon_fence_process - process a fence
+ * radeon_fence_activity - check for fence activity
  *
  * @rdev: radeon_device pointer
  * @ring: ring index the fence is associated with
  *
- * Checks the current fence value and wakes the fence queue
- * if the sequence number has increased (all asics).
+ * Checks the current fence value and calculates the last
+ * signalled fence value. Returns true if activity occured
+ * on the ring, and the fence_queue should be waken up.
  */
-void radeon_fence_process(struct radeon_device *rdev, int ring)
+static bool radeon_fence_activity(struct radeon_device *rdev, int ring)
 {
uint64_t seq, last_seq, last_emitted;
unsigned count_loop = 0;
@@ -190,7 +211,67 @@ void radeon_fence_process(struct radeon_device *rdev, int 
ring)
}
} while (atomic64_xchg(>fence_drv[ring].last_seq, seq) > seq);

-   if (wake)
+   if (seq < last_emitted)
+   radeon_fence_schedule_check(rdev, ring);
+
+   return wake;
+}
+
+/**
+ * radeon_fence_check_lockup - check for hardware lockup
+ *
+ * @work: delayed work item
+ *
+ * Checks for fence activity and if there is none probe
+ * the hardware if a lockup occured.
+ */
+static void radeon_fence_check_lockup(struct work_struct *work)
+{
+   struct radeon_fence_driver *fence_drv;
+   struct radeon_device *rdev;
+   int ring;
+
+   fence_drv = container_of(work, struct radeon_fence_driver,
+lockup_work.work);
+   rdev = fence_drv->rdev;
+   ring = fence_drv - >fence_drv[0];
+
+   if (!down_read_trylock(>exclusive_lock)) {
+   /* just reschedule the check if a reset is going on */
+   radeon_fence_schedule_check(rdev, ring);
+   return;
+   }
+
+   if (radeon_fence_activity(rdev, ring))
+   wake_up_all(>fence_queue);
+
+   else if (radeon_ring_is_lockup(rdev, ring, >ring[ring])) {
+
+   /* good news we believe it's a lockup */
+   dev_warn(rdev->dev, "GPU lockup (current fence id "
+"0x%016llx last fence id 0x%016llx on ring %d)\n",
+(uint64_t)atomic64_read(_drv->last_seq),
+fence_drv->sync_seq[ring], ring);
+
+   /* remember that we need an reset */
+   

[PATCH 6/9] drm/radeon: handle lockup in delayed work, v5

2014-08-26 Thread Christian König
From: Christian K?nig 

v5 (chk): complete rework, start when the first fence is emitted,
  stop when the last fence is signalled, make it work
  correctly with GPU resets, cleanup radeon_fence_wait_seq

Signed-off-by: Maarten Lankhorst 
Signed-off-by: Christian K?nig 
---
 drivers/gpu/drm/radeon/radeon.h   |   2 +
 drivers/gpu/drm/radeon/radeon_fence.c | 200 +-
 2 files changed, 124 insertions(+), 78 deletions(-)

diff --git a/drivers/gpu/drm/radeon/radeon.h b/drivers/gpu/drm/radeon/radeon.h
index f528ae8..fce8b32 100644
--- a/drivers/gpu/drm/radeon/radeon.h
+++ b/drivers/gpu/drm/radeon/radeon.h
@@ -349,6 +349,7 @@ extern void evergreen_tiling_fields(unsigned tiling_flags, 
unsigned *bankw,
  * Fences.
  */
 struct radeon_fence_driver {
+   struct radeon_device*rdev;
uint32_tscratch_reg;
uint64_tgpu_addr;
volatile uint32_t   *cpu_addr;
@@ -356,6 +357,7 @@ struct radeon_fence_driver {
uint64_tsync_seq[RADEON_NUM_RINGS];
atomic64_t  last_seq;
boolinitialized;
+   struct delayed_work lockup_work;
 };

 struct radeon_fence {
diff --git a/drivers/gpu/drm/radeon/radeon_fence.c 
b/drivers/gpu/drm/radeon/radeon_fence.c
index e8a28e7..ac15f34 100644
--- a/drivers/gpu/drm/radeon/radeon_fence.c
+++ b/drivers/gpu/drm/radeon/radeon_fence.c
@@ -98,6 +98,25 @@ static u32 radeon_fence_read(struct radeon_device *rdev, int 
ring)
 }

 /**
+ * radeon_fence_schedule_check - schedule lockup check
+ *
+ * @rdev: radeon_device pointer
+ * @ring: ring index we should work with
+ *
+ * Queues a delayed work item to check for lockups.
+ */
+static void radeon_fence_schedule_check(struct radeon_device *rdev, int ring)
+{
+   /*
+* Do not reset the timer here with mod_delayed_work,
+* this can livelock in an interaction with TTM delayed destroy.
+*/
+   queue_delayed_work(system_power_efficient_wq,
+  >fence_drv[ring].lockup_work,
+  RADEON_FENCE_JIFFIES_TIMEOUT);
+}
+
+/**
  * radeon_fence_emit - emit a fence on the requested ring
  *
  * @rdev: radeon_device pointer
@@ -122,19 +141,21 @@ int radeon_fence_emit(struct radeon_device *rdev,
(*fence)->ring = ring;
radeon_fence_ring_emit(rdev, ring, *fence);
trace_radeon_fence_emit(rdev->ddev, ring, (*fence)->seq);
+   radeon_fence_schedule_check(rdev, ring);
return 0;
 }

 /**
- * radeon_fence_process - process a fence
+ * radeon_fence_activity - check for fence activity
  *
  * @rdev: radeon_device pointer
  * @ring: ring index the fence is associated with
  *
- * Checks the current fence value and wakes the fence queue
- * if the sequence number has increased (all asics).
+ * Checks the current fence value and calculates the last
+ * signalled fence value. Returns true if activity occured
+ * on the ring, and the fence_queue should be waken up.
  */
-void radeon_fence_process(struct radeon_device *rdev, int ring)
+static bool radeon_fence_activity(struct radeon_device *rdev, int ring)
 {
uint64_t seq, last_seq, last_emitted;
unsigned count_loop = 0;
@@ -190,7 +211,67 @@ void radeon_fence_process(struct radeon_device *rdev, int 
ring)
}
} while (atomic64_xchg(>fence_drv[ring].last_seq, seq) > seq);

-   if (wake)
+   if (seq < last_emitted)
+   radeon_fence_schedule_check(rdev, ring);
+
+   return wake;
+}
+
+/**
+ * radeon_fence_check_lockup - check for hardware lockup
+ *
+ * @work: delayed work item
+ *
+ * Checks for fence activity and if there is none probe
+ * the hardware if a lockup occured.
+ */
+static void radeon_fence_check_lockup(struct work_struct *work)
+{
+   struct radeon_fence_driver *fence_drv;
+   struct radeon_device *rdev;
+   int ring;
+
+   fence_drv = container_of(work, struct radeon_fence_driver,
+lockup_work.work);
+   rdev = fence_drv->rdev;
+   ring = fence_drv - >fence_drv[0];
+
+   if (!down_read_trylock(>exclusive_lock)) {
+   /* just reschedule the check if a reset is going on */
+   radeon_fence_schedule_check(rdev, ring);
+   return;
+   }
+
+   if (radeon_fence_activity(rdev, ring))
+   wake_up_all(>fence_queue);
+
+   else if (radeon_ring_is_lockup(rdev, ring, >ring[ring])) {
+
+   /* good news we believe it's a lockup */
+   dev_warn(rdev->dev, "GPU lockup (current fence id "
+"0x%016llx last fence id 0x%016llx on ring %d)\n",
+(uint64_t)atomic64_read(_drv->last_seq),
+fence_drv->sync_seq[ring], ring);
+
+   /* remember that we need an reset */
+