[PATCH 6/9] drm/radeon: handle lockup in delayed work, v5
From: Christian K?nigv5 (chk): complete rework, start when the first fence is emitted, stop when the last fence is signalled, make it work correctly with GPU resets, cleanup radeon_fence_wait_seq Signed-off-by: Maarten Lankhorst Signed-off-by: Christian K?nig --- drivers/gpu/drm/radeon/radeon.h | 2 + drivers/gpu/drm/radeon/radeon_fence.c | 200 +- 2 files changed, 124 insertions(+), 78 deletions(-) diff --git a/drivers/gpu/drm/radeon/radeon.h b/drivers/gpu/drm/radeon/radeon.h index f528ae8..fce8b32 100644 --- a/drivers/gpu/drm/radeon/radeon.h +++ b/drivers/gpu/drm/radeon/radeon.h @@ -349,6 +349,7 @@ extern void evergreen_tiling_fields(unsigned tiling_flags, unsigned *bankw, * Fences. */ struct radeon_fence_driver { + struct radeon_device*rdev; uint32_tscratch_reg; uint64_tgpu_addr; volatile uint32_t *cpu_addr; @@ -356,6 +357,7 @@ struct radeon_fence_driver { uint64_tsync_seq[RADEON_NUM_RINGS]; atomic64_t last_seq; boolinitialized; + struct delayed_work lockup_work; }; struct radeon_fence { diff --git a/drivers/gpu/drm/radeon/radeon_fence.c b/drivers/gpu/drm/radeon/radeon_fence.c index e8a28e7..ac15f34 100644 --- a/drivers/gpu/drm/radeon/radeon_fence.c +++ b/drivers/gpu/drm/radeon/radeon_fence.c @@ -98,6 +98,25 @@ static u32 radeon_fence_read(struct radeon_device *rdev, int ring) } /** + * radeon_fence_schedule_check - schedule lockup check + * + * @rdev: radeon_device pointer + * @ring: ring index we should work with + * + * Queues a delayed work item to check for lockups. + */ +static void radeon_fence_schedule_check(struct radeon_device *rdev, int ring) +{ + /* +* Do not reset the timer here with mod_delayed_work, +* this can livelock in an interaction with TTM delayed destroy. +*/ + queue_delayed_work(system_power_efficient_wq, + >fence_drv[ring].lockup_work, + RADEON_FENCE_JIFFIES_TIMEOUT); +} + +/** * radeon_fence_emit - emit a fence on the requested ring * * @rdev: radeon_device pointer @@ -122,19 +141,21 @@ int radeon_fence_emit(struct radeon_device *rdev, (*fence)->ring = ring; radeon_fence_ring_emit(rdev, ring, *fence); trace_radeon_fence_emit(rdev->ddev, ring, (*fence)->seq); + radeon_fence_schedule_check(rdev, ring); return 0; } /** - * radeon_fence_process - process a fence + * radeon_fence_activity - check for fence activity * * @rdev: radeon_device pointer * @ring: ring index the fence is associated with * - * Checks the current fence value and wakes the fence queue - * if the sequence number has increased (all asics). + * Checks the current fence value and calculates the last + * signalled fence value. Returns true if activity occured + * on the ring, and the fence_queue should be waken up. */ -void radeon_fence_process(struct radeon_device *rdev, int ring) +static bool radeon_fence_activity(struct radeon_device *rdev, int ring) { uint64_t seq, last_seq, last_emitted; unsigned count_loop = 0; @@ -190,7 +211,67 @@ void radeon_fence_process(struct radeon_device *rdev, int ring) } } while (atomic64_xchg(>fence_drv[ring].last_seq, seq) > seq); - if (wake) + if (seq < last_emitted) + radeon_fence_schedule_check(rdev, ring); + + return wake; +} + +/** + * radeon_fence_check_lockup - check for hardware lockup + * + * @work: delayed work item + * + * Checks for fence activity and if there is none probe + * the hardware if a lockup occured. + */ +static void radeon_fence_check_lockup(struct work_struct *work) +{ + struct radeon_fence_driver *fence_drv; + struct radeon_device *rdev; + int ring; + + fence_drv = container_of(work, struct radeon_fence_driver, +lockup_work.work); + rdev = fence_drv->rdev; + ring = fence_drv - >fence_drv[0]; + + if (!down_read_trylock(>exclusive_lock)) { + /* just reschedule the check if a reset is going on */ + radeon_fence_schedule_check(rdev, ring); + return; + } + + if (radeon_fence_activity(rdev, ring)) + wake_up_all(>fence_queue); + + else if (radeon_ring_is_lockup(rdev, ring, >ring[ring])) { + + /* good news we believe it's a lockup */ + dev_warn(rdev->dev, "GPU lockup (current fence id " +"0x%016llx last fence id 0x%016llx on ring %d)\n", +(uint64_t)atomic64_read(_drv->last_seq), +fence_drv->sync_seq[ring], ring); + + /* remember that we need an reset */ +
[PATCH 6/9] drm/radeon: handle lockup in delayed work, v5
From: Christian K?nigv5 (chk): complete rework, start when the first fence is emitted, stop when the last fence is signalled, make it work correctly with GPU resets, cleanup radeon_fence_wait_seq Signed-off-by: Maarten Lankhorst Signed-off-by: Christian K?nig --- drivers/gpu/drm/radeon/radeon.h | 2 + drivers/gpu/drm/radeon/radeon_fence.c | 200 +- 2 files changed, 124 insertions(+), 78 deletions(-) diff --git a/drivers/gpu/drm/radeon/radeon.h b/drivers/gpu/drm/radeon/radeon.h index f528ae8..fce8b32 100644 --- a/drivers/gpu/drm/radeon/radeon.h +++ b/drivers/gpu/drm/radeon/radeon.h @@ -349,6 +349,7 @@ extern void evergreen_tiling_fields(unsigned tiling_flags, unsigned *bankw, * Fences. */ struct radeon_fence_driver { + struct radeon_device*rdev; uint32_tscratch_reg; uint64_tgpu_addr; volatile uint32_t *cpu_addr; @@ -356,6 +357,7 @@ struct radeon_fence_driver { uint64_tsync_seq[RADEON_NUM_RINGS]; atomic64_t last_seq; boolinitialized; + struct delayed_work lockup_work; }; struct radeon_fence { diff --git a/drivers/gpu/drm/radeon/radeon_fence.c b/drivers/gpu/drm/radeon/radeon_fence.c index e8a28e7..ac15f34 100644 --- a/drivers/gpu/drm/radeon/radeon_fence.c +++ b/drivers/gpu/drm/radeon/radeon_fence.c @@ -98,6 +98,25 @@ static u32 radeon_fence_read(struct radeon_device *rdev, int ring) } /** + * radeon_fence_schedule_check - schedule lockup check + * + * @rdev: radeon_device pointer + * @ring: ring index we should work with + * + * Queues a delayed work item to check for lockups. + */ +static void radeon_fence_schedule_check(struct radeon_device *rdev, int ring) +{ + /* +* Do not reset the timer here with mod_delayed_work, +* this can livelock in an interaction with TTM delayed destroy. +*/ + queue_delayed_work(system_power_efficient_wq, + >fence_drv[ring].lockup_work, + RADEON_FENCE_JIFFIES_TIMEOUT); +} + +/** * radeon_fence_emit - emit a fence on the requested ring * * @rdev: radeon_device pointer @@ -122,19 +141,21 @@ int radeon_fence_emit(struct radeon_device *rdev, (*fence)->ring = ring; radeon_fence_ring_emit(rdev, ring, *fence); trace_radeon_fence_emit(rdev->ddev, ring, (*fence)->seq); + radeon_fence_schedule_check(rdev, ring); return 0; } /** - * radeon_fence_process - process a fence + * radeon_fence_activity - check for fence activity * * @rdev: radeon_device pointer * @ring: ring index the fence is associated with * - * Checks the current fence value and wakes the fence queue - * if the sequence number has increased (all asics). + * Checks the current fence value and calculates the last + * signalled fence value. Returns true if activity occured + * on the ring, and the fence_queue should be waken up. */ -void radeon_fence_process(struct radeon_device *rdev, int ring) +static bool radeon_fence_activity(struct radeon_device *rdev, int ring) { uint64_t seq, last_seq, last_emitted; unsigned count_loop = 0; @@ -190,7 +211,67 @@ void radeon_fence_process(struct radeon_device *rdev, int ring) } } while (atomic64_xchg(>fence_drv[ring].last_seq, seq) > seq); - if (wake) + if (seq < last_emitted) + radeon_fence_schedule_check(rdev, ring); + + return wake; +} + +/** + * radeon_fence_check_lockup - check for hardware lockup + * + * @work: delayed work item + * + * Checks for fence activity and if there is none probe + * the hardware if a lockup occured. + */ +static void radeon_fence_check_lockup(struct work_struct *work) +{ + struct radeon_fence_driver *fence_drv; + struct radeon_device *rdev; + int ring; + + fence_drv = container_of(work, struct radeon_fence_driver, +lockup_work.work); + rdev = fence_drv->rdev; + ring = fence_drv - >fence_drv[0]; + + if (!down_read_trylock(>exclusive_lock)) { + /* just reschedule the check if a reset is going on */ + radeon_fence_schedule_check(rdev, ring); + return; + } + + if (radeon_fence_activity(rdev, ring)) + wake_up_all(>fence_queue); + + else if (radeon_ring_is_lockup(rdev, ring, >ring[ring])) { + + /* good news we believe it's a lockup */ + dev_warn(rdev->dev, "GPU lockup (current fence id " +"0x%016llx last fence id 0x%016llx on ring %d)\n", +(uint64_t)atomic64_read(_drv->last_seq), +fence_drv->sync_seq[ring], ring); + + /* remember that we need an reset */ +