When CPER ring overflow handling advances the read pointer, it trusts the parsed entry size from the current ring contents. Corrupt CPER data can produce an entry size that does not advance rptr after dword conversion and pointer masking.
In that case the recovery loop keeps testing the same location while holding the CPER ring mutex. This can hang the worker that is writing the next CPER record. Detect a no-progress rptr update and reset the CPER ring to an empty state instead. This drops the corrupt contents and lets the writer leave the recovery path without spinning. Signed-off-by: Xiang Liu <[email protected]> --- drivers/gpu/drm/amd/amdgpu/amdgpu_cper.c | 20 ++++++++++++++++---- 1 file changed, 16 insertions(+), 4 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_cper.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_cper.c index 004edc28d0cc..d5e59c24d907 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_cper.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_cper.c @@ -484,7 +484,7 @@ static u32 amdgpu_cper_ring_get_ent_sz(struct amdgpu_ring *ring, u64 pos) void amdgpu_cper_ring_write(struct amdgpu_ring *ring, void *src, int count) { - u64 pos, wptr_old, rptr; + u64 pos, wptr_old, rptr, next_rptr; int rec_cnt_dw = count >> 2; u32 chunk, ent_sz; u8 *s = (u8 *)src; @@ -525,9 +525,19 @@ void amdgpu_cper_ring_write(struct amdgpu_ring *ring, void *src, int count) do { ent_sz = amdgpu_cper_ring_get_ent_sz(ring, pos); - - rptr += (ent_sz >> 2); - rptr &= ring->ptr_mask; + next_rptr = rptr; + if (ent_sz >= sizeof(u32)) + next_rptr = (rptr + (ent_sz >> 2)) & ring->ptr_mask; + + if (next_rptr == rptr) { + /* Corrupt entry size, reset the ring to avoid an infinite loop. */ + rptr = ring->wptr; + *ring->rptr_cpu_addr = rptr; + ring->count_dw = (ring->ring_size - 4) >> 2; + goto out_unlock; + } + + rptr = next_rptr; *ring->rptr_cpu_addr = rptr; pos = rptr; @@ -536,6 +546,8 @@ void amdgpu_cper_ring_write(struct amdgpu_ring *ring, void *src, int count) if (ring->count_dw >= rec_cnt_dw) ring->count_dw -= rec_cnt_dw; + +out_unlock: mutex_unlock(&ring->adev->cper.ring_lock); } -- 2.54.0
