When register RLC_SPM_PERFMON_CNTL/PERFMON_RING_MODE is set to 0x11,SPM HW fires warning interrupt when rptr reaches RLC_SPM_SEGMENT_THRESHOLD, and stalls when rptr reaches the end of ring buffer. But the HW bug causes that both stall and interrupt arise when rptr reaches RLC_SPM_SEGMENT_THRESHOLD, and this means unexpecetd data loss with early SPM HW stall arise when interrupt received. this fix uses polling mode instead to avoid expected SPM HW early stall.
Signed-off-by: James Zhu <[email protected]> --- drivers/gpu/drm/amd/amdgpu/amdgpu_spm.c | 88 ++++++++++++++++++++++++- drivers/gpu/drm/amd/amdgpu/amdgpu_spm.h | 1 + 2 files changed, 88 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_spm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_spm.c index f09c237cc8f7..4cfb3c49b9b8 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_spm.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_spm.c @@ -49,6 +49,7 @@ struct amdgpu_spm_base { bool has_user_buf; bool is_user_buf_filled; bool is_spm_started; + u32 warned_ring_rptr; }; struct amdgpu_spm_cntr { @@ -66,6 +67,77 @@ struct amdgpu_spm_cntr { static int amdgpu_spm_release(struct amdgpu_spm_mgr *spm_mgr, struct drm_file *filp); static void _amdgpu_spm_release(struct amdgpu_spm_mgr *spm_mgr, int inst, struct drm_file *filp); +static int amdgpu_spm_monitor_thread(void *param) +{ + struct amdgpu_spm_mgr *spm_mgr = param; + struct amdgpu_device *adev = mgr_to_adev(spm_mgr, spm_mgr); + + allow_signal(SIGKILL); + while (!kthread_should_stop() && + !signal_pending(spm_mgr->spm_monitor_thread) && spm_mgr->spm_cntr) { + bool need_schedule = false; + u32 inst; + + usleep_range(1, 11); + + if (!mutex_trylock(&spm_mgr->spm_cntr->spm_worker_mutex)) + continue; + + for_each_inst(inst, AMDGPU_XCC_MASK(adev)) { + struct amdgpu_spm_base *spm = &(spm_mgr->spm_cntr->spm[inst]); + u32 warned_ring_rptr; + u32 ring_size; + u32 ring_rptr; + u32 ring_wptr; + + if (!spm->is_spm_started) + continue; + + ring_size = spm->ring_size; + ring_rptr = spm->ring_rptr; + warned_ring_rptr = spm->warned_ring_rptr; + ring_wptr = READ_ONCE(spm->cpu_addr[0]); + + if (need_schedule || (ring_rptr != warned_ring_rptr && + (ring_size + ring_wptr - ring_rptr) % ring_size > + (ring_size >> 1))) { + spm->warned_ring_rptr = ring_rptr; + if (!need_schedule) { + dev_dbg(adev->dev, + "[SPM#%d] soft interrupt rptr:0x%08x--wptr:0x%08x", + inst, ring_rptr, ring_wptr); + need_schedule = true; + } + } + } + mutex_unlock(&spm_mgr->spm_cntr->spm_worker_mutex); + if (need_schedule) + schedule_work(&spm_mgr->spm_work); + } + spm_mgr->spm_monitor_thread = NULL; + return 0; +} + +static int amdgpu_spm_monitor_thread_start(struct amdgpu_spm_mgr *spm_mgr) +{ + struct amdgpu_device *adev = mgr_to_adev(spm_mgr, spm_mgr); + char thread_name[16]; + int ret = 0; + + snprintf(thread_name, 16, "spm_%d", adev->ddev.render->index); + spm_mgr->spm_monitor_thread = + kthread_run(amdgpu_spm_monitor_thread, spm_mgr, thread_name); + + if (IS_ERR(spm_mgr->spm_monitor_thread)) { + ret = PTR_ERR(spm_mgr->spm_monitor_thread); + spm_mgr->spm_monitor_thread = NULL; + dev_dbg(adev->dev, "Failed to create spm monitor thread %s with ret = %d.", + thread_name, ret); + } + + return ret; +} + static void amdgpu_spm_preset(struct amdgpu_spm_base *spm, u32 size) { uint64_t *overflow_ptr, *overflow_end_ptr; @@ -152,6 +224,7 @@ static int amdgpu_spm_read_ring_buffer(struct amdgpu_spm_mgr *spm_mgr, int inst) if (spm->ring_rptr == ring_wptr) goto exit; + spm->warned_ring_rptr = spm->ring_rptr; if (ring_wptr > spm->ring_rptr) { size_to_copy = ring_wptr - spm->ring_rptr; ret = amdgpu_spm_data_copy(spm_mgr, size_to_copy, inst); @@ -277,6 +350,7 @@ static int _amdgpu_spm_acquire(struct amdgpu_spm_mgr *spm_mgr, int inst, struct goto rlc_spm_acquire_failure; amdgpu_spm_preset(spm, spm_mgr->spm_overflow_reserved); + spm->warned_ring_rptr = ~0; goto out; rlc_spm_acquire_failure: @@ -320,6 +394,7 @@ static int amdgpu_spm_acquire(struct amdgpu_spm_mgr *spm_mgr, struct drm_file *f INIT_WORK(&spm_mgr->spm_work, amdgpu_spm_work); spin_lock_init(&spm_mgr->spm_irq_lock); + spm_mgr->spm_monitor_thread = NULL; spm_mgr->file = filp; goto out; @@ -366,6 +441,9 @@ static int amdgpu_spm_release(struct amdgpu_spm_mgr *spm_mgr, struct drm_file *f goto out; } + if (spm_mgr->spm_monitor_thread) + kthread_stop(spm_mgr->spm_monitor_thread); + for_each_inst(inst, AMDGPU_XCC_MASK(adev)) { spin_lock_irqsave(&spm_mgr->spm_irq_lock, flags); spm_mgr->spm_cntr->spm[inst].is_spm_started = false; @@ -551,7 +629,10 @@ static int amdgpu_set_dest_buffer(struct amdgpu_spm_mgr *spm_mgr, void __user *d * wptr will become 0, adjust rptr accordingly. */ spm->ring_rptr = 0; + spm->warned_ring_rptr = ~0; spin_unlock_irqrestore(&spm_mgr->spm_irq_lock, flags); + if (!spm_mgr->spm_monitor_thread) + amdgpu_spm_monitor_thread_start(spm_mgr); } else { /* If SPM was already started, there may already * be data in the ring-buffer that needs to be read. @@ -567,7 +648,10 @@ static int amdgpu_set_dest_buffer(struct amdgpu_spm_mgr *spm_mgr, void __user *d * Adjust rptr accordingly */ spm->ring_rptr = 0; + spm->warned_ring_rptr = ~0; spin_unlock_irqrestore(&spm_mgr->spm_irq_lock, flags); + if (spm_mgr->spm_monitor_thread) + kthread_stop(spm_mgr->spm_monitor_thread); } } @@ -638,6 +722,8 @@ void amdgpu_spm_interrupt(struct amdgpu_device *adev, int xcc_id) spin_lock_irqsave(&spm_mgr->spm_irq_lock, flags); if (spm_mgr->spm_cntr && spm_mgr->spm_cntr->spm[xcc_id].is_spm_started) - schedule_work(&spm_mgr->spm_work); + spm_mgr->spm_cntr->spm[xcc_id].has_data_loss = true; spin_unlock_irqrestore(&spm_mgr->spm_irq_lock, flags); + + dev_dbg(adev->dev, "[SPM#%d:%d] ring buffer stall.", xcp_id, xcc_id); } diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_spm.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_spm.h index f3d812fa4e2b..aca627b6c2de 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_spm.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_spm.h @@ -24,6 +24,7 @@ #define AMDGPU_SPM_H_ struct amdgpu_spm_mgr { + struct task_struct *spm_monitor_thread; struct drm_file *file; struct task_struct *lead_thread; -- 2.34.1
