When RLC_SPM_PERFMON_CNTL.PERFMON_RING_MODE is set to 0x11, the SPM
hardware is designed to fire a warning interrupt when the ring fill level
reaches RLC_SPM_SEGMENT_THRESHOLD, and stall only when the ring is
completely full. A hardware bug causes both the stall and the warning
interrupt to trigger simultaneously at RLC_SPM_SEGMENT_THRESHOLD, resulting
in an unexpected early hardware stall at interrupt time, which causes data
loss before the ring buffer is actually full.
This patch replaces interrupt-driven ring drain scheduling with a software
polling monitor thread to avoid triggering the premature stall.
Software polling monitor (amdgpu_spm_monitor_thread):
A kthread named "spm_<render_index>" polls each active XCC instance
in a tight loop with usleep_range(1, 11) (1-11 us per iteration).
On each iteration, it attempts to acquire spm_worker_mutex with
mutex_trylock() (skipping if contended) and for each active XCC checks
whether the amount of unconsumed data in the ring exceeds half the ring
capacity, computed as:
(ring_size + ring_wptr - ring_rptr) % ring_size > (ring_size >> 1)
This condition fires only once per rptr advancement (guarded by
warned_ring_rptr != ring_rptr) to avoid redundant scheduling.
When triggered, schedules amdgpu_spm_work to drain the ring and logs
a "soft interrupt" debug message with the current rptr and wptr.
warned_ring_rptr (u32, added to amdgpu_spm_base):
Tracks the rptr value at which the last soft interrupt was issued.
Initialized to ~0U (U32_MAX) on acquire and on each SPM start/stop to
ensure the first polling check always passes. Updated to the current
rptr at the start of each ring buffer read to suppress duplicate
scheduling for the same rptr position.
Monitor thread lifecycle:
- Started lazily on the first SET_DEST_BUF call that starts the SPM
hardware (dest_buf != NULL, is_spm_started transitions false→true).
Only one thread is created per SPM session regardless of XCC count.
- Stopped on SET_DEST_BUF with dest_buf=NULL (SPM stopped) and on
AMDGPU_SPM_OP_RELEASE. spm_monitor_thread is initialized to NULL
in amdgpu_spm_acquire() and set back to NULL by the thread itself
on exit.
amdgpu_spm_interrupt():
- schedules amdgpu_spm_work directly. But when detects affected hardware
via ip_versions[GC_HWIP][0] < IP_VERSION(12, 0, 0), the polling thread
is responsible for all ring drain scheduling in this mode. the hardware
interrupt only sets has_data_loss=1 under spm_irq_lock (indicating the
ring reached the stall threshold) and logs a debug message.
Signed-off-by: James Zhu <[email protected]>
---
drivers/gpu/drm/amd/amdgpu/amdgpu_spm.c | 94 ++++++++++++++++++++++++-
drivers/gpu/drm/amd/amdgpu/amdgpu_spm.h | 2 +
2 files changed, 94 insertions(+), 2 deletions(-)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_spm.c
b/drivers/gpu/drm/amd/amdgpu/amdgpu_spm.c
index 9b7bb15a3785..ee0fbf75709c 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_spm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_spm.c
@@ -38,6 +38,77 @@
static int amdgpu_spm_release(struct amdgpu_spm_mgr *spm_mgr, struct drm_file
*filp);
static void _amdgpu_spm_release(struct amdgpu_spm_mgr *spm_mgr, int inst,
struct drm_file *filp);
+static int amdgpu_spm_monitor_thread(void *param)
+{
+ struct amdgpu_spm_mgr *spm_mgr = param;
+ struct amdgpu_device *adev = mgr_to_adev(spm_mgr, spm_mgr);
+
+ allow_signal(SIGKILL);
+ while (!kthread_should_stop() &&
+ !signal_pending(spm_mgr->spm_monitor_thread) &&
spm_mgr->spm_cntr) {
+ bool need_schedule = false;
+ u32 inst;
+
+ usleep_range(1, 11);
+
+ if (!mutex_trylock(&spm_mgr->spm_cntr->spm_worker_mutex))
+ continue;
+
+ for_each_inst(inst, AMDGPU_XCC_MASK(adev)) {
+ struct amdgpu_spm_base *spm =
&(spm_mgr->spm_cntr->spm[inst]);
+ u32 warned_ring_rptr;
+ u32 ring_size;
+ u32 ring_rptr;
+ u32 ring_wptr;
+
+ if (!spm->is_spm_started)
+ continue;
+
+ ring_size = spm->ring_size;
+ ring_rptr = spm->ring_rptr;
+ warned_ring_rptr = spm->warned_ring_rptr;
+ ring_wptr = READ_ONCE(spm->cpu_addr[0]);
+
+ if (need_schedule || (ring_rptr != warned_ring_rptr &&
+ (ring_size + ring_wptr - ring_rptr) % ring_size
>
+ (ring_size >> 1))) {
+ spm->warned_ring_rptr = ring_rptr;
+ if (!need_schedule) {
+ dev_dbg(adev->dev,
+ "[SPM#%d] soft interrupt
rptr:0x%08x--wptr:0x%08x",
+ inst, ring_rptr, ring_wptr);
+ need_schedule = true;
+ }
+ }
+ }
+ mutex_unlock(&spm_mgr->spm_cntr->spm_worker_mutex);
+ if (need_schedule)
+ schedule_work(&spm_mgr->spm_work);
+ }
+ spm_mgr->spm_monitor_thread = NULL;
+ return 0;
+}
+
+static int amdgpu_spm_monitor_thread_start(struct amdgpu_spm_mgr *spm_mgr)
+{
+ struct amdgpu_device *adev = mgr_to_adev(spm_mgr, spm_mgr);
+ char thread_name[16];
+ int ret = 0;
+
+ snprintf(thread_name, 16, "spm_%d", adev->ddev.render->index);
+ spm_mgr->spm_monitor_thread =
+ kthread_run(amdgpu_spm_monitor_thread, spm_mgr, thread_name);
+
+ if (IS_ERR(spm_mgr->spm_monitor_thread)) {
+ ret = PTR_ERR(spm_mgr->spm_monitor_thread);
+ spm_mgr->spm_monitor_thread = NULL;
+ dev_dbg(adev->dev, "Failed to create spm monitor thread %s with
ret = %d.",
+ thread_name, ret);
+ }
+
+ return ret;
+}
+
static void amdgpu_spm_preset(struct amdgpu_spm_base *spm, u32 size)
{
uint64_t *overflow_ptr, *overflow_end_ptr;
@@ -122,6 +193,7 @@ static int amdgpu_spm_read_ring_buffer(struct
amdgpu_spm_mgr *spm_mgr, int inst)
if (spm->ring_rptr == ring_wptr)
goto exit;
+ spm->warned_ring_rptr = spm->ring_rptr;
if (ring_wptr > spm->ring_rptr) {
size_to_copy = ring_wptr - spm->ring_rptr;
ret = amdgpu_spm_data_copy(spm_mgr, size_to_copy, inst);
@@ -246,6 +318,7 @@ static int _amdgpu_spm_acquire(struct amdgpu_spm_mgr
*spm_mgr, int inst, struct
*/
spm->ring_size -= 0x20;
amdgpu_spm_preset(spm, spm_mgr->spm_overflow_reserved);
+ spm->warned_ring_rptr = ~0;
goto out;
@@ -290,6 +363,7 @@ static int amdgpu_spm_acquire(struct amdgpu_spm_mgr
*spm_mgr, struct drm_file *f
INIT_WORK(&spm_mgr->spm_work, amdgpu_spm_work);
spin_lock_init(&spm_mgr->spm_irq_lock);
+ spm_mgr->spm_monitor_thread = NULL;
spm_mgr->file = filp;
goto out;
@@ -340,6 +414,9 @@ static int amdgpu_spm_release(struct amdgpu_spm_mgr
*spm_mgr, struct drm_file *f
goto out;
}
+ if (spm_mgr->spm_monitor_thread)
+ kthread_stop(spm_mgr->spm_monitor_thread);
+
for_each_inst(inst, AMDGPU_XCC_MASK(adev)) {
spin_lock_irqsave(&spm_mgr->spm_irq_lock, flags);
spm_mgr->spm_cntr->spm[inst].is_spm_started = false;
@@ -526,7 +603,10 @@ static int amdgpu_set_dest_buffer(struct amdgpu_spm_mgr
*spm_mgr, void *data)
* wptr will become 0, adjust rptr accordingly.
*/
spm->ring_rptr = 0;
+ spm->warned_ring_rptr = ~0;
spin_unlock_irqrestore(&spm_mgr->spm_irq_lock,
flags);
+ if (!spm_mgr->spm_monitor_thread)
+
amdgpu_spm_monitor_thread_start(spm_mgr);
} else {
/* If SPM was already started, there may already
* be data in the ring-buffer that needs to be
read.
@@ -542,7 +622,10 @@ static int amdgpu_set_dest_buffer(struct amdgpu_spm_mgr
*spm_mgr, void *data)
* Adjust rptr accordingly
*/
spm->ring_rptr = 0;
+ spm->warned_ring_rptr = ~0;
spin_unlock_irqrestore(&spm_mgr->spm_irq_lock, flags);
+ if (spm_mgr->spm_monitor_thread)
+ kthread_stop(spm_mgr->spm_monitor_thread);
}
}
@@ -606,8 +689,15 @@ void amdgpu_spm_interrupt(struct amdgpu_device *adev, int
xcc_id)
spm_mgr = &(adev->prof_mgr.prof_xcp_mgr[xcp_id].spm_mgr);
+ /* watermark threshold is triggered */
spin_lock_irqsave(&spm_mgr->spm_irq_lock, flags);
- if (spm_mgr->spm_cntr && spm_mgr->spm_cntr->spm[xcc_id].is_spm_started)
- schedule_work(&spm_mgr->spm_work);
+ if (spm_mgr->spm_cntr && spm_mgr->spm_cntr->spm[xcc_id].is_spm_started)
{
+ if (adev->ip_versions[GC_HWIP][0] < IP_VERSION(12, 0, 0))
+ spm_mgr->spm_cntr->spm[xcc_id].has_data_loss = 1;
+ else
+ schedule_work(&spm_mgr->spm_work);
+ }
spin_unlock_irqrestore(&spm_mgr->spm_irq_lock, flags);
+
+ dev_dbg(adev->dev, "[SPM#%d:%d] ring buffer stall.", xcp_id, xcc_id);
}
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_spm.h
b/drivers/gpu/drm/amd/amdgpu/amdgpu_spm.h
index f00a4751643e..db440b2e11f0 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_spm.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_spm.h
@@ -41,6 +41,7 @@ struct amdgpu_spm_base {
bool has_user_buf;
bool is_user_buf_filled;
bool is_spm_started;
+ u32 warned_ring_rptr;
};
struct amdgpu_spm_cntr {
@@ -53,6 +54,7 @@ struct amdgpu_spm_cntr {
};
struct amdgpu_spm_mgr {
+ struct task_struct *spm_monitor_thread;
struct drm_file *file;
struct task_struct *lead_thread;
--
2.34.1