amdgpu: workaround for hw stall issue

James Zhu Tue, 24 Mar 2026 14:21:16 -0700

When RLC_SPM_PERFMON_CNTL.PERFMON_RING_MODE is set to 0x11, the SPM
hardware is designed to fire a warning interrupt when the ring fill level
reaches RLC_SPM_SEGMENT_THRESHOLD, and stall only when the ring is
completely full. A hardware bug causes both the stall and the warning
interrupt to trigger simultaneously at RLC_SPM_SEGMENT_THRESHOLD, resulting
in an unexpected early hardware stall at interrupt time, which causes data
loss before the ring buffer is actually full.


This patch replaces interrupt-driven ring drain scheduling with a software
polling monitor thread to avoid triggering the premature stall.

Software polling monitor (amdgpu_spm_monitor_thread):
  A kthread named "spm_<render_index>" polls each active XCC instance
  in a tight loop with usleep_range(1, 11) (1-11 us per iteration).
  On each iteration, it attempts to acquire spm_worker_mutex with
  mutex_trylock() (skipping if contended) and for each active XCC checks
  whether the amount of unconsumed data in the ring exceeds half the ring
  capacity, computed as:
    (ring_size + ring_wptr - ring_rptr) % ring_size > (ring_size >> 1)
  This condition fires only once per rptr advancement (guarded by
  warned_ring_rptr != ring_rptr) to avoid redundant scheduling.
  When triggered, schedules amdgpu_spm_work to drain the ring and logs
  a "soft interrupt" debug message with the current rptr and wptr.

warned_ring_rptr (u32, added to amdgpu_spm_base):
  Tracks the rptr value at which the last soft interrupt was issued.
  Initialized to ~0U (U32_MAX) on acquire and on each SPM start/stop to
  ensure the first polling check always passes. Updated to the current
  rptr at the start of each ring buffer read to suppress duplicate
  scheduling for the same rptr position.

Monitor thread lifecycle:
  - Started lazily on the first SET_DEST_BUF call that starts the SPM
    hardware (dest_buf != NULL, is_spm_started transitions false→true).
    Only one thread is created per SPM session regardless of XCC count.
  - Stopped on SET_DEST_BUF with dest_buf=NULL (SPM stopped) and on
    AMDGPU_SPM_OP_RELEASE. spm_monitor_thread is initialized to NULL
    in amdgpu_spm_acquire() and set back to NULL by the thread itself
    on exit.

amdgpu_spm_interrupt():
  - schedules amdgpu_spm_work directly. But when detects affected hardware
    via ip_versions[GC_HWIP][0] < IP_VERSION(12, 0, 0), the polling thread
    is responsible for all ring drain scheduling in this mode. the hardware
    interrupt only sets has_data_loss=1  under spm_irq_lock (indicating the
     ring reached the stall threshold) and logs a debug message.

Signed-off-by: James Zhu <[email protected]>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_spm.c | 94 ++++++++++++++++++++++++-
 drivers/gpu/drm/amd/amdgpu/amdgpu_spm.h |  2 +
 2 files changed, 94 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_spm.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_spm.c
index 9b7bb15a3785..ee0fbf75709c 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_spm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_spm.c
@@ -38,6 +38,77 @@
 static int amdgpu_spm_release(struct amdgpu_spm_mgr *spm_mgr, struct drm_file 
*filp);
 static void _amdgpu_spm_release(struct amdgpu_spm_mgr *spm_mgr, int inst, 
struct drm_file *filp);
 
+static int amdgpu_spm_monitor_thread(void *param)
+{
+       struct amdgpu_spm_mgr *spm_mgr = param;
+       struct amdgpu_device *adev = mgr_to_adev(spm_mgr, spm_mgr);
+
+       allow_signal(SIGKILL);
+       while (!kthread_should_stop() &&
+                       !signal_pending(spm_mgr->spm_monitor_thread) && 
spm_mgr->spm_cntr) {
+               bool need_schedule = false;
+               u32 inst;
+
+               usleep_range(1, 11);
+
+               if (!mutex_trylock(&spm_mgr->spm_cntr->spm_worker_mutex))
+                       continue;
+
+               for_each_inst(inst, AMDGPU_XCC_MASK(adev)) {
+                       struct amdgpu_spm_base *spm = 
&(spm_mgr->spm_cntr->spm[inst]);
+                       u32 warned_ring_rptr;
+                       u32 ring_size;
+                       u32 ring_rptr;
+                       u32 ring_wptr;
+
+                       if (!spm->is_spm_started)
+                               continue;
+
+                       ring_size = spm->ring_size;
+                       ring_rptr = spm->ring_rptr;
+                       warned_ring_rptr = spm->warned_ring_rptr;
+                       ring_wptr = READ_ONCE(spm->cpu_addr[0]);
+
+                       if (need_schedule || (ring_rptr != warned_ring_rptr &&
+                               (ring_size + ring_wptr - ring_rptr) % ring_size 
>
+                                       (ring_size >> 1))) {
+                               spm->warned_ring_rptr = ring_rptr;
+                               if (!need_schedule) {
+                                       dev_dbg(adev->dev,
+                                               "[SPM#%d] soft interrupt 
rptr:0x%08x--wptr:0x%08x",
+                                                inst, ring_rptr, ring_wptr);
+                                       need_schedule = true;
+                               }
+                       }
+               }
+               mutex_unlock(&spm_mgr->spm_cntr->spm_worker_mutex);
+               if (need_schedule)
+                       schedule_work(&spm_mgr->spm_work);
+       }
+       spm_mgr->spm_monitor_thread = NULL;
+       return 0;
+}
+
+static int amdgpu_spm_monitor_thread_start(struct amdgpu_spm_mgr *spm_mgr)
+{
+       struct amdgpu_device *adev = mgr_to_adev(spm_mgr, spm_mgr);
+       char thread_name[16];
+       int ret = 0;
+
+       snprintf(thread_name, 16, "spm_%d", adev->ddev.render->index);
+       spm_mgr->spm_monitor_thread =
+               kthread_run(amdgpu_spm_monitor_thread, spm_mgr, thread_name);
+
+       if (IS_ERR(spm_mgr->spm_monitor_thread)) {
+               ret = PTR_ERR(spm_mgr->spm_monitor_thread);
+               spm_mgr->spm_monitor_thread = NULL;
+               dev_dbg(adev->dev, "Failed to create spm monitor thread %s with 
ret = %d.",
+                       thread_name, ret);
+       }
+
+       return ret;
+}
+
 static void amdgpu_spm_preset(struct amdgpu_spm_base *spm, u32 size)
 {
        uint64_t *overflow_ptr, *overflow_end_ptr;
@@ -122,6 +193,7 @@ static int amdgpu_spm_read_ring_buffer(struct 
amdgpu_spm_mgr *spm_mgr, int inst)
        if (spm->ring_rptr == ring_wptr)
                goto exit;
 
+       spm->warned_ring_rptr = spm->ring_rptr;
        if (ring_wptr > spm->ring_rptr) {
                size_to_copy = ring_wptr - spm->ring_rptr;
                ret = amdgpu_spm_data_copy(spm_mgr, size_to_copy, inst);
@@ -246,6 +318,7 @@ static int _amdgpu_spm_acquire(struct amdgpu_spm_mgr 
*spm_mgr, int inst, struct
         */
        spm->ring_size -= 0x20;
        amdgpu_spm_preset(spm, spm_mgr->spm_overflow_reserved);
+       spm->warned_ring_rptr = ~0;
 
        goto out;
 
@@ -290,6 +363,7 @@ static int amdgpu_spm_acquire(struct amdgpu_spm_mgr 
*spm_mgr, struct drm_file *f
        INIT_WORK(&spm_mgr->spm_work, amdgpu_spm_work);
 
        spin_lock_init(&spm_mgr->spm_irq_lock);
+       spm_mgr->spm_monitor_thread = NULL;
        spm_mgr->file = filp;
 
        goto out;
@@ -340,6 +414,9 @@ static int amdgpu_spm_release(struct amdgpu_spm_mgr 
*spm_mgr, struct drm_file *f
                goto out;
        }
 
+       if (spm_mgr->spm_monitor_thread)
+               kthread_stop(spm_mgr->spm_monitor_thread);
+
        for_each_inst(inst, AMDGPU_XCC_MASK(adev)) {
                spin_lock_irqsave(&spm_mgr->spm_irq_lock, flags);
                spm_mgr->spm_cntr->spm[inst].is_spm_started = false;
@@ -526,7 +603,10 @@ static int amdgpu_set_dest_buffer(struct amdgpu_spm_mgr 
*spm_mgr, void *data)
                                 * wptr will become 0, adjust rptr accordingly.
                                 */
                                spm->ring_rptr = 0;
+                               spm->warned_ring_rptr = ~0;
                                spin_unlock_irqrestore(&spm_mgr->spm_irq_lock, 
flags);
+                               if (!spm_mgr->spm_monitor_thread)
+                                       
amdgpu_spm_monitor_thread_start(spm_mgr);
                        } else {
                                /* If SPM was already started, there may already
                                 * be data in the ring-buffer that needs to be 
read.
@@ -542,7 +622,10 @@ static int amdgpu_set_dest_buffer(struct amdgpu_spm_mgr 
*spm_mgr, void *data)
                         * Adjust rptr accordingly
                         */
                        spm->ring_rptr = 0;
+                       spm->warned_ring_rptr = ~0;
                        spin_unlock_irqrestore(&spm_mgr->spm_irq_lock, flags);
+                       if (spm_mgr->spm_monitor_thread)
+                               kthread_stop(spm_mgr->spm_monitor_thread);
                }
        }
 
@@ -606,8 +689,15 @@ void amdgpu_spm_interrupt(struct amdgpu_device *adev, int 
xcc_id)
 
        spm_mgr = &(adev->prof_mgr.prof_xcp_mgr[xcp_id].spm_mgr);
 
+       /*  watermark threshold is triggered */
        spin_lock_irqsave(&spm_mgr->spm_irq_lock, flags);
-       if (spm_mgr->spm_cntr && spm_mgr->spm_cntr->spm[xcc_id].is_spm_started)
-               schedule_work(&spm_mgr->spm_work);
+       if (spm_mgr->spm_cntr && spm_mgr->spm_cntr->spm[xcc_id].is_spm_started) 
{
+               if (adev->ip_versions[GC_HWIP][0] < IP_VERSION(12, 0, 0))
+                       spm_mgr->spm_cntr->spm[xcc_id].has_data_loss = 1;
+               else
+                       schedule_work(&spm_mgr->spm_work);
+       }
        spin_unlock_irqrestore(&spm_mgr->spm_irq_lock, flags);
+
+       dev_dbg(adev->dev, "[SPM#%d:%d] ring buffer stall.", xcp_id, xcc_id);
 }
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_spm.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_spm.h
index f00a4751643e..db440b2e11f0 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_spm.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_spm.h
@@ -41,6 +41,7 @@ struct amdgpu_spm_base {
        bool   has_user_buf;
        bool   is_user_buf_filled;
        bool   is_spm_started;
+       u32    warned_ring_rptr;
 };
 
 struct amdgpu_spm_cntr {
@@ -53,6 +54,7 @@ struct amdgpu_spm_cntr {
 };
 
 struct amdgpu_spm_mgr {
+       struct task_struct *spm_monitor_thread;
        struct drm_file *file;
 
        struct task_struct *lead_thread;
-- 
2.34.1

[PATCH v2 15/17] drm/amdgpu: workaround for hw stall issue

Reply via email to