amdgpu: implement data dump from spm ring buffer

James Zhu Tue, 24 Mar 2026 14:21:19 -0700

to complete the amdgpu_spm_work() stub by implementing the full
ring buffer drain pipeline: reading the hardware write pointer,
copying counter data from the GTT ring buffer to the user-space
destination buffer, advancing the hardware read pointer, and
waking up caller once any XCC buffers are filled.


amdgpu_spm_data_copy(spm_mgr, size_to_copy, inst):
  Copies size_to_copy bytes from the kernel-mapped GTT ring buffer to
  the user-space destination buffer for one XCC instance.
  - Per the RLC hardware spec, ring_rptr=0 maps to cpu_addr + 0x20
    (the ring data starts 0x20 bytes past the base of the BO).
  - The destination user_address is computed as ubuf.user_addr +
    size_copied (continuing from where the previous copy left off).
  - If size_to_copy fits within the remaining user buffer space
    (ubufsize - size_copied): copies the full amount, advances
    size_copied and ring_rptr by size_to_copy.
  - If size_to_copy would overflow the user buffer: copies only the
    remaining space (user_buf_space_left), sets size_copied to ubufsize,
    advances ring_rptr, and marks is_user_buf_filled = true.
  - Sets has_data_loss = 1 and returns -EFAULT on copy_to_user()
    failure.

amdgpu_spm_read_ring_buffer(spm_mgr, inst):
  Reads the hardware wptr from cpu_addr[0] via READ_ONCE() followed by
  dma_rmb() to ensure ring buffer data written by the GPU is visible to
  the CPU before processing.
  - If no user buffer is registered (has_user_buf=false) or the current
    buffer is already full (is_user_buf_filled=true): sets has_data_loss=1,
    force-sets is_user_buf_filled=true (to handle the polling timeout
    path where the flag was not previously set), and exits. This signals
    to the user that the SPM hardware may stall.
  - If rptr == wptr: no new data, exits immediately.
  - Linear case (wptr > rptr): calls amdgpu_spm_data_copy() for the
    contiguous segment.
  - Wrap-around case (wptr < rptr): calls amdgpu_spm_data_copy() for
    the tail segment (rptr → ring_end). If rptr reaches ring_end exactly:
    if wptr == 0, resets rptr to 0 and exits; otherwise resets rptr to 0
    and calls amdgpu_spm_data_copy() for the wrapped head segment
    (0 → wptr), skipping the second copy if the first failed.
  - On exit, calls amdgpu_rlc_spm_set_rdptr() to inform the hardware
    of the updated rptr, preventing ring buffer stalls.

amdgpu_spm_work() (work_struct handler, previously a stub):
  Attaches the lead_thread's mm via kthread_use_mm() to permit
  copy_to_user() calls, then under spm_worker_mutex:
  - Resets are_users_buf_filled to false via WRITE_ONCE().
  - Calls amdgpu_spm_read_ring_buffer() for each active XCC.
  - If any XCC's is_user_buf_filled is true, sets are_users_buf_filled
    via WRITE_ONCE() and calls wake_up() on spm_buf_wq to unblock any
    thread waiting in SET_DEST_BUF.
  - Releases spm_worker_mutex before calling wake_up() to avoid
    unnecessary lock contention on the waiter's wakeup path.

Signed-off-by: James Zhu <[email protected]>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_spm.c | 117 +++++++++++++++++++++++-
 1 file changed, 116 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_spm.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_spm.c
index d4af195bbcd2..9f0d1f688d5e 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_spm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_spm.c
@@ -35,16 +35,131 @@
 static int amdgpu_spm_release(struct amdgpu_spm_mgr *spm_mgr, struct drm_file 
*filp);
 static void _amdgpu_spm_release(struct amdgpu_spm_mgr *spm_mgr, int inst, 
struct drm_file *filp);
 
+static int amdgpu_spm_data_copy(struct amdgpu_spm_mgr *spm_mgr, u32 
size_to_copy, int inst)
+{
+       struct amdgpu_spm_base *spm = &(spm_mgr->spm_cntr->spm[inst]);
+       uint64_t __user *user_address;
+       uint64_t *ring_buf;
+       u32 user_buf_space_left;
+       int ret = 0;
+
+       if (spm->ubuf.user_addr == NULL)
+               return -EFAULT;
+
+       user_address = (uint64_t *)((uint64_t)spm->ubuf.user_addr + 
spm->size_copied);
+       /* From RLC spec, ring_rptr = 0 points to spm->cpu_addr + 0x20 */
+       ring_buf =  (uint64_t *)((uint64_t)spm->cpu_addr + spm->ring_rptr + 
0x20);
+
+       user_buf_space_left = spm->ubuf.ubufsize - spm->size_copied;
+
+       if (size_to_copy < user_buf_space_left) {
+               ret = copy_to_user(user_address, ring_buf, size_to_copy);
+               if (ret) {
+                       spm->has_data_loss = 1;
+                       return -EFAULT;
+               }
+               spm->size_copied += size_to_copy;
+               spm->ring_rptr += size_to_copy;
+       } else {
+               ret = copy_to_user(user_address, ring_buf, user_buf_space_left);
+               if (ret) {
+                       spm->has_data_loss = 1;
+                       return -EFAULT;
+               }
+
+               spm->size_copied = spm->ubuf.ubufsize;
+               spm->ring_rptr += user_buf_space_left;
+               spm->is_user_buf_filled = true;
+       }
+
+       return ret;
+}
+
+static int amdgpu_spm_read_ring_buffer(struct amdgpu_spm_mgr *spm_mgr, int 
inst)
+{
+       struct amdgpu_device *adev = mgr_to_adev(spm_mgr, spm_mgr);
+       struct amdgpu_spm_base *spm = &(spm_mgr->spm_cntr->spm[inst]);
+       u32 size_to_copy;
+       int ret = 0;
+       u32 ring_wptr;
+
+       ring_wptr = READ_ONCE(spm->cpu_addr[0]);
+       dma_rmb();
+
+       /* SPM might stall if we cannot copy data out of SPM ringbuffer.
+        * spm->has_data_loss is only a hint here since stall is only a
+        * possibility and data loss might not happen. But it is a useful
+        * hint for user mode profiler to take extra actions.
+        */
+       if (!spm->has_user_buf || spm->is_user_buf_filled) {
+               spm->has_data_loss = 1;
+               /* set flag due to there is no flag setup
+                * when read ring buffer timeout.
+                */
+               if (!spm->is_user_buf_filled)
+                       spm->is_user_buf_filled = true;
+               dev_dbg(adev->dev, "[SPM#%d] [%d|%d] rptr:0x%x--wptr:0x%x", 
inst,
+                       spm->has_user_buf, spm->is_user_buf_filled, 
spm->ring_rptr, ring_wptr);
+               goto exit;
+       }
+
+       if (spm->ring_rptr == ring_wptr)
+               goto exit;
+
+       if (ring_wptr > spm->ring_rptr) {
+               size_to_copy = ring_wptr - spm->ring_rptr;
+               ret = amdgpu_spm_data_copy(spm_mgr, size_to_copy, inst);
+       } else {
+               size_to_copy = spm->ring_size - spm->ring_rptr;
+               ret = amdgpu_spm_data_copy(spm_mgr, size_to_copy, inst);
+
+               /* correct counter start point */
+               if (spm->ring_size == spm->ring_rptr) {
+                       if (ring_wptr == 0) {
+                               /* reset rptr to start point of ring buffer */
+                               spm->ring_rptr = ring_wptr;
+                               goto exit;
+                       }
+                       spm->ring_rptr = 0;
+                       size_to_copy = ring_wptr - spm->ring_rptr;
+                       if (!ret)
+                               ret = amdgpu_spm_data_copy(spm_mgr, 
size_to_copy, inst);
+               }
+       }
+
+exit:
+       amdgpu_rlc_spm_set_rdptr(adev, inst, spm->ring_rptr);
+       return ret;
+}
+
 static void amdgpu_spm_work(struct work_struct *work)
 {
        struct amdgpu_spm_mgr *spm_mgr = container_of(work, struct 
amdgpu_spm_mgr, spm_work);
+       struct amdgpu_device *adev = mgr_to_adev(spm_mgr, spm_mgr);
        struct mm_struct *mm = NULL;
 
        mm = get_task_mm(spm_mgr->lead_thread);
        if (mm) {
                kthread_use_mm(mm);
                { /* attach mm */
-                       /* TODO: dump spm ring buffer to user buffer */
+                       int inst;
+
+                       mutex_lock(&spm_mgr->spm_cntr->spm_worker_mutex);
+                       WRITE_ONCE(spm_mgr->spm_cntr->are_users_buf_filled, 
false);
+                       for_each_inst(inst, AMDGPU_XCC_MASK(adev)) {
+                               struct amdgpu_spm_base *spm = 
&(spm_mgr->spm_cntr->spm[inst]);
+
+                               amdgpu_spm_read_ring_buffer(spm_mgr, inst);
+                               if (spm->is_user_buf_filled)
+                                       
WRITE_ONCE(spm_mgr->spm_cntr->are_users_buf_filled, true);
+                       }
+                       if (READ_ONCE(spm_mgr->spm_cntr->are_users_buf_filled)) 
{
+                               
mutex_unlock(&spm_mgr->spm_cntr->spm_worker_mutex);
+                               pr_debug("SPM wake up buffer work queue.");
+                               wake_up(&spm_mgr->spm_cntr->spm_buf_wq);
+                       } else {
+                               
mutex_unlock(&spm_mgr->spm_cntr->spm_worker_mutex);
+                       }
                } /* detach mm */
                kthread_unuse_mm(mm);
                /* release the mm structure */
-- 
2.34.1

[PATCH v2 13/17] drm/amdgpu: implement data dump from spm ring buffer

Reply via email to