spm operation AMDGPU_SPM_OP_ACQUIRE

James Zhu Tue, 24 Mar 2026 14:21:13 -0700

to implement AMDGPU_SPM_OP_ACQUIRE, which grants a process exclusive
access to the SPM hardware on its XCP partition and allocates the
per-XCC ring buffers used for streaming performance counter data.


Data structures:
  struct amdgpu_spm_base (amdgpu_spm.h):
    Holds the per-XCC ring buffer state: GTT BO handle (spm_obj),
    GPU virtual address (gpu_addr), kernel virtual address (cpu_addr),
    and effective ring size (ring_size).

  struct amdgpu_spm_cntr (amdgpu_spm.h):
    Contains an array of amdgpu_spm_base[MAX_XCP] (one per XCC) and a
    spm_worker_mutex to serialize worker operations. Allocated on first
    ACQUIRE and freed on RELEASE.

  struct amdgpu_spm_mgr (amdgpu_spm.h):
    Extended with lead_thread (the acquiring process's thread group
    leader), spm_cntr pointer, and spm_work work_struct for the deferred
    ring buffer drain worker.

ACQUIRE flow (amdgpu_spm_acquire):
  1. Takes the per-XCP prof_xcp_mgr->mutex to serialize concurrent
     ACQUIRE attempts.
  2. Returns -EBUSY if spm_cntr is already allocated (another process
     holds SPM).
  3. Allocates spm_cntr via kzalloc and records current->group_leader
     as lead_thread.
  4. For each XCC in AMDGPU_XCC_MASK(adev), calls _amdgpu_spm_acquire():
     - Allocates a 4 MiB GTT ring buffer via amdgpu_bo_alloc_gtt_mem().
     - Programs the ring buffer into the RLC hardware via
       amdgpu_rlc_spm_acquire(), which also reserves a VMID for the
       caller's VM.
     - Subtracts 0x20 bytes (8 DWORDs) from ring_size to exclude the
       hardware-defined metadata area at the end of the buffer.
     - On failure, frees the GTT BO and clears the spm_base entry.
  5. Initializes spm_worker_mutex and INIT_WORK for amdgpu_spm_work.
  6. Records the drm_file pointer in spm_mgr->file.
  7. On any per-XCC failure, rolls back all already-acquired XCCs via
     _amdgpu_spm_release() (stub, TODO) and frees spm_cntr.

amdgpu_spm_work (work_struct handler):
  Attaches the lead_thread's mm_struct via kthread_use_mm() to enable
  user-space copy operations, then detaches and releases the mm. The
  actual ring buffer drain to user space is a TODO for a later patch.

Three new navigation macros are added in amdgpu_profiler.h:
  - to_prof_xcp_mgr(x, y): container_of from a member y to
    amdgpu_profiler_xcp_mgr.
  - xcp_to_prof_mgr(x, y): container_of from prof_xcp_mgr[] element to
    amdgpu_profiler_mgr.
  - mgr_to_adev(x, y): compound statement combining the two above to
    reach amdgpu_device from any embedded manager pointer.

AMDGPU_XCC_MASK(adev) is added using GENMASK() to safely produce a
bitmask of active XCC instances from NUM_XCC(adev->gfx.xcc_mask).

In amdgpu_spm_ioctl(), the per-XCP spm_mgr is now resolved from fpriv
using AMDGPU_XCP_ID() before dispatching to sub-operation handlers.

Signed-off-by: James Zhu <[email protected]>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_profiler.h |  13 +++
 drivers/gpu/drm/amd/amdgpu/amdgpu_spm.c      | 114 ++++++++++++++++++-
 drivers/gpu/drm/amd/amdgpu/amdgpu_spm.h      |  17 +++
 3 files changed, 143 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_profiler.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_profiler.h
index ea62a4dee364..587adadaedb8 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_profiler.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_profiler.h
@@ -27,6 +27,7 @@
 
 #include "amdgpu_spm.h"
 
+#define AMDGPU_XCC_MASK(adev) GENMASK(NUM_XCC(adev->gfx.xcc_mask) - 1, 0)
 #define AMDGPU_XCP_ID(x) (x == AMDGPU_XCP_NO_PARTITION ? 0 : x)
 #define fpriv_to_prof_mgr(fpriv) (&(fpriv)->userq_mgr.adev->prof_mgr)
 #define fpriv_to_adev(fpriv) ((fpriv)->userq_mgr.adev)
@@ -34,6 +35,18 @@
 #define prof_mgr_to_adev(x) \
        container_of(x, struct amdgpu_device, prof_mgr)
 
+#define to_prof_xcp_mgr(x, y) \
+       container_of(x, struct amdgpu_profiler_xcp_mgr, y)
+
+#define xcp_to_prof_mgr(x, y) \
+       container_of(x, struct amdgpu_profiler_mgr, y)
+
+#define mgr_to_adev(x, y) \
+({     struct amdgpu_profiler_xcp_mgr *prof_xcp_mgr = to_prof_xcp_mgr(x, y); \
+       struct amdgpu_profiler_mgr *prof_mgr = \
+               xcp_to_prof_mgr(prof_xcp_mgr, 
prof_xcp_mgr[prof_xcp_mgr->xcp_id]);\
+       prof_mgr_to_adev(prof_mgr); })
+
 struct amdgpu_profiler_xcp_mgr {
        struct mutex                   mutex;
        uint32_t                       xcp_id;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_spm.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_spm.c
index 6ff88dfabf1c..e58b89ccd83f 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_spm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_spm.c
@@ -30,10 +30,28 @@
  */
 
 static int amdgpu_spm_release(struct amdgpu_spm_mgr *spm_mgr, struct drm_file 
*filp);
+static void _amdgpu_spm_release(struct amdgpu_spm_mgr *spm_mgr, int inst, 
struct drm_file *filp);
+
+static void amdgpu_spm_work(struct work_struct *work)
+{
+       struct amdgpu_spm_mgr *spm_mgr = container_of(work, struct 
amdgpu_spm_mgr, spm_work);
+       struct mm_struct *mm = NULL;
+
+       mm = get_task_mm(spm_mgr->lead_thread);
+       if (mm) {
+               kthread_use_mm(mm);
+               { /* attach mm */
+                       /* TODO: dump spm ring buffer to user buffer */
+               } /* detach mm */
+               kthread_unuse_mm(mm);
+               /* release the mm structure */
+               mmput(mm);
+       }
+}
 
 static void amdgpu_spm_init_device(struct amdgpu_spm_mgr *spm_mgr)
 {
-       /* TODO */
+       spm_mgr->spm_cntr = NULL;
 }
 
 static void amdgpu_spm_release_device(struct amdgpu_spm_mgr *spm_mgr, struct 
drm_file *filp)
@@ -41,6 +59,94 @@ static void amdgpu_spm_release_device(struct amdgpu_spm_mgr 
*spm_mgr, struct drm
        amdgpu_spm_release(spm_mgr, filp);
 }
 
+static int _amdgpu_spm_acquire(struct amdgpu_spm_mgr *spm_mgr, int inst, 
struct drm_file *filp)
+{
+       struct amdgpu_device *adev = mgr_to_adev(spm_mgr, spm_mgr);
+       struct amdgpu_spm_base *spm = &(spm_mgr->spm_cntr->spm[inst]);
+       int ret = 0;
+
+       /* allocate 4M spm ring buffer */
+       spm->ring_size = 4 * 1024 * 1024;
+
+       ret = amdgpu_bo_alloc_gtt_mem(adev,
+                       spm->ring_size, &spm->spm_obj,
+                       &spm->gpu_addr, (void *)&spm->cpu_addr,
+                       false, false);
+
+       if (ret)
+               goto out;
+
+       ret = amdgpu_rlc_spm_acquire(adev, inst, drm_priv_to_vm(filp),
+                       spm->gpu_addr, spm->ring_size);
+       if (ret)
+               goto rlc_spm_acquire_failure;
+
+       /*
+        * By definition, the last 8 DWs of the buffer are not part of the rings
+        *  and are instead part of the Meta data area.
+        */
+       spm->ring_size -= 0x20;
+
+       goto out;
+
+rlc_spm_acquire_failure:
+       amdgpu_bo_free_gtt_mem(adev, &spm->spm_obj);
+       memset(spm, 0, sizeof(*spm));
+out:
+       return ret;
+}
+
+static int amdgpu_spm_acquire(struct amdgpu_spm_mgr *spm_mgr, struct drm_file 
*filp)
+{
+       struct amdgpu_device *adev = mgr_to_adev(spm_mgr, spm_mgr);
+       int ret = 0;
+       int inst;
+
+       mutex_lock(&(to_prof_xcp_mgr(spm_mgr, spm_mgr)->mutex));
+
+       if (spm_mgr->spm_cntr) {
+               ret = -EBUSY;
+               goto out;
+       }
+
+       spm_mgr->spm_cntr = kzalloc(sizeof(struct amdgpu_spm_cntr), GFP_KERNEL);
+       if (!spm_mgr->spm_cntr) {
+               ret = -ENOMEM;
+               goto out;
+       }
+       spm_mgr->lead_thread = current->group_leader;
+
+       for_each_inst(inst, AMDGPU_XCC_MASK(adev)) {
+               ret = _amdgpu_spm_acquire(spm_mgr, inst, filp);
+               if (ret)
+                       goto acquire_spm_failure;
+       }
+
+       mutex_init(&spm_mgr->spm_cntr->spm_worker_mutex);
+
+       INIT_WORK(&spm_mgr->spm_work, amdgpu_spm_work);
+
+       spm_mgr->file = filp;
+
+       goto out;
+
+acquire_spm_failure:
+       for_each_inst(inst, AMDGPU_XCC_MASK(adev))
+               _amdgpu_spm_release(spm_mgr, inst, filp);
+       kfree(spm_mgr->spm_cntr);
+       spm_mgr->spm_cntr = NULL;
+
+out:
+       mutex_unlock(&(to_prof_xcp_mgr(spm_mgr, spm_mgr)->mutex));
+       return ret;
+}
+
+static void _amdgpu_spm_release(struct amdgpu_spm_mgr *spm_mgr, int inst, 
struct drm_file *filp)
+{
+       /* TODO: */
+
+}
+
 static int amdgpu_spm_release(struct amdgpu_spm_mgr *spm_mgr, struct drm_file 
*filp)
 {
        /* TODO */
@@ -52,9 +158,15 @@ int amdgpu_spm_ioctl(struct drm_device *dev, void *data,
 {
        struct amdgpu_fpriv *fpriv = filp->driver_priv;
        struct amdgpu_device *adev = fpriv_to_adev(fpriv);
+       struct amdgpu_profiler_mgr *prof_mgr = fpriv_to_prof_mgr(fpriv);
        struct drm_amdgpu_spm_args *args = data;
+       struct amdgpu_spm_mgr *spm_mgr =
+               &(prof_mgr->prof_xcp_mgr[AMDGPU_XCP_ID(fpriv->xcp_id)].spm_mgr);
 
        switch (args->op) {
+       case AMDGPU_SPM_OP_ACQUIRE:
+               return amdgpu_spm_acquire(spm_mgr, filp);
+
        default:
                dev_dbg(adev->dev, "Invalid option: %i\n", args->op);
                return -EINVAL;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_spm.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_spm.h
index dc55d2a8f016..9db89fd6154d 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_spm.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_spm.h
@@ -24,8 +24,25 @@
 #ifndef AMDGPU_SPM_H_
 #define AMDGPU_SPM_H_
 
+struct amdgpu_spm_base {
+       u64    gpu_addr;
+       u32    ring_size;
+       u32    *cpu_addr;
+       void   *spm_obj;
+};
+struct amdgpu_spm_cntr {
+       struct amdgpu_spm_base spm[MAX_XCP];
+       struct mutex spm_worker_mutex;
+};
+
 struct amdgpu_spm_mgr {
        struct drm_file *file;
+
+       struct task_struct *lead_thread;
+
+       /* spm data */
+       struct amdgpu_spm_cntr *spm_cntr;
+       struct work_struct spm_work;
 };
 
 int amdgpu_spm_ioctl(struct drm_device *dev, void *data,
-- 
2.34.1

[PATCH v2 10/17] drm/amdgpu: add profiler/spm operation AMDGPU_SPM_OP_ACQUIRE

Reply via email to