The v11 MQD manager incorrectly assigned the CP-compute variants of
checkpoint_mqd/restore_mqd for KFD_MQD_TYPE_SDMA queues. These functions
use sizeof(struct v11_compute_mqd) (2048 bytes) instead of sizeof(struct
v11_sdma_mqd) (512 bytes), causing a 1536-byte overflow.

During CRIU checkpoint of an SDMA queue on Navi3x:
- checkpoint_mqd() reads 2048 bytes from a 512-byte SDMA MQD buffer,
  leaking 1536 bytes of adjacent GTT memory to userspace

During CRIU restore:
- restore_mqd() writes 2048 bytes into a 512-byte SDMA MQD buffer,
  corrupting 1536 bytes of adjacent GTT memory (often the ring buffer
  or neighboring MQDs)

This is a copy-paste regression unique to v11. All other ASIC backends
(cik, vi, v9, v10, v12) correctly use the SDMA-specific variants.

Add checkpoint_mqd_sdma() and restore_mqd_sdma() functions that properly
handle the smaller v11_sdma_mqd structure, matching the pattern used in
other MQD managers.

Fixes: cc009e613de6 ("drm/amdkfd: Add KFD support for soc21 v3")
Assisted-by: Claude:Sonnet 4-5
Signed-off-by: Andrew Martin <[email protected]>
---
 .../gpu/drm/amd/amdkfd/kfd_mqd_manager_v11.c  | 40 ++++++++++++++++++-
 1 file changed, 38 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v11.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v11.c
index 4d8cf6008a77..ce0f5e8e5c29 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v11.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v11.c
@@ -355,6 +355,42 @@ static void restore_mqd(struct mqd_manager *mm, void **mqd,
        qp->is_active = 0;
 }
 
+static void checkpoint_mqd_sdma(struct mqd_manager *mm,
+                               void *mqd,
+                               void *mqd_dst,
+                               void *ctl_stack_dst)
+{
+       struct v11_sdma_mqd *m;
+
+       m = get_sdma_mqd(mqd);
+
+       memcpy(mqd_dst, m, sizeof(struct v11_sdma_mqd));
+}
+
+static void restore_mqd_sdma(struct mqd_manager *mm, void **mqd,
+                            struct kfd_mem_obj *mqd_mem_obj, uint64_t 
*gart_addr,
+                            struct queue_properties *qp,
+                            const void *mqd_src,
+                            const void *ctl_stack_src,
+                            const u32 ctl_stack_size)
+{
+       uint64_t addr;
+       struct v11_sdma_mqd *m;
+
+       m = (struct v11_sdma_mqd *) mqd_mem_obj->cpu_ptr;
+       addr = mqd_mem_obj->gpu_addr;
+
+       memcpy(m, mqd_src, sizeof(*m));
+
+       m->sdmax_rlcx_doorbell_offset =
+               qp->doorbell_off << SDMA0_QUEUE0_DOORBELL_OFFSET__OFFSET__SHIFT;
+
+       *mqd = m;
+       if (gart_addr)
+               *gart_addr = addr;
+
+       qp->is_active = 0;
+}
 
 static void init_mqd_hiq(struct mqd_manager *mm, void **mqd,
                        struct kfd_mem_obj *mqd_mem_obj, uint64_t *gart_addr,
@@ -539,8 +575,8 @@ struct mqd_manager *mqd_manager_init_v11(enum KFD_MQD_TYPE 
type,
                mqd->update_mqd = update_mqd_sdma;
                mqd->destroy_mqd = kfd_destroy_mqd_sdma;
                mqd->is_occupied = kfd_is_occupied_sdma;
-               mqd->checkpoint_mqd = checkpoint_mqd;
-               mqd->restore_mqd = restore_mqd;
+               mqd->checkpoint_mqd = checkpoint_mqd_sdma;
+               mqd->restore_mqd = restore_mqd_sdma;
                mqd->mqd_size = sizeof(struct v11_sdma_mqd);
                mqd->mqd_stride = kfd_mqd_stride;
 #if defined(CONFIG_DEBUG_FS)
-- 
2.43.0

Reply via email to