The v11 MQD manager incorrectly assigned the CP-compute variants of
checkpoint_mqd/restore_mqd for KFD_MQD_TYPE_SDMA queues. These functions
use sizeof(struct v11_compute_mqd) (2048 bytes) instead of sizeof(struct
v11_sdma_mqd) (512 bytes), causing a 1536-byte overflow.
During CRIU checkpoint of an SDMA queue on Navi3x:
- checkpoint_mqd() reads 2048 bytes from a 512-byte SDMA MQD buffer,
leaking 1536 bytes of adjacent GTT memory to userspace
During CRIU restore:
- restore_mqd() writes 2048 bytes into a 512-byte SDMA MQD buffer,
corrupting 1536 bytes of adjacent GTT memory (often the ring buffer
or neighboring MQDs)
This is a copy-paste regression unique to v11. All other ASIC backends
(cik, vi, v9, v10, v12) correctly use the SDMA-specific variants.
Add checkpoint_mqd_sdma() and restore_mqd_sdma() functions that properly
handle the smaller v11_sdma_mqd structure, matching the pattern used in
other MQD managers.
Fixes: cc009e613de6 ("drm/amdkfd: Add KFD support for soc21 v3")
Assisted-by: Claude:Sonnet 4-5
Signed-off-by: Andrew Martin <[email protected]>
---
.../gpu/drm/amd/amdkfd/kfd_mqd_manager_v11.c | 40 ++++++++++++++++++-
1 file changed, 38 insertions(+), 2 deletions(-)
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v11.c
b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v11.c
index 4d8cf6008a77..ce0f5e8e5c29 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v11.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v11.c
@@ -355,6 +355,42 @@ static void restore_mqd(struct mqd_manager *mm, void **mqd,
qp->is_active = 0;
}
+static void checkpoint_mqd_sdma(struct mqd_manager *mm,
+ void *mqd,
+ void *mqd_dst,
+ void *ctl_stack_dst)
+{
+ struct v11_sdma_mqd *m;
+
+ m = get_sdma_mqd(mqd);
+
+ memcpy(mqd_dst, m, sizeof(struct v11_sdma_mqd));
+}
+
+static void restore_mqd_sdma(struct mqd_manager *mm, void **mqd,
+ struct kfd_mem_obj *mqd_mem_obj, uint64_t
*gart_addr,
+ struct queue_properties *qp,
+ const void *mqd_src,
+ const void *ctl_stack_src,
+ const u32 ctl_stack_size)
+{
+ uint64_t addr;
+ struct v11_sdma_mqd *m;
+
+ m = (struct v11_sdma_mqd *) mqd_mem_obj->cpu_ptr;
+ addr = mqd_mem_obj->gpu_addr;
+
+ memcpy(m, mqd_src, sizeof(*m));
+
+ m->sdmax_rlcx_doorbell_offset =
+ qp->doorbell_off << SDMA0_QUEUE0_DOORBELL_OFFSET__OFFSET__SHIFT;
+
+ *mqd = m;
+ if (gart_addr)
+ *gart_addr = addr;
+
+ qp->is_active = 0;
+}
static void init_mqd_hiq(struct mqd_manager *mm, void **mqd,
struct kfd_mem_obj *mqd_mem_obj, uint64_t *gart_addr,
@@ -539,8 +575,8 @@ struct mqd_manager *mqd_manager_init_v11(enum KFD_MQD_TYPE
type,
mqd->update_mqd = update_mqd_sdma;
mqd->destroy_mqd = kfd_destroy_mqd_sdma;
mqd->is_occupied = kfd_is_occupied_sdma;
- mqd->checkpoint_mqd = checkpoint_mqd;
- mqd->restore_mqd = restore_mqd;
+ mqd->checkpoint_mqd = checkpoint_mqd_sdma;
+ mqd->restore_mqd = restore_mqd_sdma;
mqd->mqd_size = sizeof(struct v11_sdma_mqd);
mqd->mqd_stride = kfd_mqd_stride;
#if defined(CONFIG_DEBUG_FS)
--
2.43.0