[PATCH] fixup! drm/amdkfd: CRIU export dmabuf handles for GTT BOs

2022-03-09 Thread David Yat Sin
Signed-off-by: David Yat Sin 
---
 drivers/gpu/drm/amd/amdkfd/kfd_chardev.c | 6 ++
 include/uapi/linux/kfd_ioctl.h   | 2 ++
 2 files changed, 8 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
index e1e2362841f8..607f65ab39ac 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
@@ -1767,7 +1767,10 @@ static int criu_checkpoint_bos(struct kfd_process *p,
_bucket->dmabuf_fd);
if (ret)
goto exit;
+   } else {
+   bo_bucket->dmabuf_fd = KFD_INVALID_FD;
}
+
if (bo_bucket->alloc_flags & 
KFD_IOC_ALLOC_MEM_FLAGS_DOORBELL)
bo_bucket->offset = KFD_MMAP_TYPE_DOORBELL |
KFD_MMAP_GPU_ID(pdd->dev->id);
@@ -2219,7 +,10 @@ static int criu_restore_bo(struct kfd_process *p,
_bucket->dmabuf_fd);
if (ret)
return ret;
+   } else {
+   bo_bucket->dmabuf_fd = KFD_INVALID_FD;
}
+
return 0;
 }
 
diff --git a/include/uapi/linux/kfd_ioctl.h b/include/uapi/linux/kfd_ioctl.h
index eb9ff85f8556..42975e940758 100644
--- a/include/uapi/linux/kfd_ioctl.h
+++ b/include/uapi/linux/kfd_ioctl.h
@@ -196,6 +196,8 @@ struct kfd_ioctl_dbg_wave_control_args {
__u32 buf_size_in_bytes;/*including gpu_id and buf_size */
 };
 
+#define KFD_INVALID_FD 0x
+
 /* Matching HSA_EVENTTYPE */
 #define KFD_IOC_EVENT_SIGNAL   0
 #define KFD_IOC_EVENT_NODECHANGE   1
-- 
2.35.1



[PATCH] drm/amdkfd: Set handle to invalid for non GTT/VRAM BOs

2022-03-09 Thread David Yat Sin
Set dmabuf handle to invalid for BOs that cannot be accessed using SDMA
during checkpoint/restore.

Signed-off-by: David Yat Sin 
---
 drivers/gpu/drm/amd/amdkfd/kfd_chardev.c | 8 ++--
 include/uapi/linux/kfd_ioctl.h   | 2 ++
 2 files changed, 8 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
index e1e2362841f8..1ffa976ad318 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
@@ -1767,7 +1767,9 @@ static int criu_checkpoint_bos(struct kfd_process *p,
_bucket->dmabuf_fd);
if (ret)
goto exit;
-   }
+   } else
+   bo_bucket->dmabuf_fd = KFD_INVALID_FD;
+
if (bo_bucket->alloc_flags & 
KFD_IOC_ALLOC_MEM_FLAGS_DOORBELL)
bo_bucket->offset = KFD_MMAP_TYPE_DOORBELL |
KFD_MMAP_GPU_ID(pdd->dev->id);
@@ -2219,7 +2221,9 @@ static int criu_restore_bo(struct kfd_process *p,
_bucket->dmabuf_fd);
if (ret)
return ret;
-   }
+   } else
+   bo_bucket->dmabuf_fd = KFD_INVALID_FD;
+
return 0;
 }
 
diff --git a/include/uapi/linux/kfd_ioctl.h b/include/uapi/linux/kfd_ioctl.h
index eb9ff85f8556..42975e940758 100644
--- a/include/uapi/linux/kfd_ioctl.h
+++ b/include/uapi/linux/kfd_ioctl.h
@@ -196,6 +196,8 @@ struct kfd_ioctl_dbg_wave_control_args {
__u32 buf_size_in_bytes;/*including gpu_id and buf_size */
 };
 
+#define KFD_INVALID_FD 0x
+
 /* Matching HSA_EVENTTYPE */
 #define KFD_IOC_EVENT_SIGNAL   0
 #define KFD_IOC_EVENT_NODECHANGE   1
-- 
2.35.1



[PATCH v2] drm/amdkfd: CRIU export dmabuf handles for GTT BOs

2022-03-08 Thread David Yat Sin
Export dmabuf handles for GTT BOs so that their contents can be accessed
using SDMA during checkpoint/restore.

Signed-off-by: David Yat Sin 
---
 drivers/gpu/drm/amd/amdkfd/kfd_chardev.c | 12 
 include/uapi/linux/kfd_ioctl.h   |  3 ++-
 2 files changed, 10 insertions(+), 5 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
index 2c7d76e67ddb..e1e2362841f8 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
@@ -1759,7 +1759,8 @@ static int criu_checkpoint_bos(struct kfd_process *p,
goto exit;
}
}
-   if (bo_bucket->alloc_flags & 
KFD_IOC_ALLOC_MEM_FLAGS_VRAM) {
+   if (bo_bucket->alloc_flags
+   & (KFD_IOC_ALLOC_MEM_FLAGS_VRAM | 
KFD_IOC_ALLOC_MEM_FLAGS_GTT)) {
ret = 
criu_get_prime_handle(_bo->tbo.base,
bo_bucket->alloc_flags &

KFD_IOC_ALLOC_MEM_FLAGS_WRITABLE ? DRM_RDWR : 0,
@@ -1812,7 +1813,8 @@ static int criu_checkpoint_bos(struct kfd_process *p,
 
 exit:
while (ret && bo_index--) {
-   if (bo_buckets[bo_index].alloc_flags & 
KFD_IOC_ALLOC_MEM_FLAGS_VRAM)
+   if (bo_buckets[bo_index].alloc_flags
+   & (KFD_IOC_ALLOC_MEM_FLAGS_VRAM | 
KFD_IOC_ALLOC_MEM_FLAGS_GTT))
close_fd(bo_buckets[bo_index].dmabuf_fd);
}
 
@@ -2211,7 +2213,8 @@ static int criu_restore_bo(struct kfd_process *p,
 
pr_debug("map memory was successful for the BO\n");
/* create the dmabuf object and export the bo */
-   if (bo_bucket->alloc_flags & KFD_IOC_ALLOC_MEM_FLAGS_VRAM) {
+   if (bo_bucket->alloc_flags
+   & (KFD_IOC_ALLOC_MEM_FLAGS_VRAM | KFD_IOC_ALLOC_MEM_FLAGS_GTT)) {
ret = criu_get_prime_handle(_mem->bo->tbo.base, DRM_RDWR,
_bucket->dmabuf_fd);
if (ret)
@@ -2281,7 +2284,8 @@ static int criu_restore_bos(struct kfd_process *p,
 
 exit:
while (ret && i--) {
-   if (bo_buckets[i].alloc_flags & KFD_IOC_ALLOC_MEM_FLAGS_VRAM)
+   if (bo_buckets[i].alloc_flags
+  & (KFD_IOC_ALLOC_MEM_FLAGS_VRAM | 
KFD_IOC_ALLOC_MEM_FLAGS_GTT))
close_fd(bo_buckets[i].dmabuf_fd);
}
kvfree(bo_buckets);
diff --git a/include/uapi/linux/kfd_ioctl.h b/include/uapi/linux/kfd_ioctl.h
index b40687bf1014..eb9ff85f8556 100644
--- a/include/uapi/linux/kfd_ioctl.h
+++ b/include/uapi/linux/kfd_ioctl.h
@@ -33,9 +33,10 @@
  * - 1.5 - Add SVM API
  * - 1.6 - Query clear flags in SVM get_attr API
  * - 1.7 - Checkpoint Restore (CRIU) API
+ * - 1.8 - CRIU - Support for SDMA transfers with GTT BOs
  */
 #define KFD_IOCTL_MAJOR_VERSION 1
-#define KFD_IOCTL_MINOR_VERSION 7
+#define KFD_IOCTL_MINOR_VERSION 8
 
 struct kfd_ioctl_get_version_args {
__u32 major_version;/* from KFD */
-- 
2.17.1



[PATCH] drm/amdkfd: CRIU export dmabuf handles for GTT BOs

2022-03-08 Thread David Yat Sin
Export dmabuf handles for GTT BOs so that their contents can be accessed
using SDMA during checkpoint/restore.

Signed-off-by: David Yat Sin 
---
 drivers/gpu/drm/amd/amdkfd/kfd_chardev.c | 12 
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
index 2c7d76e67ddb..e1e2362841f8 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
@@ -1759,7 +1759,8 @@ static int criu_checkpoint_bos(struct kfd_process *p,
goto exit;
}
}
-   if (bo_bucket->alloc_flags & 
KFD_IOC_ALLOC_MEM_FLAGS_VRAM) {
+   if (bo_bucket->alloc_flags
+   & (KFD_IOC_ALLOC_MEM_FLAGS_VRAM | 
KFD_IOC_ALLOC_MEM_FLAGS_GTT)) {
ret = 
criu_get_prime_handle(_bo->tbo.base,
bo_bucket->alloc_flags &

KFD_IOC_ALLOC_MEM_FLAGS_WRITABLE ? DRM_RDWR : 0,
@@ -1812,7 +1813,8 @@ static int criu_checkpoint_bos(struct kfd_process *p,
 
 exit:
while (ret && bo_index--) {
-   if (bo_buckets[bo_index].alloc_flags & 
KFD_IOC_ALLOC_MEM_FLAGS_VRAM)
+   if (bo_buckets[bo_index].alloc_flags
+   & (KFD_IOC_ALLOC_MEM_FLAGS_VRAM | 
KFD_IOC_ALLOC_MEM_FLAGS_GTT))
close_fd(bo_buckets[bo_index].dmabuf_fd);
}
 
@@ -2211,7 +2213,8 @@ static int criu_restore_bo(struct kfd_process *p,
 
pr_debug("map memory was successful for the BO\n");
/* create the dmabuf object and export the bo */
-   if (bo_bucket->alloc_flags & KFD_IOC_ALLOC_MEM_FLAGS_VRAM) {
+   if (bo_bucket->alloc_flags
+   & (KFD_IOC_ALLOC_MEM_FLAGS_VRAM | KFD_IOC_ALLOC_MEM_FLAGS_GTT)) {
ret = criu_get_prime_handle(_mem->bo->tbo.base, DRM_RDWR,
_bucket->dmabuf_fd);
if (ret)
@@ -2281,7 +2284,8 @@ static int criu_restore_bos(struct kfd_process *p,
 
 exit:
while (ret && i--) {
-   if (bo_buckets[i].alloc_flags & KFD_IOC_ALLOC_MEM_FLAGS_VRAM)
+   if (bo_buckets[i].alloc_flags
+  & (KFD_IOC_ALLOC_MEM_FLAGS_VRAM | 
KFD_IOC_ALLOC_MEM_FLAGS_GTT))
close_fd(bo_buckets[i].dmabuf_fd);
}
kvfree(bo_buckets);
-- 
2.17.1



[PATCH 1/2] drm/amdkfd: CRIU remove sync and TLB flush on restore

2022-03-08 Thread David Yat Sin
When the process is getting restored, the queues are not mapped yet, so
there is no VMID assigned for this process and no TLBs to flush.

Signed-off-by: David Yat Sin 
Reviewed-by: Felix Kuehling 
---
 drivers/gpu/drm/amd/amdkfd/kfd_chardev.c | 30 +---
 1 file changed, 1 insertion(+), 29 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
index 59d3fe269e7c..789bdfbd3f9b 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
@@ -2102,7 +2102,6 @@ static int criu_restore_bos(struct kfd_process *p,
struct kfd_criu_bo_bucket *bo_buckets = NULL;
struct kfd_criu_bo_priv_data *bo_privs = NULL;
const bool criu_resume = true;
-   bool flush_tlbs = false;
int ret = 0, j = 0;
uint32_t i = 0;
 
@@ -2248,7 +2247,6 @@ static int criu_restore_bos(struct kfd_process *p,
for (j = 0; j < p->n_pdds; j++) {
struct kfd_dev *peer;
struct kfd_process_device *peer_pdd;
-   bool table_freed = false;
 
if (!bo_priv->mapped_gpuids[j])
break;
@@ -2268,20 +2266,11 @@ static int criu_restore_bos(struct kfd_process *p,
pr_debug("map mem in restore ioctl -> 0x%llx\n",
 ((struct kgd_mem *)mem)->va);
ret = amdgpu_amdkfd_gpuvm_map_memory_to_gpu(peer->adev,
-   (struct kgd_mem *)mem, peer_pdd->drm_priv, 
_freed);
+   (struct kgd_mem *)mem, peer_pdd->drm_priv, 
NULL);
if (ret) {
pr_err("Failed to map to gpu %d/%d\n", j, 
p->n_pdds);
goto exit;
}
-   if (table_freed)
-   flush_tlbs = true;
-   }
-
-   ret = amdgpu_amdkfd_gpuvm_sync_memory(dev->adev,
- (struct kgd_mem *) mem, 
true);
-   if (ret) {
-   pr_debug("Sync memory failed, wait interrupted by user 
signal\n");
-   goto exit;
}
 
pr_debug("map memory was successful for the BO\n");
@@ -2296,23 +2285,6 @@ static int criu_restore_bos(struct kfd_process *p,
}
} /* done */
 
-   if (flush_tlbs) {
-   /* Flush TLBs after waiting for the page table updates to 
complete */
-   for (j = 0; j < p->n_pdds; j++) {
-   struct kfd_dev *peer;
-   struct kfd_process_device *pdd = p->pdds[j];
-   struct kfd_process_device *peer_pdd;
-
-   peer = kfd_device_by_id(pdd->dev->id);
-   if (WARN_ON_ONCE(!peer))
-   continue;
-   peer_pdd = kfd_get_process_device_data(peer, p);
-   if (WARN_ON_ONCE(!peer_pdd))
-   continue;
-   kfd_flush_tlb(peer_pdd, TLB_FLUSH_LEGACY);
-   }
-   }
-
/* Copy only the buckets back so user can read 
bo_buckets[N].restored_offset */
ret = copy_to_user((void __user *)args->bos,
bo_buckets,
-- 
2.17.1



[PATCH 2/2] drm/amdkfd: CRIU Refactor restore BO function

2022-03-08 Thread David Yat Sin
Refactor CRIU restore BO to reduce identation before adding support for
IPC.

Signed-off-by: David Yat Sin 
---
 drivers/gpu/drm/amd/amdkfd/kfd_chardev.c | 271 +++
 1 file changed, 129 insertions(+), 142 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
index 789bdfbd3f9b..2c7d76e67ddb 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
@@ -2094,6 +2094,132 @@ static int criu_restore_devices(struct kfd_process *p,
return ret;
 }
 
+static int criu_restore_memory_of_gpu(struct kfd_process_device *pdd,
+ struct kfd_criu_bo_bucket *bo_bucket,
+ struct kfd_criu_bo_priv_data *bo_priv,
+ struct kgd_mem **kgd_mem)
+{
+   int idr_handle;
+   int ret;
+   const bool criu_resume = true;
+   u64 offset;
+
+   if (bo_bucket->alloc_flags & KFD_IOC_ALLOC_MEM_FLAGS_DOORBELL) {
+   if (bo_bucket->size != kfd_doorbell_process_slice(pdd->dev))
+   return -EINVAL;
+
+   offset = kfd_get_process_doorbells(pdd);
+   } else if (bo_bucket->alloc_flags & KFD_IOC_ALLOC_MEM_FLAGS_MMIO_REMAP) 
{
+   /* MMIO BOs need remapped bus address */
+   if (bo_bucket->size != PAGE_SIZE) {
+   pr_err("Invalid page size\n");
+   return -EINVAL;
+   }
+   offset = pdd->dev->adev->rmmio_remap.bus_addr;
+   if (!offset) {
+   pr_err("amdgpu_amdkfd_get_mmio_remap_phys_addr 
failed\n");
+   return -ENOMEM;
+   }
+   } else if (bo_bucket->alloc_flags & KFD_IOC_ALLOC_MEM_FLAGS_USERPTR) {
+   offset = bo_priv->user_addr;
+   }
+   /* Create the BO */
+   ret = amdgpu_amdkfd_gpuvm_alloc_memory_of_gpu(pdd->dev->adev, 
bo_bucket->addr,
+ bo_bucket->size, 
pdd->drm_priv, kgd_mem,
+ , 
bo_bucket->alloc_flags, criu_resume);
+   if (ret) {
+   pr_err("Could not create the BO\n");
+   return ret;
+   }
+   pr_debug("New BO created: size:0x%llx addr:0x%llx offset:0x%llx\n",
+bo_bucket->size, bo_bucket->addr, offset);
+
+   /* Restore previous IDR handle */
+   pr_debug("Restoring old IDR handle for the BO");
+   idr_handle = idr_alloc(>alloc_idr, *kgd_mem, bo_priv->idr_handle,
+  bo_priv->idr_handle + 1, GFP_KERNEL);
+
+   if (idr_handle < 0) {
+   pr_err("Could not allocate idr\n");
+   amdgpu_amdkfd_gpuvm_free_memory_of_gpu(pdd->dev->adev, 
*kgd_mem, pdd->drm_priv,
+  NULL);
+   return -ENOMEM;
+   }
+
+   if (bo_bucket->alloc_flags & KFD_IOC_ALLOC_MEM_FLAGS_DOORBELL)
+   bo_bucket->restored_offset = KFD_MMAP_TYPE_DOORBELL | 
KFD_MMAP_GPU_ID(pdd->dev->id);
+   if (bo_bucket->alloc_flags & KFD_IOC_ALLOC_MEM_FLAGS_MMIO_REMAP) {
+   bo_bucket->restored_offset = KFD_MMAP_TYPE_MMIO | 
KFD_MMAP_GPU_ID(pdd->dev->id);
+   } else if (bo_bucket->alloc_flags & KFD_IOC_ALLOC_MEM_FLAGS_GTT) {
+   bo_bucket->restored_offset = offset;
+   } else if (bo_bucket->alloc_flags & KFD_IOC_ALLOC_MEM_FLAGS_VRAM) {
+   bo_bucket->restored_offset = offset;
+   /* Update the VRAM usage count */
+   WRITE_ONCE(pdd->vram_usage, pdd->vram_usage + bo_bucket->size);
+   }
+   return 0;
+}
+
+static int criu_restore_bo(struct kfd_process *p,
+  struct kfd_criu_bo_bucket *bo_bucket,
+  struct kfd_criu_bo_priv_data *bo_priv)
+{
+   struct kfd_process_device *pdd;
+   struct kgd_mem *kgd_mem;
+   int ret;
+   int j;
+
+   pr_debug("Restoring BO size:0x%llx addr:0x%llx gpu_id:0x%x flags:0x%x 
idr_handle:0x%x\n",
+bo_bucket->size, bo_bucket->addr, bo_bucket->gpu_id, 
bo_bucket->alloc_flags,
+bo_priv->idr_handle);
+
+   pdd = kfd_process_device_data_by_id(p, bo_bucket->gpu_id);
+   if (!pdd) {
+   pr_err("Failed to get pdd\n");
+   return -ENODEV;
+   }
+
+   ret = criu_restore_memory_of_gpu(pdd, bo_bucket, bo_priv, _mem);
+   if (ret)
+   return ret;
+
+   /* now map these BOs to GPU/s */
+   for (j = 0; j < p->n_pdds; j++) {
+   struct kfd_dev *peer;
+   struct kfd_process_device 

[PATCH] drm/amdkfd: Fix for possible integer overflow

2022-02-18 Thread David Yat Sin
Fix for possible integer overflow when doing addition.

Reported-by: Dan Carpenter 
Signed-off-by: David Yat Sin 
---
 drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
index c3f252fc337b..6eca9509f2e3 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
@@ -786,7 +786,7 @@ int kfd_criu_restore_queue(struct kfd_process *p,
}
 
*priv_data_offset += sizeof(*q_data);
-   q_extra_data_size = q_data->ctl_stack_size + q_data->mqd_size;
+   q_extra_data_size = (uint64_t)q_data->ctl_stack_size + q_data->mqd_size;
 
if (*priv_data_offset + q_extra_data_size > max_priv_data_size) {
ret = -EINVAL;
-- 
2.17.1