From: David Yat Sin <david.yat...@amd.com>

Add support to existing CRIU ioctl's to save number of queues and queue
properties for each queue during checkpoint and re-create queues on restore.

Signed-off-by: David Yat Sin <david.yat...@amd.com>
Change-Id: Ifcd5e8359f492eef015867f354f44146dd1b6848
---
 drivers/gpu/drm/amd/amdkfd/kfd_chardev.c | 234 ++++++++++++++++++++++-
 1 file changed, 231 insertions(+), 3 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
index 788baee2a025..a9a04148e94c 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
@@ -1840,7 +1840,6 @@ static int kfd_devinfo_restore(struct kfd_process *p, 
struct kfd_criu_devinfo_bu
                               uint32_t num_of_devices)
 {
        int i;
-
        if (p->n_pdds != num_of_devices)
                return -EINVAL;
 
@@ -1891,6 +1890,77 @@ static int kfd_devinfo_restore(struct kfd_process *p, 
struct kfd_criu_devinfo_bu
        }
        return 0;
 }
+static void criu_dump_queue(struct kfd_process_device *pdd,
+                           struct queue *q,
+                           struct kfd_criu_q_bucket *q_bucket)
+{
+       q_bucket->gpu_id = pdd->dev->id;
+       q_bucket->type = q->properties.type;
+       q_bucket->format = q->properties.format;
+       q_bucket->q_id =  q->properties.queue_id;
+       q_bucket->q_address = q->properties.queue_address;
+       q_bucket->q_size = q->properties.queue_size;
+       q_bucket->priority = q->properties.priority;
+       q_bucket->q_percent = q->properties.queue_percent;
+       q_bucket->read_ptr_addr = (uint64_t)q->properties.read_ptr;
+       q_bucket->write_ptr_addr = (uint64_t)q->properties.write_ptr;
+       q_bucket->doorbell_id = q->doorbell_id;
+       q_bucket->doorbell_off = q->properties.doorbell_off;
+       q_bucket->sdma_id = q->sdma_id;
+
+       q_bucket->eop_ring_buffer_address =
+               q->properties.eop_ring_buffer_address;
+
+       q_bucket->eop_ring_buffer_size = q->properties.eop_ring_buffer_size;
+
+       q_bucket->ctx_save_restore_area_address =
+               q->properties.ctx_save_restore_area_address;
+
+       q_bucket->ctx_save_restore_area_size =
+               q->properties.ctx_save_restore_area_size;
+
+       q_bucket->ctl_stack_size = q->properties.ctl_stack_size;
+}
+
+static int criu_dump_queues_device(struct kfd_process_device *pdd,
+                               unsigned *q_index,
+                               unsigned int max_num_queues,
+                               struct kfd_criu_q_bucket *user_buckets)
+{
+       struct queue *q;
+       struct kfd_criu_q_bucket q_bucket;
+       int ret = 0;
+
+       list_for_each_entry(q, &pdd->qpd.queues_list, list) {
+               if (q->properties.type != KFD_QUEUE_TYPE_COMPUTE &&
+                       q->properties.type != KFD_QUEUE_TYPE_SDMA &&
+                       q->properties.type != KFD_QUEUE_TYPE_SDMA_XGMI) {
+
+                       pr_err("Unsupported queue type (%d)\n", 
q->properties.type);
+                       return -ENOTSUPP;
+               }
+
+               if (*q_index >= max_num_queues) {
+                       pr_err("Number of queues(%d) exceed allocated(%d)\n",
+                               *q_index, max_num_queues);
+
+                       ret = -ENOMEM;
+                       break;
+               }
+
+               memset(&q_bucket, 0, sizeof(q_bucket));
+               criu_dump_queue(pdd, q, &q_bucket);
+               ret = copy_to_user((void __user *)&user_buckets[*q_index],
+                                       &q_bucket, sizeof(q_bucket));
+               if (ret) {
+                       pr_err("Failed to copy queue information to user\n");
+                       ret = -EFAULT;
+                       break;
+               }
+               *q_index = *q_index + 1;
+       }
+       return ret;
+}
 
 static int kfd_ioctl_criu_dumper(struct file *filep,
                                struct kfd_process *p, void *data)
@@ -1900,8 +1970,13 @@ static int kfd_ioctl_criu_dumper(struct file *filep,
        struct amdgpu_bo *dumper_bo;
        int ret, id, index, i = 0;
        struct kgd_mem *kgd_mem;
+       int q_index = 0;
        void *mem;
 
+       struct kfd_criu_q_bucket *user_buckets =
+               (struct kfd_criu_q_bucket*) args->kfd_criu_q_buckets_ptr;
+
+
        pr_info("Inside %s\n",__func__);
 
        if (args->num_of_bos == 0) {
@@ -1922,6 +1997,8 @@ static int kfd_ioctl_criu_dumper(struct file *filep,
        if (!bo_bucket)
                return -ENOMEM;
 
+       pr_debug("num of queues = %u\n", args->num_of_queues);
+
        mutex_lock(&p->mutex);
 
        if (!kfd_has_process_device_data(p)) {
@@ -1930,9 +2007,17 @@ static int kfd_ioctl_criu_dumper(struct file *filep,
                goto err_unlock;
        }
 
+       ret = kfd_process_evict_queues(p);
+       if (ret) {
+               pr_err("Failed to evict queues\n");
+               goto err_unlock;
+       }
+
        ret = kfd_devinfo_dump(p, args);
-       if (ret)
+       if (ret) {
+               pr_err("Failed to dump devices\n");
                goto err_unlock;
+       }
 
        /* Run over all PDDs of the process */
        for (index = 0; index < p->n_pdds; index++) {
@@ -1989,6 +2074,11 @@ static int kfd_ioctl_criu_dumper(struct file *filep,
                                i++;
                        }
                }
+
+               ret = criu_dump_queues_device(pdd, &q_index,
+                                       args->num_of_queues, user_buckets);
+               if (ret)
+                       goto err_unlock;
        }
 
        ret = copy_to_user((void __user *)args->kfd_criu_bo_buckets_ptr,
@@ -1996,15 +2086,131 @@ static int kfd_ioctl_criu_dumper(struct file *filep,
                        (args->num_of_bos *
                         sizeof(struct kfd_criu_bo_buckets)));
        kvfree(bo_bucket);
+
+       kfd_process_restore_queues(p);
        mutex_unlock(&p->mutex);
-       return ret ? -EFAULT : 0;
+       return 0;
 
 err_unlock:
+       kfd_process_restore_queues(p);
        mutex_unlock(&p->mutex);
        pr_err("Dumper ioctl failed err:%d\n", ret);
        return ret;
 }
 
+static void set_queue_properties_from_criu(struct queue_properties *qp,
+                                       struct kfd_criu_q_bucket *q_bucket)
+{
+       qp->is_interop = false;
+       qp->is_gws = q_bucket->is_gws;
+       qp->queue_percent = q_bucket->q_percent;
+       qp->priority = q_bucket->priority;
+       qp->queue_address = q_bucket->q_address;
+       qp->queue_size = q_bucket->q_size;
+       qp->read_ptr = (uint32_t *) q_bucket->read_ptr_addr;
+       qp->write_ptr = (uint32_t *) q_bucket->write_ptr_addr;
+       qp->eop_ring_buffer_address = q_bucket->eop_ring_buffer_address;
+       qp->eop_ring_buffer_size = q_bucket->eop_ring_buffer_size;
+       qp->ctx_save_restore_area_address = 
q_bucket->ctx_save_restore_area_address;
+       qp->ctx_save_restore_area_size = q_bucket->ctx_save_restore_area_size;
+       qp->ctl_stack_size = q_bucket->ctl_stack_size;
+       qp->type = q_bucket->type;
+       qp->format = q_bucket->format;
+}
+
+/* criu_restore_queue runs with the process mutex locked */
+int criu_restore_queue(struct kfd_process *p,
+                                       struct kfd_dev *dev,
+                                       struct kfd_process_device *pdd,
+                                       struct kfd_criu_q_bucket *q_bucket)
+{
+       int ret = 0;
+       unsigned int queue_id;
+       struct queue_properties qp;
+
+       pr_debug("Restoring Queue: gpu_id:%x type:%x format:%x queue_id:%u "
+                       "address:%llx size:%llx priority:%u percent:%u "
+                       "read_ptr:%llx write_ptr:%llx doorbell_id:%x "
+                       "doorbell_off:%llx queue_address:%llx\n",
+                       q_bucket->gpu_id,
+                       q_bucket->type,
+                       q_bucket->format,
+                       q_bucket->q_id,
+                       q_bucket->q_address,
+                       q_bucket->q_size,
+                       q_bucket->priority,
+                       q_bucket->q_percent,
+                       q_bucket->read_ptr_addr,
+                       q_bucket->write_ptr_addr,
+                       q_bucket->doorbell_id,
+                       q_bucket->doorbell_off,
+                       q_bucket->q_address);
+
+       memset(&qp, 0, sizeof(qp));
+       set_queue_properties_from_criu(&qp, q_bucket);
+       print_queue_properties(&qp);
+
+       ret = pqm_create_queue(&p->pqm, dev, NULL, &qp, &queue_id, NULL);
+       if (ret) {
+               pr_err("Failed to create new queue err:%d\n", ret);
+               return -EINVAL;
+       }
+       pr_debug("Queue id %d was restored successfully\n", queue_id);
+
+       return 0;
+}
+
+/* criu_restore_queues runs with the process mutex locked */
+static int criu_restore_queues(struct kfd_process *p,
+                       struct kfd_ioctl_criu_restorer_args *args)
+{
+       struct kfd_process_device *pdd;
+       struct kfd_dev *dev;
+       int i;
+       int ret;
+       struct kfd_criu_q_bucket *user_buckets =
+               (struct kfd_criu_q_bucket*) args->kfd_criu_q_buckets_ptr;
+       /*
+         * This process will not have any queues at this point, but we are
+         * setting all the dqm's for this process to evicted state.
+         */
+        kfd_process_evict_queues(p);
+
+       for (i = 0; i < args->num_of_queues; i++) {
+               struct kfd_criu_q_bucket q_bucket;
+               ret = copy_from_user(&q_bucket, (void __user *)&user_buckets[i],
+                               sizeof(struct kfd_criu_q_bucket));
+
+               if (ret) {
+                       ret = -EFAULT;
+                       pr_err("Failed to access");
+                       return ret;
+               }
+
+               dev = kfd_device_by_id(q_bucket.gpu_id);
+               if (!dev) {
+                       pr_err("Could not get kfd_dev from gpu_id = 0x%x\n",
+                       q_bucket.gpu_id);
+
+                       ret = -EINVAL;
+                       return ret;
+               }
+
+               pdd = kfd_get_process_device_data(dev, p);
+               if (!pdd) {
+                       pr_err("Failed to get pdd\n");
+                       ret = -EFAULT;
+                       return ret;
+               }
+               ret = criu_restore_queue(p, dev, pdd, &q_bucket);
+               if (ret) {
+                       pr_err("Failed to restore queue (%d)\n", ret);
+                       break;
+               }
+       }
+       return ret;
+}
+
 static int kfd_ioctl_criu_restorer(struct file *filep,
                                struct kfd_process *p, void *data)
 {
@@ -2229,6 +2435,12 @@ static int kfd_ioctl_criu_restorer(struct file *filep,
                kfd_flush_tlb(peer_pdd);
        }
 
+       ret = criu_restore_queues(p, args);
+       if (ret) {
+               err = ret;
+               goto err_unlock;
+       }
+
        ret = copy_to_user((void __user *)args->restored_bo_array_ptr,
                           restored_bo_offsets_arr,
                           (args->num_of_bos * 
sizeof(*restored_bo_offsets_arr)));
@@ -2286,8 +2498,10 @@ static int kfd_ioctl_criu_helper(struct file *filep,
 {
        struct kfd_ioctl_criu_helper_args *args = data;
        struct kgd_mem *kgd_mem;
+       struct queue *q;
        u64 num_of_bos = 0;
        int id, i = 0;
+       u32 q_index = 0;
        void *mem;
        int ret = 0;
 
@@ -2314,12 +2528,26 @@ static int kfd_ioctl_criu_helper(struct file *filep,
                        if ((uint64_t)kgd_mem->va > pdd->gpuvm_base)
                                num_of_bos++;
                }
+
+               list_for_each_entry(q, &pdd->qpd.queues_list, list) {
+                       if (q->properties.type == KFD_QUEUE_TYPE_COMPUTE ||
+                               q->properties.type == KFD_QUEUE_TYPE_SDMA ||
+                               q->properties.type == KFD_QUEUE_TYPE_SDMA_XGMI) 
{
+
+                               q_index++;
+                       } else {
+                               pr_err("Unsupported queue type (%d)\n", 
q->properties.type);
+                               ret = -ENOTSUPP;
+                               goto err_unlock;
+                       }
+               }
        }
 
        args->task_pid = task_pid_nr_ns(p->lead_thread,
                                        task_active_pid_ns(p->lead_thread));
        args->num_of_devices = p->n_pdds;
        args->num_of_bos = num_of_bos;
+       args->num_of_queues = q_index;
        dev_dbg(kfd_device, "Num of bos = %llu\n", num_of_bos);
 
 err_unlock:
-- 
2.17.1

_______________________________________________
dri-devel mailing list
dri-devel@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/dri-devel

Reply via email to