If GPU hang, driver will fail to flush tlb, return the hang error
to callers, make callers have a chance to handle the error.

Signed-off-by: Dennis Li <[email protected]>
Change-Id: Ie305ad0a77675f6eab7d5b8f68e279b7f4e7a8b9

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
index e9b96ad3d9a5..18e243183b5e 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
@@ -1488,7 +1488,7 @@ static int kfd_ioctl_map_memory_to_gpu(struct file *filep,
                peer_pdd = kfd_get_process_device_data(peer, p);
                if (WARN_ON_ONCE(!peer_pdd))
                        continue;
-               kfd_flush_tlb(peer_pdd);
+               err = kfd_flush_tlb(peer_pdd);
        }
 
        kfree(devices_arr);
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
index 21eb0998c4ae..d636cbf7d32f 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
@@ -263,6 +263,7 @@ static int allocate_vmid(struct device_queue_manager *dqm,
                        struct queue *q)
 {
        int allocated_vmid = -1, i;
+       int ret = 0;
 
        for (i = dqm->dev->vm_info.first_vmid_kfd;
                        i <= dqm->dev->vm_info.last_vmid_kfd; i++) {
@@ -295,13 +296,26 @@ static int allocate_vmid(struct device_queue_manager *dqm,
                        qpd->vmid,
                        qpd->page_table_base);
        /* invalidate the VM context after pasid and vmid mapping is set up */
-       kfd_flush_tlb(qpd_to_pdd(qpd));
+       ret = kfd_flush_tlb(qpd_to_pdd(qpd));
+       if (ret) {
+               pr_err("Failed to flush tlb\n");
+               goto pro_failed;
+       }
 
        if (dqm->dev->kfd2kgd->set_scratch_backing_va)
                dqm->dev->kfd2kgd->set_scratch_backing_va(dqm->dev->kgd,
                                qpd->sh_hidden_private_base, qpd->vmid);
 
        return 0;
+
+pro_failed:
+       /* Release the vmid mapping */
+       set_pasid_vmid_mapping(dqm, 0, qpd->vmid);
+       dqm->vmid_pasid[qpd->vmid] = 0;
+
+       qpd->vmid = 0;
+       q->properties.vmid = 0;
+       return ret;
 }
 
 static int flush_texture_cache_nocpsch(struct kfd_dev *kdev,
@@ -326,12 +340,17 @@ static void deallocate_vmid(struct device_queue_manager 
*dqm,
                                struct qcm_process_device *qpd,
                                struct queue *q)
 {
+       int ret = 0;
+
        /* On GFX v7, CP doesn't flush TC at dequeue */
        if (q->device->device_info->asic_family == CHIP_HAWAII)
                if (flush_texture_cache_nocpsch(q->device, qpd))
                        pr_err("Failed to flush TC\n");
 
-       kfd_flush_tlb(qpd_to_pdd(qpd));
+       ret = kfd_flush_tlb(qpd_to_pdd(qpd));
+       if (ret) {
+               pr_err("Failed to flush tlb\n");
+       }
 
        /* Release the vmid mapping */
        set_pasid_vmid_mapping(dqm, 0, qpd->vmid);
@@ -795,7 +814,9 @@ static int restore_process_queues_nocpsch(struct 
device_queue_manager *dqm,
                                dqm->dev->kgd,
                                qpd->vmid,
                                qpd->page_table_base);
-               kfd_flush_tlb(pdd);
+               ret = kfd_flush_tlb(pdd);
+               if (ret)
+                       goto out;
        }
 
        /* Take a safe reference to the mm_struct, which may otherwise
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h 
b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
index 51ba2020732e..31ea72946d06 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
@@ -1081,7 +1081,7 @@ void kfd_signal_vm_fault_event(struct kfd_dev *dev, 
unsigned int pasid,
 
 void kfd_signal_reset_event(struct kfd_dev *dev);
 
-void kfd_flush_tlb(struct kfd_process_device *pdd);
+int kfd_flush_tlb(struct kfd_process_device *pdd);
 
 int dbgdev_wave_reset_wavefronts(struct kfd_dev *dev, struct kfd_process *p);
 
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
index 8616a204e4c3..3919cc88813c 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_process.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
@@ -1444,21 +1444,24 @@ int kfd_reserved_mem_mmap(struct kfd_dev *dev, struct 
kfd_process *process,
                               KFD_CWSR_TBA_TMA_SIZE, vma->vm_page_prot);
 }
 
-void kfd_flush_tlb(struct kfd_process_device *pdd)
+int kfd_flush_tlb(struct kfd_process_device *pdd)
 {
        struct kfd_dev *dev = pdd->dev;
+       int ret = 0;
 
        if (dev->dqm->sched_policy == KFD_SCHED_POLICY_NO_HWS) {
                /* Nothing to flush until a VMID is assigned, which
                 * only happens when the first queue is created.
                 */
                if (pdd->qpd.vmid)
-                       amdgpu_amdkfd_flush_gpu_tlb_vmid(dev->kgd,
+                       ret = amdgpu_amdkfd_flush_gpu_tlb_vmid(dev->kgd,
                                                        pdd->qpd.vmid);
        } else {
-               amdgpu_amdkfd_flush_gpu_tlb_pasid(dev->kgd,
+               ret = amdgpu_amdkfd_flush_gpu_tlb_pasid(dev->kgd,
                                                pdd->process->pasid);
        }
+
+       return ret;
 }
 
 #if defined(CONFIG_DEBUG_FS)
-- 
2.17.1

_______________________________________________
amd-gfx mailing list
[email protected]
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

Reply via email to