There's no reason for panthor_vm_[un]map_pages() to fail unless the
drm_gpuvm state and the page table are out of sync, so let's reflect that
by making panthor_vm_unmap_pages() a void function and adding
WARN_ON()s in various places. We also try to recover from those
unexpected mismatch by checking for already unmapped ranges and skipping
them. But there's only so much we can do to try and cope with such
SW bugs, so when we see a mismatch, we flag the VM unusable and disable
the AS to avoid further GPU accesses to the memory.

It could be that the as_disable() call fails because the MMU unit is
stuck, in which case the whole GPU is frozen, and only a GPU reset can
unblock things. Ater the reset, the VM will be seen as unusable and
any attempt to re-use it will fail, so we should be covered for any
use-after-unmap issues.

v2:
- Fix double unlock

Signed-off-by: Boris Brezillon <[email protected]>
---
 drivers/gpu/drm/panthor/panthor_mmu.c | 82 ++++++++++++++++++---------
 1 file changed, 55 insertions(+), 27 deletions(-)

diff --git a/drivers/gpu/drm/panthor/panthor_mmu.c 
b/drivers/gpu/drm/panthor/panthor_mmu.c
index 21389137a324..35aad1e0ecaa 100644
--- a/drivers/gpu/drm/panthor/panthor_mmu.c
+++ b/drivers/gpu/drm/panthor/panthor_mmu.c
@@ -842,12 +842,32 @@ static size_t get_pgsize(u64 addr, size_t size, size_t 
*count)
        return SZ_2M;
 }
 
-static int panthor_vm_unmap_pages(struct panthor_vm *vm, u64 iova, u64 size)
+static void panthor_vm_declare_unusable(struct panthor_vm *vm)
+{
+       struct panthor_device *ptdev = vm->ptdev;
+       int cookie;
+
+       if (vm->unusable)
+               return;
+
+       vm->unusable = true;
+       mutex_lock(&ptdev->mmu->as.slots_lock);
+       if (vm->as.id >= 0 && drm_dev_enter(&ptdev->base, &cookie)) {
+               panthor_mmu_as_disable(ptdev, vm->as.id);
+               drm_dev_exit(cookie);
+       }
+       mutex_unlock(&ptdev->mmu->as.slots_lock);
+}
+
+static void panthor_vm_unmap_pages(struct panthor_vm *vm, u64 iova, u64 size)
 {
        struct panthor_device *ptdev = vm->ptdev;
        struct io_pgtable_ops *ops = vm->pgtbl_ops;
        u64 offset = 0;
 
+       if (!size)
+               return;
+
        drm_WARN_ON(&ptdev->base,
                    (iova < vm->locked_region.start) ||
                    (iova + size > vm->locked_region.start + 
vm->locked_region.size));
@@ -858,18 +878,32 @@ static int panthor_vm_unmap_pages(struct panthor_vm *vm, 
u64 iova, u64 size)
                size_t pgsize = get_pgsize(iova + offset, size - offset, 
&pgcount);
 
                unmapped_sz = ops->unmap_pages(ops, iova + offset, pgsize, 
pgcount, NULL);
+               if (drm_WARN_ON_ONCE(&ptdev->base, unmapped_sz != pgsize * 
pgcount)) {
+                       /* Gracefully handle sparsely unmapped regions to avoid 
leaving
+                        * page table pages behind when the drm_gpuvm and VM 
page table
+                        * are out-of-sync. This is not supposed to happen, 
hence the
+                        * above WARN_ON().
+                        */
+                       while (!ops->iova_to_phys(ops, iova + unmapped_sz) &&
+                              unmapped_sz < pgsize * pgcount)
+                               unmapped_sz += SZ_4K;
 
-               if (drm_WARN_ON(&ptdev->base, unmapped_sz != pgsize * pgcount)) 
{
-                       drm_err(&ptdev->base, "failed to unmap range %llx-%llx 
(requested range %llx-%llx)\n",
-                               iova + offset + unmapped_sz,
-                               iova + offset + pgsize * pgcount,
-                               iova, iova + size);
-                       return  -EINVAL;
+                       /* We're passed the point where we can try to fix 
things,
+                        * so flag the VM unusable to make sure it's not going
+                        * to be used anymore.
+                        */
+                       panthor_vm_declare_unusable(vm);
+
+                       /* If we don't make progress, we're screwed. That also 
means
+                        * something else prevents us from unmapping the 
region, but
+                        * there's not much we can do here: time for debugging.
+                        */
+                       if (drm_WARN_ON_ONCE(&ptdev->base, !unmapped_sz))
+                               return;
                }
+
                offset += unmapped_sz;
        }
-
-       return 0;
 }
 
 static int
@@ -917,16 +951,17 @@ panthor_vm_map_pages(struct panthor_vm *vm, u64 iova, int 
prot,
                        paddr += mapped;
                        len -= mapped;
 
-                       if (drm_WARN_ON(&ptdev->base, !ret && !mapped))
+                       /* If nothing was mapped, consider it an ENOMEM. */
+                       if (!ret && !mapped)
                                ret = -ENOMEM;
 
-                       if (ret) {
-                               /* If something failed, unmap what we've 
already mapped before
-                                * returning. The unmap call is not supposed to 
fail.
+                       /* If something fails, we stop there, and flag the VM 
unusable. */
+                       if (drm_WARN_ON_ONCE(&ptdev->base, ret)) {
+                               /* Unmap what we've already mapped to avoid 
leaving page
+                                * table pages behind.
                                 */
-                               drm_WARN_ON(&ptdev->base,
-                                           panthor_vm_unmap_pages(vm, 
start_iova,
-                                                                  iova - 
start_iova));
+                               panthor_vm_unmap_pages(vm, start_iova, iova - 
start_iova);
+                               panthor_vm_declare_unusable(vm);
                                return ret;
                        }
                }
@@ -2109,12 +2144,9 @@ static int panthor_gpuva_sm_step_remap(struct 
drm_gpuva_op *op,
        struct panthor_vm_op_ctx *op_ctx = vm->op_ctx;
        struct panthor_vma *prev_vma = NULL, *next_vma = NULL;
        u64 unmap_start, unmap_range;
-       int ret;
 
        drm_gpuva_op_remap_to_unmap_range(&op->remap, &unmap_start, 
&unmap_range);
-       ret = panthor_vm_unmap_pages(vm, unmap_start, unmap_range);
-       if (ret)
-               return ret;
+       panthor_vm_unmap_pages(vm, unmap_start, unmap_range);
 
        if (op->remap.prev) {
                prev_vma = panthor_vm_op_ctx_get_vma(op_ctx);
@@ -2154,13 +2186,9 @@ static int panthor_gpuva_sm_step_unmap(struct 
drm_gpuva_op *op,
 {
        struct panthor_vma *unmap_vma = container_of(op->unmap.va, struct 
panthor_vma, base);
        struct panthor_vm *vm = priv;
-       int ret;
-
-       ret = panthor_vm_unmap_pages(vm, unmap_vma->base.va.addr,
-                                    unmap_vma->base.va.range);
-       if (drm_WARN_ON(&vm->ptdev->base, ret))
-               return ret;
 
+       panthor_vm_unmap_pages(vm, unmap_vma->base.va.addr,
+                              unmap_vma->base.va.range);
        drm_gpuva_unmap(&op->unmap);
        panthor_vma_unlink(vm, unmap_vma);
        return 0;
@@ -2240,7 +2268,7 @@ panthor_vm_exec_op(struct panthor_vm *vm, struct 
panthor_vm_op_ctx *op,
 
 out:
        if (ret && flag_vm_unusable_on_failure)
-               vm->unusable = true;
+               panthor_vm_declare_unusable(vm);
 
        vm->op_ctx = NULL;
        mutex_unlock(&vm->op_lock);
-- 
2.51.1

Reply via email to