On 13/11/2025 10:39, Boris Brezillon wrote: > There's no reason for panthor_vm_[un]map_pages() to fail unless the > drm_gpuvm state and the page table are out of sync, so let's reflect that > by making panthor_vm_unmap_pages() a void function and adding > WARN_ON()s in various places. We also try to recover from those > unexpected mismatch by checking for already unmapped ranges and skipping > them. But there's only so much we can do to try and cope with such > SW bugs, so when we see a mismatch, we flag the VM unusable and disable > the AS to avoid further GPU accesses to the memory. > > It could be that the as_disable() call fails because the MMU unit is > stuck, in which case the whole GPU is frozen, and only a GPU reset can > unblock things. Ater the reset, the VM will be seen as unusable and > any attempt to re-use it will fail, so we should be covered for any > use-after-unmap issues. > > v2: > - Fix double unlock > > Signed-off-by: Boris Brezillon <[email protected]>
Reviewed-by: Steven Price <[email protected]> > --- > drivers/gpu/drm/panthor/panthor_mmu.c | 82 ++++++++++++++++++--------- > 1 file changed, 55 insertions(+), 27 deletions(-) > > diff --git a/drivers/gpu/drm/panthor/panthor_mmu.c > b/drivers/gpu/drm/panthor/panthor_mmu.c > index 21389137a324..35aad1e0ecaa 100644 > --- a/drivers/gpu/drm/panthor/panthor_mmu.c > +++ b/drivers/gpu/drm/panthor/panthor_mmu.c > @@ -842,12 +842,32 @@ static size_t get_pgsize(u64 addr, size_t size, size_t > *count) > return SZ_2M; > } > > -static int panthor_vm_unmap_pages(struct panthor_vm *vm, u64 iova, u64 size) > +static void panthor_vm_declare_unusable(struct panthor_vm *vm) > +{ > + struct panthor_device *ptdev = vm->ptdev; > + int cookie; > + > + if (vm->unusable) > + return; > + > + vm->unusable = true; > + mutex_lock(&ptdev->mmu->as.slots_lock); > + if (vm->as.id >= 0 && drm_dev_enter(&ptdev->base, &cookie)) { > + panthor_mmu_as_disable(ptdev, vm->as.id); > + drm_dev_exit(cookie); > + } > + mutex_unlock(&ptdev->mmu->as.slots_lock); > +} > + > +static void panthor_vm_unmap_pages(struct panthor_vm *vm, u64 iova, u64 size) > { > struct panthor_device *ptdev = vm->ptdev; > struct io_pgtable_ops *ops = vm->pgtbl_ops; > u64 offset = 0; > > + if (!size) > + return; > + > drm_WARN_ON(&ptdev->base, > (iova < vm->locked_region.start) || > (iova + size > vm->locked_region.start + > vm->locked_region.size)); > @@ -858,18 +878,32 @@ static int panthor_vm_unmap_pages(struct panthor_vm > *vm, u64 iova, u64 size) > size_t pgsize = get_pgsize(iova + offset, size - offset, > &pgcount); > > unmapped_sz = ops->unmap_pages(ops, iova + offset, pgsize, > pgcount, NULL); > + if (drm_WARN_ON_ONCE(&ptdev->base, unmapped_sz != pgsize * > pgcount)) { > + /* Gracefully handle sparsely unmapped regions to avoid > leaving > + * page table pages behind when the drm_gpuvm and VM > page table > + * are out-of-sync. This is not supposed to happen, > hence the > + * above WARN_ON(). > + */ > + while (!ops->iova_to_phys(ops, iova + unmapped_sz) && > + unmapped_sz < pgsize * pgcount) > + unmapped_sz += SZ_4K; > > - if (drm_WARN_ON(&ptdev->base, unmapped_sz != pgsize * pgcount)) > { > - drm_err(&ptdev->base, "failed to unmap range %llx-%llx > (requested range %llx-%llx)\n", > - iova + offset + unmapped_sz, > - iova + offset + pgsize * pgcount, > - iova, iova + size); > - return -EINVAL; > + /* We're passed the point where we can try to fix > things, > + * so flag the VM unusable to make sure it's not going > + * to be used anymore. > + */ > + panthor_vm_declare_unusable(vm); > + > + /* If we don't make progress, we're screwed. That also > means > + * something else prevents us from unmapping the > region, but > + * there's not much we can do here: time for debugging. > + */ > + if (drm_WARN_ON_ONCE(&ptdev->base, !unmapped_sz)) > + return; > } > + > offset += unmapped_sz; > } > - > - return 0; > } > > static int > @@ -917,16 +951,17 @@ panthor_vm_map_pages(struct panthor_vm *vm, u64 iova, > int prot, > paddr += mapped; > len -= mapped; > > - if (drm_WARN_ON(&ptdev->base, !ret && !mapped)) > + /* If nothing was mapped, consider it an ENOMEM. */ > + if (!ret && !mapped) > ret = -ENOMEM; > > - if (ret) { > - /* If something failed, unmap what we've > already mapped before > - * returning. The unmap call is not supposed to > fail. > + /* If something fails, we stop there, and flag the VM > unusable. */ > + if (drm_WARN_ON_ONCE(&ptdev->base, ret)) { > + /* Unmap what we've already mapped to avoid > leaving page > + * table pages behind. > */ > - drm_WARN_ON(&ptdev->base, > - panthor_vm_unmap_pages(vm, > start_iova, > - iova - > start_iova)); > + panthor_vm_unmap_pages(vm, start_iova, iova - > start_iova); > + panthor_vm_declare_unusable(vm); > return ret; > } > } > @@ -2109,12 +2144,9 @@ static int panthor_gpuva_sm_step_remap(struct > drm_gpuva_op *op, > struct panthor_vm_op_ctx *op_ctx = vm->op_ctx; > struct panthor_vma *prev_vma = NULL, *next_vma = NULL; > u64 unmap_start, unmap_range; > - int ret; > > drm_gpuva_op_remap_to_unmap_range(&op->remap, &unmap_start, > &unmap_range); > - ret = panthor_vm_unmap_pages(vm, unmap_start, unmap_range); > - if (ret) > - return ret; > + panthor_vm_unmap_pages(vm, unmap_start, unmap_range); > > if (op->remap.prev) { > prev_vma = panthor_vm_op_ctx_get_vma(op_ctx); > @@ -2154,13 +2186,9 @@ static int panthor_gpuva_sm_step_unmap(struct > drm_gpuva_op *op, > { > struct panthor_vma *unmap_vma = container_of(op->unmap.va, struct > panthor_vma, base); > struct panthor_vm *vm = priv; > - int ret; > - > - ret = panthor_vm_unmap_pages(vm, unmap_vma->base.va.addr, > - unmap_vma->base.va.range); > - if (drm_WARN_ON(&vm->ptdev->base, ret)) > - return ret; > > + panthor_vm_unmap_pages(vm, unmap_vma->base.va.addr, > + unmap_vma->base.va.range); > drm_gpuva_unmap(&op->unmap); > panthor_vma_unlink(vm, unmap_vma); > return 0; > @@ -2240,7 +2268,7 @@ panthor_vm_exec_op(struct panthor_vm *vm, struct > panthor_vm_op_ctx *op, > > out: > if (ret && flag_vm_unusable_on_failure) > - vm->unusable = true; > + panthor_vm_declare_unusable(vm); > > vm->op_ctx = NULL; > mutex_unlock(&vm->op_lock);
