When lots of virtual address spaces is used, there can be thousands
of page table BOs. amdgpu_vm_update_level iterates over all of them
recursively. In many cases only a few or none at all need to be
updated. Minimize unnecessary code execution and memory usage in
those cases.

This speeds up memory mapping in a synthetic KFD memory mapping
benchmark by roughly a factor two.

Signed-off-by: Felix Kuehling <[email protected]>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c | 109 +++++++++++++++++----------------
 1 file changed, 55 insertions(+), 54 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
index ff5de3a..23b899b 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
@@ -1025,7 +1025,7 @@ static int amdgpu_vm_update_level(struct amdgpu_device 
*adev,
 {
        struct amdgpu_bo *shadow;
        struct amdgpu_ring *ring = NULL;
-       uint64_t pd_addr, shadow_addr = 0;
+       uint64_t pd_addr = 0, shadow_addr = 0;
        uint32_t incr = amdgpu_vm_bo_size(adev, level + 1);
        uint64_t last_pde = ~0, last_pt = ~0, last_shadow = ~0;
        unsigned count = 0, pt_idx, ndw = 0;
@@ -1044,48 +1044,19 @@ static int amdgpu_vm_update_level(struct amdgpu_device 
*adev,
 
        WARN_ON(vm->use_cpu_for_update && shadow);
        if (vm->use_cpu_for_update && !shadow) {
-               r = amdgpu_bo_kmap(parent->bo, (void **)&pd_addr);
-               if (r)
-                       return r;
-               r = amdgpu_vm_bo_wait(adev, parent->bo);
-               if (unlikely(r)) {
-                       amdgpu_bo_kunmap(parent->bo);
-                       return r;
-               }
+               /* Defer kmapping until it's actually needed. Some
+                * PDBs may need no update at all
+                */
                params.func = amdgpu_vm_cpu_set_ptes;
+               params.ib = (void *)(long)-1;
        } else {
-               if (shadow) {
-                       r = amdgpu_ttm_bind(&shadow->tbo, &shadow->tbo.mem);
-                       if (r)
-                               return r;
-               }
-               ring = container_of(vm->entity.sched, struct amdgpu_ring,
-                                   sched);
-
-               /* padding, etc. */
-               ndw = 64;
-
-               /* assume the worst case */
-               ndw += parent->last_entry_used * 6;
-
-               pd_addr = amdgpu_bo_gpu_offset(parent->bo);
-
-               if (shadow) {
-                       shadow_addr = amdgpu_bo_gpu_offset(shadow);
-                       ndw *= 2;
-               } else {
-                       shadow_addr = 0;
-               }
-
-               r = amdgpu_job_alloc_with_ib(adev, ndw * 4, &job);
-               if (r)
-                       return r;
-
-               params.ib = &job->ibs[0];
+               /* Defer IB allocation until it's actually
+                * needed. Some PDBs may need no update at all
+                */
+               params.ib = NULL;
                params.func = amdgpu_vm_do_set_ptes;
        }
 
-
        /* walk over the address space and update the directory */
        for (pt_idx = 0; pt_idx <= parent->last_entry_used; ++pt_idx) {
                struct amdgpu_bo *bo = parent->entries[pt_idx].bo;
@@ -1094,22 +1065,53 @@ static int amdgpu_vm_update_level(struct amdgpu_device 
*adev,
                if (bo == NULL)
                        continue;
 
-               if (bo->shadow) {
-                       struct amdgpu_bo *pt_shadow = bo->shadow;
-
-                       r = amdgpu_ttm_bind(&pt_shadow->tbo,
-                                           &pt_shadow->tbo.mem);
-                       if (r)
-                               return r;
-               }
-
-               pt = amdgpu_bo_gpu_offset(bo);
-               pt = amdgpu_gart_get_vm_pde(adev, pt);
+               pt = amdgpu_gart_get_vm_pde(adev, bo->tbo.offset);
                if (parent->entries[pt_idx].addr == pt)
                        continue;
 
                parent->entries[pt_idx].addr = pt;
 
+               if (!params.ib) {
+                       if (shadow) {
+                               r = amdgpu_ttm_bind(&shadow->tbo,
+                                                   &shadow->tbo.mem);
+                               if (r)
+                                       return r;
+                       }
+
+                       ring = container_of(vm->entity.sched,
+                                           struct amdgpu_ring, sched);
+
+                       /* padding, etc. */
+                       ndw = 64;
+
+                       /* assume the worst case */
+                       ndw += (parent->last_entry_used - pt_idx) * 6;
+
+                       pd_addr = parent->bo->tbo.offset;
+
+                       if (shadow) {
+                               shadow_addr = shadow->tbo.offset;
+                               ndw *= 2;
+                       } else {
+                               shadow_addr = 0;
+                       }
+                       r = amdgpu_job_alloc_with_ib(adev, ndw * 4, &job);
+                       if (r)
+                               return r;
+
+                       params.ib = &job->ibs[0];
+               } else if (!pd_addr) {
+                       r = amdgpu_bo_kmap(parent->bo, (void **)&pd_addr);
+                       if (r)
+                               return r;
+                       r = amdgpu_vm_bo_wait(adev, parent->bo);
+                       if (unlikely(r)) {
+                               amdgpu_bo_kunmap(parent->bo);
+                               return r;
+                       }
+               }
+
                pde = pd_addr + pt_idx * 8;
                if (((last_pde + 8 * count) != pde) ||
                    ((last_pt + incr * count) != pt) ||
@@ -1148,9 +1150,9 @@ static int amdgpu_vm_update_level(struct amdgpu_device 
*adev,
 
        if (params.func == amdgpu_vm_cpu_set_ptes)
                amdgpu_bo_kunmap(parent->bo);
-       else if (params.ib->length_dw == 0) {
+       else if (params.ib && params.ib->length_dw == 0) {
                amdgpu_job_free(job);
-       } else {
+       } else if (params.ib) {
                amdgpu_ring_pad_ib(ring, params.ib);
                amdgpu_sync_resv(adev, &job->sync, parent->bo->tbo.resv,
                                 AMDGPU_FENCE_OWNER_VM);
@@ -1166,8 +1168,7 @@ static int amdgpu_vm_update_level(struct amdgpu_device 
*adev,
 
                amdgpu_bo_fence(parent->bo, fence, true);
                dma_fence_put(vm->last_dir_update);
-               vm->last_dir_update = dma_fence_get(fence);
-               dma_fence_put(fence);
+               vm->last_dir_update = fence;
        }
        /*
         * Recurse into the subdirectories. This recursion is harmless because
@@ -1176,7 +1177,7 @@ static int amdgpu_vm_update_level(struct amdgpu_device 
*adev,
        for (pt_idx = 0; pt_idx <= parent->last_entry_used; ++pt_idx) {
                struct amdgpu_vm_pt *entry = &parent->entries[pt_idx];
 
-               if (!entry->bo)
+               if (!entry->bo || !entry->entries)
                        continue;
 
                r = amdgpu_vm_update_level(adev, vm, entry, level + 1);
-- 
1.9.1

_______________________________________________
amd-gfx mailing list
[email protected]
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

Reply via email to