Reviewed-by: Samuel Pitoiset <samuel.pitoi...@gmail.com>

We should remove some radv_cs_add_buffer() calls and re-enable local BOs to reduce overhead, but this can be done later.

Make sure to check the system submission path.

On 04/12/2018 01:44 AM, Bas Nieuwenhuizen wrote:
With update after bind we can't attach bo's to the command buffer
from the descriptor set anymore, so we have to have a global BO
list.

I am somewhat surprised this works really well even though we have
implicit synchronization in the WSI based on the bo list associations
and with the new behavior every command buffer is associated with
every swapchain image. But I could not find slowdowns in games because
of it.
---
  src/amd/vulkan/radv_device.c                  | 125 +++++++++++++-----
  src/amd/vulkan/radv_private.h                 |   8 ++
  src/amd/vulkan/radv_radeon_winsys.h           |   6 +
  src/amd/vulkan/winsys/amdgpu/radv_amdgpu_cs.c |  46 ++++++-
  4 files changed, 146 insertions(+), 39 deletions(-)

diff --git a/src/amd/vulkan/radv_device.c b/src/amd/vulkan/radv_device.c
index 22e8f1e7a78..c81b69fef5c 100644
--- a/src/amd/vulkan/radv_device.c
+++ b/src/amd/vulkan/radv_device.c
@@ -1208,6 +1208,55 @@ radv_queue_finish(struct radv_queue *queue)
                queue->device->ws->buffer_destroy(queue->compute_scratch_bo);
  }
+static void
+radv_bo_list_init(struct radv_bo_list *bo_list)
+{
+       pthread_mutex_init(&bo_list->mutex, NULL);
+       bo_list->list.count = bo_list->capacity = 0;
+       bo_list->list.bos = NULL;
+}
+
+static void
+radv_bo_list_finish(struct radv_bo_list *bo_list)
+{
+       free(bo_list->list.bos);
+       pthread_mutex_destroy(&bo_list->mutex);
+}
+
+static VkResult radv_bo_list_add(struct radv_bo_list *bo_list, struct 
radeon_winsys_bo *bo)
+{
+       pthread_mutex_lock(&bo_list->mutex);
+       if (bo_list->list.count == bo_list->capacity) {
+               unsigned capacity = MAX2(4, bo_list->capacity * 2);
+               void *data = realloc(bo_list->list.bos, capacity * 
sizeof(struct radeon_winsys_bo*));
+
+               if (!data) {
+                       pthread_mutex_unlock(&bo_list->mutex);
+                       return VK_ERROR_OUT_OF_HOST_MEMORY;
+               }
+
+               bo_list->list.bos = (struct radeon_winsys_bo**)data;
+               bo_list->capacity = capacity;
+       }
+
+       bo_list->list.bos[bo_list->list.count++] = bo;
+       pthread_mutex_unlock(&bo_list->mutex);
+       return VK_SUCCESS;
+}
+
+static void radv_bo_list_remove(struct radv_bo_list *bo_list, struct 
radeon_winsys_bo *bo)
+{
+       pthread_mutex_lock(&bo_list->mutex);
+       for(unsigned i = 0; i < bo_list->list.count; ++i) {
+               if (bo_list->list.bos[i] == bo) {
+                       bo_list->list.bos[i] = 
bo_list->list.bos[bo_list->list.count - 1];
+                       --bo_list->list.count;
+                       break;
+               }
+       }
+       pthread_mutex_unlock(&bo_list->mutex);
+}
+
  static void
  radv_device_init_gs_info(struct radv_device *device)
  {
@@ -1308,6 +1357,8 @@ VkResult radv_CreateDevice(
        mtx_init(&device->shader_slab_mutex, mtx_plain);
        list_inithead(&device->shader_slabs);
+ radv_bo_list_init(&device->bo_list);
+
        for (unsigned i = 0; i < pCreateInfo->queueCreateInfoCount; i++) {
                const VkDeviceQueueCreateInfo *queue_create = 
&pCreateInfo->pQueueCreateInfos[i];
                uint32_t qfi = queue_create->queueFamilyIndex;
@@ -1440,6 +1491,8 @@ VkResult radv_CreateDevice(
  fail_meta:
        radv_device_finish_meta(device);
  fail:
+       radv_bo_list_finish(&device->bo_list);
+
        if (device->trace_bo)
                device->ws->buffer_destroy(device->trace_bo);
@@ -1487,6 +1540,7 @@ void radv_DestroyDevice( radv_destroy_shader_slabs(device); + radv_bo_list_finish(&device->bo_list);
        vk_free(&device->alloc, device);
  }
@@ -2257,7 +2311,7 @@ static VkResult radv_signal_fence(struct radv_queue *queue, ret = queue->device->ws->cs_submit(queue->hw_ctx, queue->queue_idx,
                                           
&queue->device->empty_cs[queue->queue_family_index],
-                                          1, NULL, NULL, &sem_info,
+                                          1, NULL, NULL, &sem_info, NULL,
                                           false, fence->fence);
        radv_free_sem_info(&sem_info);
@@ -2334,7 +2388,7 @@ VkResult radv_QueueSubmit(
                                ret = queue->device->ws->cs_submit(ctx, 
queue->queue_idx,
                                                                   
&queue->device->empty_cs[queue->queue_family_index],
                                                                   1, NULL, 
NULL,
-                                                                  &sem_info,
+                                                                  &sem_info, 
NULL,
                                                                   false, 
base_fence);
                                if (ret) {
                                        radv_loge("failed to submit CS %d\n", 
i);
@@ -2372,11 +2426,15 @@ VkResult radv_QueueSubmit(
                        sem_info.cs_emit_wait = j == 0;
                        sem_info.cs_emit_signal = j + advance == 
pSubmits[i].commandBufferCount;
+ pthread_mutex_lock(&queue->device->bo_list.mutex);
+
                        ret = queue->device->ws->cs_submit(ctx, 
queue->queue_idx, cs_array + j,
                                                        advance, 
initial_preamble, continue_preamble_cs,
-                                                          &sem_info,
+                                                       &sem_info, 
&queue->device->bo_list.list,
                                                        can_patch, base_fence);
+ pthread_mutex_unlock(&queue->device->bo_list.mutex);
+
                        if (ret) {
                                radv_loge("failed to submit CS %d\n", i);
                                abort();
@@ -2582,11 +2640,8 @@ static VkResult radv_alloc_memory(struct radv_device 
*device,
                        goto fail;
                } else {
                        close(import_info->fd);
-                       goto out_success;
                }
-       }
-
-       if (host_ptr_info) {
+       } else if (host_ptr_info) {
                assert(host_ptr_info->handleType == 
VK_EXTERNAL_MEMORY_HANDLE_TYPE_HOST_ALLOCATION_BIT_EXT);
                assert(mem_type_index == RADV_MEM_TYPE_GTT_CACHED);
                mem->bo = device->ws->buffer_from_ptr(device->ws, 
host_ptr_info->pHostPointer,
@@ -2596,41 +2651,46 @@ static VkResult radv_alloc_memory(struct radv_device 
*device,
                        goto fail;
                } else {
                        mem->user_ptr = host_ptr_info->pHostPointer;
-                       goto out_success;
                }
-       }
-
-       uint64_t alloc_size = align_u64(pAllocateInfo->allocationSize, 4096);
-       if (mem_type_index == RADV_MEM_TYPE_GTT_WRITE_COMBINE ||
-           mem_type_index == RADV_MEM_TYPE_GTT_CACHED)
-               domain = RADEON_DOMAIN_GTT;
-       else
-               domain = RADEON_DOMAIN_VRAM;
+       } else {
+               uint64_t alloc_size = align_u64(pAllocateInfo->allocationSize, 
4096);
+               if (mem_type_index == RADV_MEM_TYPE_GTT_WRITE_COMBINE ||
+                   mem_type_index == RADV_MEM_TYPE_GTT_CACHED)
+                       domain = RADEON_DOMAIN_GTT;
+               else
+                       domain = RADEON_DOMAIN_VRAM;
- if (mem_type_index == RADV_MEM_TYPE_VRAM)
-               flags |= RADEON_FLAG_NO_CPU_ACCESS;
-       else
-               flags |= RADEON_FLAG_CPU_ACCESS;
+               if (mem_type_index == RADV_MEM_TYPE_VRAM)
+                       flags |= RADEON_FLAG_NO_CPU_ACCESS;
+               else
+                       flags |= RADEON_FLAG_CPU_ACCESS;
- if (mem_type_index == RADV_MEM_TYPE_GTT_WRITE_COMBINE)
-               flags |= RADEON_FLAG_GTT_WC;
+               if (mem_type_index == RADV_MEM_TYPE_GTT_WRITE_COMBINE)
+                       flags |= RADEON_FLAG_GTT_WC;
- if (!dedicate_info && !import_info && (!export_info || !export_info->handleTypes))
-               flags |= RADEON_FLAG_NO_INTERPROCESS_SHARING;
+               if (!dedicate_info && !import_info && (!export_info || 
!export_info->handleTypes))
+                       flags |= RADEON_FLAG_NO_INTERPROCESS_SHARING;
- mem->bo = device->ws->buffer_create(device->ws, alloc_size, device->physical_device->rad_info.max_alignment,
-                                              domain, flags);
+               mem->bo = device->ws->buffer_create(device->ws, alloc_size, 
device->physical_device->rad_info.max_alignment,
+                                                   domain, flags);
- if (!mem->bo) {
-               result = VK_ERROR_OUT_OF_DEVICE_MEMORY;
-               goto fail;
+               if (!mem->bo) {
+                       result = VK_ERROR_OUT_OF_DEVICE_MEMORY;
+                       goto fail;
+               }
+               mem->type_index = mem_type_index;
        }
-       mem->type_index = mem_type_index;
-out_success:
+
+       result = radv_bo_list_add(&device->bo_list, mem->bo);
+       if (result != VK_SUCCESS)
+               goto fail_bo;
+
        *pMem = radv_device_memory_to_handle(mem);
return VK_SUCCESS; +fail_bo:
+       device->ws->buffer_destroy(mem->bo);
  fail:
        vk_free2(&device->alloc, pAllocator, mem);
@@ -2658,6 +2718,7 @@ void radv_FreeMemory(
        if (mem == NULL)
                return;
+ radv_bo_list_remove(&device->bo_list, mem->bo);
        device->ws->buffer_destroy(mem->bo);
        mem->bo = NULL;
@@ -2977,7 +3038,7 @@ radv_sparse_image_opaque_bind_memory(struct radv_device *device,
                        queue->device->ws->cs_submit(queue->hw_ctx, 
queue->queue_idx,
                                                     
&queue->device->empty_cs[queue->queue_family_index],
                                                     1, NULL, NULL,
-                                                    &sem_info,
+                                                    &sem_info, NULL,
                                                     false, base_fence);
                        fence_emitted = true;
                        if (fence)
diff --git a/src/amd/vulkan/radv_private.h b/src/amd/vulkan/radv_private.h
index 1bcc3a906ec..9bed7ba07c2 100644
--- a/src/amd/vulkan/radv_private.h
+++ b/src/amd/vulkan/radv_private.h
@@ -596,6 +596,12 @@ struct radv_queue {
        struct radeon_winsys_cs *continue_preamble_cs;
  };
+struct radv_bo_list {
+       struct radv_winsys_bo_list list;
+       unsigned capacity;
+       pthread_mutex_t mutex;
+};
+
  struct radv_device {
        VK_LOADER_DATA                              _loader_data;
@@ -658,6 +664,8 @@ struct radv_device {
        uint64_t dmesg_timestamp;
struct radv_device_extension_table enabled_extensions;
+
+       struct radv_bo_list bo_list;
  };
struct radv_device_memory {
diff --git a/src/amd/vulkan/radv_radeon_winsys.h 
b/src/amd/vulkan/radv_radeon_winsys.h
index 270b3bceaba..cfde19d1ae1 100644
--- a/src/amd/vulkan/radv_radeon_winsys.h
+++ b/src/amd/vulkan/radv_radeon_winsys.h
@@ -177,6 +177,11 @@ struct radv_winsys_sem_info {
        struct radv_winsys_sem_counts signal;
  };
+struct radv_winsys_bo_list {
+       struct radeon_winsys_bo **bos;
+       unsigned count;
+};
+
  struct radeon_winsys {
        void (*destroy)(struct radeon_winsys *ws);
@@ -245,6 +250,7 @@ struct radeon_winsys {
                         struct radeon_winsys_cs *initial_preamble_cs,
                         struct radeon_winsys_cs *continue_preamble_cs,
                         struct radv_winsys_sem_info *sem_info,
+                        const struct radv_winsys_bo_list *bo_list, /* optional 
*/
                         bool can_patch,
                         struct radeon_winsys_fence *fence);
diff --git a/src/amd/vulkan/winsys/amdgpu/radv_amdgpu_cs.c b/src/amd/vulkan/winsys/amdgpu/radv_amdgpu_cs.c
index cd7ab384e73..a975f0e4969 100644
--- a/src/amd/vulkan/winsys/amdgpu/radv_amdgpu_cs.c
+++ b/src/amd/vulkan/winsys/amdgpu/radv_amdgpu_cs.c
@@ -550,6 +550,7 @@ static int radv_amdgpu_create_bo_list(struct 
radv_amdgpu_winsys *ws,
                                      unsigned count,
                                      struct radv_amdgpu_winsys_bo *extra_bo,
                                      struct radeon_winsys_cs *extra_cs,
+                                     const struct radv_winsys_bo_list 
*radv_bo_list,
                                      amdgpu_bo_list_handle *bo_list)
  {
        int r = 0;
@@ -577,7 +578,7 @@ static int radv_amdgpu_create_bo_list(struct 
radv_amdgpu_winsys *ws,
                                          bo_list);
                free(handles);
                pthread_mutex_unlock(&ws->global_bo_list_lock);
-       } else if (count == 1 && !extra_bo && !extra_cs &&
+       } else if (count == 1 && !extra_bo && !extra_cs && !radv_bo_list &&
                   !radv_amdgpu_cs(cs_array[0])->num_virtual_buffers) {
                struct radv_amdgpu_cs *cs = (struct radv_amdgpu_cs*)cs_array[0];
                if (cs->num_buffers == 0) {
@@ -599,6 +600,11 @@ static int radv_amdgpu_create_bo_list(struct 
radv_amdgpu_winsys *ws,
                if (extra_cs) {
                        total_buffer_count += ((struct 
radv_amdgpu_cs*)extra_cs)->num_buffers;
                }
+
+               if (radv_bo_list) {
+                       total_buffer_count += radv_bo_list->count;
+               }
+
                if (total_buffer_count == 0) {
                        *bo_list = 0;
                        return 0;
@@ -672,6 +678,27 @@ static int radv_amdgpu_create_bo_list(struct 
radv_amdgpu_winsys *ws,
                        }
                }
+ if (radv_bo_list) {
+                       unsigned unique_bo_so_far = unique_bo_count;
+                       const unsigned default_bo_priority = 7;
+                       for (unsigned i = 0; i < radv_bo_list->count; ++i) {
+                               struct radv_amdgpu_winsys_bo *bo = 
radv_amdgpu_winsys_bo(radv_bo_list->bos[i]);
+                               bool found = false;
+                               for (unsigned j = 0; j < unique_bo_so_far; ++j) 
{
+                                       if (bo->bo == handles[j]) {
+                                               found = true;
+                                               priorities[j] = 
MAX2(priorities[j], default_bo_priority);
+                                               break;
+                                       }
+                               }
+                               if (!found) {
+                                       handles[unique_bo_count] = bo->bo;
+                                       priorities[unique_bo_count] = 
default_bo_priority;
+                                       ++unique_bo_count;
+                               }
+                       }
+               }
+
                if (unique_bo_count > 0) {
                        r = amdgpu_bo_list_create(ws->dev, unique_bo_count, 
handles,
                                                  priorities, bo_list);
@@ -707,6 +734,7 @@ static void radv_assign_last_submit(struct radv_amdgpu_ctx 
*ctx,
  static int radv_amdgpu_winsys_cs_submit_chained(struct radeon_winsys_ctx 
*_ctx,
                                                int queue_idx,
                                                struct radv_winsys_sem_info 
*sem_info,
+                                               const struct 
radv_winsys_bo_list *radv_bo_list,
                                                struct radeon_winsys_cs 
**cs_array,
                                                unsigned cs_count,
                                                struct radeon_winsys_cs 
*initial_preamble_cs,
@@ -743,7 +771,8 @@ static int radv_amdgpu_winsys_cs_submit_chained(struct 
radeon_winsys_ctx *_ctx,
                }
        }
- r = radv_amdgpu_create_bo_list(cs0->ws, cs_array, cs_count, NULL, initial_preamble_cs, &bo_list);
+       r = radv_amdgpu_create_bo_list(cs0->ws, cs_array, cs_count, NULL, 
initial_preamble_cs,
+                                      radv_bo_list, &bo_list);
        if (r) {
                fprintf(stderr, "amdgpu: buffer list creation failed for the "
                                "chained submission(%d)\n", r);
@@ -787,6 +816,7 @@ static int radv_amdgpu_winsys_cs_submit_chained(struct 
radeon_winsys_ctx *_ctx,
  static int radv_amdgpu_winsys_cs_submit_fallback(struct radeon_winsys_ctx 
*_ctx,
                                                 int queue_idx,
                                                 struct radv_winsys_sem_info 
*sem_info,
+                                                const struct 
radv_winsys_bo_list *radv_bo_list,
                                                 struct radeon_winsys_cs 
**cs_array,
                                                 unsigned cs_count,
                                                 struct radeon_winsys_cs 
*initial_preamble_cs,
@@ -811,7 +841,7 @@ static int radv_amdgpu_winsys_cs_submit_fallback(struct 
radeon_winsys_ctx *_ctx,
                memset(&request, 0, sizeof(request));
r = radv_amdgpu_create_bo_list(cs0->ws, &cs_array[i], cnt, NULL,
-                                              preamble_cs, &bo_list);
+                                              preamble_cs, radv_bo_list, 
&bo_list);
                if (r) {
                        fprintf(stderr, "amdgpu: buffer list creation failed "
                                        "for the fallback submission (%d)\n", 
r);
@@ -868,6 +898,7 @@ static int radv_amdgpu_winsys_cs_submit_fallback(struct 
radeon_winsys_ctx *_ctx,
  static int radv_amdgpu_winsys_cs_submit_sysmem(struct radeon_winsys_ctx *_ctx,
                                               int queue_idx,
                                               struct radv_winsys_sem_info 
*sem_info,
+                                              const struct radv_winsys_bo_list 
*radv_bo_list,
                                               struct radeon_winsys_cs 
**cs_array,
                                               unsigned cs_count,
                                               struct radeon_winsys_cs 
*initial_preamble_cs,
@@ -937,7 +968,7 @@ static int radv_amdgpu_winsys_cs_submit_sysmem(struct 
radeon_winsys_ctx *_ctx,
r = radv_amdgpu_create_bo_list(cs0->ws, &cs_array[i], cnt,
                                               (struct 
radv_amdgpu_winsys_bo*)bo,
-                                              preamble_cs, &bo_list);
+                                              preamble_cs, radv_bo_list, 
&bo_list);
                if (r) {
                        fprintf(stderr, "amdgpu: buffer list creation failed "
                                        "for the sysmem submission (%d)\n", r);
@@ -988,6 +1019,7 @@ static int radv_amdgpu_winsys_cs_submit(struct 
radeon_winsys_ctx *_ctx,
                                        struct radeon_winsys_cs 
*initial_preamble_cs,
                                        struct radeon_winsys_cs 
*continue_preamble_cs,
                                        struct radv_winsys_sem_info *sem_info,
+                                       const struct radv_winsys_bo_list 
*bo_list,
                                        bool can_patch,
                                        struct radeon_winsys_fence *_fence)
  {
@@ -997,13 +1029,13 @@ static int radv_amdgpu_winsys_cs_submit(struct 
radeon_winsys_ctx *_ctx,
assert(sem_info);
        if (!cs->ws->use_ib_bos) {
-               ret = radv_amdgpu_winsys_cs_submit_sysmem(_ctx, queue_idx, 
sem_info, cs_array,
+               ret = radv_amdgpu_winsys_cs_submit_sysmem(_ctx, queue_idx, 
sem_info, bo_list, cs_array,
                                                           cs_count, 
initial_preamble_cs, continue_preamble_cs, _fence);
        } else if (can_patch && cs_count > AMDGPU_CS_MAX_IBS_PER_SUBMIT && 
cs->ws->batchchain) {
-               ret = radv_amdgpu_winsys_cs_submit_chained(_ctx, queue_idx, 
sem_info, cs_array,
+               ret = radv_amdgpu_winsys_cs_submit_chained(_ctx, queue_idx, 
sem_info, bo_list, cs_array,
                                                            cs_count, 
initial_preamble_cs, continue_preamble_cs, _fence);
        } else {
-               ret = radv_amdgpu_winsys_cs_submit_fallback(_ctx, queue_idx, 
sem_info, cs_array,
+               ret = radv_amdgpu_winsys_cs_submit_fallback(_ctx, queue_idx, 
sem_info, bo_list, cs_array,
                                                             cs_count, 
initial_preamble_cs, continue_preamble_cs, _fence);
        }
_______________________________________________
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev

Reply via email to