Module: Mesa
Branch: main
Commit: 2dc452ec7cab6e71a4d43949d420760b502a4049
URL:    
http://cgit.freedesktop.org/mesa/mesa/commit/?id=2dc452ec7cab6e71a4d43949d420760b502a4049

Author: Lionel Landwerlin <lionel.g.landwer...@intel.com>
Date:   Thu Nov  2 22:50:38 2023 +0200

anv: dynamically allocate utrace batch buffers

Estimating the batch space required can be tricky because of all the
workarounds. So implement chaining of batches like we do for command
buffers.

Signed-off-by: Lionel Landwerlin <lionel.g.landwer...@intel.com>
Reviewed-by: José Roberto de Souza <jose.so...@intel.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/26087>

---

 src/intel/vulkan/anv_private.h          |   2 +-
 src/intel/vulkan/anv_utrace.c           | 110 +++++++++++++++++++-------------
 src/intel/vulkan/i915/anv_batch_chain.c |  45 +++++++------
 src/intel/vulkan/xe/anv_batch_chain.c   |  19 ++++--
 4 files changed, 106 insertions(+), 70 deletions(-)

diff --git a/src/intel/vulkan/anv_private.h b/src/intel/vulkan/anv_private.h
index 36c078e23a7..d964c3bb05b 100644
--- a/src/intel/vulkan/anv_private.h
+++ b/src/intel/vulkan/anv_private.h
@@ -5509,7 +5509,7 @@ struct anv_utrace_submit {
     */
    struct anv_reloc_list relocs;
    struct anv_batch batch;
-   struct anv_bo *batch_bo;
+   struct util_dynarray batch_bos;
 
    /* Stream for temporary allocations */
    struct anv_state_stream dynamic_state_stream;
diff --git a/src/intel/vulkan/anv_utrace.c b/src/intel/vulkan/anv_utrace.c
index 68c519587b1..93759e58b73 100644
--- a/src/intel/vulkan/anv_utrace.c
+++ b/src/intel/vulkan/anv_utrace.c
@@ -25,7 +25,7 @@
 #include "anv_internal_kernels.h"
 
 #include "ds/intel_tracepoints.h"
-#include "genxml/gen8_pack.h"
+#include "genxml/gen9_pack.h"
 #include "perf/intel_perf.h"
 #include "util/perf/cpu_trace.h"
 
@@ -88,10 +88,9 @@ anv_utrace_delete_submit(struct u_trace_context *utctx, void 
*submit_data)
    if (submit->trace_bo)
       anv_bo_pool_free(&device->utrace_bo_pool, submit->trace_bo);
 
-   if (submit->batch_bo) {
-      anv_reloc_list_finish(&submit->relocs);
-      anv_bo_pool_free(&device->utrace_bo_pool, submit->batch_bo);
-   }
+   util_dynarray_foreach(&submit->batch_bos, struct anv_bo *, bo)
+      anv_bo_pool_free(&device->utrace_bo_pool, *bo);
+   util_dynarray_fini(&submit->batch_bos);
 
    vk_sync_destroy(&device->vk, submit->sync);
 
@@ -151,6 +150,44 @@ anv_device_utrace_emit_cs_copy_ts_buffer(struct 
u_trace_context *utctx,
       push_data_state);
 }
 
+static VkResult
+anv_utrace_submit_extend_batch(struct anv_batch *batch, uint32_t size,
+                               void *user_data)
+{
+   struct anv_utrace_submit *submit = user_data;
+
+   uint32_t alloc_size = 0;
+   util_dynarray_foreach(&submit->batch_bos, struct anv_bo *, bo)
+      alloc_size += (*bo)->size;
+   alloc_size = MAX2(alloc_size * 2, 8192);
+
+   struct anv_bo *bo;
+   VkResult result = anv_bo_pool_alloc(&submit->queue->device->utrace_bo_pool,
+                                       align(alloc_size, 4096),
+                                       &bo);
+   if (result != VK_SUCCESS)
+      return result;
+
+   util_dynarray_append(&submit->batch_bos, struct anv_bo *, bo);
+
+   batch->end += 4 * GFX9_MI_BATCH_BUFFER_START_length;
+
+   anv_batch_emit(batch, GFX9_MI_BATCH_BUFFER_START, bbs) {
+      bbs.DWordLength               = GFX9_MI_BATCH_BUFFER_START_length -
+                                      GFX9_MI_BATCH_BUFFER_START_length_bias;
+      bbs.SecondLevelBatchBuffer    = Firstlevelbatch;
+      bbs.AddressSpaceIndicator     = ASI_PPGTT;
+      bbs.BatchBufferStartAddress   = (struct anv_address) { bo, 0 };
+   }
+
+   anv_batch_set_storage(batch,
+                         (struct anv_address) { .bo = bo, },
+                         bo->map,
+                         bo->size - 4 * GFX9_MI_BATCH_BUFFER_START_length);
+
+   return VK_SUCCESS;
+}
+
 VkResult
 anv_device_utrace_flush_cmd_buffers(struct anv_queue *queue,
                                     uint32_t cmd_buffer_count,
@@ -175,6 +212,8 @@ anv_device_utrace_flush_cmd_buffers(struct anv_queue *queue,
    if (!submit)
       return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
 
+   submit->queue = queue;
+
    intel_ds_flush_data_init(&submit->ds, &queue->ds, queue->ds.submission_id);
 
    result = vk_sync_create(&device->vk, &device->physical->sync_syncobj_type,
@@ -182,6 +221,8 @@ anv_device_utrace_flush_cmd_buffers(struct anv_queue *queue,
    if (result != VK_SUCCESS)
       goto error_sync;
 
+   util_dynarray_init(&submit->batch_bos, NULL);
+
    if (utrace_copies > 0) {
       result = anv_bo_pool_alloc(&device->utrace_bo_pool,
                                  utrace_copies * 4096,
@@ -189,22 +230,6 @@ anv_device_utrace_flush_cmd_buffers(struct anv_queue 
*queue,
       if (result != VK_SUCCESS)
          goto error_trace_buf;
 
-      uint32_t batch_size = 512; /* 128 dwords of setup */
-      if (intel_needs_workaround(device->info, 16013994831)) {
-         /* Enable/Disable preemption at the begin/end */
-         batch_size += 2 * (250 /* 250 MI_NOOPs*/ +
-                            6   /* PIPE_CONTROL */ +
-                            3   /* MI_LRI */) * 4 /* dwords */;
-      }
-      batch_size += 256 * utrace_copies; /* 64 dwords per copy */
-      batch_size = align(batch_size + 4, 8); /* MI_BATCH_BUFFER_END */
-
-      result = anv_bo_pool_alloc(&device->utrace_bo_pool,
-                                 align(batch_size, 4096),
-                                 &submit->batch_bo);
-      if (result != VK_SUCCESS)
-         goto error_batch_buf;
-
       const bool uses_relocs = device->physical->uses_relocs;
       result = anv_reloc_list_init(&submit->relocs, &device->vk.alloc, 
uses_relocs);
       if (result != VK_SUCCESS)
@@ -215,11 +240,12 @@ anv_device_utrace_flush_cmd_buffers(struct anv_queue 
*queue,
       anv_state_stream_init(&submit->general_state_stream,
                             &device->general_state_pool, 16384);
 
-      submit->batch.alloc = &device->vk.alloc;
-      submit->batch.relocs = &submit->relocs;
-      anv_batch_set_storage(&submit->batch,
-                            (struct anv_address) { .bo = submit->batch_bo, },
-                            submit->batch_bo->map, submit->batch_bo->size);
+      submit->batch = (struct anv_batch) {
+         .alloc = &device->vk.alloc,
+         .relocs = &submit->relocs,
+         .user_data = submit,
+         .extend_cb = anv_utrace_submit_extend_batch,
+      };
 
       /* Only engine class where we support timestamp copies
        *
@@ -304,17 +330,15 @@ anv_device_utrace_flush_cmd_buffers(struct anv_queue 
*queue,
       }
    }
 
-   submit->queue = queue;
-
    *out_submit = submit;
 
    return VK_SUCCESS;
 
  error_batch:
    anv_reloc_list_finish(&submit->relocs);
+   util_dynarray_foreach(&submit->batch_bos, struct anv_bo *, bo)
+      anv_bo_pool_free(&device->utrace_bo_pool, *bo);
  error_reloc_list:
-   anv_bo_pool_free(&device->utrace_bo_pool, submit->batch_bo);
- error_batch_buf:
    anv_bo_pool_free(&device->utrace_bo_pool, submit->trace_bo);
  error_trace_buf:
    vk_sync_destroy(&device->vk, submit->sync);
@@ -555,21 +579,17 @@ anv_queue_trace(struct anv_queue *queue, const char 
*label, bool frame, bool beg
    if (result != VK_SUCCESS)
       goto error_trace;
 
-   result = anv_bo_pool_alloc(&device->utrace_bo_pool, 4096,
-                              &submit->batch_bo);
-   if (result != VK_SUCCESS)
-      goto error_sync;
-
    const bool uses_relocs = device->physical->uses_relocs;
    result = anv_reloc_list_init(&submit->relocs, &device->vk.alloc, 
uses_relocs);
    if (result != VK_SUCCESS)
-      goto error_batch_bo;
+      goto error_sync;
 
-   submit->batch.alloc = &device->vk.alloc;
-   submit->batch.relocs = &submit->relocs;
-   anv_batch_set_storage(&submit->batch,
-                         (struct anv_address) { .bo = submit->batch_bo, },
-                         submit->batch_bo->map, submit->batch_bo->size);
+   submit->batch = (struct anv_batch) {
+      .alloc = &device->vk.alloc,
+      .relocs = &submit->relocs,
+      .user_data = submit,
+      .extend_cb = anv_utrace_submit_extend_batch,
+   };
 
    if (frame) {
       if (begin)
@@ -588,8 +608,8 @@ anv_queue_trace(struct anv_queue *queue, const char *label, 
bool frame, bool beg
       }
    }
 
-   anv_batch_emit(&submit->batch, GFX8_MI_BATCH_BUFFER_END, bbs);
-   anv_batch_emit(&submit->batch, GFX8_MI_NOOP, noop);
+   anv_batch_emit(&submit->batch, GFX9_MI_BATCH_BUFFER_END, bbs);
+   anv_batch_emit(&submit->batch, GFX9_MI_NOOP, noop);
 
    if (submit->batch.status != VK_SUCCESS) {
       result = submit->batch.status;
@@ -606,8 +626,8 @@ anv_queue_trace(struct anv_queue *queue, const char *label, 
bool frame, bool beg
 
  error_reloc_list:
    anv_reloc_list_finish(&submit->relocs);
- error_batch_bo:
-   anv_bo_pool_free(&device->utrace_bo_pool, submit->batch_bo);
+   util_dynarray_foreach(&submit->batch_bos, struct anv_bo *, bo)
+      anv_bo_pool_free(&device->utrace_bo_pool, *bo);
  error_sync:
    vk_sync_destroy(&device->vk, submit->sync);
  error_trace:
diff --git a/src/intel/vulkan/i915/anv_batch_chain.c 
b/src/intel/vulkan/i915/anv_batch_chain.c
index 459b6c7f860..925c40e09ca 100644
--- a/src/intel/vulkan/i915/anv_batch_chain.c
+++ b/src/intel/vulkan/i915/anv_batch_chain.c
@@ -522,38 +522,43 @@ setup_utrace_execbuf(struct anv_execbuf *execbuf, struct 
anv_queue *queue,
    if (result != VK_SUCCESS)
       return result;
 
-   result = anv_execbuf_add_bo(device, execbuf,
-                               submit->batch_bo,
-                               &submit->relocs, 0);
-   if (result != VK_SUCCESS)
-      return result;
+   util_dynarray_foreach(&submit->batch_bos, struct anv_bo *, _bo) {
+      struct anv_bo *bo = *_bo;
+
+      result = anv_execbuf_add_bo(device, execbuf, bo,
+                                  &submit->relocs, 0);
+      if (result != VK_SUCCESS)
+         return result;
+
+#ifdef SUPPORT_INTEL_INTEGRATED_GPUS
+      if (device->physical->memory.need_flush)
+         intel_flush_range(bo->map, bo->size);
+#endif
+   }
 
    result = anv_execbuf_add_sync(device, execbuf, submit->sync,
                                  true /* is_signal */, 0 /* value */);
    if (result != VK_SUCCESS)
       return result;
 
-   if (submit->batch_bo->exec_obj_index != execbuf->bo_count - 1) {
-      uint32_t idx = submit->batch_bo->exec_obj_index;
+   struct anv_bo *batch_bo =
+      *util_dynarray_element(&submit->batch_bos, struct anv_bo *, 0);
+   if (batch_bo->exec_obj_index != execbuf->bo_count - 1) {
+      uint32_t idx = batch_bo->exec_obj_index;
       uint32_t last_idx = execbuf->bo_count - 1;
 
       struct drm_i915_gem_exec_object2 tmp_obj = execbuf->objects[idx];
-      assert(execbuf->bos[idx] == submit->batch_bo);
+      assert(execbuf->bos[idx] == batch_bo);
 
       execbuf->objects[idx] = execbuf->objects[last_idx];
       execbuf->bos[idx] = execbuf->bos[last_idx];
       execbuf->bos[idx]->exec_obj_index = idx;
 
       execbuf->objects[last_idx] = tmp_obj;
-      execbuf->bos[last_idx] = submit->batch_bo;
-      submit->batch_bo->exec_obj_index = last_idx;
+      execbuf->bos[last_idx] = batch_bo;
+      batch_bo->exec_obj_index = last_idx;
    }
 
-#ifdef SUPPORT_INTEL_INTEGRATED_GPUS
-   if (device->physical->memory.need_flush)
-      intel_flush_range(submit->batch_bo->map, submit->batch_bo->size);
-#endif
-
    uint64_t exec_flags = 0;
    uint32_t context_id;
    get_context_and_exec_flags(queue, false, &exec_flags, &context_id);
@@ -596,7 +601,8 @@ static VkResult
 anv_queue_exec_utrace_locked(struct anv_queue *queue,
                              struct anv_utrace_submit *submit)
 {
-   assert(submit->batch_bo);
+   assert(util_dynarray_num_elements(&submit->batch_bos,
+                                     struct anv_bo *) > 0);
 
    struct anv_device *device = queue->device;
    struct anv_execbuf execbuf = {
@@ -740,7 +746,9 @@ i915_queue_exec_locked(struct anv_queue *queue,
    };
    VkResult result;
 
-   if (utrace_submit && !utrace_submit->batch_bo) {
+   if (utrace_submit &&
+       util_dynarray_num_elements(&utrace_submit->batch_bos,
+                                  struct anv_bo *) == 0) {
       result = anv_execbuf_add_sync(device, &execbuf,
                                     utrace_submit->sync,
                                     true /* is_signal */,
@@ -950,7 +958,8 @@ VkResult
 i915_queue_exec_trace(struct anv_queue *queue,
                       struct anv_utrace_submit *submit)
 {
-   assert(submit->batch_bo);
+   assert(util_dynarray_num_elements(&submit->batch_bos,
+                                     struct anv_bo *) > 0);
 
    return anv_queue_exec_utrace_locked(queue, submit);
 }
diff --git a/src/intel/vulkan/xe/anv_batch_chain.c 
b/src/intel/vulkan/xe/anv_batch_chain.c
index 28588aedfe2..baf04db3cb3 100644
--- a/src/intel/vulkan/xe/anv_batch_chain.c
+++ b/src/intel/vulkan/xe/anv_batch_chain.c
@@ -126,7 +126,9 @@ xe_exec_process_syncs(struct anv_queue *queue,
    /* Signal the utrace sync only if it doesn't have a batch. Otherwise the
     * it's the utrace batch that should signal its own sync.
     */
-   if (utrace_submit && !utrace_submit->batch_bo) {
+   if (utrace_submit &&
+       util_dynarray_num_elements(&utrace_submit->batch_bos,
+                                  struct anv_bo *) == 0) {
       struct drm_xe_sync *xe_sync = &xe_syncs[count++];
 
       xe_exec_fill_sync(xe_sync, utrace_submit->sync, 0, TYPE_SIGNAL);
@@ -186,17 +188,20 @@ xe_queue_exec_utrace_locked(struct anv_queue *queue,
    xe_exec_fill_sync(&xe_sync, utrace_submit->sync, 0, TYPE_SIGNAL);
 
 #ifdef SUPPORT_INTEL_INTEGRATED_GPUS
-   if (device->physical->memory.need_flush)
-      intel_flush_range(utrace_submit->batch_bo->map,
-                        utrace_submit->batch_bo->size);
+   if (device->physical->memory.need_flush) {
+      util_dynarray_foreach(&utrace_submit->batch_bos, struct anv_bo *, bo)
+         intel_flush_range((*bo)->map, (*bo)->size);
+   }
 #endif
 
+   struct anv_bo *batch_bo =
+      *util_dynarray_element(&utrace_submit->batch_bos, struct anv_bo *, 0);
    struct drm_xe_exec exec = {
       .exec_queue_id = queue->exec_queue_id,
       .num_batch_buffer = 1,
       .syncs = (uintptr_t)&xe_sync,
       .num_syncs = 1,
-      .address = utrace_submit->batch_bo->offset,
+      .address = batch_bo->offset,
    };
    if (likely(!device->info->no_hw)) {
       if (intel_ioctl(device->fd, DRM_IOCTL_XE_EXEC, &exec))
@@ -283,7 +288,9 @@ xe_queue_exec_locked(struct anv_queue *queue,
       return result;
 
    /* If we have no batch for utrace, just forget about it now. */
-   if (utrace_submit && !utrace_submit->batch_bo)
+   if (utrace_submit &&
+       util_dynarray_num_elements(&utrace_submit->batch_bos,
+                                  struct anv_bo *) == 0)
       utrace_submit = NULL;
 
    struct drm_xe_exec exec = {

Reply via email to