Module: Mesa Branch: main Commit: 2dc452ec7cab6e71a4d43949d420760b502a4049 URL: http://cgit.freedesktop.org/mesa/mesa/commit/?id=2dc452ec7cab6e71a4d43949d420760b502a4049
Author: Lionel Landwerlin <lionel.g.landwer...@intel.com> Date: Thu Nov 2 22:50:38 2023 +0200 anv: dynamically allocate utrace batch buffers Estimating the batch space required can be tricky because of all the workarounds. So implement chaining of batches like we do for command buffers. Signed-off-by: Lionel Landwerlin <lionel.g.landwer...@intel.com> Reviewed-by: José Roberto de Souza <jose.so...@intel.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/26087> --- src/intel/vulkan/anv_private.h | 2 +- src/intel/vulkan/anv_utrace.c | 110 +++++++++++++++++++------------- src/intel/vulkan/i915/anv_batch_chain.c | 45 +++++++------ src/intel/vulkan/xe/anv_batch_chain.c | 19 ++++-- 4 files changed, 106 insertions(+), 70 deletions(-) diff --git a/src/intel/vulkan/anv_private.h b/src/intel/vulkan/anv_private.h index 36c078e23a7..d964c3bb05b 100644 --- a/src/intel/vulkan/anv_private.h +++ b/src/intel/vulkan/anv_private.h @@ -5509,7 +5509,7 @@ struct anv_utrace_submit { */ struct anv_reloc_list relocs; struct anv_batch batch; - struct anv_bo *batch_bo; + struct util_dynarray batch_bos; /* Stream for temporary allocations */ struct anv_state_stream dynamic_state_stream; diff --git a/src/intel/vulkan/anv_utrace.c b/src/intel/vulkan/anv_utrace.c index 68c519587b1..93759e58b73 100644 --- a/src/intel/vulkan/anv_utrace.c +++ b/src/intel/vulkan/anv_utrace.c @@ -25,7 +25,7 @@ #include "anv_internal_kernels.h" #include "ds/intel_tracepoints.h" -#include "genxml/gen8_pack.h" +#include "genxml/gen9_pack.h" #include "perf/intel_perf.h" #include "util/perf/cpu_trace.h" @@ -88,10 +88,9 @@ anv_utrace_delete_submit(struct u_trace_context *utctx, void *submit_data) if (submit->trace_bo) anv_bo_pool_free(&device->utrace_bo_pool, submit->trace_bo); - if (submit->batch_bo) { - anv_reloc_list_finish(&submit->relocs); - anv_bo_pool_free(&device->utrace_bo_pool, submit->batch_bo); - } + util_dynarray_foreach(&submit->batch_bos, struct anv_bo *, bo) + anv_bo_pool_free(&device->utrace_bo_pool, *bo); + util_dynarray_fini(&submit->batch_bos); vk_sync_destroy(&device->vk, submit->sync); @@ -151,6 +150,44 @@ anv_device_utrace_emit_cs_copy_ts_buffer(struct u_trace_context *utctx, push_data_state); } +static VkResult +anv_utrace_submit_extend_batch(struct anv_batch *batch, uint32_t size, + void *user_data) +{ + struct anv_utrace_submit *submit = user_data; + + uint32_t alloc_size = 0; + util_dynarray_foreach(&submit->batch_bos, struct anv_bo *, bo) + alloc_size += (*bo)->size; + alloc_size = MAX2(alloc_size * 2, 8192); + + struct anv_bo *bo; + VkResult result = anv_bo_pool_alloc(&submit->queue->device->utrace_bo_pool, + align(alloc_size, 4096), + &bo); + if (result != VK_SUCCESS) + return result; + + util_dynarray_append(&submit->batch_bos, struct anv_bo *, bo); + + batch->end += 4 * GFX9_MI_BATCH_BUFFER_START_length; + + anv_batch_emit(batch, GFX9_MI_BATCH_BUFFER_START, bbs) { + bbs.DWordLength = GFX9_MI_BATCH_BUFFER_START_length - + GFX9_MI_BATCH_BUFFER_START_length_bias; + bbs.SecondLevelBatchBuffer = Firstlevelbatch; + bbs.AddressSpaceIndicator = ASI_PPGTT; + bbs.BatchBufferStartAddress = (struct anv_address) { bo, 0 }; + } + + anv_batch_set_storage(batch, + (struct anv_address) { .bo = bo, }, + bo->map, + bo->size - 4 * GFX9_MI_BATCH_BUFFER_START_length); + + return VK_SUCCESS; +} + VkResult anv_device_utrace_flush_cmd_buffers(struct anv_queue *queue, uint32_t cmd_buffer_count, @@ -175,6 +212,8 @@ anv_device_utrace_flush_cmd_buffers(struct anv_queue *queue, if (!submit) return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY); + submit->queue = queue; + intel_ds_flush_data_init(&submit->ds, &queue->ds, queue->ds.submission_id); result = vk_sync_create(&device->vk, &device->physical->sync_syncobj_type, @@ -182,6 +221,8 @@ anv_device_utrace_flush_cmd_buffers(struct anv_queue *queue, if (result != VK_SUCCESS) goto error_sync; + util_dynarray_init(&submit->batch_bos, NULL); + if (utrace_copies > 0) { result = anv_bo_pool_alloc(&device->utrace_bo_pool, utrace_copies * 4096, @@ -189,22 +230,6 @@ anv_device_utrace_flush_cmd_buffers(struct anv_queue *queue, if (result != VK_SUCCESS) goto error_trace_buf; - uint32_t batch_size = 512; /* 128 dwords of setup */ - if (intel_needs_workaround(device->info, 16013994831)) { - /* Enable/Disable preemption at the begin/end */ - batch_size += 2 * (250 /* 250 MI_NOOPs*/ + - 6 /* PIPE_CONTROL */ + - 3 /* MI_LRI */) * 4 /* dwords */; - } - batch_size += 256 * utrace_copies; /* 64 dwords per copy */ - batch_size = align(batch_size + 4, 8); /* MI_BATCH_BUFFER_END */ - - result = anv_bo_pool_alloc(&device->utrace_bo_pool, - align(batch_size, 4096), - &submit->batch_bo); - if (result != VK_SUCCESS) - goto error_batch_buf; - const bool uses_relocs = device->physical->uses_relocs; result = anv_reloc_list_init(&submit->relocs, &device->vk.alloc, uses_relocs); if (result != VK_SUCCESS) @@ -215,11 +240,12 @@ anv_device_utrace_flush_cmd_buffers(struct anv_queue *queue, anv_state_stream_init(&submit->general_state_stream, &device->general_state_pool, 16384); - submit->batch.alloc = &device->vk.alloc; - submit->batch.relocs = &submit->relocs; - anv_batch_set_storage(&submit->batch, - (struct anv_address) { .bo = submit->batch_bo, }, - submit->batch_bo->map, submit->batch_bo->size); + submit->batch = (struct anv_batch) { + .alloc = &device->vk.alloc, + .relocs = &submit->relocs, + .user_data = submit, + .extend_cb = anv_utrace_submit_extend_batch, + }; /* Only engine class where we support timestamp copies * @@ -304,17 +330,15 @@ anv_device_utrace_flush_cmd_buffers(struct anv_queue *queue, } } - submit->queue = queue; - *out_submit = submit; return VK_SUCCESS; error_batch: anv_reloc_list_finish(&submit->relocs); + util_dynarray_foreach(&submit->batch_bos, struct anv_bo *, bo) + anv_bo_pool_free(&device->utrace_bo_pool, *bo); error_reloc_list: - anv_bo_pool_free(&device->utrace_bo_pool, submit->batch_bo); - error_batch_buf: anv_bo_pool_free(&device->utrace_bo_pool, submit->trace_bo); error_trace_buf: vk_sync_destroy(&device->vk, submit->sync); @@ -555,21 +579,17 @@ anv_queue_trace(struct anv_queue *queue, const char *label, bool frame, bool beg if (result != VK_SUCCESS) goto error_trace; - result = anv_bo_pool_alloc(&device->utrace_bo_pool, 4096, - &submit->batch_bo); - if (result != VK_SUCCESS) - goto error_sync; - const bool uses_relocs = device->physical->uses_relocs; result = anv_reloc_list_init(&submit->relocs, &device->vk.alloc, uses_relocs); if (result != VK_SUCCESS) - goto error_batch_bo; + goto error_sync; - submit->batch.alloc = &device->vk.alloc; - submit->batch.relocs = &submit->relocs; - anv_batch_set_storage(&submit->batch, - (struct anv_address) { .bo = submit->batch_bo, }, - submit->batch_bo->map, submit->batch_bo->size); + submit->batch = (struct anv_batch) { + .alloc = &device->vk.alloc, + .relocs = &submit->relocs, + .user_data = submit, + .extend_cb = anv_utrace_submit_extend_batch, + }; if (frame) { if (begin) @@ -588,8 +608,8 @@ anv_queue_trace(struct anv_queue *queue, const char *label, bool frame, bool beg } } - anv_batch_emit(&submit->batch, GFX8_MI_BATCH_BUFFER_END, bbs); - anv_batch_emit(&submit->batch, GFX8_MI_NOOP, noop); + anv_batch_emit(&submit->batch, GFX9_MI_BATCH_BUFFER_END, bbs); + anv_batch_emit(&submit->batch, GFX9_MI_NOOP, noop); if (submit->batch.status != VK_SUCCESS) { result = submit->batch.status; @@ -606,8 +626,8 @@ anv_queue_trace(struct anv_queue *queue, const char *label, bool frame, bool beg error_reloc_list: anv_reloc_list_finish(&submit->relocs); - error_batch_bo: - anv_bo_pool_free(&device->utrace_bo_pool, submit->batch_bo); + util_dynarray_foreach(&submit->batch_bos, struct anv_bo *, bo) + anv_bo_pool_free(&device->utrace_bo_pool, *bo); error_sync: vk_sync_destroy(&device->vk, submit->sync); error_trace: diff --git a/src/intel/vulkan/i915/anv_batch_chain.c b/src/intel/vulkan/i915/anv_batch_chain.c index 459b6c7f860..925c40e09ca 100644 --- a/src/intel/vulkan/i915/anv_batch_chain.c +++ b/src/intel/vulkan/i915/anv_batch_chain.c @@ -522,38 +522,43 @@ setup_utrace_execbuf(struct anv_execbuf *execbuf, struct anv_queue *queue, if (result != VK_SUCCESS) return result; - result = anv_execbuf_add_bo(device, execbuf, - submit->batch_bo, - &submit->relocs, 0); - if (result != VK_SUCCESS) - return result; + util_dynarray_foreach(&submit->batch_bos, struct anv_bo *, _bo) { + struct anv_bo *bo = *_bo; + + result = anv_execbuf_add_bo(device, execbuf, bo, + &submit->relocs, 0); + if (result != VK_SUCCESS) + return result; + +#ifdef SUPPORT_INTEL_INTEGRATED_GPUS + if (device->physical->memory.need_flush) + intel_flush_range(bo->map, bo->size); +#endif + } result = anv_execbuf_add_sync(device, execbuf, submit->sync, true /* is_signal */, 0 /* value */); if (result != VK_SUCCESS) return result; - if (submit->batch_bo->exec_obj_index != execbuf->bo_count - 1) { - uint32_t idx = submit->batch_bo->exec_obj_index; + struct anv_bo *batch_bo = + *util_dynarray_element(&submit->batch_bos, struct anv_bo *, 0); + if (batch_bo->exec_obj_index != execbuf->bo_count - 1) { + uint32_t idx = batch_bo->exec_obj_index; uint32_t last_idx = execbuf->bo_count - 1; struct drm_i915_gem_exec_object2 tmp_obj = execbuf->objects[idx]; - assert(execbuf->bos[idx] == submit->batch_bo); + assert(execbuf->bos[idx] == batch_bo); execbuf->objects[idx] = execbuf->objects[last_idx]; execbuf->bos[idx] = execbuf->bos[last_idx]; execbuf->bos[idx]->exec_obj_index = idx; execbuf->objects[last_idx] = tmp_obj; - execbuf->bos[last_idx] = submit->batch_bo; - submit->batch_bo->exec_obj_index = last_idx; + execbuf->bos[last_idx] = batch_bo; + batch_bo->exec_obj_index = last_idx; } -#ifdef SUPPORT_INTEL_INTEGRATED_GPUS - if (device->physical->memory.need_flush) - intel_flush_range(submit->batch_bo->map, submit->batch_bo->size); -#endif - uint64_t exec_flags = 0; uint32_t context_id; get_context_and_exec_flags(queue, false, &exec_flags, &context_id); @@ -596,7 +601,8 @@ static VkResult anv_queue_exec_utrace_locked(struct anv_queue *queue, struct anv_utrace_submit *submit) { - assert(submit->batch_bo); + assert(util_dynarray_num_elements(&submit->batch_bos, + struct anv_bo *) > 0); struct anv_device *device = queue->device; struct anv_execbuf execbuf = { @@ -740,7 +746,9 @@ i915_queue_exec_locked(struct anv_queue *queue, }; VkResult result; - if (utrace_submit && !utrace_submit->batch_bo) { + if (utrace_submit && + util_dynarray_num_elements(&utrace_submit->batch_bos, + struct anv_bo *) == 0) { result = anv_execbuf_add_sync(device, &execbuf, utrace_submit->sync, true /* is_signal */, @@ -950,7 +958,8 @@ VkResult i915_queue_exec_trace(struct anv_queue *queue, struct anv_utrace_submit *submit) { - assert(submit->batch_bo); + assert(util_dynarray_num_elements(&submit->batch_bos, + struct anv_bo *) > 0); return anv_queue_exec_utrace_locked(queue, submit); } diff --git a/src/intel/vulkan/xe/anv_batch_chain.c b/src/intel/vulkan/xe/anv_batch_chain.c index 28588aedfe2..baf04db3cb3 100644 --- a/src/intel/vulkan/xe/anv_batch_chain.c +++ b/src/intel/vulkan/xe/anv_batch_chain.c @@ -126,7 +126,9 @@ xe_exec_process_syncs(struct anv_queue *queue, /* Signal the utrace sync only if it doesn't have a batch. Otherwise the * it's the utrace batch that should signal its own sync. */ - if (utrace_submit && !utrace_submit->batch_bo) { + if (utrace_submit && + util_dynarray_num_elements(&utrace_submit->batch_bos, + struct anv_bo *) == 0) { struct drm_xe_sync *xe_sync = &xe_syncs[count++]; xe_exec_fill_sync(xe_sync, utrace_submit->sync, 0, TYPE_SIGNAL); @@ -186,17 +188,20 @@ xe_queue_exec_utrace_locked(struct anv_queue *queue, xe_exec_fill_sync(&xe_sync, utrace_submit->sync, 0, TYPE_SIGNAL); #ifdef SUPPORT_INTEL_INTEGRATED_GPUS - if (device->physical->memory.need_flush) - intel_flush_range(utrace_submit->batch_bo->map, - utrace_submit->batch_bo->size); + if (device->physical->memory.need_flush) { + util_dynarray_foreach(&utrace_submit->batch_bos, struct anv_bo *, bo) + intel_flush_range((*bo)->map, (*bo)->size); + } #endif + struct anv_bo *batch_bo = + *util_dynarray_element(&utrace_submit->batch_bos, struct anv_bo *, 0); struct drm_xe_exec exec = { .exec_queue_id = queue->exec_queue_id, .num_batch_buffer = 1, .syncs = (uintptr_t)&xe_sync, .num_syncs = 1, - .address = utrace_submit->batch_bo->offset, + .address = batch_bo->offset, }; if (likely(!device->info->no_hw)) { if (intel_ioctl(device->fd, DRM_IOCTL_XE_EXEC, &exec)) @@ -283,7 +288,9 @@ xe_queue_exec_locked(struct anv_queue *queue, return result; /* If we have no batch for utrace, just forget about it now. */ - if (utrace_submit && !utrace_submit->batch_bo) + if (utrace_submit && + util_dynarray_num_elements(&utrace_submit->batch_bos, + struct anv_bo *) == 0) utrace_submit = NULL; struct drm_xe_exec exec = {