Module: Mesa Branch: main Commit: 88cbe32048aff3d7a873474bebc2d1068c21ce46 URL: http://cgit.freedesktop.org/mesa/mesa/commit/?id=88cbe32048aff3d7a873474bebc2d1068c21ce46
Author: Samuel Pitoiset <samuel.pitoi...@gmail.com> Date: Mon May 1 14:57:45 2023 +0200 radv: add support for RGP queue events This can be used for analysing queue submissions. This is still experimental and it can be disabled with RADV_THREAD_TRACE_QUEUE_EVENT=false if you have issues with it. Signed-off-by: Samuel Pitoiset <samuel.pitoi...@gmail.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/22779> --- docs/envvars.rst | 4 + src/amd/vulkan/layers/radv_sqtt_layer.c | 259 ++++++++++++++++++++++++++++++++ src/amd/vulkan/radv_device.c | 5 +- src/amd/vulkan/radv_private.h | 25 +++ src/amd/vulkan/radv_sqtt.c | 222 +++++++++++++++++++++++++++ 5 files changed, 513 insertions(+), 2 deletions(-) diff --git a/docs/envvars.rst b/docs/envvars.rst index b46b070a13c..2178cab6eb1 100644 --- a/docs/envvars.rst +++ b/docs/envvars.rst @@ -1362,6 +1362,10 @@ RADV driver environment variables enable/disable SQTT/RGP instruction timing (enabled by default) +.. envvar:: RADV_THREAD_TRACE_QUEUE_EVENTS + + enable/disable SQTT/RGP queue events (enabled by default) + .. envvar:: RADV_RRA_TRACE_VALIDATE enable validation of captured acceleration structures. Can be diff --git a/src/amd/vulkan/layers/radv_sqtt_layer.c b/src/amd/vulkan/layers/radv_sqtt_layer.c index ab11943cd8f..5f250490ced 100644 --- a/src/amd/vulkan/layers/radv_sqtt_layer.c +++ b/src/amd/vulkan/layers/radv_sqtt_layer.c @@ -25,6 +25,7 @@ #include "radv_private.h" #include "radv_shader.h" #include "vk_common_entrypoints.h" +#include "vk_semaphore.h" #include "wsi_common_entrypoints.h" #include "ac_rgp.h" @@ -535,6 +536,83 @@ radv_describe_pipeline_bind(struct radv_cmd_buffer *cmd_buffer, VkPipelineBindPo radv_emit_sqtt_userdata(cmd_buffer, &marker, sizeof(marker) / 4); } +/* Queue events */ +static void +radv_describe_queue_event(struct radv_queue *queue, struct rgp_queue_event_record *record) +{ + struct radv_device *device = queue->device; + struct ac_sqtt *sqtt = &device->sqtt; + struct rgp_queue_event *queue_event = &sqtt->rgp_queue_event; + + simple_mtx_lock(&queue_event->lock); + list_addtail(&record->list, &queue_event->record); + queue_event->record_count++; + simple_mtx_unlock(&queue_event->lock); +} + +static VkResult +radv_describe_queue_present(struct radv_queue *queue, uint64_t cpu_timestamp, void *gpu_timestamp_ptr) +{ + struct rgp_queue_event_record *record; + + record = calloc(1, sizeof(struct rgp_queue_event_record)); + if (!record) + return VK_ERROR_OUT_OF_HOST_MEMORY; + + record->event_type = SQTT_QUEUE_TIMING_EVENT_PRESENT; + record->cpu_timestamp = cpu_timestamp; + record->gpu_timestamps[0] = gpu_timestamp_ptr; + record->queue_info_index = queue->vk.queue_family_index; + + radv_describe_queue_event(queue, record); + + return VK_SUCCESS; +} + +static VkResult +radv_describe_queue_submit(struct radv_queue *queue, struct radv_cmd_buffer *cmd_buffer, uint64_t cpu_timestamp, + void *pre_gpu_timestamp_ptr, void *post_gpu_timestamp_ptr) +{ + struct radv_device *device = queue->device; + struct rgp_queue_event_record *record; + + record = calloc(1, sizeof(struct rgp_queue_event_record)); + if (!record) + return VK_ERROR_OUT_OF_HOST_MEMORY; + + record->event_type = SQTT_QUEUE_TIMING_EVENT_CMDBUF_SUBMIT; + record->api_id = (uintptr_t)cmd_buffer; + record->cpu_timestamp = cpu_timestamp; + record->frame_index = device->vk.current_frame; + record->gpu_timestamps[0] = pre_gpu_timestamp_ptr; + record->gpu_timestamps[1] = post_gpu_timestamp_ptr; + record->queue_info_index = queue->vk.queue_family_index; + + radv_describe_queue_event(queue, record); + + return VK_SUCCESS; +} + +static VkResult +radv_describe_queue_semaphore(struct radv_queue *queue, struct vk_semaphore *sync, + enum sqtt_queue_event_type event_type) +{ + struct rgp_queue_event_record *record; + + record = calloc(1, sizeof(struct rgp_queue_event_record)); + if (!record) + return VK_ERROR_OUT_OF_HOST_MEMORY; + + record->event_type = event_type; + record->api_id = (uintptr_t)sync; + record->cpu_timestamp = os_time_get_nano(); + record->queue_info_index = queue->vk.queue_family_index; + + radv_describe_queue_event(queue, record); + + return VK_SUCCESS; +} + static void radv_handle_sqtt(VkQueue _queue) { @@ -597,15 +675,196 @@ sqtt_QueuePresentKHR(VkQueue _queue, const VkPresentInfoKHR *pPresentInfo) RADV_FROM_HANDLE(radv_queue, queue, _queue); VkResult result; + queue->sqtt_present = true; + result = queue->device->layer_dispatch.rgp.QueuePresentKHR(_queue, pPresentInfo); if (result != VK_SUCCESS && result != VK_SUBOPTIMAL_KHR) return result; + queue->sqtt_present = false; + radv_handle_sqtt(_queue); return VK_SUCCESS; } +static VkResult +radv_sqtt_wsi_submit(VkQueue _queue, uint32_t submitCount, const VkSubmitInfo2 *pSubmits, VkFence _fence) +{ + RADV_FROM_HANDLE(radv_queue, queue, _queue); + struct radv_device *device = queue->device; + VkCommandBufferSubmitInfo *new_cmdbufs = NULL; + struct radeon_winsys_bo *gpu_timestamp_bo; + uint32_t gpu_timestamp_offset; + VkCommandBuffer timed_cmdbuf; + void *gpu_timestamp_ptr; + uint64_t cpu_timestamp; + VkResult result = VK_SUCCESS; + + assert(submitCount <= 1 && pSubmits != NULL); + + for (uint32_t i = 0; i < submitCount; i++) { + const VkSubmitInfo2 *pSubmit = &pSubmits[i]; + VkSubmitInfo2 sqtt_submit = *pSubmit; + + assert(sqtt_submit.commandBufferInfoCount <= 1); + + /* Command buffers */ + uint32_t new_cmdbuf_count = sqtt_submit.commandBufferInfoCount + 1; + + new_cmdbufs = malloc(new_cmdbuf_count * sizeof(*new_cmdbufs)); + if (!new_cmdbufs) + return VK_ERROR_OUT_OF_HOST_MEMORY; + + /* Sample the current CPU time before building the GPU timestamp cmdbuf. */ + cpu_timestamp = os_time_get_nano(); + + result = radv_sqtt_acquire_gpu_timestamp(device, &gpu_timestamp_bo, &gpu_timestamp_offset, &gpu_timestamp_ptr); + if (result != VK_SUCCESS) + goto fail; + + result = radv_sqtt_get_timed_cmdbuf(queue, gpu_timestamp_bo, gpu_timestamp_offset, + VK_PIPELINE_STAGE_2_TOP_OF_PIPE_BIT, &timed_cmdbuf); + if (result != VK_SUCCESS) + goto fail; + + new_cmdbufs[0] = (VkCommandBufferSubmitInfo){ + .sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_SUBMIT_INFO, + .commandBuffer = timed_cmdbuf, + }; + + if (sqtt_submit.commandBufferInfoCount == 1) + new_cmdbufs[1] = sqtt_submit.pCommandBufferInfos[0]; + + sqtt_submit.commandBufferInfoCount = new_cmdbuf_count; + sqtt_submit.pCommandBufferInfos = new_cmdbufs; + + radv_describe_queue_present(queue, cpu_timestamp, gpu_timestamp_ptr); + + result = queue->device->layer_dispatch.rgp.QueueSubmit2KHR(_queue, 1, &sqtt_submit, _fence); + if (result != VK_SUCCESS) + goto fail; + + FREE(new_cmdbufs); + } + + return result; + +fail: + FREE(new_cmdbufs); + return result; +} + +VKAPI_ATTR VkResult VKAPI_CALL +sqtt_QueueSubmit2KHR(VkQueue _queue, uint32_t submitCount, const VkSubmitInfo2 *pSubmits, VkFence _fence) +{ + RADV_FROM_HANDLE(radv_queue, queue, _queue); + const bool is_gfx_or_ace = queue->state.qf == RADV_QUEUE_GENERAL || queue->state.qf == RADV_QUEUE_COMPUTE; + struct radv_device *device = queue->device; + VkCommandBufferSubmitInfo *new_cmdbufs = NULL; + VkResult result = VK_SUCCESS; + + /* Only consider queue events on graphics/compute when enabled. */ + if (!device->sqtt_enabled || !radv_sqtt_queue_events_enabled() || !is_gfx_or_ace) + return queue->device->layer_dispatch.rgp.QueueSubmit2KHR(_queue, submitCount, pSubmits, _fence); + + for (uint32_t i = 0; i < submitCount; i++) { + const VkSubmitInfo2 *pSubmit = &pSubmits[i]; + + /* Wait semaphores */ + for (uint32_t j = 0; j < pSubmit->waitSemaphoreInfoCount; j++) { + const VkSemaphoreSubmitInfo *pWaitSemaphoreInfo = &pSubmit->pWaitSemaphoreInfos[j]; + VK_FROM_HANDLE(vk_semaphore, sem, pWaitSemaphoreInfo->semaphore); + radv_describe_queue_semaphore(queue, sem, SQTT_QUEUE_TIMING_EVENT_WAIT_SEMAPHORE); + } + } + + if (queue->sqtt_present) + return radv_sqtt_wsi_submit(_queue, submitCount, pSubmits, _fence); + + for (uint32_t i = 0; i < submitCount; i++) { + const VkSubmitInfo2 *pSubmit = &pSubmits[i]; + VkSubmitInfo2 sqtt_submit = *pSubmit; + + /* Command buffers */ + uint32_t new_cmdbuf_count = sqtt_submit.commandBufferInfoCount * 3; + uint32_t cmdbuf_idx = 0; + + new_cmdbufs = malloc(new_cmdbuf_count * sizeof(*new_cmdbufs)); + if (!new_cmdbufs) + return VK_ERROR_OUT_OF_HOST_MEMORY; + + for (uint32_t j = 0; j < sqtt_submit.commandBufferInfoCount; j++) { + const VkCommandBufferSubmitInfo *pCommandBufferInfo = &sqtt_submit.pCommandBufferInfos[j]; + struct radeon_winsys_bo *gpu_timestamps_bo[2]; + uint32_t gpu_timestamps_offset[2]; + VkCommandBuffer pre_timed_cmdbuf, post_timed_cmdbuf; + void *gpu_timestamps_ptr[2]; + uint64_t cpu_timestamp; + + /* Sample the current CPU time before building the timed cmdbufs. */ + cpu_timestamp = os_time_get_nano(); + + result = radv_sqtt_acquire_gpu_timestamp(queue->device, &gpu_timestamps_bo[0], &gpu_timestamps_offset[0], + &gpu_timestamps_ptr[0]); + if (result != VK_SUCCESS) + goto fail; + + result = radv_sqtt_get_timed_cmdbuf(queue, gpu_timestamps_bo[0], gpu_timestamps_offset[0], + VK_PIPELINE_STAGE_2_TOP_OF_PIPE_BIT, &pre_timed_cmdbuf); + if (result != VK_SUCCESS) + goto fail; + + new_cmdbufs[cmdbuf_idx++] = (VkCommandBufferSubmitInfo){ + .sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_SUBMIT_INFO, + .commandBuffer = pre_timed_cmdbuf, + }; + + new_cmdbufs[cmdbuf_idx++] = *pCommandBufferInfo; + + result = radv_sqtt_acquire_gpu_timestamp(queue->device, &gpu_timestamps_bo[1], &gpu_timestamps_offset[1], + &gpu_timestamps_ptr[1]); + if (result != VK_SUCCESS) + goto fail; + + result = radv_sqtt_get_timed_cmdbuf(queue, gpu_timestamps_bo[1], gpu_timestamps_offset[1], + VK_PIPELINE_STAGE_2_BOTTOM_OF_PIPE_BIT, &post_timed_cmdbuf); + if (result != VK_SUCCESS) + goto fail; + + new_cmdbufs[cmdbuf_idx++] = (VkCommandBufferSubmitInfo){ + .sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_SUBMIT_INFO, + .commandBuffer = post_timed_cmdbuf, + }; + + RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, pCommandBufferInfo->commandBuffer); + radv_describe_queue_submit(queue, cmd_buffer, cpu_timestamp, gpu_timestamps_ptr[0], gpu_timestamps_ptr[1]); + } + + sqtt_submit.commandBufferInfoCount = new_cmdbuf_count; + sqtt_submit.pCommandBufferInfos = new_cmdbufs; + + result = queue->device->layer_dispatch.rgp.QueueSubmit2KHR(_queue, 1, &sqtt_submit, _fence); + if (result != VK_SUCCESS) + goto fail; + + /* Signal semaphores */ + for (uint32_t j = 0; j < sqtt_submit.signalSemaphoreInfoCount; j++) { + const VkSemaphoreSubmitInfo *pSignalSemaphoreInfo = &sqtt_submit.pSignalSemaphoreInfos[j]; + VK_FROM_HANDLE(vk_semaphore, sem, pSignalSemaphoreInfo->semaphore); + radv_describe_queue_semaphore(queue, sem, SQTT_QUEUE_TIMING_EVENT_SIGNAL_SEMAPHORE); + } + + FREE(new_cmdbufs); + } + + return result; + +fail: + FREE(new_cmdbufs); + return result; +} + #define EVENT_MARKER_BASE(cmd_name, api_name, event_name, ...) \ RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer); \ radv_write_begin_general_api_marker(cmd_buffer, ApiCmd##api_name); \ diff --git a/src/amd/vulkan/radv_device.c b/src/amd/vulkan/radv_device.c index c6ada5e8354..f613bf105e7 100644 --- a/src/amd/vulkan/radv_device.c +++ b/src/amd/vulkan/radv_device.c @@ -1014,9 +1014,10 @@ radv_CreateDevice(VkPhysicalDevice physicalDevice, const VkDeviceCreateInfo *pCr fprintf(stderr, "radv: Thread trace support is enabled (initial buffer size: %u MiB, " - "instruction timing: %s, cache counters: %s).\n", + "instruction timing: %s, cache counters: %s, queue events: %s).\n", device->sqtt.buffer_size / (1024 * 1024), radv_is_instruction_timing_enabled() ? "enabled" : "disabled", - radv_spm_trace_enabled(device->instance) ? "enabled" : "disabled"); + radv_spm_trace_enabled(device->instance) ? "enabled" : "disabled", + radv_sqtt_queue_events_enabled() ? "enabled" : "disabled"); if (radv_spm_trace_enabled(device->instance)) { if (device->physical_device->rad_info.gfx_level >= GFX10) { diff --git a/src/amd/vulkan/radv_private.h b/src/amd/vulkan/radv_private.h index ff16f49da65..26412019a0c 100644 --- a/src/amd/vulkan/radv_private.h +++ b/src/amd/vulkan/radv_private.h @@ -840,6 +840,7 @@ struct radv_queue { struct radeon_winsys_bo *gang_sem_bo; uint64_t last_shader_upload_seq; + bool sqtt_present; }; int radv_queue_init(struct radv_device *device, struct radv_queue *queue, int idx, @@ -946,6 +947,14 @@ enum radv_buffer_robustness { RADV_BUFFER_ROBUSTNESS_2, /* robustBufferAccess2 */ }; +struct radv_sqtt_timestamp { + uint8_t *map; + unsigned offset; + uint64_t size; + struct radeon_winsys_bo *bo; + struct list_head list; +}; + struct radv_device { struct vk_device vk; @@ -1049,6 +1058,14 @@ struct radv_device { bool sqtt_enabled; bool sqtt_triggered; + /* SQTT timestamps for queue events. */ + simple_mtx_t sqtt_timestamp_mtx; + struct radv_sqtt_timestamp sqtt_timestamp; + + /* SQTT timed cmd buffers. */ + simple_mtx_t sqtt_command_pool_mtx; + struct vk_command_pool *sqtt_command_pool[2]; + /* Memory trace. */ struct radv_memory_trace_data memory_trace; @@ -3096,11 +3113,19 @@ bool radv_get_sqtt_trace(struct radv_queue *queue, struct ac_sqtt_trace *sqtt_tr void radv_reset_sqtt_trace(struct radv_device *device); void radv_emit_sqtt_userdata(const struct radv_cmd_buffer *cmd_buffer, const void *data, uint32_t num_dwords); bool radv_is_instruction_timing_enabled(void); +bool radv_sqtt_queue_events_enabled(void); bool radv_sqtt_sample_clocks(struct radv_device *device); void radv_emit_inhibit_clockgating(const struct radv_device *device, struct radeon_cmdbuf *cs, bool inhibit); void radv_emit_spi_config_cntl(const struct radv_device *device, struct radeon_cmdbuf *cs, bool enable); +VkResult radv_sqtt_get_timed_cmdbuf(struct radv_queue *queue, struct radeon_winsys_bo *timestamp_bo, + uint32_t timestamp_offset, VkPipelineStageFlags2 timestamp_stage, + VkCommandBuffer *pcmdbuf); + +VkResult radv_sqtt_acquire_gpu_timestamp(struct radv_device *device, struct radeon_winsys_bo **gpu_timestamp_bo, + uint32_t *gpu_timestamp_offset, void **gpu_timestamp_ptr); + void radv_rra_trace_init(struct radv_device *device); VkResult radv_rra_dump_trace(VkQueue vk_queue, char *filename); diff --git a/src/amd/vulkan/radv_sqtt.c b/src/amd/vulkan/radv_sqtt.c index 6c3fa93861a..45c360c3e4c 100644 --- a/src/amd/vulkan/radv_sqtt.c +++ b/src/amd/vulkan/radv_sqtt.c @@ -24,9 +24,12 @@ #include <inttypes.h> #include "radv_cs.h" +#include "radv_debug.h" #include "radv_private.h" #include "sid.h" +#include "vk_common_entrypoints.h" + #define SQTT_BUFFER_ALIGN_SHIFT 12 bool @@ -35,6 +38,12 @@ radv_is_instruction_timing_enabled(void) return debug_get_bool_option("RADV_THREAD_TRACE_INSTRUCTION_TIMING", true); } +bool +radv_sqtt_queue_events_enabled(void) +{ + return debug_get_bool_option("RADV_THREAD_TRACE_QUEUE_EVENTS", true); +} + static uint32_t gfx11_get_sqtt_ctrl(const struct radv_device *device, bool enable) { @@ -479,6 +488,141 @@ radv_emit_inhibit_clockgating(const struct radv_device *device, struct radeon_cm } } +VkResult +radv_sqtt_acquire_gpu_timestamp(struct radv_device *device, struct radeon_winsys_bo **gpu_timestamp_bo, + uint32_t *gpu_timestamp_offset, void **gpu_timestamp_ptr) +{ + struct radeon_winsys *ws = device->ws; + + simple_mtx_lock(&device->sqtt_timestamp_mtx); + + if (device->sqtt_timestamp.offset + 8 > device->sqtt_timestamp.size) { + struct radeon_winsys_bo *bo; + uint64_t new_size; + VkResult result; + uint8_t *map; + + new_size = MAX2(4096, 2 * device->sqtt_timestamp.size); + + result = ws->buffer_create(ws, new_size, 8, RADEON_DOMAIN_GTT, + RADEON_FLAG_CPU_ACCESS | RADEON_FLAG_NO_INTERPROCESS_SHARING, RADV_BO_PRIORITY_SCRATCH, + 0, &bo); + if (result != VK_SUCCESS) { + simple_mtx_unlock(&device->sqtt_timestamp_mtx); + return result; + } + + map = device->ws->buffer_map(bo); + if (!map) { + ws->buffer_destroy(ws, bo); + simple_mtx_unlock(&device->sqtt_timestamp_mtx); + return VK_ERROR_OUT_OF_DEVICE_MEMORY; + } + + if (device->sqtt_timestamp.bo) { + struct radv_sqtt_timestamp *new_timestamp; + + new_timestamp = malloc(sizeof(*new_timestamp)); + if (!new_timestamp) { + ws->buffer_destroy(ws, bo); + simple_mtx_unlock(&device->sqtt_timestamp_mtx); + return VK_ERROR_OUT_OF_HOST_MEMORY; + } + + memcpy(new_timestamp, &device->sqtt_timestamp, sizeof(*new_timestamp)); + list_add(&new_timestamp->list, &device->sqtt_timestamp.list); + } + + device->sqtt_timestamp.bo = bo; + device->sqtt_timestamp.size = new_size; + device->sqtt_timestamp.offset = 0; + device->sqtt_timestamp.map = map; + } + + *gpu_timestamp_bo = device->sqtt_timestamp.bo; + *gpu_timestamp_offset = device->sqtt_timestamp.offset; + *gpu_timestamp_ptr = device->sqtt_timestamp.map + device->sqtt_timestamp.offset; + + device->sqtt_timestamp.offset += 8; + + simple_mtx_unlock(&device->sqtt_timestamp_mtx); + + return VK_SUCCESS; +} + +static void +radv_sqtt_reset_timestamp(struct radv_device *device) +{ + struct radeon_winsys *ws = device->ws; + + simple_mtx_lock(&device->sqtt_timestamp_mtx); + + list_for_each_entry_safe (struct radv_sqtt_timestamp, ts, &device->sqtt_timestamp.list, list) { + ws->buffer_destroy(ws, ts->bo); + list_del(&ts->list); + free(ts); + } + + device->sqtt_timestamp.offset = 0; + + simple_mtx_unlock(&device->sqtt_timestamp_mtx); +} + +static bool +radv_sqtt_init_queue_event(struct radv_device *device) +{ + VkCommandPool cmd_pool; + VkResult result; + + const VkCommandPoolCreateInfo create_gfx_info = { + .sType = VK_STRUCTURE_TYPE_COMMAND_POOL_CREATE_INFO, + .queueFamilyIndex = RADV_QUEUE_GENERAL, /* Graphics queue is always the first queue. */ + }; + + result = vk_common_CreateCommandPool(radv_device_to_handle(device), &create_gfx_info, NULL, &cmd_pool); + if (result != VK_SUCCESS) + return false; + + device->sqtt_command_pool[0] = vk_command_pool_from_handle(cmd_pool); + + if (!(device->instance->debug_flags & RADV_DEBUG_NO_COMPUTE_QUEUE)) { + const VkCommandPoolCreateInfo create_comp_info = { + .sType = VK_STRUCTURE_TYPE_COMMAND_POOL_CREATE_INFO, + .queueFamilyIndex = RADV_QUEUE_COMPUTE, + }; + + result = vk_common_CreateCommandPool(radv_device_to_handle(device), &create_comp_info, NULL, &cmd_pool); + if (result != VK_SUCCESS) + return false; + + device->sqtt_command_pool[1] = vk_command_pool_from_handle(cmd_pool); + } + + simple_mtx_init(&device->sqtt_command_pool_mtx, mtx_plain); + + simple_mtx_init(&device->sqtt_timestamp_mtx, mtx_plain); + list_inithead(&device->sqtt_timestamp.list); + + return true; +} + +static void +radv_sqtt_finish_queue_event(struct radv_device *device) +{ + struct radeon_winsys *ws = device->ws; + + if (device->sqtt_timestamp.bo) + ws->buffer_destroy(ws, device->sqtt_timestamp.bo); + + simple_mtx_destroy(&device->sqtt_timestamp_mtx); + + for (unsigned i = 0; i < ARRAY_SIZE(device->sqtt_command_pool); i++) + vk_common_DestroyCommandPool(radv_device_to_handle(device), + vk_command_pool_to_handle(device->sqtt_command_pool[i]), NULL); + + simple_mtx_destroy(&device->sqtt_command_pool_mtx); +} + static bool radv_sqtt_init_bo(struct radv_device *device) { @@ -603,6 +747,9 @@ radv_sqtt_init(struct radv_device *device) if (!radv_sqtt_init_bo(device)) return false; + if (!radv_sqtt_init_queue_event(device)) + return false; + if (!radv_device_acquire_performance_counters(device)) return false; @@ -620,6 +767,7 @@ radv_sqtt_finish(struct radv_device *device) struct radeon_winsys *ws = device->ws; radv_sqtt_finish_bo(device); + radv_sqtt_finish_queue_event(device); for (unsigned i = 0; i < 2; i++) { if (device->sqtt.start_cs[i]) @@ -806,6 +954,7 @@ radv_reset_sqtt_trace(struct radv_device *device) { struct ac_sqtt *sqtt = &device->sqtt; struct rgp_clock_calibration *clock_calibration = &sqtt->rgp_clock_calibration; + struct rgp_queue_event *queue_event = &sqtt->rgp_queue_event; /* Clear clock calibration records. */ simple_mtx_lock(&clock_calibration->lock); @@ -815,6 +964,26 @@ radv_reset_sqtt_trace(struct radv_device *device) free(record); } simple_mtx_unlock(&clock_calibration->lock); + + /* Clear queue event records. */ + simple_mtx_lock(&queue_event->lock); + list_for_each_entry_safe (struct rgp_queue_event_record, record, &queue_event->record, list) { + list_del(&record->list); + free(record); + } + queue_event->record_count = 0; + simple_mtx_unlock(&queue_event->lock); + + /* Clear timestamps. */ + radv_sqtt_reset_timestamp(device); + + /* Clear timed cmdbufs. */ + simple_mtx_lock(&device->sqtt_command_pool_mtx); + for (unsigned i = 0; i < ARRAY_SIZE(device->sqtt_command_pool); i++) { + vk_common_TrimCommandPool(radv_device_to_handle(device), vk_command_pool_to_handle(device->sqtt_command_pool[i]), + 0); + } + simple_mtx_unlock(&device->sqtt_command_pool_mtx); } static VkResult @@ -856,3 +1025,56 @@ radv_sqtt_sample_clocks(struct radv_device *device) return ac_sqtt_add_clock_calibration(&device->sqtt, cpu_timestamp, gpu_timestamp); } + +VkResult +radv_sqtt_get_timed_cmdbuf(struct radv_queue *queue, struct radeon_winsys_bo *timestamp_bo, uint32_t timestamp_offset, + VkPipelineStageFlags2 timestamp_stage, VkCommandBuffer *pcmdbuf) +{ + struct radv_device *device = queue->device; + enum radv_queue_family queue_family = queue->state.qf; + VkCommandBuffer cmdbuf; + uint64_t timestamp_va; + VkResult result; + + assert(queue_family == RADV_QUEUE_GENERAL || queue_family == RADV_QUEUE_COMPUTE); + + simple_mtx_lock(&device->sqtt_command_pool_mtx); + + const VkCommandBufferAllocateInfo alloc_info = { + .sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_ALLOCATE_INFO, + .commandPool = vk_command_pool_to_handle(device->sqtt_command_pool[queue_family]), + .level = VK_COMMAND_BUFFER_LEVEL_PRIMARY, + .commandBufferCount = 1, + }; + + result = vk_common_AllocateCommandBuffers(radv_device_to_handle(device), &alloc_info, &cmdbuf); + if (result != VK_SUCCESS) + goto fail; + + const VkCommandBufferBeginInfo begin_info = { + .sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO, + .flags = VK_COMMAND_BUFFER_USAGE_ONE_TIME_SUBMIT_BIT, + }; + + result = radv_BeginCommandBuffer(cmdbuf, &begin_info); + if (result != VK_SUCCESS) + goto fail; + + radeon_check_space(device->ws, radv_cmd_buffer_from_handle(cmdbuf)->cs, 28); + + timestamp_va = radv_buffer_get_va(timestamp_bo) + timestamp_offset; + + radv_cs_add_buffer(device->ws, radv_cmd_buffer_from_handle(cmdbuf)->cs, timestamp_bo); + + radv_write_timestamp(radv_cmd_buffer_from_handle(cmdbuf), timestamp_va, timestamp_stage); + + result = radv_EndCommandBuffer(cmdbuf); + if (result != VK_SUCCESS) + goto fail; + + *pcmdbuf = cmdbuf; + +fail: + simple_mtx_unlock(&device->sqtt_command_pool_mtx); + return result; +}