Module: Mesa Branch: main Commit: c4fb82744158adbe8799370a1f35823b4b47a8a5 URL: http://cgit.freedesktop.org/mesa/mesa/commit/?id=c4fb82744158adbe8799370a1f35823b4b47a8a5
Author: Bas Nieuwenhuizen <b...@basnieuwenhuizen.nl> Date: Wed Oct 18 12:04:42 2023 +0200 radv: Add compute DGC preprocessing support. This should reduce the overhead due to reduced syncs. Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/25835> --- src/amd/vulkan/radv_cmd_buffer.c | 18 +++------ src/amd/vulkan/radv_device_generated_commands.c | 51 +++++++++++++++++++++++-- src/amd/vulkan/radv_private.h | 4 ++ 3 files changed, 56 insertions(+), 17 deletions(-) diff --git a/src/amd/vulkan/radv_cmd_buffer.c b/src/amd/vulkan/radv_cmd_buffer.c index ccb46c15861..20908dae425 100644 --- a/src/amd/vulkan/radv_cmd_buffer.c +++ b/src/amd/vulkan/radv_cmd_buffer.c @@ -9519,18 +9519,6 @@ radv_CmdDrawMeshTasksIndirectCountEXT(VkCommandBuffer commandBuffer, VkBuffer _b static void radv_dgc_before_dispatch(struct radv_cmd_buffer *cmd_buffer); static void radv_dgc_after_dispatch(struct radv_cmd_buffer *cmd_buffer); -static bool -radv_use_dgc_predication(struct radv_cmd_buffer *cmd_buffer, const VkGeneratedCommandsInfoNV *pGeneratedCommandsInfo) -{ - VK_FROM_HANDLE(radv_buffer, seq_count_buffer, pGeneratedCommandsInfo->sequencesCountBuffer); - - /* Enable conditional rendering (if not enabled by user) to skip prepare/execute DGC calls when - * the indirect sequence count might be zero. This can only be enabled on GFX because on ACE it's - * not possible to skip the execute DGC call (ie. no INDIRECT_PACKET) - */ - return cmd_buffer->qf == RADV_QUEUE_GENERAL && seq_count_buffer && !cmd_buffer->state.predicating; -} - VKAPI_ATTR void VKAPI_CALL radv_CmdExecuteGeneratedCommandsNV(VkCommandBuffer commandBuffer, VkBool32 isPreprocessed, const VkGeneratedCommandsInfoNV *pGeneratedCommandsInfo) @@ -9557,7 +9545,11 @@ radv_CmdExecuteGeneratedCommandsNV(VkCommandBuffer commandBuffer, VkBool32 isPre cmd_buffer->state.predicating = true; } - radv_prepare_dgc(cmd_buffer, pGeneratedCommandsInfo); + if (!layout->use_preprocess) { + radv_prepare_dgc(cmd_buffer, pGeneratedCommandsInfo); + + cmd_buffer->state.flush_bits |= RADV_CMD_FLAG_CS_PARTIAL_FLUSH | RADV_CMD_FLAG_INV_VCACHE | RADV_CMD_FLAG_INV_L2; + } if (compute) { radv_dgc_before_dispatch(cmd_buffer); diff --git a/src/amd/vulkan/radv_device_generated_commands.c b/src/amd/vulkan/radv_device_generated_commands.c index 6078a9794db..5afd1c119a6 100644 --- a/src/amd/vulkan/radv_device_generated_commands.c +++ b/src/amd/vulkan/radv_device_generated_commands.c @@ -1337,6 +1337,18 @@ radv_CreateIndirectCommandsLayoutNV(VkDevice _device, const VkIndirectCommandsLa if (!layout->indexed) layout->binds_index_buffer = false; + layout->use_preprocess = pCreateInfo->flags & VK_INDIRECT_COMMANDS_LAYOUT_USAGE_EXPLICIT_PREPROCESS_BIT_NV; + + /* From the Vulkan spec (1.3.269, chapter 32): + * "The bound descriptor sets and push constants that will be used with indirect command generation for the compute + * piplines must already be specified at the time of preprocessing commands with vkCmdPreprocessGeneratedCommandsNV. + * They must not change until the execution of indirect commands is submitted with vkCmdExecuteGeneratedCommandsNV." + * + * So we can always preprocess compute layouts. + */ + if (layout->pipeline_bind_point != VK_PIPELINE_BIND_POINT_COMPUTE) + layout->use_preprocess = false; + *pIndirectCommandsLayout = radv_indirect_command_layout_to_handle(layout); return VK_SUCCESS; } @@ -1379,12 +1391,45 @@ radv_GetGeneratedCommandsMemoryRequirementsNV(VkDevice _device, align(cmd_buf_size + upload_buf_size, pMemoryRequirements->memoryRequirements.alignment); } +bool +radv_use_dgc_predication(struct radv_cmd_buffer *cmd_buffer, const VkGeneratedCommandsInfoNV *pGeneratedCommandsInfo) +{ + VK_FROM_HANDLE(radv_buffer, seq_count_buffer, pGeneratedCommandsInfo->sequencesCountBuffer); + + /* Enable conditional rendering (if not enabled by user) to skip prepare/execute DGC calls when + * the indirect sequence count might be zero. This can only be enabled on GFX because on ACE it's + * not possible to skip the execute DGC call (ie. no INDIRECT_PACKET) + */ + return cmd_buffer->qf == RADV_QUEUE_GENERAL && seq_count_buffer && !cmd_buffer->state.predicating; +} + VKAPI_ATTR void VKAPI_CALL radv_CmdPreprocessGeneratedCommandsNV(VkCommandBuffer commandBuffer, const VkGeneratedCommandsInfoNV *pGeneratedCommandsInfo) { - /* Can't do anything here as we depend on some dynamic state in some cases that we only know - * at draw time. */ + VK_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer); + VK_FROM_HANDLE(radv_indirect_command_layout, layout, pGeneratedCommandsInfo->indirectCommandsLayout); + + if (!layout->use_preprocess) + return; + + const bool use_predication = radv_use_dgc_predication(cmd_buffer, pGeneratedCommandsInfo); + + if (use_predication) { + VK_FROM_HANDLE(radv_buffer, seq_count_buffer, pGeneratedCommandsInfo->sequencesCountBuffer); + const uint64_t va = radv_buffer_get_va(seq_count_buffer->bo) + seq_count_buffer->offset + + pGeneratedCommandsInfo->sequencesCountOffset; + + radv_begin_conditional_rendering(cmd_buffer, va, true); + cmd_buffer->state.predicating = true; + } + + radv_prepare_dgc(cmd_buffer, pGeneratedCommandsInfo); + + if (use_predication) { + cmd_buffer->state.predicating = false; + radv_end_conditional_rendering(cmd_buffer); + } } /* Always need to call this directly before draw due to dependence on bound state. */ @@ -1658,8 +1703,6 @@ radv_prepare_dgc(struct radv_cmd_buffer *cmd_buffer, const VkGeneratedCommandsIn radv_buffer_finish(&token_buffer); radv_meta_restore(&saved_state, cmd_buffer); - - cmd_buffer->state.flush_bits |= RADV_CMD_FLAG_CS_PARTIAL_FLUSH | RADV_CMD_FLAG_INV_VCACHE | RADV_CMD_FLAG_INV_L2; } /* VK_NV_device_generated_commands_compute */ diff --git a/src/amd/vulkan/radv_private.h b/src/amd/vulkan/radv_private.h index c87cd06bebb..ccfb1549401 100644 --- a/src/amd/vulkan/radv_private.h +++ b/src/amd/vulkan/radv_private.h @@ -3257,11 +3257,15 @@ struct radv_indirect_command_layout { uint32_t ibo_type_32; uint32_t ibo_type_8; + bool use_preprocess; + VkIndirectCommandsLayoutTokenNV tokens[0]; }; uint32_t radv_get_indirect_cmdbuf_size(const VkGeneratedCommandsInfoNV *cmd_info); +bool radv_use_dgc_predication(struct radv_cmd_buffer *cmd_buffer, + const VkGeneratedCommandsInfoNV *pGeneratedCommandsInfo); void radv_prepare_dgc(struct radv_cmd_buffer *cmd_buffer, const VkGeneratedCommandsInfoNV *pGeneratedCommandsInfo); static inline uint32_t