Module: Mesa
Branch: main
Commit: c4fb82744158adbe8799370a1f35823b4b47a8a5
URL:    
http://cgit.freedesktop.org/mesa/mesa/commit/?id=c4fb82744158adbe8799370a1f35823b4b47a8a5

Author: Bas Nieuwenhuizen <b...@basnieuwenhuizen.nl>
Date:   Wed Oct 18 12:04:42 2023 +0200

radv: Add compute DGC preprocessing support.

This should reduce the overhead due to reduced syncs.

Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/25835>

---

 src/amd/vulkan/radv_cmd_buffer.c                | 18 +++------
 src/amd/vulkan/radv_device_generated_commands.c | 51 +++++++++++++++++++++++--
 src/amd/vulkan/radv_private.h                   |  4 ++
 3 files changed, 56 insertions(+), 17 deletions(-)

diff --git a/src/amd/vulkan/radv_cmd_buffer.c b/src/amd/vulkan/radv_cmd_buffer.c
index ccb46c15861..20908dae425 100644
--- a/src/amd/vulkan/radv_cmd_buffer.c
+++ b/src/amd/vulkan/radv_cmd_buffer.c
@@ -9519,18 +9519,6 @@ radv_CmdDrawMeshTasksIndirectCountEXT(VkCommandBuffer 
commandBuffer, VkBuffer _b
 static void radv_dgc_before_dispatch(struct radv_cmd_buffer *cmd_buffer);
 static void radv_dgc_after_dispatch(struct radv_cmd_buffer *cmd_buffer);
 
-static bool
-radv_use_dgc_predication(struct radv_cmd_buffer *cmd_buffer, const 
VkGeneratedCommandsInfoNV *pGeneratedCommandsInfo)
-{
-   VK_FROM_HANDLE(radv_buffer, seq_count_buffer, 
pGeneratedCommandsInfo->sequencesCountBuffer);
-
-   /* Enable conditional rendering (if not enabled by user) to skip 
prepare/execute DGC calls when
-    * the indirect sequence count might be zero. This can only be enabled on 
GFX because on ACE it's
-    * not possible to skip the execute DGC call (ie. no INDIRECT_PACKET)
-    */
-   return cmd_buffer->qf == RADV_QUEUE_GENERAL && seq_count_buffer && 
!cmd_buffer->state.predicating;
-}
-
 VKAPI_ATTR void VKAPI_CALL
 radv_CmdExecuteGeneratedCommandsNV(VkCommandBuffer commandBuffer, VkBool32 
isPreprocessed,
                                    const VkGeneratedCommandsInfoNV 
*pGeneratedCommandsInfo)
@@ -9557,7 +9545,11 @@ radv_CmdExecuteGeneratedCommandsNV(VkCommandBuffer 
commandBuffer, VkBool32 isPre
       cmd_buffer->state.predicating = true;
    }
 
-   radv_prepare_dgc(cmd_buffer, pGeneratedCommandsInfo);
+   if (!layout->use_preprocess) {
+      radv_prepare_dgc(cmd_buffer, pGeneratedCommandsInfo);
+
+      cmd_buffer->state.flush_bits |= RADV_CMD_FLAG_CS_PARTIAL_FLUSH | 
RADV_CMD_FLAG_INV_VCACHE | RADV_CMD_FLAG_INV_L2;
+   }
 
    if (compute) {
       radv_dgc_before_dispatch(cmd_buffer);
diff --git a/src/amd/vulkan/radv_device_generated_commands.c 
b/src/amd/vulkan/radv_device_generated_commands.c
index 6078a9794db..5afd1c119a6 100644
--- a/src/amd/vulkan/radv_device_generated_commands.c
+++ b/src/amd/vulkan/radv_device_generated_commands.c
@@ -1337,6 +1337,18 @@ radv_CreateIndirectCommandsLayoutNV(VkDevice _device, 
const VkIndirectCommandsLa
    if (!layout->indexed)
       layout->binds_index_buffer = false;
 
+   layout->use_preprocess = pCreateInfo->flags & 
VK_INDIRECT_COMMANDS_LAYOUT_USAGE_EXPLICIT_PREPROCESS_BIT_NV;
+
+   /* From the Vulkan spec (1.3.269, chapter 32):
+    * "The bound descriptor sets and push constants that will be used with 
indirect command generation for the compute
+    * piplines must already be specified at the time of preprocessing commands 
with vkCmdPreprocessGeneratedCommandsNV.
+    * They must not change until the execution of indirect commands is 
submitted with vkCmdExecuteGeneratedCommandsNV."
+    *
+    * So we can always preprocess compute layouts.
+    */
+   if (layout->pipeline_bind_point != VK_PIPELINE_BIND_POINT_COMPUTE)
+      layout->use_preprocess = false;
+
    *pIndirectCommandsLayout = radv_indirect_command_layout_to_handle(layout);
    return VK_SUCCESS;
 }
@@ -1379,12 +1391,45 @@ radv_GetGeneratedCommandsMemoryRequirementsNV(VkDevice 
_device,
       align(cmd_buf_size + upload_buf_size, 
pMemoryRequirements->memoryRequirements.alignment);
 }
 
+bool
+radv_use_dgc_predication(struct radv_cmd_buffer *cmd_buffer, const 
VkGeneratedCommandsInfoNV *pGeneratedCommandsInfo)
+{
+   VK_FROM_HANDLE(radv_buffer, seq_count_buffer, 
pGeneratedCommandsInfo->sequencesCountBuffer);
+
+   /* Enable conditional rendering (if not enabled by user) to skip 
prepare/execute DGC calls when
+    * the indirect sequence count might be zero. This can only be enabled on 
GFX because on ACE it's
+    * not possible to skip the execute DGC call (ie. no INDIRECT_PACKET)
+    */
+   return cmd_buffer->qf == RADV_QUEUE_GENERAL && seq_count_buffer && 
!cmd_buffer->state.predicating;
+}
+
 VKAPI_ATTR void VKAPI_CALL
 radv_CmdPreprocessGeneratedCommandsNV(VkCommandBuffer commandBuffer,
                                       const VkGeneratedCommandsInfoNV 
*pGeneratedCommandsInfo)
 {
-   /* Can't do anything here as we depend on some dynamic state in some cases 
that we only know
-    * at draw time. */
+   VK_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
+   VK_FROM_HANDLE(radv_indirect_command_layout, layout, 
pGeneratedCommandsInfo->indirectCommandsLayout);
+
+   if (!layout->use_preprocess)
+      return;
+
+   const bool use_predication = radv_use_dgc_predication(cmd_buffer, 
pGeneratedCommandsInfo);
+
+   if (use_predication) {
+      VK_FROM_HANDLE(radv_buffer, seq_count_buffer, 
pGeneratedCommandsInfo->sequencesCountBuffer);
+      const uint64_t va = radv_buffer_get_va(seq_count_buffer->bo) + 
seq_count_buffer->offset +
+                          pGeneratedCommandsInfo->sequencesCountOffset;
+
+      radv_begin_conditional_rendering(cmd_buffer, va, true);
+      cmd_buffer->state.predicating = true;
+   }
+
+   radv_prepare_dgc(cmd_buffer, pGeneratedCommandsInfo);
+
+   if (use_predication) {
+      cmd_buffer->state.predicating = false;
+      radv_end_conditional_rendering(cmd_buffer);
+   }
 }
 
 /* Always need to call this directly before draw due to dependence on bound 
state. */
@@ -1658,8 +1703,6 @@ radv_prepare_dgc(struct radv_cmd_buffer *cmd_buffer, 
const VkGeneratedCommandsIn
 
    radv_buffer_finish(&token_buffer);
    radv_meta_restore(&saved_state, cmd_buffer);
-
-   cmd_buffer->state.flush_bits |= RADV_CMD_FLAG_CS_PARTIAL_FLUSH | 
RADV_CMD_FLAG_INV_VCACHE | RADV_CMD_FLAG_INV_L2;
 }
 
 /* VK_NV_device_generated_commands_compute */
diff --git a/src/amd/vulkan/radv_private.h b/src/amd/vulkan/radv_private.h
index c87cd06bebb..ccfb1549401 100644
--- a/src/amd/vulkan/radv_private.h
+++ b/src/amd/vulkan/radv_private.h
@@ -3257,11 +3257,15 @@ struct radv_indirect_command_layout {
    uint32_t ibo_type_32;
    uint32_t ibo_type_8;
 
+   bool use_preprocess;
+
    VkIndirectCommandsLayoutTokenNV tokens[0];
 };
 
 uint32_t radv_get_indirect_cmdbuf_size(const VkGeneratedCommandsInfoNV 
*cmd_info);
 
+bool radv_use_dgc_predication(struct radv_cmd_buffer *cmd_buffer,
+                              const VkGeneratedCommandsInfoNV 
*pGeneratedCommandsInfo);
 void radv_prepare_dgc(struct radv_cmd_buffer *cmd_buffer, const 
VkGeneratedCommandsInfoNV *pGeneratedCommandsInfo);
 
 static inline uint32_t

Reply via email to