Module: Mesa
Branch: main
Commit: 6c7265338d175224dd7899d326e33b769f3e743d
URL:    
http://cgit.freedesktop.org/mesa/mesa/commit/?id=6c7265338d175224dd7899d326e33b769f3e743d

Author: Samuel Pitoiset <samuel.pitoi...@gmail.com>
Date:   Fri Oct 27 16:54:55 2023 +0200

radv: add support for task shader invocations queries on GFX10.3

Signed-off-by: Samuel Pitoiset <samuel.pitoi...@gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/25950>

---

 src/amd/vulkan/radv_cmd_buffer.c |  39 +++++++++++++--
 src/amd/vulkan/radv_private.h    |   2 +
 src/amd/vulkan/radv_query.c      | 104 ++++++++++++++++++++++++++++++++++++++-
 3 files changed, 140 insertions(+), 5 deletions(-)

diff --git a/src/amd/vulkan/radv_cmd_buffer.c b/src/amd/vulkan/radv_cmd_buffer.c
index 5e80a1e8cf5..defa33f2fbb 100644
--- a/src/amd/vulkan/radv_cmd_buffer.c
+++ b/src/amd/vulkan/radv_cmd_buffer.c
@@ -4994,15 +4994,13 @@ radv_flush_streamout_descriptors(struct radv_cmd_buffer 
*cmd_buffer)
 }
 
 static void
-radv_flush_shader_query_state(struct radv_cmd_buffer *cmd_buffer)
+radv_flush_shader_query_state_gfx(struct radv_cmd_buffer *cmd_buffer)
 {
    const struct radv_shader *last_vgt_shader = 
cmd_buffer->state.last_vgt_shader;
    const struct radv_userdata_info *loc = radv_get_user_sgpr(last_vgt_shader, 
AC_UD_SHADER_QUERY_STATE);
    enum radv_shader_query_state shader_query_state = radv_shader_query_none;
    uint32_t base_reg;
 
-   cmd_buffer->state.dirty &= ~RADV_CMD_DIRTY_SHADER_QUERY;
-
    if (loc->sgpr_idx == -1)
       return;
 
@@ -5031,6 +5029,41 @@ radv_flush_shader_query_state(struct radv_cmd_buffer 
*cmd_buffer)
    radeon_set_sh_reg(cmd_buffer->cs, base_reg + loc->sgpr_idx * 4, 
shader_query_state);
 }
 
+static void
+radv_flush_shader_query_state_ace(struct radv_cmd_buffer *cmd_buffer, struct 
radv_shader *task_shader)
+{
+   const struct radv_userdata_info *loc = radv_get_user_sgpr(task_shader, 
AC_UD_SHADER_QUERY_STATE);
+   enum radv_shader_query_state shader_query_state = radv_shader_query_none;
+   uint32_t base_reg;
+
+   if (loc->sgpr_idx == -1)
+      return;
+
+   /* By default shader queries are disabled but they are enabled if the 
command buffer has active ACE
+    * queries or if it's a secondary command buffer that inherits the number 
of task shader
+    * invocations query.
+    */
+   if (cmd_buffer->state.active_pipeline_ace_queries ||
+       (cmd_buffer->state.inherited_pipeline_statistics & 
VK_QUERY_PIPELINE_STATISTIC_TASK_SHADER_INVOCATIONS_BIT_EXT))
+      shader_query_state |= radv_shader_query_pipeline_stat;
+
+   base_reg = task_shader->info.user_data_0;
+   assert(loc->sgpr_idx != -1);
+
+   radeon_set_sh_reg(cmd_buffer->gang.cs, base_reg + loc->sgpr_idx * 4, 
shader_query_state);
+}
+
+static void
+radv_flush_shader_query_state(struct radv_cmd_buffer *cmd_buffer)
+{
+   radv_flush_shader_query_state_gfx(cmd_buffer);
+
+   if (radv_cmdbuf_has_stage(cmd_buffer, MESA_SHADER_TASK))
+      radv_flush_shader_query_state_ace(cmd_buffer, 
cmd_buffer->state.shaders[MESA_SHADER_TASK]);
+
+   cmd_buffer->state.dirty &= ~RADV_CMD_DIRTY_SHADER_QUERY;
+}
+
 static void
 radv_flush_force_vrs_state(struct radv_cmd_buffer *cmd_buffer)
 {
diff --git a/src/amd/vulkan/radv_private.h b/src/amd/vulkan/radv_private.h
index 73b98e9d695..caee8d17474 100644
--- a/src/amd/vulkan/radv_private.h
+++ b/src/amd/vulkan/radv_private.h
@@ -1676,6 +1676,7 @@ struct radv_cmd_state {
    bool perfect_occlusion_queries_enabled;
    unsigned active_pipeline_queries;
    unsigned active_pipeline_gds_queries;
+   unsigned active_pipeline_ace_queries; /* Task shader invocations query */
    unsigned active_prims_gen_queries;
    unsigned active_prims_xfb_queries;
    unsigned active_prims_gen_gds_queries;
@@ -2999,6 +3000,7 @@ struct radv_query_pool {
    uint64_t size;
    char *ptr;
    bool uses_gds; /* For NGG GS on GFX10+ */
+   bool uses_ace; /* For task shader invocations on GFX10.3+ */
 };
 
 struct radv_perfcounter_impl;
diff --git a/src/amd/vulkan/radv_query.c b/src/amd/vulkan/radv_query.c
index 6b542d3bb8c..26aaaec5a5d 100644
--- a/src/amd/vulkan/radv_query.c
+++ b/src/amd/vulkan/radv_query.c
@@ -280,6 +280,7 @@ build_pipeline_statistics_query_shader(struct radv_device 
*device)
 
    nir_variable *output_offset = nir_local_variable_create(b.impl, 
glsl_int_type(), "output_offset");
    nir_variable *result = nir_local_variable_create(b.impl, 
glsl_int64_t_type(), "result");
+   nir_variable *available = nir_local_variable_create(b.impl, 
glsl_bool_type(), "available");
 
    nir_def *flags = nir_load_push_constant(&b, 1, 32, nir_imm_int(&b, 0), 
.range = 4);
    nir_def *stats_mask = nir_load_push_constant(&b, 1, 32, nir_imm_int(&b, 8), 
.range = 12);
@@ -298,15 +299,34 @@ build_pipeline_statistics_query_shader(struct radv_device 
*device)
    avail_offset = nir_iadd(&b, avail_offset, nir_imul_imm(&b, global_id, 4));
 
    nir_def *available32 = nir_load_ssbo(&b, 1, 32, src_buf, avail_offset);
+   nir_store_var(&b, available, nir_i2b(&b, available32), 0x1);
+
+   nir_push_if(&b, nir_test_mask(&b, stats_mask, 
VK_QUERY_PIPELINE_STATISTIC_TASK_SHADER_INVOCATIONS_BIT_EXT));
+   {
+      const uint32_t idx = 
ffs(VK_QUERY_PIPELINE_STATISTIC_TASK_SHADER_INVOCATIONS_BIT_EXT) - 1;
+
+      nir_def *avail_start_offset = nir_iadd_imm(&b, input_base, 
pipeline_statistics_indices[idx] * 8 + 4);
+      nir_def *avail_start = nir_load_ssbo(&b, 1, 32, src_buf, 
avail_start_offset);
+
+      nir_def *avail_end_offset =
+         nir_iadd_imm(&b, input_base, pipeline_statistics_indices[idx] * 8 + 
pipelinestat_block_size + 4);
+      nir_def *avail_end = nir_load_ssbo(&b, 1, 32, src_buf, avail_end_offset);
+
+      nir_def *task_invoc_result_available =
+         nir_i2b(&b, nir_iand_imm(&b, nir_iand(&b, avail_start, avail_end), 
0x80000000));
+
+      nir_store_var(&b, available, nir_iand(&b, nir_load_var(&b, available), 
task_invoc_result_available), 0x1);
+   }
+   nir_pop_if(&b, NULL);
 
    nir_def *result_is_64bit = nir_test_mask(&b, flags, VK_QUERY_RESULT_64_BIT);
    nir_def *elem_size = nir_bcsel(&b, result_is_64bit, nir_imm_int(&b, 8), 
nir_imm_int(&b, 4));
    nir_def *elem_count = nir_ushr_imm(&b, stats_mask, 16);
 
    radv_store_availability(&b, flags, dst_buf, nir_iadd(&b, output_base, 
nir_imul(&b, elem_count, elem_size)),
-                           available32);
+                           nir_b2i32(&b, nir_load_var(&b, available)));
 
-   nir_push_if(&b, nir_i2b(&b, available32));
+   nir_push_if(&b, nir_load_var(&b, available));
 
    nir_store_var(&b, output_offset, output_base, 0x1);
    for (int i = 0; i < ARRAY_SIZE(pipeline_statistics_indices); ++i) {
@@ -1221,6 +1241,10 @@ radv_create_query_pool(struct radv_device *device, const 
VkQueryPoolCreateInfo *
        (pCreateInfo->queryType == VK_QUERY_TYPE_MESH_PRIMITIVES_GENERATED_EXT 
||
         pool->vk.pipeline_statistics & 
VK_QUERY_PIPELINE_STATISTIC_MESH_SHADER_INVOCATIONS_BIT_EXT));
 
+   /* The number of task shader invocations needs to be queried on ACE. */
+   pool->uses_ace = device->physical_device->emulate_mesh_shader_queries &&
+                    (pool->vk.pipeline_statistics & 
VK_QUERY_PIPELINE_STATISTIC_TASK_SHADER_INVOCATIONS_BIT_EXT);
+
    switch (pCreateInfo->queryType) {
    case VK_QUERY_TYPE_OCCLUSION:
       pool->stride = 16 * 
device->physical_device->rad_info.max_render_backends;
@@ -1399,6 +1423,17 @@ radv_GetQueryPoolResults(VkDevice _device, VkQueryPool 
queryPool, uint32_t first
 
          do {
             available = p_atomic_read(avail_ptr);
+
+            if (pool->uses_ace) {
+               const uint32_t task_invoc_offset =
+                  
radv_get_pipelinestat_query_offset(VK_QUERY_PIPELINE_STATISTIC_TASK_SHADER_INVOCATIONS_BIT_EXT);
+               const uint32_t *avail_ptr_start = (const uint32_t *)(src + 
task_invoc_offset + 4);
+               const uint32_t *avail_ptr_stop =
+                  (const uint32_t *)(src + pipelinestat_block_size + 
task_invoc_offset + 4);
+
+               if (!(p_atomic_read(avail_ptr_start) & 0x80000000) || 
!(p_atomic_read(avail_ptr_stop) & 0x80000000))
+                  available = 0;
+            }
          } while (!available && (flags & VK_QUERY_RESULT_WAIT_BIT));
 
          if (!available && !(flags & VK_QUERY_RESULT_PARTIAL_BIT))
@@ -1667,6 +1702,10 @@ radv_CmdCopyQueryPoolResults(VkCommandBuffer 
commandBuffer, VkQueryPool queryPoo
       break;
    case VK_QUERY_TYPE_PIPELINE_STATISTICS:
       if (flags & VK_QUERY_RESULT_WAIT_BIT) {
+         const uint32_t task_invoc_offset =
+            
radv_get_pipelinestat_query_offset(VK_QUERY_PIPELINE_STATISTIC_TASK_SHADER_INVOCATIONS_BIT_EXT);
+         const unsigned pipelinestat_block_size = 
radv_get_pipelinestat_query_size(cmd_buffer->device);
+
          for (unsigned i = 0; i < queryCount; ++i, dest_va += stride) {
             unsigned query = firstQuery + i;
 
@@ -1676,6 +1715,17 @@ radv_CmdCopyQueryPoolResults(VkCommandBuffer 
commandBuffer, VkQueryPool queryPoo
 
             /* This waits on the ME. All copies below are done on the ME */
             radv_cp_wait_mem(cs, cmd_buffer->qf, WAIT_REG_MEM_EQUAL, avail_va, 
1, 0xffffffff);
+
+            if (pool->uses_ace) {
+               const uint64_t src_va = va + query * pool->stride;
+               const uint64_t start_va = src_va + task_invoc_offset + 4;
+               const uint64_t stop_va = start_va + pipelinestat_block_size;
+
+               radeon_check_space(cmd_buffer->device->ws, cs, 7 * 2);
+
+               radv_cp_wait_mem(cs, cmd_buffer->qf, 
WAIT_REG_MEM_GREATER_OR_EQUAL, start_va, 0x80000000, 0xffffffff);
+               radv_cp_wait_mem(cs, cmd_buffer->qf, 
WAIT_REG_MEM_GREATER_OR_EQUAL, stop_va, 0x80000000, 0xffffffff);
+            }
          }
       }
       radv_query_shader(cmd_buffer, 
&cmd_buffer->device->meta_state.query.pipeline_statistics_query_pipeline, 
pool->bo,
@@ -1889,6 +1939,16 @@ gfx10_copy_gds_query_gfx(struct radv_cmd_buffer 
*cmd_buffer, uint32_t gds_offset
    gfx10_copy_gds_query(cmd_buffer->cs, gds_offset, va);
 }
 
+static void
+gfx10_copy_gds_query_ace(struct radv_cmd_buffer *cmd_buffer, uint32_t 
gds_offset, uint64_t va)
+{
+   /* Make sure GDS is idle before copying the value. */
+   cmd_buffer->gang.flush_bits |= RADV_CMD_FLAG_CS_PARTIAL_FLUSH | 
RADV_CMD_FLAG_INV_L2;
+   radv_gang_cache_flush(cmd_buffer);
+
+   gfx10_copy_gds_query(cmd_buffer->gang.cs, gds_offset, va);
+}
+
 static void
 radv_update_hw_pipelinestat(struct radv_cmd_buffer *cmd_buffer)
 {
@@ -1996,6 +2056,24 @@ emit_begin_query(struct radv_cmd_buffer *cmd_buffer, 
struct radv_query_pool *poo
 
          cmd_buffer->state.active_pipeline_gds_queries++;
       }
+
+      if (pool->uses_ace) {
+         uint32_t task_invoc_offset =
+            
radv_get_pipelinestat_query_offset(VK_QUERY_PIPELINE_STATISTIC_TASK_SHADER_INVOCATIONS_BIT_EXT);
+
+         radeon_check_space(cmd_buffer->device->ws, cmd_buffer->gang.cs, 11);
+
+         gfx10_copy_gds_query_ace(cmd_buffer, 
RADV_SHADER_QUERY_TS_INVOCATION_OFFSET, va + task_invoc_offset);
+         radv_cs_write_data_imm(cmd_buffer->gang.cs, V_370_ME, va + 
task_invoc_offset + 4, 0x80000000);
+
+         /* Record that the command buffer needs GDS. */
+         cmd_buffer->gds_needed = true;
+
+         if (!cmd_buffer->state.active_pipeline_ace_queries)
+            cmd_buffer->state.dirty |= RADV_CMD_DIRTY_SHADER_QUERY;
+
+         cmd_buffer->state.active_pipeline_ace_queries++;
+      }
       break;
    }
    case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT:
@@ -2175,6 +2253,21 @@ emit_end_query(struct radv_cmd_buffer *cmd_buffer, 
struct radv_query_pool *pool,
             cmd_buffer->state.dirty |= RADV_CMD_DIRTY_SHADER_QUERY;
       }
 
+      if (pool->uses_ace) {
+         uint32_t task_invoc_offset =
+            
radv_get_pipelinestat_query_offset(VK_QUERY_PIPELINE_STATISTIC_TASK_SHADER_INVOCATIONS_BIT_EXT);
+
+         radeon_check_space(cmd_buffer->device->ws, cmd_buffer->gang.cs, 11);
+
+         gfx10_copy_gds_query_ace(cmd_buffer, 
RADV_SHADER_QUERY_TS_INVOCATION_OFFSET, va + task_invoc_offset);
+         radv_cs_write_data_imm(cmd_buffer->gang.cs, V_370_ME, va + 
task_invoc_offset + 4, 0x80000000);
+
+         cmd_buffer->state.active_pipeline_ace_queries--;
+
+         if (!cmd_buffer->state.active_pipeline_ace_queries)
+            cmd_buffer->state.dirty |= RADV_CMD_DIRTY_SHADER_QUERY;
+      }
+
       si_cs_emit_write_event_eop(cs, 
cmd_buffer->device->physical_device->rad_info.gfx_level, cmd_buffer->qf,
                                  V_028A90_BOTTOM_OF_PIPE_TS, 0, 
EOP_DST_SEL_MEM, EOP_DATA_SEL_VALUE_32BIT, avail_va, 1,
                                  cmd_buffer->gfx9_eop_bug_va);
@@ -2282,6 +2375,13 @@ radv_CmdBeginQueryIndexedEXT(VkCommandBuffer 
commandBuffer, VkQueryPool queryPoo
 
    va += pool->stride * query;
 
+   if (pool->uses_ace) {
+      if (!radv_gang_init(cmd_buffer))
+         return;
+
+      radv_cs_add_buffer(cmd_buffer->device->ws, cmd_buffer->gang.cs, 
pool->bo);
+   }
+
    emit_begin_query(cmd_buffer, pool, va, pool->vk.query_type, flags, index);
 }
 

Reply via email to