Module: Mesa
Branch: staging/23.1
Commit: ff9b24141c941c9d293a59551a2ca6725e9aa8e9
URL:    
http://cgit.freedesktop.org/mesa/mesa/commit/?id=ff9b24141c941c9d293a59551a2ca6725e9aa8e9

Author: Samuel Pitoiset <[email protected]>
Date:   Wed Jul 19 09:12:01 2023 +0200

radv: simplify the NGG vs legacy pipelinestat query path

NGG is enabled by default on RDNA1-2 but the driver might fallback to
legacy GS for some reasons, like XFB. On these generations, the number
of generated primitives by GS needs to be emulated from the NGG shader
because the hw doesn't increment the related pipelinestat counter.

In order to support NGG and legacy GS with that query (remember that
we can't know pipelines when starting/ending queries), we used to
reserve 2x 64-bit counters to store the GDS results, and the results
were accumulated.

Now that legacy GS also uses GDS counters, we can simplify this path
and overwrite the pipelinestat counter directly instead of having two
separate counters.

Signed-off-by: Samuel Pitoiset <[email protected]>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/24231>
(cherry picked from commit 50709863ace6ccd003389b595af20536980f6a3e)

---

 .pick_status.json           |  2 +-
 src/amd/vulkan/radv_query.c | 93 +++++++++++----------------------------------
 2 files changed, 24 insertions(+), 71 deletions(-)

diff --git a/.pick_status.json b/.pick_status.json
index 469ddfaa394..0102c0ac4e7 100644
--- a/.pick_status.json
+++ b/.pick_status.json
@@ -32089,7 +32089,7 @@
         "description": "radv: simplify the NGG vs legacy pipelinestat query 
path",
         "nominated": false,
         "nomination_type": null,
-        "resolution": 4,
+        "resolution": 1,
         "main_sha": null,
         "because_sha": null
     },
diff --git a/src/amd/vulkan/radv_query.c b/src/amd/vulkan/radv_query.c
index d0de76332e6..0ddac8bd72c 100644
--- a/src/amd/vulkan/radv_query.c
+++ b/src/amd/vulkan/radv_query.c
@@ -43,6 +43,13 @@
 /* TODO: Add support for mesh/task queries on GFX11 */
 static const unsigned pipeline_statistics_indices[] = {7, 6, 3, 4, 5, 2, 1, 0, 
8, 9, 10};
 
+static unsigned
+radv_get_pipelinestat_query_offset(VkQueryPipelineStatisticFlagBits query)
+{
+   uint32_t idx = ffs(query) - 1;
+   return pipeline_statistics_indices[idx] * 8;
+}
+
 static unsigned
 radv_get_pipelinestat_query_size(struct radv_device *device)
 {
@@ -285,25 +292,14 @@ build_pipeline_statistics_query_shader(struct radv_device 
*device)
    nir_ssa_def *flags = nir_load_push_constant(&b, 1, 32, nir_imm_int(&b, 0), 
.range = 4);
    nir_ssa_def *stats_mask = nir_load_push_constant(&b, 1, 32, nir_imm_int(&b, 
8), .range = 12);
    nir_ssa_def *avail_offset = nir_load_push_constant(&b, 1, 32, 
nir_imm_int(&b, 12), .range = 16);
-   nir_ssa_def *uses_gds = nir_load_push_constant(&b, 1, 32, nir_imm_int(&b, 
16), .range = 20);
 
    nir_ssa_def *dst_buf = radv_meta_load_descriptor(&b, 0, 0);
    nir_ssa_def *src_buf = radv_meta_load_descriptor(&b, 0, 1);
 
    nir_ssa_def *global_id = get_global_ids(&b, 1);
 
-   nir_variable *input_stride = nir_local_variable_create(b.impl, 
glsl_int_type(), "input_stride");
-   nir_push_if(&b, nir_ine(&b, uses_gds, nir_imm_int(&b, 0)));
-   {
-      nir_store_var(&b, input_stride, nir_imm_int(&b, pipelinestat_block_size 
* 2 + 8 * 2), 0x1);
-   }
-   nir_push_else(&b, NULL);
-   {
-      nir_store_var(&b, input_stride, nir_imm_int(&b, pipelinestat_block_size 
* 2), 0x1);
-   }
-   nir_pop_if(&b, NULL);
-
-   nir_ssa_def *input_base = nir_imul(&b, nir_load_var(&b, input_stride), 
global_id);
+   nir_ssa_def *input_stride = nir_imm_int(&b, pipelinestat_block_size * 2);
+   nir_ssa_def *input_base = nir_imul(&b, input_stride, global_id);
    nir_ssa_def *output_stride = nir_load_push_constant(&b, 1, 32, 
nir_imm_int(&b, 4), .range = 8);
    nir_ssa_def *output_base = nir_imul(&b, output_stride, global_id);
 
@@ -334,25 +330,6 @@ build_pipeline_statistics_query_shader(struct radv_device 
*device)
 
       nir_store_var(&b, result, nir_isub(&b, end, start), 0x1);
 
-      nir_push_if(&b, nir_iand(&b, nir_i2b(&b, uses_gds),
-                               nir_ieq(&b, nir_imm_int(&b, 1u << i),
-                                       nir_imm_int(&b, 
VK_QUERY_PIPELINE_STATISTIC_GEOMETRY_SHADER_PRIMITIVES_BIT))));
-      {
-         /* Compute the GDS result if needed. */
-         nir_ssa_def *gds_start_offset =
-            nir_iadd(&b, input_base, nir_imm_int(&b, pipelinestat_block_size * 
2));
-         nir_ssa_def *gds_start = nir_load_ssbo(&b, 1, 64, src_buf, 
gds_start_offset);
-
-         nir_ssa_def *gds_end_offset =
-            nir_iadd(&b, input_base, nir_imm_int(&b, pipelinestat_block_size * 
2 + 8));
-         nir_ssa_def *gds_end = nir_load_ssbo(&b, 1, 64, src_buf, 
gds_end_offset);
-
-         nir_ssa_def *ngg_gds_result = nir_isub(&b, gds_end, gds_start);
-
-         nir_store_var(&b, result, nir_iadd(&b, nir_load_var(&b, result), 
ngg_gds_result), 0x1);
-      }
-      nir_pop_if(&b, NULL);
-
       /* Store result */
       nir_push_if(&b, result_is_64bit);
 
@@ -1139,12 +1116,6 @@ radv_create_query_pool(struct radv_device *device, const 
VkQueryPoolCreateInfo *
       break;
    case VK_QUERY_TYPE_PIPELINE_STATISTICS:
       pool->stride = radv_get_pipelinestat_query_size(device) * 2;
-      if (pool->uses_gds) {
-         /* When the query pool needs GDS (for counting the number of 
primitives generated by a
-          * geometry shader with NGG), allocate 2x64-bit values for begin/end.
-          */
-         pool->stride += 8 * 2;
-      }
       break;
    case VK_QUERY_TYPE_TIMESTAMP:
    case VK_QUERY_TYPE_ACCELERATION_STRUCTURE_COMPACTED_SIZE_KHR:
@@ -1316,7 +1287,6 @@ radv_GetQueryPoolResults(VkDevice _device, VkQueryPool 
queryPool, uint32_t first
          unsigned pipelinestat_block_size = 
radv_get_pipelinestat_query_size(device);
          const uint32_t *avail_ptr =
             (const uint32_t *)(pool->ptr + pool->availability_offset + 4 * 
query);
-         uint64_t ngg_gds_result = 0;
 
          do {
             available = p_atomic_read(avail_ptr);
@@ -1325,14 +1295,6 @@ radv_GetQueryPoolResults(VkDevice _device, VkQueryPool 
queryPool, uint32_t first
          if (!available && !(flags & VK_QUERY_RESULT_PARTIAL_BIT))
             result = VK_NOT_READY;
 
-         if (pool->uses_gds) {
-            /* Compute the result that was copied from GDS. */
-            const uint64_t *gds_start = (uint64_t *)(src + 
pipelinestat_block_size * 2);
-            const uint64_t *gds_stop = (uint64_t *)(src + 
pipelinestat_block_size * 2 + 8);
-
-            ngg_gds_result = gds_stop[0] - gds_start[0];
-         }
-
          const uint64_t *start = (uint64_t *)src;
          const uint64_t *stop = (uint64_t *)(src + pipelinestat_block_size);
          if (flags & VK_QUERY_RESULT_64_BIT) {
@@ -1341,13 +1303,7 @@ radv_GetQueryPoolResults(VkDevice _device, VkQueryPool 
queryPool, uint32_t first
             for (int i = 0; i < ARRAY_SIZE(pipeline_statistics_indices); ++i) {
                if (pool->pipeline_stats_mask & (1u << i)) {
                   if (available || (flags & VK_QUERY_RESULT_PARTIAL_BIT)) {
-                     *dst = stop[pipeline_statistics_indices[i]] -
-                            start[pipeline_statistics_indices[i]];
-
-                     if (pool->uses_gds &&
-                         (1u << i) == 
VK_QUERY_PIPELINE_STATISTIC_GEOMETRY_SHADER_PRIMITIVES_BIT) {
-                        *dst += ngg_gds_result;
-                     }
+                     *dst = stop[pipeline_statistics_indices[i]] - 
start[pipeline_statistics_indices[i]];
                   }
                   dst++;
                }
@@ -1359,13 +1315,7 @@ radv_GetQueryPoolResults(VkDevice _device, VkQueryPool 
queryPool, uint32_t first
             for (int i = 0; i < ARRAY_SIZE(pipeline_statistics_indices); ++i) {
                if (pool->pipeline_stats_mask & (1u << i)) {
                   if (available || (flags & VK_QUERY_RESULT_PARTIAL_BIT)) {
-                     *dst = stop[pipeline_statistics_indices[i]] -
-                            start[pipeline_statistics_indices[i]];
-
-                     if (pool->uses_gds &&
-                         (1u << i) == 
VK_QUERY_PIPELINE_STATISTIC_GEOMETRY_SHADER_PRIMITIVES_BIT) {
-                        *dst += ngg_gds_result;
-                     }
+                     *dst = stop[pipeline_statistics_indices[i]] - 
start[pipeline_statistics_indices[i]];
                   }
                   dst++;
                }
@@ -1588,11 +1538,10 @@ radv_CmdCopyQueryPoolResults(VkCommandBuffer 
commandBuffer, VkQueryPool queryPoo
             radv_cp_wait_mem(cs, WAIT_REG_MEM_EQUAL, avail_va, 1, 0xffffffff);
          }
       }
-      radv_query_shader(
-         cmd_buffer, 
&cmd_buffer->device->meta_state.query.pipeline_statistics_query_pipeline,
-         pool->bo, dst_buffer->bo, firstQuery * pool->stride, 
dst_buffer->offset + dstOffset,
-         pool->stride, stride, dst_size, queryCount, flags, 
pool->pipeline_stats_mask,
-         pool->availability_offset + 4 * firstQuery, pool->uses_gds);
+      radv_query_shader(cmd_buffer, 
&cmd_buffer->device->meta_state.query.pipeline_statistics_query_pipeline, 
pool->bo,
+                        dst_buffer->bo, firstQuery * pool->stride, 
dst_buffer->offset + dstOffset, pool->stride, stride,
+                        dst_size, queryCount, flags, pool->pipeline_stats_mask,
+                        pool->availability_offset + 4 * firstQuery, false);
       break;
    case VK_QUERY_TYPE_TIMESTAMP:
    case VK_QUERY_TYPE_ACCELERATION_STRUCTURE_COMPACTED_SIZE_KHR:
@@ -1838,8 +1787,6 @@ emit_begin_query(struct radv_cmd_buffer *cmd_buffer, 
struct radv_query_pool *poo
       radeon_emit(cs, va >> 32);
       break;
    case VK_QUERY_TYPE_PIPELINE_STATISTICS: {
-      unsigned pipelinestat_block_size = 
radv_get_pipelinestat_query_size(cmd_buffer->device);
-
       radeon_check_space(cmd_buffer->device->ws, cs, 4);
 
       ++cmd_buffer->state.active_pipeline_queries;
@@ -1854,7 +1801,10 @@ emit_begin_query(struct radv_cmd_buffer *cmd_buffer, 
struct radv_query_pool *poo
       radeon_emit(cs, va >> 32);
 
       if (pool->uses_gds) {
-         va += pipelinestat_block_size * 2;
+         uint32_t gs_prim_offset =
+            
radv_get_pipelinestat_query_offset(VK_QUERY_PIPELINE_STATISTIC_GEOMETRY_SHADER_PRIMITIVES_BIT);
+
+         va += gs_prim_offset;
 
          /* pipeline statistics counter for all streams */
          gfx10_copy_gds_query(cmd_buffer, RADV_NGG_QUERY_PIPELINE_STAT_OFFSET, 
va);
@@ -1996,7 +1946,10 @@ emit_end_query(struct radv_cmd_buffer *cmd_buffer, 
struct radv_query_pool *pool,
                                  cmd_buffer->gfx9_eop_bug_va);
 
       if (pool->uses_gds) {
-         va += pipelinestat_block_size + 8;
+         uint32_t gs_prim_offset =
+            
radv_get_pipelinestat_query_offset(VK_QUERY_PIPELINE_STATISTIC_GEOMETRY_SHADER_PRIMITIVES_BIT);
+
+         va += gs_prim_offset;
 
          /* pipeline statistics counter for all streams */
          gfx10_copy_gds_query(cmd_buffer, RADV_NGG_QUERY_PIPELINE_STAT_OFFSET, 
va);

Reply via email to