Module: Mesa Branch: main Commit: 7d96fe853e1b007293054645caf54ade77a77c90 URL: http://cgit.freedesktop.org/mesa/mesa/commit/?id=7d96fe853e1b007293054645caf54ade77a77c90
Author: Samuel Pitoiset <samuel.pitoi...@gmail.com> Date: Tue Oct 3 17:34:06 2023 +0200 radv: fix a synchronization issue with primitives generated query on RDNA1-2 Only RDNA1-2 are affected because RADV needs to handle the legacy vs NGG path for this query, and the NGG results are stored with 2 extra 64-bit values. Fixes flakes with dEQP-VK.transform_feedback.primitives_generated_query.* since VKCTS 1.3.7.0. Cc: mesa-stable Signed-off-by: Samuel Pitoiset <samuel.pitoi...@gmail.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/25862> --- src/amd/ci/radv-navi21-aco-flakes.txt | 3 --- src/amd/vulkan/radv_query.c | 48 +++++++++++++++++++++++++---------- 2 files changed, 34 insertions(+), 17 deletions(-) diff --git a/src/amd/ci/radv-navi21-aco-flakes.txt b/src/amd/ci/radv-navi21-aco-flakes.txt index 663dfca4d0b..b6e62fed8ad 100644 --- a/src/amd/ci/radv-navi21-aco-flakes.txt +++ b/src/amd/ci/radv-navi21-aco-flakes.txt @@ -19,6 +19,3 @@ dEQP-VK.ray_tracing_pipeline.pipeline_library.configurations.multithreaded_compi dEQP-VK.ray_tracing_pipeline.pipeline_library.configurations.multithreaded_compilation.*_check_capture_replay_handles dEQP-VK.ray_tracing_pipeline.pipeline_library.configurations.singlethreaded_compilation.*_check_all_handles dEQP-VK.ray_tracing_pipeline.pipeline_library.configurations.singlethreaded_compilation.*_check_capture_replay_handles - -# New CTS flakes in 1.3.7.0 -dEQP-VK.transform_feedback.primitives_generated_query.get.* diff --git a/src/amd/vulkan/radv_query.c b/src/amd/vulkan/radv_query.c index 6596e93199b..7e43604c8bc 100644 --- a/src/amd/vulkan/radv_query.c +++ b/src/amd/vulkan/radv_query.c @@ -584,7 +584,7 @@ build_timestamp_query_shader(struct radv_device *device) } #define RADV_PGQ_STRIDE 32 -#define RADV_PGQ_STRIDE_GDS (RADV_PGQ_STRIDE + 4 * 2) +#define RADV_PGQ_STRIDE_GDS (RADV_PGQ_STRIDE + 8 * 2) static nir_shader * build_pg_query_shader(struct radv_device *device) @@ -663,11 +663,21 @@ build_pg_query_shader(struct radv_device *device) nir_def *avails[2]; avails[0] = nir_channel(&b, load1, 1); avails[1] = nir_channel(&b, load2, 1); - nir_def *result_is_available = - nir_i2b(&b, nir_iand(&b, nir_iand(&b, avails[0], avails[1]), nir_imm_int(&b, 0x80000000))); + nir_store_var(&b, available, nir_i2b(&b, nir_iand_imm(&b, nir_iand(&b, avails[0], avails[1]), 0x80000000)), 0x1); + + nir_push_if(&b, uses_gds); + { + nir_def *gds_avail_start = nir_load_ssbo(&b, 1, 32, src_buf, nir_iadd_imm(&b, input_base, 36), .align_mul = 4); + nir_def *gds_avail_end = nir_load_ssbo(&b, 1, 32, src_buf, nir_iadd_imm(&b, input_base, 44), .align_mul = 4); + nir_def *gds_result_available = + nir_i2b(&b, nir_iand_imm(&b, nir_iand(&b, gds_avail_start, gds_avail_end), 0x80000000)); + + nir_store_var(&b, available, nir_iand(&b, nir_load_var(&b, available), gds_result_available), 0x1); + } + nir_pop_if(&b, NULL); /* Only compute result if available. */ - nir_push_if(&b, result_is_available); + nir_push_if(&b, nir_load_var(&b, available)); /* Pack values. */ nir_def *packed64[2]; @@ -684,7 +694,7 @@ build_pg_query_shader(struct radv_device *device) nir_def *gds_start = nir_load_ssbo(&b, 1, 32, src_buf, nir_iadd(&b, input_base, nir_imm_int(&b, 32)), .align_mul = 4); nir_def *gds_end = - nir_load_ssbo(&b, 1, 32, src_buf, nir_iadd(&b, input_base, nir_imm_int(&b, 36)), .align_mul = 4); + nir_load_ssbo(&b, 1, 32, src_buf, nir_iadd(&b, input_base, nir_imm_int(&b, 40)), .align_mul = 4); nir_def *ngg_gds_result = nir_isub(&b, gds_end, gds_start); @@ -692,8 +702,6 @@ build_pg_query_shader(struct radv_device *device) } nir_pop_if(&b, NULL); - nir_store_var(&b, available, nir_imm_true(&b), 0x1); - nir_pop_if(&b, NULL); /* Determine if result is 64 or 32 bit. */ @@ -1087,7 +1095,7 @@ radv_create_query_pool(struct radv_device *device, const VkQueryPoolCreateInfo * case VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT: if (pool->uses_gds && device->physical_device->rad_info.gfx_level < GFX11) { /* When the hardware can use both the legacy and the NGG paths in the same begin/end pair, - * allocate 2x32-bit values for the GDS counters. + * allocate 2x64-bit values for the GDS counters. */ pool->stride = RADV_PGQ_STRIDE_GDS; } else { @@ -1318,6 +1326,7 @@ radv_GetQueryPoolResults(VkDevice _device, VkQueryPool queryPool, uint32_t first break; } case VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT: { + const bool uses_gds_query = pool->uses_gds && device->physical_device->rad_info.gfx_level < GFX11; p_atomic_uint64_t const *src64 = (p_atomic_uint64_t const *)src; uint64_t primitive_storage_needed; @@ -1333,6 +1342,10 @@ radv_GetQueryPoolResults(VkDevice _device, VkQueryPool queryPool, uint32_t first !(p_atomic_read(src64 + 2) & 0x8000000000000000UL)) { available = 0; } + if (uses_gds_query && (!(p_atomic_read(src64 + 4) & 0x8000000000000000UL) || + !(p_atomic_read(src64 + 5) & 0x8000000000000000UL))) { + available = 0; + } } while (!available && (flags & VK_QUERY_RESULT_WAIT_BIT)); if (!available && !(flags & VK_QUERY_RESULT_PARTIAL_BIT)) @@ -1340,11 +1353,9 @@ radv_GetQueryPoolResults(VkDevice _device, VkQueryPool queryPool, uint32_t first primitive_storage_needed = src64[2] - src64[0]; - if (pool->uses_gds && device->physical_device->rad_info.gfx_level < GFX11) { - uint32_t const *src32 = (uint32_t const *)src; - + if (uses_gds_query) { /* Accumulate the result that was copied from GDS in case NGG shader has been used. */ - primitive_storage_needed += src32[9] - src32[8]; + primitive_storage_needed += src64[5] - src64[4]; } if (flags & VK_QUERY_RESULT_64_BIT) { @@ -1539,15 +1550,22 @@ radv_CmdCopyQueryPoolResults(VkCommandBuffer commandBuffer, VkQueryPool queryPoo break; case VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT: if (flags & VK_QUERY_RESULT_WAIT_BIT) { + const bool uses_gds_query = pool->uses_gds && cmd_buffer->device->physical_device->rad_info.gfx_level < GFX11; + for (unsigned i = 0; i < queryCount; i++) { unsigned query = firstQuery + i; uint64_t src_va = va + query * pool->stride; - radeon_check_space(cmd_buffer->device->ws, cs, 7 * 2); + radeon_check_space(cmd_buffer->device->ws, cs, 7 * 4); /* Wait on the upper word of the PrimitiveStorageNeeded result. */ radv_cp_wait_mem(cs, cmd_buffer->qf, WAIT_REG_MEM_GREATER_OR_EQUAL, src_va + 4, 0x80000000, 0xffffffff); radv_cp_wait_mem(cs, cmd_buffer->qf, WAIT_REG_MEM_GREATER_OR_EQUAL, src_va + 20, 0x80000000, 0xffffffff); + + if (uses_gds_query) { + radv_cp_wait_mem(cs, cmd_buffer->qf, WAIT_REG_MEM_GREATER_OR_EQUAL, src_va + 36, 0x80000000, 0xffffffff); + radv_cp_wait_mem(cs, cmd_buffer->qf, WAIT_REG_MEM_GREATER_OR_EQUAL, src_va + 44, 0x80000000, 0xffffffff); + } } } @@ -1829,6 +1847,7 @@ emit_begin_query(struct radv_cmd_buffer *cmd_buffer, struct radv_query_pool *poo if (pool->uses_gds) { /* generated prim counter */ gfx10_copy_gds_query(cmd_buffer, RADV_SHADER_QUERY_PRIM_GEN_OFFSET(index), va + 32); + radv_cs_write_data_imm(cs, V_370_ME, va + 36, 0x80000000); /* Record that the command buffer needs GDS. */ cmd_buffer->gds_needed = true; @@ -1983,7 +2002,8 @@ emit_end_query(struct radv_cmd_buffer *cmd_buffer, struct radv_query_pool *pool, if (pool->uses_gds) { /* generated prim counter */ - gfx10_copy_gds_query(cmd_buffer, RADV_SHADER_QUERY_PRIM_GEN_OFFSET(index), va + 36); + gfx10_copy_gds_query(cmd_buffer, RADV_SHADER_QUERY_PRIM_GEN_OFFSET(index), va + 40); + radv_cs_write_data_imm(cs, V_370_ME, va + 44, 0x80000000); cmd_buffer->state.active_prims_gen_gds_queries--;