Module: Mesa Branch: main Commit: 0c489f18cb27d3c725f424f8f57d45636f4eb297 URL: http://cgit.freedesktop.org/mesa/mesa/commit/?id=0c489f18cb27d3c725f424f8f57d45636f4eb297
Author: Danylo Piliaiev <[email protected]> Date: Thu Apr 14 17:19:21 2022 +0300 turnip: Skip load/stores for tiles with no geometry When HW binning is used tile loads/stores could be skipped if there is no geometry in the tile. Loads could be skipped when: - The attachment won't be resolved, otherwise if load is skipped there would be holes in the resolved attachment; - There is no vkCmdClearAttachments afterwards since it is likely a partial clear done via 2d blit (2d blit doesn't produce geometry). Stores could be skipped when: - The attachment was not cleared, which may happen by load_op or vkCmdClearAttachments; - When store is not a resolve. I chose to predicate each load/store separately to allow them to be skipped when only some attachments are cleared or resolved. Gmem loads are moved into separate cs because whether to emit CP_COND_REG_EXEC depends on HW binning being enabled and usage of vkCmdClearAttachments. CP_COND_REG_EXEC predicate could be changed during draw_cs only by perf query, in such case the predicate should be re-emitted. (At the moment it is always re-emitted before stores) Signed-off-by: Danylo Piliaiev <[email protected]> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/15974> --- src/freedreno/vulkan/tu_clear_blit.c | 68 +++++++++++++++++++++- src/freedreno/vulkan/tu_cmd_buffer.c | 106 ++++++++++++++++++++++++++++++----- src/freedreno/vulkan/tu_pass.c | 11 ++++ src/freedreno/vulkan/tu_private.h | 10 +++- src/freedreno/vulkan/tu_query.c | 4 ++ 5 files changed, 182 insertions(+), 17 deletions(-) diff --git a/src/freedreno/vulkan/tu_clear_blit.c b/src/freedreno/vulkan/tu_clear_blit.c index a96be2613e2..555b5edf26f 100644 --- a/src/freedreno/vulkan/tu_clear_blit.c +++ b/src/freedreno/vulkan/tu_clear_blit.c @@ -2280,6 +2280,8 @@ tu_clear_sysmem_attachments(struct tu_cmd_buffer *cmd, s_clear_val = attachments[i].clearValue.depthStencil.stencil & 0xff; } } + + cmd->state.attachment_cmd_clear[a] = true; } /* We may not know the multisample count if there are no attachments, so @@ -2551,6 +2553,8 @@ tu_clear_gmem_attachments(struct tu_cmd_buffer *cmd, if (a == VK_ATTACHMENT_UNUSED) continue; + cmd->state.attachment_cmd_clear[a] = true; + tu_emit_clear_gmem_attachment(cmd, cs, a, attachments[j].aspectMask, &attachments[j].clearValue); } @@ -2799,24 +2803,64 @@ blit_can_resolve(VkFormat format) return true; } +static void +tu_begin_load_store_cond_exec(struct tu_cmd_buffer *cmd, + struct tu_cs *cs, bool load) +{ + tu_cond_exec_start(cs, CP_COND_REG_EXEC_0_MODE(PRED_TEST)); +} + +static void +tu_end_load_store_cond_exec(struct tu_cmd_buffer *cmd, + struct tu_cs *cs, bool load) +{ + tu_cond_exec_end(cs); +} + void tu_load_gmem_attachment(struct tu_cmd_buffer *cmd, struct tu_cs *cs, uint32_t a, + bool cond_exec_allowed, bool force_load) { const struct tu_image_view *iview = cmd->state.attachments[a]; const struct tu_render_pass_attachment *attachment = &cmd->state.pass->attachments[a]; + bool load_common = attachment->load || force_load; + bool load_stencil = + attachment->load_stencil || + (attachment->format == VK_FORMAT_D32_SFLOAT_S8_UINT && force_load); + + if (!load_common && !load_stencil) + return; + trace_start_gmem_load(&cmd->trace, cs); - if (attachment->load || force_load) + /* If attachment will be cleared by vkCmdClearAttachments - it is likely + * that it would be partially cleared, and since it is done by 2d blit + * it doesn't produce geometry, so we have to unconditionally load. + * + * To simplify conditions treat partially cleared separate DS as fully + * cleared and don't emit cond_exec. + */ + bool cond_exec = cond_exec_allowed && + !attachment->clear_mask && + !cmd->state.attachment_cmd_clear[a] && + !attachment->will_be_resolved; + if (cond_exec) + tu_begin_load_store_cond_exec(cmd, cs, true); + + if (load_common) tu_emit_blit(cmd, cs, iview, attachment, false, false); - if (attachment->load_stencil || (attachment->format == VK_FORMAT_D32_SFLOAT_S8_UINT && force_load)) + if (load_stencil) tu_emit_blit(cmd, cs, iview, attachment, false, true); + if (cond_exec) + tu_end_load_store_cond_exec(cmd, cs, true); + trace_end_gmem_load(&cmd->trace, cs, attachment->format, force_load); } @@ -2919,7 +2963,8 @@ void tu_store_gmem_attachment(struct tu_cmd_buffer *cmd, struct tu_cs *cs, uint32_t a, - uint32_t gmem_a) + uint32_t gmem_a, + bool cond_exec_allowed) { struct tu_physical_device *phys_dev = cmd->device->physical_device; const VkRect2D *render_area = &cmd->state.render_area; @@ -2930,6 +2975,15 @@ tu_store_gmem_attachment(struct tu_cmd_buffer *cmd, if (!dst->store && !dst->store_stencil) return; + bool was_cleared = src->clear_mask || cmd->state.attachment_cmd_clear[a]; + /* Unconditional store should happen only if attachment was cleared, + * which could have happened either by load_op or via vkCmdClearAttachments. + */ + bool cond_exec = cond_exec_allowed && !was_cleared; + if (cond_exec) { + tu_begin_load_store_cond_exec(cmd, cs, false); + } + uint32_t x1 = render_area->offset.x; uint32_t y1 = render_area->offset.y; uint32_t x2 = x1 + render_area->extent.width; @@ -2971,6 +3025,10 @@ tu_store_gmem_attachment(struct tu_cmd_buffer *cmd, if (store_separate_stencil) tu_emit_blit(cmd, cs, iview, src, true, true); + if (cond_exec) { + tu_end_load_store_cond_exec(cmd, cs, false); + } + trace_end_gmem_store(&cmd->trace, cs, dst->format, true, false); return; } @@ -3011,5 +3069,9 @@ tu_store_gmem_attachment(struct tu_cmd_buffer *cmd, } } + if (cond_exec) { + tu_end_load_store_cond_exec(cmd, cs, false); + } + trace_end_gmem_store(&cmd->trace, cs, dst->format, false, unaligned); } diff --git a/src/freedreno/vulkan/tu_cmd_buffer.c b/src/freedreno/vulkan/tu_cmd_buffer.c index e97765ccdc8..3f1dd3831b6 100644 --- a/src/freedreno/vulkan/tu_cmd_buffer.c +++ b/src/freedreno/vulkan/tu_cmd_buffer.c @@ -632,6 +632,25 @@ use_sysmem_rendering(struct tu_cmd_buffer *cmd, return use_sysmem; } +/* Optimization: there is no reason to load gmem if there is no + * geometry to process. COND_REG_EXEC predicate is set here, + * but the actual skip happens in tile_load_cs and tile_store_cs, + * for each blit separately. + */ +static void +tu6_emit_cond_for_load_stores(struct tu_cmd_buffer *cmd, struct tu_cs *cs, + uint32_t pipe, uint32_t slot, bool wfm) +{ + if (use_hw_binning(cmd)) { + tu_cs_emit_pkt7(cs, CP_REG_TEST, 1); + tu_cs_emit(cs, A6XX_CP_REG_TEST_0_REG(REG_A6XX_VSC_STATE_REG(pipe)) | + A6XX_CP_REG_TEST_0_BIT(slot) | + COND(wfm, A6XX_CP_REG_TEST_0_WAIT_FOR_ME)); + } else { + /* COND_REG_EXECs are not emitted in non-binning case */ + } +} + static void tu6_emit_tile_select(struct tu_cmd_buffer *cmd, struct tu_cs *cs, @@ -664,6 +683,8 @@ tu6_emit_tile_select(struct tu_cmd_buffer *cmd, tu_cs_emit(cs, pipe * 4); tu_cs_emit(cs, pipe * cmd->vsc_prim_strm_pitch); + tu6_emit_cond_for_load_stores(cmd, cs, pipe, slot, true); + tu_cs_emit_pkt7(cs, CP_SET_VISIBILITY_OVERRIDE, 1); tu_cs_emit(cs, 0x0); @@ -740,6 +761,15 @@ tu6_emit_sysmem_resolves(struct tu_cmd_buffer *cmd, } } +static void +tu6_emit_tile_load(struct tu_cmd_buffer *cmd, struct tu_cs *cs) +{ + tu6_emit_blit_scissor(cmd, cs, true); + + for (uint32_t i = 0; i < cmd->state.pass->attachment_count; ++i) + tu_load_gmem_attachment(cmd, cs, i, use_hw_binning(cmd), false); +} + static void tu6_emit_tile_store(struct tu_cmd_buffer *cmd, struct tu_cs *cs) { @@ -756,7 +786,7 @@ tu6_emit_tile_store(struct tu_cmd_buffer *cmd, struct tu_cs *cs) for (uint32_t a = 0; a < pass->attachment_count; ++a) { if (pass->attachments[a].gmem_offset >= 0) - tu_store_gmem_attachment(cmd, cs, a, a); + tu_store_gmem_attachment(cmd, cs, a, a, use_hw_binning(cmd)); } if (subpass->resolve_attachments) { @@ -764,7 +794,7 @@ tu6_emit_tile_store(struct tu_cmd_buffer *cmd, struct tu_cs *cs) uint32_t a = subpass->resolve_attachments[i].attachment; if (a != VK_ATTACHMENT_UNUSED) { uint32_t gmem_a = tu_subpass_get_attachment_to_resolve(subpass, i); - tu_store_gmem_attachment(cmd, cs, a, gmem_a); + tu_store_gmem_attachment(cmd, cs, a, gmem_a, false); } } } @@ -1220,11 +1250,6 @@ tu_emit_renderpass_begin(struct tu_cmd_buffer *cmd, tu_cond_exec_start(cs, CP_COND_EXEC_0_RENDER_MODE_GMEM); - tu6_emit_blit_scissor(cmd, cs, true); - - for (uint32_t i = 0; i < cmd->state.pass->attachment_count; ++i) - tu_load_gmem_attachment(cmd, cs, i, false); - tu6_emit_blit_scissor(cmd, cs, false); for (uint32_t i = 0; i < cmd->state.pass->attachment_count; ++i) @@ -1356,8 +1381,10 @@ tu6_tile_render_begin(struct tu_cmd_buffer *cmd, struct tu_cs *cs, } static void -tu6_render_tile(struct tu_cmd_buffer *cmd, struct tu_cs *cs) +tu6_render_tile(struct tu_cmd_buffer *cmd, struct tu_cs *cs, + uint32_t pipe, uint32_t slot) { + tu_cs_emit_call(cs, &cmd->tile_load_cs); tu_cs_emit_call(cs, &cmd->draw_cs); if (use_hw_binning(cmd)) { @@ -1365,6 +1392,10 @@ tu6_render_tile(struct tu_cmd_buffer *cmd, struct tu_cs *cs) tu_cs_emit(cs, A6XX_CP_SET_MARKER_0_MODE(RM6_ENDVIS)); } + /* Predicate is changed in draw_cs so we have to re-emit it */ + if (cmd->state.draw_cs_writes_to_cond_pred) + tu6_emit_cond_for_load_stores(cmd, cs, pipe, slot, false); + tu_cs_emit_call(cs, &cmd->tile_store_cs); if (!u_trace_iterator_equal(cmd->trace_renderpass_start, cmd->trace_renderpass_end)) { @@ -1418,7 +1449,7 @@ tu_cmd_render_tiles(struct tu_cmd_buffer *cmd, tu6_emit_tile_select(cmd, &cmd->cs, tx, ty, pipe, slot); trace_start_draw_ib_gmem(&cmd->trace, &cmd->cs); - tu6_render_tile(cmd, &cmd->cs); + tu6_render_tile(cmd, &cmd->cs, pipe, slot); trace_end_draw_ib_gmem(&cmd->trace, &cmd->cs); } } @@ -1491,6 +1522,7 @@ tu_create_cmd_buffer(struct tu_device *device, list_inithead(&cmd_buffer->renderpass_autotune_results); tu_cs_init(&cmd_buffer->cs, device, TU_CS_MODE_GROW, 4096); + tu_cs_init(&cmd_buffer->tile_load_cs, device, TU_CS_MODE_GROW, 2048); tu_cs_init(&cmd_buffer->draw_cs, device, TU_CS_MODE_GROW, 4096); tu_cs_init(&cmd_buffer->tile_store_cs, device, TU_CS_MODE_GROW, 2048); tu_cs_init(&cmd_buffer->draw_epilogue_cs, device, TU_CS_MODE_GROW, 4096); @@ -1507,11 +1539,14 @@ tu_cmd_buffer_destroy(struct tu_cmd_buffer *cmd_buffer) list_del(&cmd_buffer->pool_link); tu_cs_finish(&cmd_buffer->cs); + tu_cs_finish(&cmd_buffer->tile_load_cs); tu_cs_finish(&cmd_buffer->draw_cs); tu_cs_finish(&cmd_buffer->tile_store_cs); tu_cs_finish(&cmd_buffer->draw_epilogue_cs); tu_cs_finish(&cmd_buffer->sub_cs); + vk_free(&cmd_buffer->pool->vk.alloc, cmd_buffer->state.attachment_cmd_clear); + u_trace_fini(&cmd_buffer->trace); tu_autotune_free_results(cmd_buffer->device, &cmd_buffer->renderpass_autotune_results); @@ -1535,11 +1570,15 @@ tu_reset_cmd_buffer(struct tu_cmd_buffer *cmd_buffer) cmd_buffer->record_result = VK_SUCCESS; tu_cs_reset(&cmd_buffer->cs); + tu_cs_reset(&cmd_buffer->tile_load_cs); tu_cs_reset(&cmd_buffer->draw_cs); tu_cs_reset(&cmd_buffer->tile_store_cs); tu_cs_reset(&cmd_buffer->draw_epilogue_cs); tu_cs_reset(&cmd_buffer->sub_cs); + vk_free(&cmd_buffer->pool->vk.alloc, cmd_buffer->state.attachment_cmd_clear); + cmd_buffer->state.attachment_cmd_clear = NULL; + tu_autotune_free_results(cmd_buffer->device, &cmd_buffer->renderpass_autotune_results); for (unsigned i = 0; i < MAX_BIND_POINTS; i++) { @@ -1678,6 +1717,7 @@ tu_BeginCommandBuffer(VkCommandBuffer commandBuffer, cmd_buffer->usage_flags = pBeginInfo->flags; tu_cs_begin(&cmd_buffer->cs); + tu_cs_begin(&cmd_buffer->tile_load_cs); tu_cs_begin(&cmd_buffer->draw_cs); tu_cs_begin(&cmd_buffer->tile_store_cs); tu_cs_begin(&cmd_buffer->draw_epilogue_cs); @@ -1710,6 +1750,14 @@ tu_BeginCommandBuffer(VkCommandBuffer commandBuffer, cmd_buffer->state.pass = tu_render_pass_from_handle(pBeginInfo->pInheritanceInfo->renderPass); cmd_buffer->state.subpass = &cmd_buffer->state.pass->subpasses[pBeginInfo->pInheritanceInfo->subpass]; + /* vkCmdClearAttachments is allowed in a secondary cmdbuf and we have to + * track it as in primary cmdbuf. + */ + cmd_buffer->state.attachment_cmd_clear = + vk_zalloc(&cmd_buffer->pool->vk.alloc, + cmd_buffer->state.pass->attachment_count * + sizeof(cmd_buffer->state.attachment_cmd_clear[0]), + 8, VK_SYSTEM_ALLOCATION_SCOPE_OBJECT); } else { /* When executing in the middle of another command buffer, the CCU * state is unknown. @@ -2245,6 +2293,7 @@ tu_EndCommandBuffer(VkCommandBuffer commandBuffer) } tu_cs_end(&cmd_buffer->cs); + tu_cs_end(&cmd_buffer->tile_load_cs); tu_cs_end(&cmd_buffer->draw_cs); tu_cs_end(&cmd_buffer->tile_store_cs); tu_cs_end(&cmd_buffer->draw_epilogue_cs); @@ -3061,7 +3110,7 @@ vk2tu_src_stage(VkPipelineStageFlags vk_stages) { enum tu_stage stage = TU_STAGE_CP; u_foreach_bit (bit, vk_stages) { - enum tu_stage new_stage = vk2tu_single_stage(1ull << bit, false); + enum tu_stage new_stage = vk2tu_single_stage(1ull << bit, false); stage = MAX2(stage, new_stage); } @@ -3073,7 +3122,7 @@ vk2tu_dst_stage(VkPipelineStageFlags vk_stages) { enum tu_stage stage = TU_STAGE_PS; u_foreach_bit (bit, vk_stages) { - enum tu_stage new_stage = vk2tu_single_stage(1ull << bit, true); + enum tu_stage new_stage = vk2tu_single_stage(1ull << bit, true); stage = MIN2(stage, new_stage); } @@ -3130,6 +3179,14 @@ tu_CmdExecuteCommands(VkCommandBuffer commandBuffer, cmd->state.has_subpass_predication = true; if (secondary->state.disable_gmem) cmd->state.disable_gmem = true; + + cmd->state.draw_cs_writes_to_cond_pred |= + secondary->state.draw_cs_writes_to_cond_pred; + + for (uint32_t i = 0; i < cmd->state.pass->attachment_count; i++) { + cmd->state.attachment_cmd_clear[i] |= + secondary->state.attachment_cmd_clear[i]; + } } else { assert(tu_cs_is_empty(&secondary->draw_cs)); assert(tu_cs_is_empty(&secondary->draw_epilogue_cs)); @@ -3307,6 +3364,18 @@ tu_CmdBeginRenderPass2(VkCommandBuffer commandBuffer, return; } + cmd->state.attachment_cmd_clear = + vk_zalloc(&cmd->pool->vk.alloc, pass->attachment_count * + sizeof(cmd->state.attachment_cmd_clear[0]), 8, + VK_SYSTEM_ALLOCATION_SCOPE_OBJECT); + + if (!cmd->state.attachment_cmd_clear) { + cmd->record_result = VK_ERROR_OUT_OF_HOST_MEMORY; + return; + } + + cmd->state.draw_cs_writes_to_cond_pred = false; + for (unsigned i = 0; i < pass->attachment_count; i++) { cmd->state.attachments[i] = pAttachmentInfo ? tu_image_view_from_handle(pAttachmentInfo->pAttachments[i]) : @@ -3400,7 +3469,7 @@ tu_CmdNextSubpass2(VkCommandBuffer commandBuffer, uint32_t gmem_a = tu_subpass_get_attachment_to_resolve(subpass, i); - tu_store_gmem_attachment(cmd, cs, a, gmem_a); + tu_store_gmem_attachment(cmd, cs, a, gmem_a, false); if (pass->attachments[a].gmem_offset < 0) continue; @@ -3410,7 +3479,7 @@ tu_CmdNextSubpass2(VkCommandBuffer commandBuffer, * if it is, should be doing a GMEM->GMEM resolve instead of GMEM->MEM->GMEM.. */ tu_finishme("missing GMEM->GMEM resolve path\n"); - tu_load_gmem_attachment(cmd, cs, a, true); + tu_load_gmem_attachment(cmd, cs, a, false, true); } } @@ -4627,8 +4696,15 @@ tu_CmdEndRenderPass2(VkCommandBuffer commandBuffer, { TU_FROM_HANDLE(tu_cmd_buffer, cmd_buffer, commandBuffer); + /* GMEM loads are created after draw_cs in the separate cs + * because they need to know whether to allow their conditional + * execution, which is tied to a state that is known only at + * the end of the renderpass. + */ + tu6_emit_tile_load(cmd_buffer, &cmd_buffer->tile_load_cs); tu6_emit_tile_store(cmd_buffer, &cmd_buffer->tile_store_cs); + tu_cs_end(&cmd_buffer->tile_load_cs); tu_cs_end(&cmd_buffer->draw_cs); tu_cs_end(&cmd_buffer->tile_store_cs); tu_cs_end(&cmd_buffer->draw_epilogue_cs); @@ -4649,6 +4725,8 @@ tu_CmdEndRenderPass2(VkCommandBuffer commandBuffer, /* discard draw_cs and draw_epilogue_cs entries now that the tiles are rendered */ + tu_cs_discard_entries(&cmd_buffer->tile_load_cs); + tu_cs_begin(&cmd_buffer->tile_load_cs); tu_cs_discard_entries(&cmd_buffer->draw_cs); tu_cs_begin(&cmd_buffer->draw_cs); tu_cs_discard_entries(&cmd_buffer->tile_store_cs); @@ -4661,6 +4739,8 @@ tu_CmdEndRenderPass2(VkCommandBuffer commandBuffer, tu_subpass_barrier(cmd_buffer, &cmd_buffer->state.pass->end_barrier, true); vk_free(&cmd_buffer->pool->vk.alloc, cmd_buffer->state.attachments); + vk_free(&cmd_buffer->pool->vk.alloc, cmd_buffer->state.attachment_cmd_clear); + cmd_buffer->state.attachment_cmd_clear = NULL; cmd_buffer->state.pass = NULL; cmd_buffer->state.subpass = NULL; diff --git a/src/freedreno/vulkan/tu_pass.c b/src/freedreno/vulkan/tu_pass.c index e7bc2c7da0f..64d9de2676f 100644 --- a/src/freedreno/vulkan/tu_pass.c +++ b/src/freedreno/vulkan/tu_pass.c @@ -800,6 +800,12 @@ tu_CreateRenderPass2(VkDevice _device, for (uint32_t j = 0; j < desc->colorAttachmentCount; j++) { subpass->resolve_attachments[j].attachment = desc->pResolveAttachments[j].attachment; + + uint32_t src_a = desc->pColorAttachments[j].attachment; + if (src_a != VK_ATTACHMENT_UNUSED) { + pass->attachments[src_a].will_be_resolved = + desc->pResolveAttachments[j].attachment != VK_ATTACHMENT_UNUSED; + } } } @@ -808,6 +814,11 @@ tu_CreateRenderPass2(VkDevice _device, subpass->resolve_count++; uint32_t a = ds_resolve->pDepthStencilResolveAttachment->attachment; subpass->resolve_attachments[subpass->resolve_count - 1].attachment = a; + + uint32_t src_a = desc->pDepthStencilAttachment->attachment; + if (src_a != VK_ATTACHMENT_UNUSED) { + pass->attachments[src_a].will_be_resolved = a != VK_ATTACHMENT_UNUSED; + } } uint32_t a = desc->pDepthStencilAttachment ? diff --git a/src/freedreno/vulkan/tu_private.h b/src/freedreno/vulkan/tu_private.h index b35647f9887..52b4fc3bccb 100644 --- a/src/freedreno/vulkan/tu_private.h +++ b/src/freedreno/vulkan/tu_private.h @@ -1196,6 +1196,10 @@ struct tu_cmd_state VkRect2D render_area; const struct tu_image_view **attachments; + /* Tracks whether attachment was cleared by vkCmdClearAttachments */ + bool *attachment_cmd_clear; + /* Track whether conditional predicate for COND_REG_EXEC is changed in draw_cs */ + bool draw_cs_writes_to_cond_pred; bool xfb_used; bool has_tess; @@ -1290,6 +1294,7 @@ struct tu_cmd_buffer VkResult record_result; struct tu_cs cs; + struct tu_cs tile_load_cs; struct tu_cs draw_cs; struct tu_cs tile_store_cs; struct tu_cs draw_epilogue_cs; @@ -1576,6 +1581,7 @@ void tu_load_gmem_attachment(struct tu_cmd_buffer *cmd, struct tu_cs *cs, uint32_t a, + bool cond_exec_allowed, bool force_load); /* expose this function to be able to emit load without checking LOAD_OP */ @@ -1587,7 +1593,8 @@ void tu_store_gmem_attachment(struct tu_cmd_buffer *cmd, struct tu_cs *cs, uint32_t a, - uint32_t gmem_a); + uint32_t gmem_a, + bool cond_exec_allowed); enum pipe_format tu_vk_format_to_pipe_format(VkFormat vk_format); @@ -1857,6 +1864,7 @@ struct tu_render_pass_attachment bool load; bool store; int32_t gmem_offset; + bool will_be_resolved; /* for D32S8 separate stencil: */ bool load_stencil; bool store_stencil; diff --git a/src/freedreno/vulkan/tu_query.c b/src/freedreno/vulkan/tu_query.c index 7bf710f5e30..2b374b73626 100644 --- a/src/freedreno/vulkan/tu_query.c +++ b/src/freedreno/vulkan/tu_query.c @@ -874,6 +874,10 @@ emit_begin_perf_query(struct tu_cmd_buffer *cmdbuf, struct tu_cs *cs = cmdbuf->state.pass ? &cmdbuf->draw_cs : &cmdbuf->cs; uint32_t last_pass = ~0; + if (cmdbuf->state.pass) { + cmdbuf->state.draw_cs_writes_to_cond_pred = true; + } + /* Querying perf counters happens in these steps: * * 0) There's a scratch reg to set a pass index for perf counters query.
