Module: Mesa Branch: main Commit: d1109f67bb0269915dacbddb26a1ce29f2cb83e6 URL: http://cgit.freedesktop.org/mesa/mesa/commit/?id=d1109f67bb0269915dacbddb26a1ce29f2cb83e6
Author: Rohan Garg <rohan.g...@intel.com> Date: Tue Jun 21 15:51:31 2022 +0200 iris: Emit EXECUTE_INDIRECT_DRAW when available On newer platforms (Arrowlake and above) we can issue a EXECUTE_INDIRECT_DRAW that allows us to: * Skip issuing mi load/store instructions for indirect parameters * Skip doing the indirect draw unroll on the CPU side when the appropriate stride is passed Signed-off-by: Rohan Garg <rohan.g...@intel.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/26178> --- src/gallium/drivers/iris/iris_context.h | 24 ++++++ src/gallium/drivers/iris/iris_draw.c | 89 +++++++++++--------- src/gallium/drivers/iris/iris_screen.h | 4 + src/gallium/drivers/iris/iris_state.c | 145 ++++++++++++++++++++++++++++++++ 4 files changed, 220 insertions(+), 42 deletions(-) diff --git a/src/gallium/drivers/iris/iris_context.h b/src/gallium/drivers/iris/iris_context.h index b59d23f555f..36491236d5a 100644 --- a/src/gallium/drivers/iris/iris_context.h +++ b/src/gallium/drivers/iris/iris_context.h @@ -1152,6 +1152,30 @@ int iris_get_driver_query_group_info(struct pipe_screen *pscreen, void gfx9_toggle_preemption(struct iris_context *ice, struct iris_batch *batch, const struct pipe_draw_info *draw); +static const bool +iris_execute_indirect_draw_supported(const struct iris_context *ice, + const struct pipe_draw_indirect_info *indirect, + const struct pipe_draw_info *draw) +{ + const struct iris_screen *screen = (struct iris_screen *)ice->ctx.screen; + const struct brw_vs_prog_data *vs_prog_data = (void *) + ice->shaders.prog[MESA_SHADER_VERTEX]->prog_data; + const bool is_multiview = draw->view_mask != 0; + const size_t struct_size = draw->index_size ? + sizeof(uint32_t) * 5 : + sizeof(uint32_t) * 4; + const bool aligned_stride = + indirect && (indirect->stride == 0 || indirect->stride == struct_size); + + return (screen->devinfo->has_indirect_unroll && + aligned_stride && + (indirect && + !indirect->count_from_stream_output) && + !is_multiview && + !(vs_prog_data->uses_firstvertex || + vs_prog_data->uses_baseinstance || + vs_prog_data->uses_drawid)); +} #ifdef genX # include "iris_genx_protos.h" diff --git a/src/gallium/drivers/iris/iris_draw.c b/src/gallium/drivers/iris/iris_draw.c index dacbdbf3226..bc897ba0f7d 100644 --- a/src/gallium/drivers/iris/iris_draw.c +++ b/src/gallium/drivers/iris/iris_draw.c @@ -181,6 +181,22 @@ iris_update_draw_parameters(struct iris_context *ice, } } +static void +iris_simple_draw_vbo(struct iris_context *ice, + const struct pipe_draw_info *draw, + unsigned drawid_offset, + const struct pipe_draw_indirect_info *indirect, + const struct pipe_draw_start_count_bias *sc) +{ + struct iris_batch *batch = &ice->batches[IRIS_BATCH_RENDER]; + + iris_batch_maybe_flush(batch, 1500); + + iris_update_draw_parameters(ice, draw, drawid_offset, indirect, sc); + + batch->screen->vtbl.upload_render_state(ice, batch, draw, drawid_offset, indirect, sc); +} + static void iris_indirect_draw_vbo(struct iris_context *ice, const struct pipe_draw_info *dinfo, @@ -191,42 +207,47 @@ iris_indirect_draw_vbo(struct iris_context *ice, struct iris_batch *batch = &ice->batches[IRIS_BATCH_RENDER]; struct pipe_draw_info info = *dinfo; struct pipe_draw_indirect_info indirect = *dindirect; - - iris_emit_buffer_barrier_for(batch, iris_resource_bo(indirect.buffer), - IRIS_DOMAIN_VF_READ); - - if (indirect.indirect_draw_count) { - struct iris_bo *draw_count_bo = - iris_resource_bo(indirect.indirect_draw_count); - iris_emit_buffer_barrier_for(batch, draw_count_bo, - IRIS_DOMAIN_OTHER_READ); - - if (ice->state.predicate == IRIS_PREDICATE_STATE_USE_BIT) { - /* Upload MI_PREDICATE_RESULT to GPR15.*/ - batch->screen->vtbl.load_register_reg64(batch, CS_GPR(15), MI_PREDICATE_RESULT); - } - } + const bool use_predicate = + ice->state.predicate == IRIS_PREDICATE_STATE_USE_BIT; const uint64_t orig_dirty = ice->state.dirty; const uint64_t orig_stage_dirty = ice->state.stage_dirty; - for (int i = 0; i < indirect.draw_count; i++) { + if (iris_execute_indirect_draw_supported(ice, &indirect, &info)) { iris_batch_maybe_flush(batch, 1500); - iris_update_draw_parameters(ice, &info, drawid_offset + i, &indirect, draw); + iris_update_draw_parameters(ice, &info, drawid_offset, &indirect, draw); - batch->screen->vtbl.upload_render_state(ice, batch, &info, drawid_offset + i, &indirect, draw); + batch->screen->vtbl.upload_indirect_render_state(ice, &info, &indirect, draw); + } else { + iris_emit_buffer_barrier_for(batch, iris_resource_bo(indirect.buffer), + IRIS_DOMAIN_VF_READ); - ice->state.dirty &= ~IRIS_ALL_DIRTY_FOR_RENDER; - ice->state.stage_dirty &= ~IRIS_ALL_STAGE_DIRTY_FOR_RENDER; + if (indirect.indirect_draw_count) { + struct iris_bo *draw_count_bo = + iris_resource_bo(indirect.indirect_draw_count); + iris_emit_buffer_barrier_for(batch, draw_count_bo, + IRIS_DOMAIN_OTHER_READ); + } - indirect.offset += indirect.stride; - } + if (use_predicate) { + /* Upload MI_PREDICATE_RESULT to GPR15.*/ + batch->screen->vtbl.load_register_reg64(batch, CS_GPR(15), MI_PREDICATE_RESULT); + } + + for (int i = 0; i < indirect.draw_count; i++) { + iris_simple_draw_vbo(ice, &info, drawid_offset + i, &indirect, draw); + + ice->state.dirty &= ~IRIS_ALL_DIRTY_FOR_RENDER; + ice->state.stage_dirty &= ~IRIS_ALL_STAGE_DIRTY_FOR_RENDER; + + indirect.offset += indirect.stride; + } - if (indirect.indirect_draw_count && - ice->state.predicate == IRIS_PREDICATE_STATE_USE_BIT) { - /* Restore MI_PREDICATE_RESULT. */ - batch->screen->vtbl.load_register_reg64(batch, MI_PREDICATE_RESULT, CS_GPR(15)); + if (use_predicate) { + /* Restore MI_PREDICATE_RESULT. */ + batch->screen->vtbl.load_register_reg64(batch, MI_PREDICATE_RESULT, CS_GPR(15)); + } } /* Put this back for post-draw resolves, we'll clear it again after. */ @@ -234,22 +255,6 @@ iris_indirect_draw_vbo(struct iris_context *ice, ice->state.stage_dirty = orig_stage_dirty; } -static void -iris_simple_draw_vbo(struct iris_context *ice, - const struct pipe_draw_info *draw, - unsigned drawid_offset, - const struct pipe_draw_indirect_info *indirect, - const struct pipe_draw_start_count_bias *sc) -{ - struct iris_batch *batch = &ice->batches[IRIS_BATCH_RENDER]; - - iris_batch_maybe_flush(batch, 1500); - - iris_update_draw_parameters(ice, draw, drawid_offset, indirect, sc); - - batch->screen->vtbl.upload_render_state(ice, batch, draw, drawid_offset, indirect, sc); -} - /** * The pipe->draw_vbo() driver hook. Performs a draw on the GPU. */ diff --git a/src/gallium/drivers/iris/iris_screen.h b/src/gallium/drivers/iris/iris_screen.h index cbb52e0fbb6..11277535642 100644 --- a/src/gallium/drivers/iris/iris_screen.h +++ b/src/gallium/drivers/iris/iris_screen.h @@ -69,6 +69,10 @@ struct iris_vtable { unsigned drawid_offset, const struct pipe_draw_indirect_info *indirect, const struct pipe_draw_start_count_bias *sc); + void (*upload_indirect_render_state)(struct iris_context *ice, + const struct pipe_draw_info *draw, + const struct pipe_draw_indirect_info *indirect, + const struct pipe_draw_start_count_bias *sc); void (*update_binder_address)(struct iris_batch *batch, struct iris_binder *binder); void (*upload_compute_state)(struct iris_context *ice, diff --git a/src/gallium/drivers/iris/iris_state.c b/src/gallium/drivers/iris/iris_state.c index 7ec776fd2dc..a6ec245b473 100644 --- a/src/gallium/drivers/iris/iris_state.c +++ b/src/gallium/drivers/iris/iris_state.c @@ -8317,6 +8317,150 @@ iris_upload_render_state(struct iris_context *ice, trace_intel_end_draw(&batch->trace, count); } +static void +iris_upload_indirect_render_state(struct iris_context *ice, + const struct pipe_draw_info *draw, + const struct pipe_draw_indirect_info *indirect, + const struct pipe_draw_start_count_bias *sc) +{ +#if GFX_VERx10 >= 125 + assert(indirect); + + struct iris_batch *batch = &ice->batches[IRIS_BATCH_RENDER]; + UNUSED struct iris_screen *screen = batch->screen; + UNUSED const struct intel_device_info *devinfo = screen->devinfo; + const bool use_predicate = + ice->state.predicate == IRIS_PREDICATE_STATE_USE_BIT; + + trace_intel_begin_draw(&batch->trace); + + if (ice->state.dirty & IRIS_DIRTY_VERTEX_BUFFER_FLUSHES) + flush_vbos(ice, batch); + + iris_batch_sync_region_start(batch); + + /* Always pin the binder. If we're emitting new binding table pointers, + * we need it. If not, we're probably inheriting old tables via the + * context, and need it anyway. Since true zero-bindings cases are + * practically non-existent, just pin it and avoid last_res tracking. + */ + iris_use_pinned_bo(batch, ice->state.binder.bo, false, + IRIS_DOMAIN_NONE); + + if (!batch->contains_draw) { + /* Re-emit constants when starting a new batch buffer in order to + * work around push constant corruption on context switch. + * + * XXX - Provide hardware spec quotation when available. + */ + ice->state.stage_dirty |= (IRIS_STAGE_DIRTY_CONSTANTS_VS | + IRIS_STAGE_DIRTY_CONSTANTS_TCS | + IRIS_STAGE_DIRTY_CONSTANTS_TES | + IRIS_STAGE_DIRTY_CONSTANTS_GS | + IRIS_STAGE_DIRTY_CONSTANTS_FS); + batch->contains_draw = true; + } + + if (!batch->contains_draw_with_next_seqno) { + iris_restore_render_saved_bos(ice, batch, draw); + batch->contains_draw_with_next_seqno = true; + } + + /* Wa_1306463417 - Send HS state for every primitive on gfx11. + * Wa_16011107343 (same for gfx12) + * We implement this by setting TCS dirty on each draw. + */ + if ((INTEL_NEEDS_WA_1306463417 || INTEL_NEEDS_WA_16011107343) && + ice->shaders.prog[MESA_SHADER_TESS_CTRL]) { + ice->state.stage_dirty |= IRIS_STAGE_DIRTY_TCS; + } + + iris_upload_dirty_render_state(ice, batch, draw); + + if (draw->index_size > 0) { + unsigned offset; + + if (draw->has_user_indices) { + unsigned start_offset = draw->index_size * sc->start; + + u_upload_data(ice->ctx.const_uploader, start_offset, + sc->count * draw->index_size, 4, + (char*)draw->index.user + start_offset, + &offset, &ice->state.last_res.index_buffer); + offset -= start_offset; + } else { + struct iris_resource *res = (void *) draw->index.resource; + res->bind_history |= PIPE_BIND_INDEX_BUFFER; + + pipe_resource_reference(&ice->state.last_res.index_buffer, + draw->index.resource); + offset = 0; + + iris_emit_buffer_barrier_for(batch, res->bo, IRIS_DOMAIN_VF_READ); + } + + struct iris_genx_state *genx = ice->state.genx; + struct iris_bo *bo = iris_resource_bo(ice->state.last_res.index_buffer); + + uint32_t ib_packet[GENX(3DSTATE_INDEX_BUFFER_length)]; + iris_pack_command(GENX(3DSTATE_INDEX_BUFFER), ib_packet, ib) { + ib.IndexFormat = draw->index_size >> 1; + ib.MOCS = iris_mocs(bo, &batch->screen->isl_dev, + ISL_SURF_USAGE_INDEX_BUFFER_BIT); + ib.BufferSize = bo->size - offset; + ib.BufferStartingAddress = ro_bo(NULL, bo->address + offset); + ib.L3BypassDisable = true; + } + + if (memcmp(genx->last_index_buffer, ib_packet, sizeof(ib_packet)) != 0) { + memcpy(genx->last_index_buffer, ib_packet, sizeof(ib_packet)); + iris_batch_emit(batch, ib_packet, sizeof(ib_packet)); + iris_use_pinned_bo(batch, bo, false, IRIS_DOMAIN_VF_READ); + } + + } + + iris_measure_snapshot(ice, batch, INTEL_SNAPSHOT_DRAW, draw, indirect, sc); + + genX(maybe_emit_breakpoint)(batch, true); + + iris_emit_cmd(batch, GENX(EXECUTE_INDIRECT_DRAW), ind) { + ind.ArgumentFormat = + draw->index_size > 0 ? DRAWINDEXED : DRAW; + ind.PredicateEnable = use_predicate; + ind.TBIMREnabled = ice->state.use_tbimr; + ind.MaxCount = indirect->draw_count; + + if (indirect->buffer) { + struct iris_bo *bo = iris_resource_bo(indirect->buffer); + ind.ArgumentBufferStartAddress = ro_bo(bo, indirect->offset); + ind.MOCS = iris_mocs(bo, &screen->isl_dev, 0); + } else { + ind.MOCS = iris_mocs(NULL, &screen->isl_dev, 0); + } + + if (indirect->indirect_draw_count) { + struct iris_bo *draw_count_bo = + iris_resource_bo(indirect->indirect_draw_count); + ind.CountBufferIndirectEnable = true; + ind.CountBufferAddress = + ro_bo(draw_count_bo, indirect->indirect_draw_count_offset); + } + } + + genX(emit_3dprimitive_was)(batch, indirect, ice->state.prim_mode, sc->count); + genX(maybe_emit_breakpoint)(batch, false); + + iris_batch_sync_region_end(batch); + + uint32_t count = (sc) ? sc->count : 0; + count *= draw->instance_count ? draw->instance_count : 1; + trace_intel_end_draw(&batch->trace, count); +#else + unreachable("Unsupported path"); +#endif /* GFX_VERx10 >= 125 */ +} + static void iris_load_indirect_location(struct iris_context *ice, struct iris_batch *batch, @@ -9728,6 +9872,7 @@ genX(init_screen_state)(struct iris_screen *screen) screen->vtbl.init_render_context = iris_init_render_context; screen->vtbl.init_compute_context = iris_init_compute_context; screen->vtbl.upload_render_state = iris_upload_render_state; + screen->vtbl.upload_indirect_render_state = iris_upload_indirect_render_state; screen->vtbl.update_binder_address = iris_update_binder_address; screen->vtbl.upload_compute_state = iris_upload_compute_state; screen->vtbl.emit_raw_pipe_control = iris_emit_raw_pipe_control;