Module: Mesa Branch: main Commit: a5f0f7d4b162c04878fb9d505d55ebdd05c5c773 URL: http://cgit.freedesktop.org/mesa/mesa/commit/?id=a5f0f7d4b162c04878fb9d505d55ebdd05c5c773
Author: Danylo Piliaiev <[email protected]> Date: Tue Sep 5 18:24:03 2023 +0200 turnip,ir3: Implement A7XX push consts load via preamble New push consts loading consist of: - Push consts are set for the entire pipeline via HLSQ_SHARED_CONSTS_IMM array which could fit up to 256b of push consts. - For each shader stage that uses push consts READ_IMM_SHARED_CONSTS should be set in HLSQ_*_CNTL, otherwise push consts may get overwritten by new push consts that are set after the draw. - Push consts are loaded into consts reg file in a shader preamble via stsc at the very start of the preamble. OPC_PUSH_CONSTS_LOAD_MACRO is used instead of directly translating NIR intrinsic into stsc because: we don't want to teach legalize pass how to set (ss) between stores and loads of consts reg file, don't want for stsc to be reordered, etc. Signed-off-by: Danylo Piliaiev <[email protected]> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/25086> --- src/compiler/nir/nir_intrinsics.py | 4 ++ src/freedreno/common/freedreno_dev_info.h | 4 ++ src/freedreno/common/freedreno_devices.py | 4 +- src/freedreno/ir3/disasm-a3xx.c | 1 + src/freedreno/ir3/instr-a3xx.h | 7 +- src/freedreno/ir3/ir3.h | 5 ++ src/freedreno/ir3/ir3_compiler.c | 2 +- src/freedreno/ir3/ir3_compiler.h | 2 + src/freedreno/ir3/ir3_compiler_nir.c | 10 +++ src/freedreno/ir3/ir3_legalize.c | 81 +++++++++++++++++----- src/freedreno/ir3/ir3_nir.c | 3 + src/freedreno/ir3/ir3_nir.h | 2 + .../ir3/ir3_nir_lower_push_consts_to_preamble.c | 28 ++++++++ src/freedreno/ir3/ir3_postsched.c | 4 ++ src/freedreno/ir3/ir3_print.c | 8 ++- src/freedreno/ir3/ir3_sched.c | 4 ++ src/freedreno/ir3/ir3_shader.h | 4 ++ src/freedreno/ir3/meson.build | 1 + src/freedreno/vulkan/tu_cmd_buffer.cc | 68 ++++++++++++------ src/freedreno/vulkan/tu_pipeline.cc | 19 +++-- src/freedreno/vulkan/tu_shader.cc | 2 + 21 files changed, 215 insertions(+), 48 deletions(-) diff --git a/src/compiler/nir/nir_intrinsics.py b/src/compiler/nir/nir_intrinsics.py index 8bc2226ab69..4bc57a7613a 100644 --- a/src/compiler/nir/nir_intrinsics.py +++ b/src/compiler/nir/nir_intrinsics.py @@ -1329,6 +1329,10 @@ store("uniform_ir3", [], indices=[BASE]) # vec4's. intrinsic("copy_ubo_to_uniform_ir3", [1, 1], indices=[BASE, RANGE]) +# IR3-specific intrinsic for stsc. Loads from push consts to constant file +# Should be used in the shader preamble. +intrinsic("copy_push_const_to_uniform_ir3", [1], indices=[BASE, RANGE]) + # Intrinsics used by the Midgard/Bifrost blend pipeline. These are defined # within a blend shader to read/write the raw value from the tile buffer, # without applying any format conversion in the process. If the shader needs diff --git a/src/freedreno/common/freedreno_dev_info.h b/src/freedreno/common/freedreno_dev_info.h index 5dda81a24bf..4f2af42a8d4 100644 --- a/src/freedreno/common/freedreno_dev_info.h +++ b/src/freedreno/common/freedreno_dev_info.h @@ -191,6 +191,10 @@ struct fd_dev_info { } a6xx; struct { + /* stsc may need to be done twice for the same range to workaround + * _something_, observed in blob's disassembly. + */ + bool stsc_duplication_quirk; } a7xx; }; diff --git a/src/freedreno/common/freedreno_devices.py b/src/freedreno/common/freedreno_devices.py index 04695d30cff..7e9a3d73bd0 100644 --- a/src/freedreno/common/freedreno_devices.py +++ b/src/freedreno/common/freedreno_devices.py @@ -704,7 +704,9 @@ add_gpus([ a7xx_730 = A7XXProps() -a7xx_740 = A7XXProps() +a7xx_740 = A7XXProps( + stsc_duplication_quirk = True, + ) add_gpus([ GPUId(chip_id=0x07030001, name="FD730"), # KGSL, no speedbin data diff --git a/src/freedreno/ir3/disasm-a3xx.c b/src/freedreno/ir3/disasm-a3xx.c index 489e45b038b..f842e7fdb1b 100644 --- a/src/freedreno/ir3/disasm-a3xx.c +++ b/src/freedreno/ir3/disasm-a3xx.c @@ -194,6 +194,7 @@ static const struct opc_info { OPC(1, OPC_SWZ_SHARED_MACRO, swz_shared.macro), OPC(1, OPC_SCAN_MACRO, scan.macro), OPC(1, OPC_SHPS_MACRO, shps.macro), + OPC(1, OPC_PUSH_CONSTS_LOAD_MACRO, push_consts_load.macro), /* category 2: */ OPC(2, OPC_ADD_F, add.f), diff --git a/src/freedreno/ir3/instr-a3xx.h b/src/freedreno/ir3/instr-a3xx.h index e9eaebc937c..777cfeb7113 100644 --- a/src/freedreno/ir3/instr-a3xx.h +++ b/src/freedreno/ir3/instr-a3xx.h @@ -131,6 +131,11 @@ typedef enum { /* Macros that expand to a loop */ OPC_SCAN_MACRO = _OPC(1, 58), + /* Macros that expand to an stsc at the start of the preamble. + * It loads into const file and should not be optimized in any way. + */ + OPC_PUSH_CONSTS_LOAD_MACRO = _OPC(1, 59), + /* category 2: */ OPC_ADD_F = _OPC(2, 0), OPC_MIN_F = _OPC(2, 1), @@ -406,7 +411,7 @@ typedef enum { /* * A manually encoded opcode */ - OPC_META_RAW = _OPC(OPC_META, 7) + OPC_META_RAW = _OPC(OPC_META, 7), } opc_t; /* clang-format on */ diff --git a/src/freedreno/ir3/ir3.h b/src/freedreno/ir3/ir3.h index 221080ba261..7eb8ca36209 100644 --- a/src/freedreno/ir3/ir3.h +++ b/src/freedreno/ir3/ir3.h @@ -445,6 +445,10 @@ struct ir3_instruction { */ gl_system_value sysval; } input; + struct { + unsigned src_base, src_size; + unsigned dst_base; + } push_consts; struct { uint64_t value; } raw; @@ -2485,6 +2489,7 @@ INSTR1(QUAD_SHUFFLE_VERT) INSTR1(QUAD_SHUFFLE_DIAG) INSTR2NODST(LDC_K) INSTR2NODST(STC) +INSTR2NODST(STSC) #ifndef GPU #elif GPU >= 600 INSTR3NODST(STIB); diff --git a/src/freedreno/ir3/ir3_compiler.c b/src/freedreno/ir3/ir3_compiler.c index eb0001a0210..52d04fdd7d9 100644 --- a/src/freedreno/ir3/ir3_compiler.c +++ b/src/freedreno/ir3/ir3_compiler.c @@ -202,13 +202,13 @@ ir3_compiler_create(struct fd_device *dev, const struct fd_dev_id *dev_id, compiler->shared_consts_size = 8; compiler->geom_shared_consts_size_quirk = 16; } else { - /* A7XX TODO: properly use new shared consts mechanism */ compiler->shared_consts_base_offset = -1; compiler->shared_consts_size = 0; compiler->geom_shared_consts_size_quirk = 0; } compiler->has_fs_tex_prefetch = dev_info->a6xx.has_fs_tex_prefetch; + compiler->stsc_duplication_quirk = dev_info->a7xx.stsc_duplication_quirk; } else { compiler->max_const_pipeline = 512; compiler->max_const_geom = 512; diff --git a/src/freedreno/ir3/ir3_compiler.h b/src/freedreno/ir3/ir3_compiler.h index 892d3627004..d51ef5519cc 100644 --- a/src/freedreno/ir3/ir3_compiler.h +++ b/src/freedreno/ir3/ir3_compiler.h @@ -245,6 +245,8 @@ struct ir3_compiler { uint64_t geom_shared_consts_size_quirk; bool has_fs_tex_prefetch; + + bool stsc_duplication_quirk; }; void ir3_compiler_destroy(struct ir3_compiler *compiler); diff --git a/src/freedreno/ir3/ir3_compiler_nir.c b/src/freedreno/ir3/ir3_compiler_nir.c index b8cd9848f25..54cc6ed3202 100644 --- a/src/freedreno/ir3/ir3_compiler_nir.c +++ b/src/freedreno/ir3/ir3_compiler_nir.c @@ -2678,6 +2678,16 @@ emit_intrinsic(struct ir3_context *ctx, nir_intrinsic_instr *intr) array_insert(b, b->keeps, stc); break; } + case nir_intrinsic_copy_push_const_to_uniform_ir3: { + struct ir3_instruction *load = + ir3_instr_create(ctx->block, OPC_PUSH_CONSTS_LOAD_MACRO, 0, 0); + array_insert(b, b->keeps, load); + + load->push_consts.dst_base = nir_src_as_uint(intr->src[0]); + load->push_consts.src_base = nir_intrinsic_base(intr); + load->push_consts.src_size = nir_intrinsic_range(intr); + break; + } default: ir3_context_error(ctx, "Unhandled intrinsic type: %s\n", nir_intrinsic_infos[intr->intrinsic].name); diff --git a/src/freedreno/ir3/ir3_legalize.c b/src/freedreno/ir3/ir3_legalize.c index 0626113d7a0..604e5bf2962 100644 --- a/src/freedreno/ir3/ir3_legalize.c +++ b/src/freedreno/ir3/ir3_legalize.c @@ -58,6 +58,7 @@ struct ir3_legalize_state { regmask_t needs_ss; regmask_t needs_ss_war; /* write after read */ regmask_t needs_sy; + bool needs_ss_for_const; }; struct ir3_legalize_block_data { @@ -65,6 +66,17 @@ struct ir3_legalize_block_data { struct ir3_legalize_state state; }; +static inline void +apply_ss(struct ir3_instruction *instr, + struct ir3_legalize_state *state, + bool mergedregs) +{ + instr->flags |= IR3_INSTR_SS; + regmask_init(&state->needs_ss_war, mergedregs); + regmask_init(&state->needs_ss, mergedregs); + state->needs_ss_for_const = false; +} + /* We want to evaluate each block from the position of any other * predecessor block, in order that the flags set are the union of * all possible program paths. @@ -109,6 +121,7 @@ legalize_block(struct ir3_legalize_ctx *ctx, struct ir3_block *block) regmask_or(&state->needs_ss_war, &state->needs_ss_war, &pstate->needs_ss_war); regmask_or(&state->needs_sy, &state->needs_sy, &pstate->needs_sy); + state->needs_ss_for_const |= pstate->needs_ss_for_const; } /* We need to take phsyical-only edges into account when tracking shared @@ -162,17 +175,15 @@ legalize_block(struct ir3_legalize_ctx *ctx, struct ir3_block *block) } if ((last_n && is_barrier(last_n)) || n->opc == OPC_SHPE) { - n->flags |= IR3_INSTR_SS | IR3_INSTR_SY; - last_input_needs_ss = false; - regmask_init(&state->needs_ss_war, mergedregs); - regmask_init(&state->needs_ss, mergedregs); + apply_ss(n, state, mergedregs); + + n->flags |= IR3_INSTR_SY; regmask_init(&state->needs_sy, mergedregs); + last_input_needs_ss = false; } if (last_n && (last_n->opc == OPC_PREDT)) { - n->flags |= IR3_INSTR_SS; - regmask_init(&state->needs_ss_war, mergedregs); - regmask_init(&state->needs_ss, mergedregs); + apply_ss(n, state, mergedregs); } /* NOTE: consider dst register too.. it could happen that @@ -195,25 +206,24 @@ legalize_block(struct ir3_legalize_ctx *ctx, struct ir3_block *block) * some tests for both this and (sy).. */ if (regmask_get(&state->needs_ss, reg)) { - n->flags |= IR3_INSTR_SS; + apply_ss(n, state, mergedregs); last_input_needs_ss = false; - regmask_init(&state->needs_ss_war, mergedregs); - regmask_init(&state->needs_ss, mergedregs); } if (regmask_get(&state->needs_sy, reg)) { n->flags |= IR3_INSTR_SY; regmask_init(&state->needs_sy, mergedregs); } + } else if ((reg->flags & IR3_REG_CONST) && state->needs_ss_for_const) { + apply_ss(n, state, mergedregs); + last_input_needs_ss = false; } } foreach_dst (reg, n) { if (regmask_get(&state->needs_ss_war, reg)) { - n->flags |= IR3_INSTR_SS; + apply_ss(n, state, mergedregs); last_input_needs_ss = false; - regmask_init(&state->needs_ss_war, mergedregs); - regmask_init(&state->needs_ss, mergedregs); } } @@ -230,7 +240,7 @@ legalize_block(struct ir3_legalize_ctx *ctx, struct ir3_block *block) } /* need to be able to set (ss) on first instruction: */ - if (list_is_empty(&block->instr_list) && (opc_cat(n->opc) >= 5)) + if (list_is_empty(&block->instr_list) && (opc_cat(n->opc) >= 5) && !is_meta(n)) ir3_NOP(block); if (ctx->compiler->samgq_workaround && @@ -281,6 +291,8 @@ legalize_block(struct ir3_legalize_ctx *ctx, struct ir3_block *block) } else { regmask_set(&state->needs_ss, n->dsts[0]); } + } else if (n->opc == OPC_PUSH_CONSTS_LOAD_MACRO) { + state->needs_ss_for_const = true; } if (is_ssbo(n->opc) || is_global_a3xx_atomic(n->opc) || @@ -324,9 +336,7 @@ legalize_block(struct ir3_legalize_ctx *ctx, struct ir3_block *block) last_input->dsts[0]->flags |= IR3_REG_EI; if (last_input_needs_ss) { - last_input->flags |= IR3_INSTR_SS; - regmask_init(&state->needs_ss_war, mergedregs); - regmask_init(&state->needs_ss, mergedregs); + apply_ss(last_input, state, mergedregs); } } } @@ -407,6 +417,36 @@ apply_fine_deriv_macro(struct ir3_legalize_ctx *ctx, struct ir3_block *block) return true; } +static void +apply_push_consts_load_macro(struct ir3_legalize_ctx *ctx, + struct ir3_block *block) +{ + foreach_instr (n, &block->instr_list) { + if (n->opc == OPC_PUSH_CONSTS_LOAD_MACRO) { + struct ir3_instruction *stsc = ir3_instr_create(block, OPC_STSC, 0, 2); + ir3_instr_move_after(stsc, n); + ir3_src_create(stsc, 0, IR3_REG_IMMED)->iim_val = + n->push_consts.dst_base; + ir3_src_create(stsc, 0, IR3_REG_IMMED)->iim_val = + n->push_consts.src_base; + stsc->cat6.iim_val = n->push_consts.src_size; + stsc->cat6.type = TYPE_U32; + + if (ctx->compiler->stsc_duplication_quirk) { + struct ir3_instruction *nop = ir3_NOP(block); + ir3_instr_move_after(nop, stsc); + nop->flags |= IR3_INSTR_SS; + ir3_instr_move_after(ir3_instr_clone(stsc), nop); + } + + list_delinit(&n->node); + break; + } else if (!is_meta(n)) { + break; + } + } +} + /* NOTE: branch instructions are always the last instruction(s) * in the block. We take advantage of this as we resolve the * branches, since "if (foo) break;" constructs turn into @@ -1180,6 +1220,13 @@ ir3_legalize(struct ir3 *ir, struct ir3_shader_variant *so, int *max_bary) progress |= apply_fine_deriv_macro(ctx, block); } + foreach_block (block, &ir->block_list) { + if (block->brtype == IR3_BRANCH_GETONE) { + apply_push_consts_load_macro(ctx, block->successors[0]); + break; + } + } + nop_sched(ir, so); while (opt_jump(ir)) diff --git a/src/freedreno/ir3/ir3_nir.c b/src/freedreno/ir3/ir3_nir.c index 2598064af7e..21b3576ec7b 100644 --- a/src/freedreno/ir3/ir3_nir.c +++ b/src/freedreno/ir3/ir3_nir.c @@ -747,6 +747,9 @@ ir3_nir_lower_variant(struct ir3_shader_variant *so, nir_shader *s) progress |= OPT(s, ir3_nir_lower_ubo_loads, so); + if (so->shader_options.push_consts_type == IR3_PUSH_CONSTS_SHARED_PREAMBLE) + progress |= OPT(s, ir3_nir_lower_push_consts_to_preamble, so); + progress |= OPT(s, ir3_nir_lower_preamble, so); OPT_V(s, nir_lower_amul, ir3_glsl_type_size); diff --git a/src/freedreno/ir3/ir3_nir.h b/src/freedreno/ir3/ir3_nir.h index b8cf29ba1f4..de6b70e7877 100644 --- a/src/freedreno/ir3/ir3_nir.h +++ b/src/freedreno/ir3/ir3_nir.h @@ -40,6 +40,8 @@ bool ir3_nir_lower_imul(nir_shader *shader); bool ir3_nir_lower_io_offsets(nir_shader *shader); bool ir3_nir_lower_load_barycentric_at_sample(nir_shader *shader); bool ir3_nir_lower_load_barycentric_at_offset(nir_shader *shader); +bool ir3_nir_lower_push_consts_to_preamble(nir_shader *nir, + struct ir3_shader_variant *v); bool ir3_nir_move_varying_inputs(nir_shader *shader); int ir3_nir_coord_offset(nir_def *ssa); bool ir3_nir_lower_tex_prefetch(nir_shader *shader); diff --git a/src/freedreno/ir3/ir3_nir_lower_push_consts_to_preamble.c b/src/freedreno/ir3/ir3_nir_lower_push_consts_to_preamble.c new file mode 100644 index 00000000000..056edb70d83 --- /dev/null +++ b/src/freedreno/ir3/ir3_nir_lower_push_consts_to_preamble.c @@ -0,0 +1,28 @@ +/* + * Copyright © 2023 Igalia S.L. + * SPDX-License-Identifier: MIT + */ + +#include "compiler/nir/nir.h" +#include "compiler/nir/nir_builder.h" +#include "util/u_math.h" +#include "ir3_compiler.h" +#include "ir3_nir.h" + +bool +ir3_nir_lower_push_consts_to_preamble(nir_shader *nir, + struct ir3_shader_variant *v) +{ + nir_function_impl *preamble = nir_shader_get_preamble(nir); + nir_builder _b = nir_builder_at(nir_before_impl(preamble)); + nir_builder *b = &_b; + + nir_copy_push_const_to_uniform_ir3( + b, nir_imm_int(b, 0), .base = v->shader_options.push_consts_base, + .range = v->shader_options.push_consts_dwords); + + nir_foreach_function_impl(impl, nir) { + nir_metadata_preserve(impl, nir_metadata_none); + } + return true; +} diff --git a/src/freedreno/ir3/ir3_postsched.c b/src/freedreno/ir3/ir3_postsched.c index 97c05e0e336..29bf3a64847 100644 --- a/src/freedreno/ir3/ir3_postsched.c +++ b/src/freedreno/ir3/ir3_postsched.c @@ -691,6 +691,10 @@ sched_block(struct ir3_postsched_ctx *ctx, struct ir3_block *block) if (instr->opc == OPC_META_TEX_PREFETCH) schedule(ctx, instr); + foreach_instr_safe (instr, &ctx->unscheduled_list) + if (instr->opc == OPC_PUSH_CONSTS_LOAD_MACRO) + schedule(ctx, instr); + while (!list_is_empty(&ctx->unscheduled_list)) { struct ir3_instruction *instr = choose_instr(ctx); diff --git a/src/freedreno/ir3/ir3_print.c b/src/freedreno/ir3/ir3_print.c index e85513d3ea9..b1d87545118 100644 --- a/src/freedreno/ir3/ir3_print.c +++ b/src/freedreno/ir3/ir3_print.c @@ -181,7 +181,8 @@ print_instr_name(struct log_stream *stream, struct ir3_instruction *instr, } } - if (instr->opc != OPC_MOVMSK && instr->opc != OPC_SCAN_MACRO) { + if (instr->opc != OPC_MOVMSK && instr->opc != OPC_SCAN_MACRO && + instr->opc != OPC_PUSH_CONSTS_LOAD_MACRO) { mesa_log_stream_printf(stream, ".%s%s", type_name(instr->cat1.src_type), type_name(instr->cat1.dst_type)); @@ -405,6 +406,11 @@ print_instr(struct log_stream *stream, struct ir3_instruction *instr, int lvl) mesa_log_stream_printf(stream, ", tex=%d, samp=%d, input_offset=%d", instr->prefetch.tex, instr->prefetch.samp, instr->prefetch.input_offset); + } else if (instr->opc == OPC_PUSH_CONSTS_LOAD_MACRO) { + mesa_log_stream_printf( + stream, " dst_offset=%d, src_offset = %d, src_size = %d", + instr->push_consts.dst_base, instr->push_consts.src_base, + instr->push_consts.src_size); } if (is_flow(instr) && instr->cat0.target) { diff --git a/src/freedreno/ir3/ir3_sched.c b/src/freedreno/ir3/ir3_sched.c index 814920d2d1f..31f709b52d8 100644 --- a/src/freedreno/ir3/ir3_sched.c +++ b/src/freedreno/ir3/ir3_sched.c @@ -1235,6 +1235,10 @@ sched_block(struct ir3_sched_ctx *ctx, struct ir3_block *block) if (instr->opc == OPC_META_TEX_PREFETCH) schedule(ctx, instr); + foreach_instr_safe (instr, &ctx->unscheduled_list) + if (instr->opc == OPC_PUSH_CONSTS_LOAD_MACRO) + schedule(ctx, instr); + while (!list_is_empty(&ctx->unscheduled_list)) { struct ir3_sched_notes notes = {0}; struct ir3_instruction *instr; diff --git a/src/freedreno/ir3/ir3_shader.h b/src/freedreno/ir3/ir3_shader.h index 345d2af7104..d2686fcb104 100644 --- a/src/freedreno/ir3/ir3_shader.h +++ b/src/freedreno/ir3/ir3_shader.h @@ -151,6 +151,7 @@ enum ir3_push_consts_type { IR3_PUSH_CONSTS_NONE, IR3_PUSH_CONSTS_PER_STAGE, IR3_PUSH_CONSTS_SHARED, + IR3_PUSH_CONSTS_SHARED_PREAMBLE, }; /** @@ -507,6 +508,9 @@ struct ir3_shader_options { */ enum ir3_wavesize_option real_wavesize; enum ir3_push_consts_type push_consts_type; + + uint32_t push_consts_base; + uint32_t push_consts_dwords; }; /** diff --git a/src/freedreno/ir3/meson.build b/src/freedreno/ir3/meson.build index 0e051958805..954b0b88fdb 100644 --- a/src/freedreno/ir3/meson.build +++ b/src/freedreno/ir3/meson.build @@ -95,6 +95,7 @@ libfreedreno_ir3_files = files( 'ir3_nir_lower_64b.c', 'ir3_nir_lower_load_barycentric_at_sample.c', 'ir3_nir_lower_load_barycentric_at_offset.c', + 'ir3_nir_lower_push_consts_to_preamble.c', 'ir3_nir_lower_io_offsets.c', 'ir3_nir_lower_tess.c', 'ir3_nir_lower_tex_prefetch.c', diff --git a/src/freedreno/vulkan/tu_cmd_buffer.cc b/src/freedreno/vulkan/tu_cmd_buffer.cc index ea48e4b7ac1..e83c7d1ddba 100644 --- a/src/freedreno/vulkan/tu_cmd_buffer.cc +++ b/src/freedreno/vulkan/tu_cmd_buffer.cc @@ -4256,9 +4256,10 @@ tu6_user_consts_size(const struct tu_const_state *const_state, { uint32_t dwords = 0; - if (const_state->push_consts.dwords > 0) { + if (const_state->push_consts.type == IR3_PUSH_CONSTS_PER_STAGE) { unsigned num_units = const_state->push_consts.dwords; dwords += 4 + num_units; + assert(num_units > 0); } dwords += 8 * const_state->num_inline_ubos; @@ -4267,12 +4268,10 @@ tu6_user_consts_size(const struct tu_const_state *const_state, } static void -tu6_emit_user_consts(struct tu_cs *cs, - const struct tu_const_state *const_state, - unsigned constlen, - gl_shader_stage type, - struct tu_descriptor_state *descriptors, - uint32_t *push_constants) +tu6_emit_per_stage_push_consts(struct tu_cs *cs, + const struct tu_const_state *const_state, + gl_shader_stage type, + uint32_t *push_constants) { if (const_state->push_consts.type == IR3_PUSH_CONSTS_PER_STAGE) { unsigned num_units = const_state->push_consts.dwords; @@ -4291,7 +4290,15 @@ tu6_emit_user_consts(struct tu_cs *cs, for (unsigned i = 0; i < num_units; i++) tu_cs_emit(cs, push_constants[i + offset]); } +} +static void +tu6_emit_inline_ubo(struct tu_cs *cs, + const struct tu_const_state *const_state, + unsigned constlen, + gl_shader_stage type, + struct tu_descriptor_state *descriptors) +{ /* Emit loads of inline uniforms. These load directly from the uniform's * storage space inside the descriptor set. */ @@ -4349,6 +4356,18 @@ tu6_emit_shared_consts(struct tu_cs *cs, } } +static void +tu7_emit_shared_preamble_consts( + struct tu_cs *cs, + const struct tu_push_constant_range *shared_consts, + uint32_t *push_constants) +{ + tu_cs_emit_pkt4(cs, REG_A7XX_HLSQ_SHARED_CONSTS_IMM(shared_consts->lo), + shared_consts->dwords); + tu_cs_emit_array(cs, push_constants + shared_consts->lo, + shared_consts->dwords); +} + static uint32_t tu6_const_size(struct tu_cmd_buffer *cmd, const struct tu_push_constant_range *shared_consts, @@ -4358,6 +4377,8 @@ tu6_const_size(struct tu_cmd_buffer *cmd, if (shared_consts->type == IR3_PUSH_CONSTS_SHARED) { dwords += shared_consts->dwords + 4; + } else if (shared_consts->type == IR3_PUSH_CONSTS_SHARED_PREAMBLE) { + dwords += shared_consts->dwords + 1; } if (compute) { @@ -4372,8 +4393,7 @@ tu6_const_size(struct tu_cmd_buffer *cmd, } static struct tu_draw_state -tu6_emit_consts(struct tu_cmd_buffer *cmd, - bool compute) +tu_emit_consts(struct tu_cmd_buffer *cmd, bool compute) { uint32_t dwords = 0; const struct tu_push_constant_range *shared_consts = @@ -4390,24 +4410,30 @@ tu6_emit_consts(struct tu_cmd_buffer *cmd, if (shared_consts->type == IR3_PUSH_CONSTS_SHARED) { tu6_emit_shared_consts(&cs, shared_consts, cmd->push_constants, compute); + } else if (shared_consts->type == IR3_PUSH_CONSTS_SHARED_PREAMBLE) { + tu7_emit_shared_preamble_consts(&cs, shared_consts, cmd->push_constants); } if (compute) { - tu6_emit_user_consts(&cs, - &cmd->state.shaders[MESA_SHADER_COMPUTE]->const_state, - cmd->state.shaders[MESA_SHADER_COMPUTE]->variant->constlen, - MESA_SHADER_COMPUTE, - tu_get_descriptors_state(cmd, VK_PIPELINE_BIND_POINT_COMPUTE), - cmd->push_constants); + tu6_emit_per_stage_push_consts( + &cs, &cmd->state.shaders[MESA_SHADER_COMPUTE]->const_state, + MESA_SHADER_COMPUTE, cmd->push_constants); + tu6_emit_inline_ubo( + &cs, &cmd->state.shaders[MESA_SHADER_COMPUTE]->const_state, + cmd->state.shaders[MESA_SHADER_COMPUTE]->variant->constlen, + MESA_SHADER_COMPUTE, + tu_get_descriptors_state(cmd, VK_PIPELINE_BIND_POINT_COMPUTE)); } else { - struct tu_descriptor_state *descriptors = + struct tu_descriptor_state *descriptors = tu_get_descriptors_state(cmd, VK_PIPELINE_BIND_POINT_GRAPHICS); for (uint32_t type = MESA_SHADER_VERTEX; type <= MESA_SHADER_FRAGMENT; type++) { const struct tu_program_descriptor_linkage *link = &cmd->state.program.link[type]; - tu6_emit_user_consts(&cs, &link->tu_const_state, link->constlen, - (gl_shader_stage) type, - descriptors, cmd->push_constants); + tu6_emit_per_stage_push_consts(&cs, &link->tu_const_state, + (gl_shader_stage) type, + cmd->push_constants); + tu6_emit_inline_ubo(&cs, &link->tu_const_state, link->constlen, + (gl_shader_stage) type, descriptors); } } @@ -4751,7 +4777,7 @@ tu6_draw_common(struct tu_cmd_buffer *cmd, } if (dirty & TU_CMD_DIRTY_SHADER_CONSTS) - cmd->state.shader_const = tu6_emit_consts(cmd, false); + cmd->state.shader_const = tu_emit_consts(cmd, false); if (dirty & TU_CMD_DIRTY_DESC_SETS) tu6_emit_descriptor_sets<CHIP>(cmd, VK_PIPELINE_BIND_POINT_GRAPHICS); @@ -5502,7 +5528,7 @@ tu_dispatch(struct tu_cmd_buffer *cmd, tu_emit_cache_flush<CHIP>(cmd); /* note: no reason to have this in a separate IB */ - tu_cs_emit_state_ib(cs, tu6_emit_consts(cmd, true)); + tu_cs_emit_state_ib(cs, tu_emit_consts(cmd, true)); tu_emit_compute_driver_params<CHIP>(cmd, cs, info); diff --git a/src/freedreno/vulkan/tu_pipeline.cc b/src/freedreno/vulkan/tu_pipeline.cc index 1725b82e0bb..342fd3b1e7a 100644 --- a/src/freedreno/vulkan/tu_pipeline.cc +++ b/src/freedreno/vulkan/tu_pipeline.cc @@ -318,7 +318,11 @@ tu_push_consts_type(const struct tu_pipeline_layout *layout, if (tu6_shared_constants_enable(layout, compiler)) { return IR3_PUSH_CONSTS_SHARED; } else { - return IR3_PUSH_CONSTS_PER_STAGE; + if (compiler->gen >= 7) { + return IR3_PUSH_CONSTS_SHARED_PREAMBLE; + } else { + return IR3_PUSH_CONSTS_PER_STAGE; + } } } @@ -385,7 +389,9 @@ tu6_emit_xs_config(struct tu_cs *cs, tu_cs_emit_pkt4(cs, cfg->reg_hlsq_xs_ctrl, 1); tu_cs_emit(cs, A6XX_HLSQ_VS_CNTL_CONSTLEN(xs->constlen) | - A6XX_HLSQ_VS_CNTL_ENABLED); + A6XX_HLSQ_VS_CNTL_ENABLED | + COND(xs->shader_options.push_consts_type == IR3_PUSH_CONSTS_SHARED_PREAMBLE, + A7XX_HLSQ_VS_CNTL_READ_IMM_SHARED_CONSTS)); } TU_GENX(tu6_emit_xs_config); @@ -2335,10 +2341,11 @@ tu_pipeline_builder_parse_shader_stages(struct tu_pipeline_builder *builder, &pipeline->shaders[i]->const_state, variants[i]); - if (pipeline->shaders[i]->const_state.push_consts.type == - IR3_PUSH_CONSTS_SHARED) { - pipeline->program.shared_consts = - pipeline->shaders[i]->const_state.push_consts; + struct tu_push_constant_range *push_consts = + &pipeline->shaders[i]->const_state.push_consts; + if (push_consts->type == IR3_PUSH_CONSTS_SHARED || + push_consts->type == IR3_PUSH_CONSTS_SHARED_PREAMBLE) { + pipeline->program.shared_consts = *push_consts; } } diff --git a/src/freedreno/vulkan/tu_shader.cc b/src/freedreno/vulkan/tu_shader.cc index ff6b4293e3b..6745da7cd24 100644 --- a/src/freedreno/vulkan/tu_shader.cc +++ b/src/freedreno/vulkan/tu_shader.cc @@ -2286,6 +2286,8 @@ tu_shader_create(struct tu_device *dev, .api_wavesize = key->api_wavesize, .real_wavesize = key->real_wavesize, .push_consts_type = shader->const_state.push_consts.type, + .push_consts_base = shader->const_state.push_consts.lo, + .push_consts_dwords = shader->const_state.push_consts.dwords, }; struct ir3_shader *ir3_shader =
