Module: Mesa Branch: main Commit: 853436bacd7512e5c57d2e660c225091d7b2a935 URL: http://cgit.freedesktop.org/mesa/mesa/commit/?id=853436bacd7512e5c57d2e660c225091d7b2a935
Author: Qiang Yu <[email protected]> Date: Sun Jun 12 21:02:26 2022 +0800 radeonsi: replace llvm ngg gs with nir lowering Acked-by: Pierre-Eric Pelloux-Prayer <[email protected]> Reviewed-by: Marek Olšák <[email protected]> Signed-off-by: Qiang Yu <[email protected]> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/17109> --- src/gallium/drivers/radeonsi/gfx10_shader_ngg.c | 31 ++++-------------------- src/gallium/drivers/radeonsi/si_shader.c | 16 +++++++++--- src/gallium/drivers/radeonsi/si_shader_llvm.c | 31 +++++++----------------- src/gallium/drivers/radeonsi/si_shader_llvm_vs.c | 4 +-- src/gallium/drivers/radeonsi/si_shader_nir.c | 12 +++++++-- 5 files changed, 39 insertions(+), 55 deletions(-) diff --git a/src/gallium/drivers/radeonsi/gfx10_shader_ngg.c b/src/gallium/drivers/radeonsi/gfx10_shader_ngg.c index c2dc9d4fd6f..2c5826de964 100644 --- a/src/gallium/drivers/radeonsi/gfx10_shader_ngg.c +++ b/src/gallium/drivers/radeonsi/gfx10_shader_ngg.c @@ -2011,22 +2011,9 @@ void gfx10_ngg_gs_emit_vertex(struct si_shader_context *ctx, unsigned stream, void gfx10_ngg_gs_emit_begin(struct si_shader_context *ctx) { - /* Zero out the part of LDS scratch that is used to accumulate the - * per-stream generated primitive count. - */ LLVMBuilderRef builder = ctx->ac.builder; - struct ac_llvm_pointer scratchptr = ctx->gs_ngg_scratch; - LLVMValueRef tid = gfx10_get_thread_id_in_tg(ctx); LLVMValueRef tmp; - tmp = LLVMBuildICmp(builder, LLVMIntULT, tid, LLVMConstInt(ctx->ac.i32, 4, false), ""); - ac_build_ifcc(&ctx->ac, tmp, 5090); - { - LLVMValueRef ptr = ac_build_gep0(&ctx->ac, scratchptr, tid); - LLVMBuildStore(builder, ctx->ac.i32_0, ptr); - } - ac_build_endif(&ctx->ac, 5090); - if (ctx->screen->info.gfx_level < GFX11) { tmp = si_is_gs_thread(ctx); ac_build_ifcc(&ctx->ac, tmp, 15090); @@ -2049,9 +2036,6 @@ void gfx10_ngg_gs_emit_begin(struct si_shader_context *ctx) } ac_build_endif(&ctx->ac, 15090); } - - ac_build_waitcnt(&ctx->ac, AC_WAIT_LGKM); - ac_build_s_barrier(&ctx->ac, ctx->stage); } void gfx10_ngg_gs_build_end(struct si_shader_context *ctx) @@ -2429,17 +2413,12 @@ static void clamp_gsprims_to_esverts(unsigned *max_gsprims, unsigned max_esverts unsigned gfx10_ngg_get_scratch_dw_size(struct si_shader *shader) { const struct si_shader_selector *sel = shader->selector; - bool uses_streamout = si_shader_uses_streamout(shader); - if (sel->stage == MESA_SHADER_GEOMETRY) { - return uses_streamout ? 44 : 8; - } else { - return ac_ngg_get_scratch_lds_size(sel->stage, - si_get_max_workgroup_size(shader), - shader->wave_size, - uses_streamout, - shader->key.ge.opt.ngg_culling) / 4; - } + return ac_ngg_get_scratch_lds_size(sel->stage, + si_get_max_workgroup_size(shader), + shader->wave_size, + si_shader_uses_streamout(shader), + shader->key.ge.opt.ngg_culling) / 4; } /** diff --git a/src/gallium/drivers/radeonsi/si_shader.c b/src/gallium/drivers/radeonsi/si_shader.c index ef864365511..314ab3d6f91 100644 --- a/src/gallium/drivers/radeonsi/si_shader.c +++ b/src/gallium/drivers/radeonsi/si_shader.c @@ -235,7 +235,8 @@ unsigned si_get_max_workgroup_size(const struct si_shader *shader) return shader->selector->screen->info.gfx_level >= GFX7 ? 128 : 0; case MESA_SHADER_GEOMETRY: - return shader->selector->screen->info.gfx_level >= GFX9 ? 128 : 0; + /* ngg_subgroup_size is only the input size. GS can always generate up to 256 vertices. */ + return shader->selector->screen->info.gfx_level >= GFX9 ? 256 : 0; case MESA_SHADER_COMPUTE: break; /* see below */ @@ -1667,6 +1668,14 @@ static void si_lower_ngg(struct si_shader *shader, nir_shader *nir) options.user_clip_plane_enable_mask = clip_plane_enable; NIR_PASS_V(nir, ac_nir_lower_ngg_nogs, &options); + } else { + assert(nir->info.stage == MESA_SHADER_GEOMETRY); + + options.gs_out_vtx_bytes = sel->info.gsvs_vertex_size; + options.has_gen_prim_query = options.has_xfb_prim_query = + sel->screen->use_ngg_streamout; + + NIR_PASS_V(nir, ac_nir_lower_ngg_gs, &options); } /* may generate some subgroup op like ballot */ @@ -1908,7 +1917,7 @@ struct nir_shader *si_get_nir_shader(struct si_shader *shader, bool *free_nir, si_assign_param_offsets(nir, shader); /* Only lower last VGT NGG shader stage. */ - if (sel->stage < MESA_SHADER_GEOMETRY && key->ge.as_ngg && !key->ge.as_es) { + if (sel->stage <= MESA_SHADER_GEOMETRY && key->ge.as_ngg && !key->ge.as_es) { si_lower_ngg(shader, nir); opt_offsets = true; } @@ -1960,7 +1969,8 @@ bool si_compile_shader(struct si_screen *sscreen, struct ac_llvm_compiler *compi struct nir_shader *nir = si_get_nir_shader(shader, &free_nir, 0); struct pipe_stream_output_info so = {}; - if (si_shader_uses_streamout(shader)) + /* NGG streamout has been lowered to buffer store in nir. */ + if (!sscreen->use_ngg_streamout && si_shader_uses_streamout(shader)) nir_gather_stream_output_info(nir, &so); /* Dump NIR before doing NIR->LLVM conversion in case the diff --git a/src/gallium/drivers/radeonsi/si_shader_llvm.c b/src/gallium/drivers/radeonsi/si_shader_llvm.c index 07d47691e16..dd5f5745c91 100644 --- a/src/gallium/drivers/radeonsi/si_shader_llvm.c +++ b/src/gallium/drivers/radeonsi/si_shader_llvm.c @@ -1007,32 +1007,22 @@ bool si_llvm_translate_nir(struct si_shader_context *ctx, struct si_shader *shad case MESA_SHADER_GEOMETRY: si_llvm_init_gs_callbacks(ctx); - if (!ctx->shader->key.ge.as_ngg) - si_preload_gs_rings(ctx); - - for (unsigned i = 0; i < 4; i++) - ctx->gs_next_vertex[i] = ac_build_alloca(&ctx->ac, ctx->ac.i32, ""); - - if (shader->key.ge.as_ngg) { - for (unsigned i = 0; i < 4; ++i) { - ctx->gs_curprim_verts[i] = ac_build_alloca(&ctx->ac, ctx->ac.i32, ""); - ctx->gs_generated_prims[i] = ac_build_alloca(&ctx->ac, ctx->ac.i32, ""); - } - - assert(!ctx->gs_ngg_scratch.value); + if (ctx->shader->key.ge.as_ngg) { LLVMTypeRef ai32 = LLVMArrayType(ctx->ac.i32, gfx10_ngg_get_scratch_dw_size(shader)); ctx->gs_ngg_scratch = (struct ac_llvm_pointer) { .value = LLVMAddGlobalInAddressSpace(ctx->ac.module, ai32, "ngg_scratch", AC_ADDR_SPACE_LDS), .pointee_type = ai32 }; LLVMSetInitializer(ctx->gs_ngg_scratch.value, LLVMGetUndef(ai32)); - LLVMSetAlignment(ctx->gs_ngg_scratch.value, 4); + LLVMSetAlignment(ctx->gs_ngg_scratch.value, 8); ctx->gs_ngg_emit = LLVMAddGlobalInAddressSpace( ctx->ac.module, LLVMArrayType(ctx->ac.i32, 0), "ngg_emit", AC_ADDR_SPACE_LDS); LLVMSetLinkage(ctx->gs_ngg_emit, LLVMExternalLinkage); LLVMSetAlignment(ctx->gs_ngg_emit, 4); } else { + si_preload_gs_rings(ctx); + ctx->gs_emitted_vertices = LLVMConstInt(ctx->ac.i32, 0, false); } break; @@ -1144,17 +1134,17 @@ bool si_llvm_translate_nir(struct si_shader_context *ctx, struct si_shader *shad shader->key.ge.as_ngg && !shader->key.ge.as_es && !shader->key.ge.opt.ngg_culling) ac_build_s_barrier(&ctx->ac, ctx->stage); - /* NGG GS: Initialize LDS and insert s_barrier, which must not be inside the if statement. */ + /* NGG GS: handle GS_STATE_PIPELINE_STATS_EMU */ if (ctx->stage == MESA_SHADER_GEOMETRY && shader->key.ge.as_ngg) gfx10_ngg_gs_emit_begin(ctx); LLVMValueRef thread_enabled = NULL; - if (ctx->stage == MESA_SHADER_GEOMETRY || + if ((ctx->stage == MESA_SHADER_GEOMETRY && !shader->key.ge.as_ngg) || (ctx->stage == MESA_SHADER_TESS_CTRL && !shader->is_monolithic)) { /* Wrap both shaders in an if statement according to the number of enabled threads * there. For monolithic TCS, the if statement is inserted by the wrapper function, - * not here. + * not here. For NGG GS, the if statement is inserted by nir lowering. */ thread_enabled = si_is_gs_thread(ctx); /* 2nd shader: thread enabled bool */ } else if ((shader->key.ge.as_ls || shader->key.ge.as_es) && !shader->is_monolithic) { @@ -1200,8 +1190,7 @@ bool si_llvm_translate_nir(struct si_shader_context *ctx, struct si_shader *shad ctx->ac.wave_size % sel->info.base.tess.tcs_vertices_out != 0) ac_build_s_barrier(&ctx->ac, ctx->stage); } - } else if (ctx->stage == MESA_SHADER_GEOMETRY && !shader->key.ge.as_ngg) { - /* gfx10_ngg_gs_emit_begin inserts the barrier for NGG. */ + } else if (ctx->stage == MESA_SHADER_GEOMETRY) { ac_build_waitcnt(&ctx->ac, AC_WAIT_LGKM); ac_build_s_barrier(&ctx->ac, ctx->stage); } @@ -1260,9 +1249,7 @@ bool si_llvm_translate_nir(struct si_shader_context *ctx, struct si_shader *shad break; case MESA_SHADER_GEOMETRY: - if (ctx->shader->key.ge.as_ngg) - gfx10_ngg_gs_build_end(ctx); - else + if (!ctx->shader->key.ge.as_ngg) si_llvm_gs_build_end(ctx); break; diff --git a/src/gallium/drivers/radeonsi/si_shader_llvm_vs.c b/src/gallium/drivers/radeonsi/si_shader_llvm_vs.c index 12d45448974..597ea18eb95 100644 --- a/src/gallium/drivers/radeonsi/si_shader_llvm_vs.c +++ b/src/gallium/drivers/radeonsi/si_shader_llvm_vs.c @@ -699,8 +699,8 @@ void si_llvm_build_vs_exports(struct si_shader_context *ctx, LLVMValueRef num_ex } if (!shader->info.nr_param_exports || - /* GFX11 VS/TES param export is handled in nir */ - (ctx->screen->info.gfx_level >= GFX11 && ctx->stage != MESA_SHADER_GEOMETRY)) + /* GFX11 param export is handled in nir */ + ctx->screen->info.gfx_level >= GFX11) return; /* Build parameter exports. Use 2 loops to export params in ascending order. diff --git a/src/gallium/drivers/radeonsi/si_shader_nir.c b/src/gallium/drivers/radeonsi/si_shader_nir.c index 413dce3cfa4..c704bfd312f 100644 --- a/src/gallium/drivers/radeonsi/si_shader_nir.c +++ b/src/gallium/drivers/radeonsi/si_shader_nir.c @@ -298,8 +298,16 @@ static void si_lower_nir(struct si_screen *sscreen, struct nir_shader *nir) nir->info.stage == MESA_SHADER_GEOMETRY) NIR_PASS_V(nir, nir_lower_io_to_scalar, nir_var_shader_out); - if (nir->info.stage == MESA_SHADER_GEOMETRY) - NIR_PASS_V(nir, nir_lower_gs_intrinsics, nir_lower_gs_intrinsics_per_stream); + if (nir->info.stage == MESA_SHADER_GEOMETRY) { + unsigned flags = nir_lower_gs_intrinsics_per_stream; + if (sscreen->use_ngg) { + flags |= nir_lower_gs_intrinsics_count_primitives | + nir_lower_gs_intrinsics_count_vertices_per_primitive | + nir_lower_gs_intrinsics_overwrite_incomplete; + } + + NIR_PASS_V(nir, nir_lower_gs_intrinsics, flags); + } if (nir->info.stage == MESA_SHADER_COMPUTE) { if (nir->info.cs.derivative_group == DERIVATIVE_GROUP_QUADS) {
