From: Marek Olšák <marek.ol...@amd.com> TGSI compute shaders don't have RW_BUFFERS, so use SGPR[0:1]. Graphics shaders use the first slot of RW_BUFFERS.
TODO: Dave's patch only implements the latter; fix the attribute names. UNTESTED --- src/gallium/drivers/radeonsi/si_compute.c | 27 +++++-- src/gallium/drivers/radeonsi/si_shader.c | 34 +++++--- src/gallium/drivers/radeonsi/si_shader.h | 1 + src/gallium/drivers/radeonsi/si_state.h | 1 + src/gallium/drivers/radeonsi/si_state_draw.c | 8 ++ src/gallium/drivers/radeonsi/si_state_shaders.c | 102 +++++++++++++----------- 6 files changed, 111 insertions(+), 62 deletions(-) diff --git a/src/gallium/drivers/radeonsi/si_compute.c b/src/gallium/drivers/radeonsi/si_compute.c index 9d83cb3..8a4c02e 100644 --- a/src/gallium/drivers/radeonsi/si_compute.c +++ b/src/gallium/drivers/radeonsi/si_compute.c @@ -287,21 +287,23 @@ static bool si_setup_compute_scratch_buffer(struct si_context *sctx, r600_resource_reference(&sctx->compute_scratch_buffer, NULL); sctx->compute_scratch_buffer = (struct r600_resource*) pipe_buffer_create(&sctx->screen->b.b, 0, PIPE_USAGE_DEFAULT, scratch_needed); if (!sctx->compute_scratch_buffer) return false; } - if (sctx->compute_scratch_buffer != shader->scratch_bo && scratch_needed) { + if (HAVE_LLVM <= 0x0309 && + scratch_needed && + sctx->compute_scratch_buffer != shader->scratch_bo) { uint64_t scratch_va = sctx->compute_scratch_buffer->gpu_address; si_shader_apply_scratch_relocs(sctx, shader, config, scratch_va); if (si_shader_binary_upload(sctx->screen, shader)) return false; r600_resource_reference(&shader->scratch_bo, sctx->compute_scratch_buffer); } @@ -351,30 +353,43 @@ static bool si_switch_compute_shader(struct si_context *sctx, /* TODO: use si_multiwave_lds_size_workaround */ assert(lds_blocks <= 0xFF); config->rsrc2 &= C_00B84C_LDS_SIZE; config->rsrc2 |= S_00B84C_LDS_SIZE(lds_blocks); } if (!si_setup_compute_scratch_buffer(sctx, shader, config)) return false; - if (shader->scratch_bo) { + if (config->scratch_bytes_per_wave) { COMPUTE_DBG(sctx->screen, "Waves: %u; Scratch per wave: %u bytes; " "Total Scratch: %u bytes\n", sctx->scratch_waves, config->scratch_bytes_per_wave, config->scratch_bytes_per_wave * sctx->scratch_waves); radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx, - shader->scratch_bo, RADEON_USAGE_READWRITE, - RADEON_PRIO_SCRATCH_BUFFER); + sctx->compute_scratch_buffer, + RADEON_USAGE_READWRITE, + RADEON_PRIO_SCRATCH_BUFFER); + + /* Write the scratch pointer to SGPR[0:1]. */ + if (HAVE_LLVM >= 0x0400 && + program->ir_type == PIPE_SHADER_IR_TGSI) { + uint64_t scratch_va = sctx->compute_scratch_buffer->gpu_address; + + radeon_set_sh_reg_seq(cs, R_00B900_COMPUTE_USER_DATA_0, 2); + radeon_emit(cs, scratch_va); + radeon_emit(cs, + S_008F04_BASE_ADDRESS_HI(scratch_va >> 32) | + S_008F04_SWIZZLE_ENABLE(1)); + } } shader_va = shader->bo->gpu_address + offset; if (program->use_code_object_v2) { /* Shader code is placed after the amd_kernel_code_t * struct. */ shader_va += sizeof(amd_kernel_code_t); } radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx, shader->bo, @@ -729,21 +744,23 @@ static void si_launch_grid( si_upload_compute_shader_descriptors(sctx); si_emit_compute_shader_userdata(sctx); if (si_is_atom_dirty(sctx, sctx->atoms.s.render_cond)) { sctx->atoms.s.render_cond->emit(&sctx->b, sctx->atoms.s.render_cond); si_set_atom_dirty(sctx, sctx->atoms.s.render_cond, false); } - if (program->input_size || program->ir_type == PIPE_SHADER_IR_NATIVE) + if (program->ir_type == PIPE_SHADER_IR_TGSI) + assert(program->input_size == 0); + else if (program->ir_type == PIPE_SHADER_IR_NATIVE) si_upload_compute_input(sctx, code_object, info); /* Global buffers */ for (i = 0; i < MAX_GLOBAL_BUFFERS; i++) { struct r600_resource *buffer = (struct r600_resource*)program->global_buffers[i]; if (!buffer) { continue; } radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx, buffer, diff --git a/src/gallium/drivers/radeonsi/si_shader.c b/src/gallium/drivers/radeonsi/si_shader.c index ed8eff4..507a44d 100644 --- a/src/gallium/drivers/radeonsi/si_shader.c +++ b/src/gallium/drivers/radeonsi/si_shader.c @@ -5321,20 +5321,28 @@ static void si_create_function(struct si_shader_context *ctx, LLVMAddTargetDependentFunctionAttr(ctx->main_fn, "no-infs-fp-math", "true"); LLVMAddTargetDependentFunctionAttr(ctx->main_fn, "no-nans-fp-math", "true"); LLVMAddTargetDependentFunctionAttr(ctx->main_fn, "unsafe-fp-math", "true"); } + + if (ctx->type == PIPE_SHADER_COMPUTE) { + LLVMAddTargetDependentFunctionAttr(ctx->main_fn, + "amdgpu-spill-bufsgpr01", "true"); + } else { + LLVMAddTargetDependentFunctionAttr(ctx->main_fn, + "amdgpu-spill-bufsgpr01-load", "true"); + } } static void create_meta_data(struct si_shader_context *ctx) { struct gallivm_state *gallivm = ctx->soa.bld_base.base.gallivm; ctx->invariant_load_md_kind = LLVMGetMDKindIDInContext(gallivm->context, "invariant.load", 14); ctx->range_md_kind = LLVMGetMDKindIDInContext(gallivm->context, "range", 5); @@ -5762,32 +5770,36 @@ static void si_llvm_emit_polygon_stipple(struct si_shader_context *ctx, lp_build_intrinsic(builder, "llvm.AMDGPU.kill", ctx->voidt, &bit, 1, 0); } void si_shader_binary_read_config(struct radeon_shader_binary *binary, struct si_shader_config *conf, unsigned symbol_offset) { unsigned i; const unsigned char *config = radeon_shader_binary_config_start(binary, symbol_offset); - bool really_needs_scratch = false; + bool may_need_scratch = true; - /* LLVM adds SGPR spills to the scratch size. - * Find out if we really need the scratch buffer. - */ - for (i = 0; i < binary->reloc_count; i++) { - const struct radeon_shader_reloc *reloc = &binary->relocs[i]; + if (HAVE_LLVM <= 0x0309) { + /* LLVM adds SGPR spills to the scratch size. + * Find out if we really need the scratch buffer. + */ + may_need_scratch = false; - if (!strcmp(scratch_rsrc_dword0_symbol, reloc->name) || - !strcmp(scratch_rsrc_dword1_symbol, reloc->name)) { - really_needs_scratch = true; - break; + for (i = 0; i < binary->reloc_count; i++) { + const struct radeon_shader_reloc *reloc = &binary->relocs[i]; + + if (!strcmp(scratch_rsrc_dword0_symbol, reloc->name) || + !strcmp(scratch_rsrc_dword1_symbol, reloc->name)) { + may_need_scratch = true; + break; + } } } /* XXX: We may be able to emit some of these values directly rather than * extracting fields to be emitted later. */ for (i = 0; i < binary->config_size_per_symbol; i+= 8) { unsigned reg = util_le32_to_cpu(*(uint32_t*)(config + i)); unsigned value = util_le32_to_cpu(*(uint32_t*)(config + i + 4)); @@ -5810,21 +5822,21 @@ void si_shader_binary_read_config(struct radeon_shader_binary *binary, break; case R_0286CC_SPI_PS_INPUT_ENA: conf->spi_ps_input_ena = value; break; case R_0286D0_SPI_PS_INPUT_ADDR: conf->spi_ps_input_addr = value; break; case R_0286E8_SPI_TMPRING_SIZE: case R_00B860_COMPUTE_TMPRING_SIZE: /* WAVESIZE is in units of 256 dwords. */ - if (really_needs_scratch) + if (may_need_scratch) conf->scratch_bytes_per_wave = G_00B860_WAVESIZE(value) * 256 * 4; break; case 0x4: /* SPILLED_SGPRS */ conf->spilled_sgprs = value; break; case 0x8: /* SPILLED_VGPRS */ conf->spilled_vgprs = value; break; default: diff --git a/src/gallium/drivers/radeonsi/si_shader.h b/src/gallium/drivers/radeonsi/si_shader.h index 129e571..b30f61b 100644 --- a/src/gallium/drivers/radeonsi/si_shader.h +++ b/src/gallium/drivers/radeonsi/si_shader.h @@ -483,20 +483,21 @@ struct si_shader_info { struct si_shader { struct si_shader_selector *selector; struct si_shader *next_variant; struct si_shader_part *prolog; struct si_shader_part *epilog; struct si_pm4_state *pm4; struct r600_resource *bo; + /* for tracking which scratch address the binary contains (<= LLVM 3.9) */ struct r600_resource *scratch_bo; struct si_shader_key key; struct util_queue_fence optimized_ready; bool compilation_failed; bool is_monolithic; bool is_optimized; bool is_binary_shared; bool is_gs_copy_shader; /* The following data is all that's needed for binary shaders. */ diff --git a/src/gallium/drivers/radeonsi/si_state.h b/src/gallium/drivers/radeonsi/si_state.h index d8e6024..b6b089a 100644 --- a/src/gallium/drivers/radeonsi/si_state.h +++ b/src/gallium/drivers/radeonsi/si_state.h @@ -154,20 +154,21 @@ union si_state_atoms { #define SI_NUM_ATOMS (sizeof(union si_state_atoms)/sizeof(struct r600_atom*)) struct si_shader_data { struct r600_atom atom; uint32_t sh_base[SI_NUM_SHADERS]; }; /* Private read-write buffer slots. */ enum { + SI_SCRATCH_BUFFER, SI_HS_RING_TESS_FACTOR, SI_HS_RING_TESS_OFFCHIP, SI_ES_RING_ESGS, SI_GS_RING_ESGS, SI_GS_RING_GSVS0, SI_GS_RING_GSVS1, SI_GS_RING_GSVS2, SI_GS_RING_GSVS3, diff --git a/src/gallium/drivers/radeonsi/si_state_draw.c b/src/gallium/drivers/radeonsi/si_state_draw.c index cae19dc..e447e32 100644 --- a/src/gallium/drivers/radeonsi/si_state_draw.c +++ b/src/gallium/drivers/radeonsi/si_state_draw.c @@ -1016,20 +1016,28 @@ void si_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info *info) if (gs_tri_strip_adj_fix != sctx->gs_tri_strip_adj_fix) { sctx->gs_tri_strip_adj_fix = gs_tri_strip_adj_fix; sctx->do_update_shaders = true; } } if (sctx->do_update_shaders && !si_update_shaders(sctx)) return; + /* Do it after si_update_shaders, but before + * si_upload_graphics_shader_descriptors. */ + if (HAVE_LLVM >= 0x0400 && sctx->emit_scratch_reloc) { + si_set_ring_buffer(ctx, SI_SCRATCH_BUFFER, + &sctx->scratch_buffer->b.b, + 0, 0xffffffff, true, true, 4, 64, 0); + } + if (!si_upload_graphics_shader_descriptors(sctx)) return; if (info->indexed) { /* Initialize the index buffer struct. */ pipe_resource_reference(&ib.buffer, sctx->index_buffer.buffer); ib.user_buffer = sctx->index_buffer.user_buffer; ib.index_size = sctx->index_buffer.index_size; ib.offset = sctx->index_buffer.offset; diff --git a/src/gallium/drivers/radeonsi/si_state_shaders.c b/src/gallium/drivers/radeonsi/si_state_shaders.c index 0afc3b4..bb9f3a8 100644 --- a/src/gallium/drivers/radeonsi/si_state_shaders.c +++ b/src/gallium/drivers/radeonsi/si_state_shaders.c @@ -2130,90 +2130,100 @@ static unsigned si_get_max_scratch_bytes_per_wave(struct si_context *sctx) unsigned bytes = 0; bytes = MAX2(bytes, si_get_scratch_buffer_bytes_per_wave(sctx->ps_shader.current)); bytes = MAX2(bytes, si_get_scratch_buffer_bytes_per_wave(sctx->gs_shader.current)); bytes = MAX2(bytes, si_get_scratch_buffer_bytes_per_wave(sctx->vs_shader.current)); bytes = MAX2(bytes, si_get_scratch_buffer_bytes_per_wave(sctx->tcs_shader.current)); bytes = MAX2(bytes, si_get_scratch_buffer_bytes_per_wave(sctx->tes_shader.current)); return bytes; } +static bool si_update_scratch_relocs(struct si_context *sctx) +{ + int r; + + /* Update the shaders, so they are using the latest scratch. The + * scratch buffer may have been changed since these shaders were + * last used, so we still need to try to update them, even if + * they require scratch buffers smaller than the current size. + */ + r = si_update_scratch_buffer(sctx, sctx->ps_shader.current); + if (r < 0) + return false; + if (r == 1) + si_pm4_bind_state(sctx, ps, sctx->ps_shader.current->pm4); + + r = si_update_scratch_buffer(sctx, sctx->gs_shader.current); + if (r < 0) + return false; + if (r == 1) + si_pm4_bind_state(sctx, gs, sctx->gs_shader.current->pm4); + + r = si_update_scratch_buffer(sctx, sctx->tcs_shader.current); + if (r < 0) + return false; + if (r == 1) + si_pm4_bind_state(sctx, hs, sctx->tcs_shader.current->pm4); + + /* VS can be bound as LS, ES, or VS. */ + r = si_update_scratch_buffer(sctx, sctx->vs_shader.current); + if (r < 0) + return false; + if (r == 1) { + if (sctx->tes_shader.current) + si_pm4_bind_state(sctx, ls, sctx->vs_shader.current->pm4); + else if (sctx->gs_shader.current) + si_pm4_bind_state(sctx, es, sctx->vs_shader.current->pm4); + else + si_pm4_bind_state(sctx, vs, sctx->vs_shader.current->pm4); + } + + /* TES can be bound as ES or VS. */ + r = si_update_scratch_buffer(sctx, sctx->tes_shader.current); + if (r < 0) + return false; + if (r == 1) { + if (sctx->gs_shader.current) + si_pm4_bind_state(sctx, es, sctx->tes_shader.current->pm4); + else + si_pm4_bind_state(sctx, vs, sctx->tes_shader.current->pm4); + } + + return true; +} + static bool si_update_spi_tmpring_size(struct si_context *sctx) { unsigned current_scratch_buffer_size = si_get_current_scratch_buffer_size(sctx); unsigned scratch_bytes_per_wave = si_get_max_scratch_bytes_per_wave(sctx); unsigned scratch_needed_size = scratch_bytes_per_wave * sctx->scratch_waves; unsigned spi_tmpring_size; - int r; if (scratch_needed_size > 0) { if (scratch_needed_size > current_scratch_buffer_size) { /* Create a bigger scratch buffer */ r600_resource_reference(&sctx->scratch_buffer, NULL); sctx->scratch_buffer = (struct r600_resource*) pipe_buffer_create(&sctx->screen->b.b, 0, PIPE_USAGE_DEFAULT, scratch_needed_size); if (!sctx->scratch_buffer) return false; sctx->emit_scratch_reloc = true; } - /* Update the shaders, so they are using the latest scratch. The - * scratch buffer may have been changed since these shaders were - * last used, so we still need to try to update them, even if - * they require scratch buffers smaller than the current size. - */ - r = si_update_scratch_buffer(sctx, sctx->ps_shader.current); - if (r < 0) - return false; - if (r == 1) - si_pm4_bind_state(sctx, ps, sctx->ps_shader.current->pm4); - - r = si_update_scratch_buffer(sctx, sctx->gs_shader.current); - if (r < 0) - return false; - if (r == 1) - si_pm4_bind_state(sctx, gs, sctx->gs_shader.current->pm4); - - r = si_update_scratch_buffer(sctx, sctx->tcs_shader.current); - if (r < 0) - return false; - if (r == 1) - si_pm4_bind_state(sctx, hs, sctx->tcs_shader.current->pm4); - - /* VS can be bound as LS, ES, or VS. */ - r = si_update_scratch_buffer(sctx, sctx->vs_shader.current); - if (r < 0) - return false; - if (r == 1) { - if (sctx->tes_shader.current) - si_pm4_bind_state(sctx, ls, sctx->vs_shader.current->pm4); - else if (sctx->gs_shader.current) - si_pm4_bind_state(sctx, es, sctx->vs_shader.current->pm4); - else - si_pm4_bind_state(sctx, vs, sctx->vs_shader.current->pm4); - } - - /* TES can be bound as ES or VS. */ - r = si_update_scratch_buffer(sctx, sctx->tes_shader.current); - if (r < 0) + if (HAVE_LLVM <= 0x0309 && + !si_update_scratch_relocs(sctx)) return false; - if (r == 1) { - if (sctx->gs_shader.current) - si_pm4_bind_state(sctx, es, sctx->tes_shader.current->pm4); - else - si_pm4_bind_state(sctx, vs, sctx->tes_shader.current->pm4); - } } /* The LLVM shader backend should be reporting aligned scratch_sizes. */ assert((scratch_needed_size & ~0x3FF) == scratch_needed_size && "scratch size should already be aligned correctly."); spi_tmpring_size = S_0286E8_WAVES(sctx->scratch_waves) | S_0286E8_WAVESIZE(scratch_bytes_per_wave >> 10); if (spi_tmpring_size != sctx->spi_tmpring_size) { sctx->spi_tmpring_size = spi_tmpring_size; -- 2.7.4 _______________________________________________ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev