On Fri, May 3, 2019 at 7:19 AM Nicolai Hähnle <[email protected]> wrote:
> From: Nicolai Hähnle <[email protected]> > > --- > src/amd/common/ac_binary.c | 2 + > src/gallium/drivers/radeonsi/si_compute.c | 14 +-- > src/gallium/drivers/radeonsi/si_shader.c | 112 +++------------------- > src/gallium/drivers/radeonsi/si_shader.h | 25 +---- > 4 files changed, 27 insertions(+), 126 deletions(-) > > diff --git a/src/amd/common/ac_binary.c b/src/amd/common/ac_binary.c > index 44251886b5f..d0ca55e0e0d 100644 > --- a/src/amd/common/ac_binary.c > +++ b/src/amd/common/ac_binary.c > @@ -218,26 +218,28 @@ void ac_parse_shader_binary_config(const char *data, > size_t nbytes, > unsigned value = util_le32_to_cpu(*(uint32_t*)(data + i + > 4)); > switch (reg) { > case R_00B028_SPI_SHADER_PGM_RSRC1_PS: > case R_00B128_SPI_SHADER_PGM_RSRC1_VS: > case R_00B228_SPI_SHADER_PGM_RSRC1_GS: > case R_00B848_COMPUTE_PGM_RSRC1: > case R_00B428_SPI_SHADER_PGM_RSRC1_HS: > conf->num_sgprs = MAX2(conf->num_sgprs, > (G_00B028_SGPRS(value) + 1) * 8); > conf->num_vgprs = MAX2(conf->num_vgprs, > (G_00B028_VGPRS(value) + 1) * 4); > conf->float_mode = G_00B028_FLOAT_MODE(value); > + conf->rsrc1 = value; > break; > case R_00B02C_SPI_SHADER_PGM_RSRC2_PS: > conf->lds_size = MAX2(conf->lds_size, > G_00B02C_EXTRA_LDS_SIZE(value)); > break; > case R_00B84C_COMPUTE_PGM_RSRC2: > conf->lds_size = MAX2(conf->lds_size, > G_00B84C_LDS_SIZE(value)); > + conf->rsrc2 = value; > break; > case R_0286CC_SPI_PS_INPUT_ENA: > conf->spi_ps_input_ena = value; > break; > case R_0286D0_SPI_PS_INPUT_ADDR: > conf->spi_ps_input_addr = value; > break; > case R_0286E8_SPI_TMPRING_SIZE: > case R_00B860_COMPUTE_TMPRING_SIZE: > /* WAVESIZE is in units of 256 dwords. */ > diff --git a/src/gallium/drivers/radeonsi/si_compute.c > b/src/gallium/drivers/radeonsi/si_compute.c > index 541d7e6f118..02d7bac406a 100644 > --- a/src/gallium/drivers/radeonsi/si_compute.c > +++ b/src/gallium/drivers/radeonsi/si_compute.c > @@ -59,21 +59,21 @@ static const amd_kernel_code_t > *si_compute_get_code_object( > uint64_t symbol_offset) > { > if (!program->use_code_object_v2) { > return NULL; > } > return (const amd_kernel_code_t*) > (program->shader.binary.code + symbol_offset); > } > > static void code_object_to_config(const amd_kernel_code_t *code_object, > - struct si_shader_config *out_config) { > + struct ac_shader_config *out_config) { > > uint32_t rsrc1 = code_object->compute_pgm_resource_registers; > uint32_t rsrc2 = code_object->compute_pgm_resource_registers >> 32; > out_config->num_sgprs = code_object->wavefront_sgpr_count; > out_config->num_vgprs = code_object->workitem_vgpr_count; > out_config->float_mode = G_00B028_FLOAT_MODE(rsrc1); > out_config->rsrc1 = rsrc1; > out_config->lds_size = MAX2(out_config->lds_size, > G_00B84C_LDS_SIZE(rsrc2)); > out_config->rsrc2 = rsrc2; > out_config->scratch_bytes_per_wave = > @@ -241,22 +241,22 @@ static void *si_create_compute_state( > const amd_kernel_code_t *code_object = > si_compute_get_code_object(program, 0); > code_object_to_config(code_object, > &program->shader.config); > if (program->shader.binary.reloc_count != 0) { > fprintf(stderr, "Error: %d unsupported > relocations\n", > > program->shader.binary.reloc_count); > FREE(program); > return NULL; > } > } else { > - > si_shader_binary_read_config(&program->shader.binary, > - &program->shader.config, 0); > + > ac_shader_binary_read_config(&program->shader.binary, > + &program->shader.config, 0, false); > } > si_shader_dump(sctx->screen, &program->shader, > &sctx->debug, > PIPE_SHADER_COMPUTE, stderr, true); > if (si_shader_binary_upload(sctx->screen, > &program->shader) < 0) { > fprintf(stderr, "LLVM failed to upload shader\n"); > FREE(program); > return NULL; > } > } > > @@ -362,21 +362,21 @@ static void si_initialize_compute(struct si_context > *sctx) > bc_va >> 8); > } > } > > sctx->cs_shader_state.emitted_program = NULL; > sctx->cs_shader_state.initialized = true; > } > > static bool si_setup_compute_scratch_buffer(struct si_context *sctx, > struct si_shader *shader, > - struct si_shader_config > *config) > + struct ac_shader_config > *config) > { > uint64_t scratch_bo_size, scratch_needed; > scratch_bo_size = 0; > scratch_needed = config->scratch_bytes_per_wave * > sctx->scratch_waves; > if (sctx->compute_scratch_buffer) > scratch_bo_size = sctx->compute_scratch_buffer->b.b.width0; > > if (scratch_bo_size < scratch_needed) { > si_resource_reference(&sctx->compute_scratch_buffer, NULL); > > @@ -405,38 +405,38 @@ static bool si_setup_compute_scratch_buffer(struct > si_context *sctx, > return true; > } > > static bool si_switch_compute_shader(struct si_context *sctx, > struct si_compute *program, > struct si_shader *shader, > const amd_kernel_code_t *code_object, > unsigned offset) > { > struct radeon_cmdbuf *cs = sctx->gfx_cs; > - struct si_shader_config inline_config = {0}; > - struct si_shader_config *config; > + struct ac_shader_config inline_config = {0}; > + struct ac_shader_config *config; > uint64_t shader_va; > > if (sctx->cs_shader_state.emitted_program == program && > sctx->cs_shader_state.offset == offset) > return true; > > if (program->ir_type != PIPE_SHADER_IR_NATIVE) { > config = &shader->config; > } else { > unsigned lds_blocks; > > config = &inline_config; > if (code_object) { > code_object_to_config(code_object, config); > } else { > - si_shader_binary_read_config(&shader->binary, > config, offset); > + ac_shader_binary_read_config(&shader->binary, > config, offset, false); > } > > lds_blocks = config->lds_size; > /* XXX: We are over allocating LDS. For SI, the shader > reports > * LDS in blocks of 256 bytes, so if there are 4 bytes lds > * allocated in the shader and 4 bytes allocated by the > state > * tracker, then we will set LDS_SIZE to 512 bytes rather > than 256. > */ > if (sctx->chip_class <= SI) { > lds_blocks += align(program->local_size, 256) >> 8; > diff --git a/src/gallium/drivers/radeonsi/si_shader.c > b/src/gallium/drivers/radeonsi/si_shader.c > index f6d882cf583..da43447013d 100644 > --- a/src/gallium/drivers/radeonsi/si_shader.c > +++ b/src/gallium/drivers/radeonsi/si_shader.c > @@ -4962,104 +4962,20 @@ static void si_llvm_emit_polygon_stipple(struct > si_shader_context *ctx, > /* The stipple pattern is 32x32, each row has 32 bits. */ > offset = LLVMBuildMul(builder, address[1], > LLVMConstInt(ctx->i32, 4, 0), ""); > row = buffer_load_const(ctx, desc, offset); > row = ac_to_integer(&ctx->ac, row); > bit = LLVMBuildLShr(builder, row, address[0], ""); > bit = LLVMBuildTrunc(builder, bit, ctx->i1, ""); > ac_build_kill_if_false(&ctx->ac, bit); > } > > -void si_shader_binary_read_config(struct ac_shader_binary *binary, > - struct si_shader_config *conf, > - unsigned symbol_offset) > -{ > - unsigned i; > - const unsigned char *config = > - ac_shader_binary_config_start(binary, symbol_offset); > - bool really_needs_scratch = false; > - > - /* LLVM adds SGPR spills to the scratch size. > - * Find out if we really need the scratch buffer. > - */ > - for (i = 0; i < binary->reloc_count; i++) { > - const struct ac_shader_reloc *reloc = &binary->relocs[i]; > - > - if (!strcmp(scratch_rsrc_dword0_symbol, reloc->name) || > - !strcmp(scratch_rsrc_dword1_symbol, reloc->name)) { > - really_needs_scratch = true; > - break; > - } > - } > - > - /* XXX: We may be able to emit some of these values directly > rather than > - * extracting fields to be emitted later. > - */ > - > - for (i = 0; i < binary->config_size_per_symbol; i+= 8) { > - unsigned reg = util_le32_to_cpu(*(uint32_t*)(config + i)); > - unsigned value = util_le32_to_cpu(*(uint32_t*)(config + i > + 4)); > - switch (reg) { > - case R_00B028_SPI_SHADER_PGM_RSRC1_PS: > - case R_00B128_SPI_SHADER_PGM_RSRC1_VS: > - case R_00B228_SPI_SHADER_PGM_RSRC1_GS: > - case R_00B428_SPI_SHADER_PGM_RSRC1_HS: > - case R_00B848_COMPUTE_PGM_RSRC1: > - conf->num_sgprs = MAX2(conf->num_sgprs, > (G_00B028_SGPRS(value) + 1) * 8); > - conf->num_vgprs = MAX2(conf->num_vgprs, > (G_00B028_VGPRS(value) + 1) * 4); > - conf->float_mode = G_00B028_FLOAT_MODE(value); > - conf->rsrc1 = value; > - break; > - case R_00B02C_SPI_SHADER_PGM_RSRC2_PS: > - conf->lds_size = MAX2(conf->lds_size, > G_00B02C_EXTRA_LDS_SIZE(value)); > - break; > - case R_00B84C_COMPUTE_PGM_RSRC2: > - conf->lds_size = MAX2(conf->lds_size, > G_00B84C_LDS_SIZE(value)); > - conf->rsrc2 = value; > - break; > - case R_0286CC_SPI_PS_INPUT_ENA: > - conf->spi_ps_input_ena = value; > - break; > - case R_0286D0_SPI_PS_INPUT_ADDR: > - conf->spi_ps_input_addr = value; > - break; > - case R_0286E8_SPI_TMPRING_SIZE: > - case R_00B860_COMPUTE_TMPRING_SIZE: > - /* WAVESIZE is in units of 256 dwords. */ > - if (really_needs_scratch) > - conf->scratch_bytes_per_wave = > - G_00B860_WAVESIZE(value) * 256 * 4; > - break; > - case 0x4: /* SPILLED_SGPRS */ > - conf->spilled_sgprs = value; > - break; > - case 0x8: /* SPILLED_VGPRS */ > - conf->spilled_vgprs = value; > - break; > - default: > - { > - static bool printed; > - > - if (!printed) { > - fprintf(stderr, "Warning: LLVM > emitted unknown " > - "config register: 0x%x\n", > reg); > - printed = true; > - } > - } > - break; > - } > - } > - > - if (!conf->spi_ps_input_addr) > - conf->spi_ps_input_addr = conf->spi_ps_input_ena; > -} > - > void si_shader_apply_scratch_relocs(struct si_shader *shader, > uint64_t scratch_va) > { > unsigned i; > uint32_t scratch_rsrc_dword0 = scratch_va; > uint32_t scratch_rsrc_dword1 = > S_008F04_BASE_ADDRESS_HI(scratch_va >> 32); > > /* Enable scratch coalescing. */ > scratch_rsrc_dword1 |= S_008F04_SWIZZLE_ENABLE(1); > @@ -5213,21 +5129,21 @@ static void si_shader_dump_disassembly(const > struct ac_shader_binary *binary, > fprintf(file, "@0x%x: %02x%02x%02x%02x\n", i, > binary->code[i + 3], binary->code[i + 2], > binary->code[i + 1], binary->code[i]); > } > } > } > > static void si_calculate_max_simd_waves(struct si_shader *shader) > { > struct si_screen *sscreen = shader->selector->screen; > - struct si_shader_config *conf = &shader->config; > + struct ac_shader_config *conf = &shader->config; > unsigned num_inputs = shader->selector->info.num_inputs; > unsigned lds_increment = sscreen->info.chip_class >= CIK ? 512 : > 256; > unsigned lds_per_wave = 0; > unsigned max_simd_waves; > > max_simd_waves = ac_get_max_simd_waves(sscreen->info.family); > > /* Compute LDS usage for PS. */ > switch (shader->selector->type) { > case PIPE_SHADER_FRAGMENT: > @@ -5262,46 +5178,46 @@ static void si_calculate_max_simd_waves(struct > si_shader *shader) > } > > if (conf->num_vgprs) > max_simd_waves = MIN2(max_simd_waves, 256 / > conf->num_vgprs); > > /* LDS is 64KB per CU (4 SIMDs), which is 16KB per SIMD (usage > above > * 16KB makes some SIMDs unoccupied). */ > if (lds_per_wave) > max_simd_waves = MIN2(max_simd_waves, 16384 / > lds_per_wave); > > - conf->max_simd_waves = max_simd_waves; > + shader->max_simd_waves = max_simd_waves; > } > > void si_shader_dump_stats_for_shader_db(const struct si_shader *shader, > struct pipe_debug_callback *debug) > { > - const struct si_shader_config *conf = &shader->config; > + const struct ac_shader_config *conf = &shader->config; > > pipe_debug_message(debug, SHADER_INFO, > "Shader Stats: SGPRS: %d VGPRS: %d Code Size: > %d " > "LDS: %d Scratch: %d Max Waves: %d Spilled > SGPRs: %d " > "Spilled VGPRs: %d PrivMem VGPRs: %d", > conf->num_sgprs, conf->num_vgprs, > si_get_shader_binary_size(shader), > conf->lds_size, conf->scratch_bytes_per_wave, > - conf->max_simd_waves, conf->spilled_sgprs, > - conf->spilled_vgprs, conf->private_mem_vgprs); > + shader->max_simd_waves, conf->spilled_sgprs, > + conf->spilled_vgprs, shader->private_mem_vgprs); > } > > static void si_shader_dump_stats(struct si_screen *sscreen, > const struct si_shader *shader, > unsigned processor, > FILE *file, > bool check_debug_option) > { > - const struct si_shader_config *conf = &shader->config; > + const struct ac_shader_config *conf = &shader->config; > > if (!check_debug_option || > si_can_dump_shader(sscreen, processor)) { > if (processor == PIPE_SHADER_FRAGMENT) { > fprintf(file, "*** SHADER CONFIG ***\n" > "SPI_PS_INPUT_ADDR = 0x%04x\n" > "SPI_PS_INPUT_ENA = 0x%04x\n", > conf->spi_ps_input_addr, > conf->spi_ps_input_ena); > } > > @@ -5311,24 +5227,24 @@ static void si_shader_dump_stats(struct si_screen > *sscreen, > "Spilled SGPRs: %d\n" > "Spilled VGPRs: %d\n" > "Private memory VGPRs: %d\n" > "Code Size: %d bytes\n" > "LDS: %d blocks\n" > "Scratch: %d bytes per wave\n" > "Max Waves: %d\n" > "********************\n\n\n", > conf->num_sgprs, conf->num_vgprs, > conf->spilled_sgprs, conf->spilled_vgprs, > - conf->private_mem_vgprs, > + shader->private_mem_vgprs, > si_get_shader_binary_size(shader), > conf->lds_size, conf->scratch_bytes_per_wave, > - conf->max_simd_waves); > + shader->max_simd_waves); > } > } > > const char *si_get_shader_name(const struct si_shader *shader, unsigned > processor) > { > switch (processor) { > case PIPE_SHADER_VERTEX: > if (shader->key.as_es) > return "Vertex Shader as ES"; > else if (shader->key.as_ls) > @@ -5399,21 +5315,21 @@ void si_shader_dump(struct si_screen *sscreen, > const struct si_shader *shader, > debug, "epilog", file); > fprintf(file, "\n"); > } > > si_shader_dump_stats(sscreen, shader, processor, file, > check_debug_option); > } > > static int si_compile_llvm(struct si_screen *sscreen, > struct ac_shader_binary *binary, > - struct si_shader_config *conf, > + struct ac_shader_config *conf, > struct ac_llvm_compiler *compiler, > LLVMModuleRef mod, > struct pipe_debug_callback *debug, > unsigned processor, > const char *name, > bool less_optimized) > { > int r = 0; > unsigned count = p_atomic_inc_return(&sscreen->num_compilations); > > @@ -5433,21 +5349,21 @@ static int si_compile_llvm(struct si_screen > *sscreen, > LLVMDisposeMessage(ir); > } > > if (!si_replace_shader(count, binary)) { > r = si_llvm_compile(mod, binary, compiler, debug, > less_optimized); > if (r) > return r; > } > > - si_shader_binary_read_config(binary, conf, 0); > + ac_shader_binary_read_config(binary, conf, 0, false); > > /* Enable 64-bit and 16-bit denormals, because there is no > performance > * cost. > * > * If denormals are enabled, all floating-point output modifiers > are > * ignored. > * > * Don't enable denormals for 32-bit floats, because: > * - Floating-point output modifiers would be ignored by the hw. > * - Some opcodes don't support denormals, such as v_mad_f32. We > would > @@ -6799,21 +6715,21 @@ int si_compile_tgsi_shader(struct si_screen > *sscreen, > need_prolog ? 1 : 0, 0); > } > > si_llvm_optimize_module(&ctx); > > /* Post-optimization transformations and analysis. */ > si_optimize_vs_outputs(&ctx); > > if ((debug && debug->debug_message) || > si_can_dump_shader(sscreen, ctx.type)) { > - ctx.shader->config.private_mem_vgprs = > + ctx.shader->private_mem_vgprs = > ac_count_scratch_private_memory(ctx.main_fn); > } > > /* Make sure the input is a pointer and not integer followed by > inttoptr. */ > assert(LLVMGetTypeKind(LLVMTypeOf(LLVMGetParam(ctx.main_fn, 0))) == > LLVMPointerTypeKind); > > /* Compile to bytecode. */ > r = si_compile_llvm(sscreen, &shader->binary, &shader->config, > compiler, > ctx.ac.module, debug, ctx.type, > @@ -7954,23 +7870,23 @@ int si_shader_create(struct si_screen *sscreen, > struct ac_llvm_compiler *compile > shader->config.num_sgprs = > MAX2(shader->config.num_sgprs, > > shader->previous_stage->config.num_sgprs); > shader->config.num_vgprs = > MAX2(shader->config.num_vgprs, > > shader->previous_stage->config.num_vgprs); > shader->config.spilled_sgprs = > MAX2(shader->config.spilled_sgprs, > > shader->previous_stage->config.spilled_sgprs); > shader->config.spilled_vgprs = > MAX2(shader->config.spilled_vgprs, > > shader->previous_stage->config.spilled_vgprs); > - shader->config.private_mem_vgprs = > - MAX2(shader->config.private_mem_vgprs, > - > shader->previous_stage->config.private_mem_vgprs); > + shader->private_mem_vgprs = > + MAX2(shader->private_mem_vgprs, > + > shader->previous_stage->private_mem_vgprs); > shader->config.scratch_bytes_per_wave = > MAX2(shader->config.scratch_bytes_per_wave, > > shader->previous_stage->config.scratch_bytes_per_wave); > shader->info.uses_instanceid |= > > shader->previous_stage->info.uses_instanceid; > } > if (shader->prolog2) { > shader->config.num_sgprs = > MAX2(shader->config.num_sgprs, > > shader->prolog2->config.num_sgprs); > shader->config.num_vgprs = > MAX2(shader->config.num_vgprs, > diff --git a/src/gallium/drivers/radeonsi/si_shader.h > b/src/gallium/drivers/radeonsi/si_shader.h > index ecf7f8bbd7a..6c8f70dc94b 100644 > --- a/src/gallium/drivers/radeonsi/si_shader.h > +++ b/src/gallium/drivers/radeonsi/si_shader.h > @@ -552,36 +552,20 @@ struct si_shader_key { > * but forces monolithic shaders to be used as soon as > * possible, because it's in the "opt" group. > */ > unsigned prefer_mono:1; > } opt; > }; > > /* Restore the pack alignment to default. */ > #pragma pack(pop) > > -struct si_shader_config { > - unsigned num_sgprs; > - unsigned num_vgprs; > - unsigned spilled_sgprs; > - unsigned spilled_vgprs; > - unsigned private_mem_vgprs; > - unsigned lds_size; > - unsigned max_simd_waves; > - unsigned spi_ps_input_ena; > - unsigned spi_ps_input_addr; > - unsigned float_mode; > - unsigned scratch_bytes_per_wave; > - unsigned rsrc1; > - unsigned rsrc2; > -}; > - > /* GCN-specific shader info. */ > struct si_shader_info { > ubyte vs_output_param_offset[SI_MAX_VS_OUTPUTS]; > ubyte num_input_sgprs; > ubyte num_input_vgprs; > signed char face_vgpr_index; > signed char ancillary_vgpr_index; > bool uses_instanceid; > ubyte nr_pos_exports; > ubyte nr_param_exports; > @@ -605,22 +589,24 @@ struct si_shader { > struct si_shader_key key; > struct util_queue_fence ready; > bool compilation_failed; > bool is_monolithic; > bool is_optimized; > bool is_binary_shared; > bool is_gs_copy_shader; > > /* The following data is all that's needed for binary shaders. */ > struct ac_shader_binary binary; > - struct si_shader_config config; > + struct ac_shader_config config; > struct si_shader_info info; > + unsigned private_mem_vgprs; > + unsigned max_simd_waves; > The shader cache stores "config" but not these new members. Marek
_______________________________________________ mesa-dev mailing list [email protected] https://lists.freedesktop.org/mailman/listinfo/mesa-dev
