On Mon, Apr 4, 2016 at 6:51 PM, Marek Olšák <[email protected]> wrote: > On Sat, Apr 2, 2016 at 3:10 PM, Bas Nieuwenhuizen > <[email protected]> wrote: >> Instead of having a scratch buffer per program, have one per >> context. >> >> Also removed the per kernel wave count calculations, but >> that only helped if the total number of waves in the dispatch >> was smaller than sctx->scratch_waves. >> >> Signed-off-by: Bas Nieuwenhuizen <[email protected]> >> --- >> src/gallium/drivers/radeonsi/si_compute.c | 136 >> ++++++++++-------------------- >> src/gallium/drivers/radeonsi/si_pipe.c | 1 + >> src/gallium/drivers/radeonsi/si_pipe.h | 2 + >> 3 files changed, 46 insertions(+), 93 deletions(-) >> >> diff --git a/src/gallium/drivers/radeonsi/si_compute.c >> b/src/gallium/drivers/radeonsi/si_compute.c >> index 7320ace..e712b46 100644 >> --- a/src/gallium/drivers/radeonsi/si_compute.c >> +++ b/src/gallium/drivers/radeonsi/si_compute.c >> @@ -46,47 +46,6 @@ struct si_compute { >> struct pipe_resource *global_buffers[MAX_GLOBAL_BUFFERS]; >> }; >> >> -static void init_scratch_buffer(struct si_context *sctx, struct si_compute >> *program) >> -{ >> - unsigned scratch_bytes = 0; >> - uint64_t scratch_buffer_va; >> - unsigned i; >> - >> - /* Compute the scratch buffer size using the maximum number of waves. >> - * This way we don't need to recompute it for each kernel launch. */ >> - unsigned scratch_waves = 32 * >> sctx->screen->b.info.num_good_compute_units; >> - for (i = 0; i < program->shader.binary.global_symbol_count; i++) { >> - unsigned offset = >> - >> program->shader.binary.global_symbol_offsets[i]; >> - unsigned scratch_bytes_needed; >> - >> - si_shader_binary_read_config(&program->shader.binary, >> - &program->shader.config, >> offset); >> - scratch_bytes_needed = >> program->shader.config.scratch_bytes_per_wave; >> - scratch_bytes = MAX2(scratch_bytes, scratch_bytes_needed); >> - } >> - >> - if (scratch_bytes == 0) >> - return; >> - >> - program->shader.scratch_bo = >> - si_resource_create_custom(sctx->b.b.screen, >> - PIPE_USAGE_DEFAULT, >> - scratch_bytes * scratch_waves); >> - >> - scratch_buffer_va = program->shader.scratch_bo->gpu_address; >> - >> - /* apply_scratch_relocs needs scratch_bytes_per_wave to be set >> - * to the maximum bytes needed, so it can compute the stride >> - * correctly. >> - */ >> - program->shader.config.scratch_bytes_per_wave = scratch_bytes; >> - >> - /* Patch the shader with the scratch buffer address. */ >> - si_shader_apply_scratch_relocs(sctx, >> - &program->shader, scratch_buffer_va); >> -} >> - >> static void *si_create_compute_state( >> struct pipe_context *ctx, >> const struct pipe_compute_state *cso) >> @@ -140,11 +99,6 @@ static void *si_create_compute_state( >> code = cso->prog + sizeof(struct pipe_llvm_program_header); >> >> radeon_elf_read(code, header->num_bytes, >> &program->shader.binary); >> - /* init_scratch_buffer patches the shader code with the >> scratch address, >> - * so we need to call it before si_shader_binary_read() which >> uploads >> - * the shader code to the GPU. >> - */ >> - init_scratch_buffer(sctx, program); >> si_shader_binary_read_config(&program->shader.binary, >> &program->shader.config, 0); >> } >> @@ -189,43 +143,6 @@ static void si_set_global_binding( >> } >> } >> >> -/** >> - * This function computes the value for R_00B860_COMPUTE_TMPRING_SIZE.WAVES >> - * /p block_layout is the number of threads in each work group. >> - * /p grid layout is the number of work groups. >> - */ >> -static unsigned compute_num_waves_for_scratch( >> - const struct radeon_info *info, >> - const uint *block_layout, >> - const uint *grid_layout) >> -{ >> - unsigned num_sh = MAX2(info->max_sh_per_se, 1); >> - unsigned num_se = MAX2(info->max_se, 1); >> - unsigned num_blocks = 1; >> - unsigned threads_per_block = 1; >> - unsigned waves_per_block; >> - unsigned waves_per_sh; >> - unsigned waves; >> - unsigned scratch_waves; >> - unsigned i; >> - >> - for (i = 0; i < 3; i++) { >> - threads_per_block *= block_layout[i]; >> - num_blocks *= grid_layout[i]; >> - } >> - >> - waves_per_block = align(threads_per_block, 64) / 64; >> - waves = waves_per_block * num_blocks; >> - waves_per_sh = align(waves, num_sh * num_se) / (num_sh * num_se); >> - scratch_waves = waves_per_sh * num_sh * num_se; >> - >> - if (waves_per_block > waves_per_sh) { >> - scratch_waves = waves_per_block * num_sh * num_se; >> - } >> - >> - return scratch_waves; >> -} >> - >> static void si_initialize_compute(struct si_context *sctx) >> { >> struct radeon_winsys_cs *cs = sctx->b.gfx.cs; >> @@ -259,6 +176,43 @@ static void si_initialize_compute(struct si_context >> *sctx) >> sctx->cs_shader_state.initialized = true; >> } >> >> +static bool si_setup_compute_scratch_buffer(struct si_context *sctx, >> + struct si_shader *shader, >> + struct si_shader_config >> *config) { > > "{" on the next line please.
With this fixed, patches 10, 12, 15-16, 18 are: Reviewed-by: Marek Olšák <[email protected]> Marek _______________________________________________ mesa-dev mailing list [email protected] https://lists.freedesktop.org/mailman/listinfo/mesa-dev
