On Thu, Jan 8, 2015 at 3:37 PM, Marek Olšák <mar...@gmail.com> wrote: > On Thu, Jan 8, 2015 at 8:41 PM, Tom Stellard <t...@stellard.net> wrote: >> On Thu, Jan 08, 2015 at 07:51:58PM +0100, Marek Olšák wrote: >>> On Thu, Jan 8, 2015 at 7:17 PM, Tom Stellard <t...@stellard.net> wrote: >>> > On Thu, Jan 08, 2015 at 07:00:11PM +0100, Marek Olšák wrote: >>> >> On Wed, Jan 7, 2015 at 10:03 PM, Tom Stellard <thomas.stell...@amd.com> >>> >> wrote: >>> >> > --- >>> >> > src/gallium/drivers/radeonsi/si_compute.c | 40 ++------------ >>> >> > src/gallium/drivers/radeonsi/si_shader.c | 69 >>> >> > ++++++++++++++++++++++++- >>> >> > src/gallium/drivers/radeonsi/si_shader.h | 7 ++- >>> >> > src/gallium/drivers/radeonsi/si_state_shaders.c | 36 +++++++++++-- >>> >> > 4 files changed, 108 insertions(+), 44 deletions(-) >>> >> > >>> >> > diff --git a/src/gallium/drivers/radeonsi/si_compute.c >>> >> > b/src/gallium/drivers/radeonsi/si_compute.c >>> >> > index 37e5c42..e5f158d 100644 >>> >> > --- a/src/gallium/drivers/radeonsi/si_compute.c >>> >> > +++ b/src/gallium/drivers/radeonsi/si_compute.c >>> >> > @@ -42,12 +42,6 @@ >>> >> > #define NUM_USER_SGPRS 4 >>> >> > #endif >>> >> > >>> >> > -static const char *scratch_rsrc_dword0_symbol = >>> >> > - "SCRATCH_RSRC_DWORD0"; >>> >> > - >>> >> > -static const char *scratch_rsrc_dword1_symbol = >>> >> > - "SCRATCH_RSRC_DWORD1"; >>> >> > - >>> >> > struct si_compute { >>> >> > struct si_context *ctx; >>> >> > >>> >> > @@ -183,35 +177,6 @@ static unsigned compute_num_waves_for_scratch( >>> >> > return scratch_waves; >>> >> > } >>> >> > >>> >> > -static void apply_scratch_relocs(const struct si_screen *sscreen, >>> >> > - const struct radeon_shader_binary *binary, >>> >> > - struct si_shader *shader, uint64_t scratch_va) >>> >> > { >>> >> > - unsigned i; >>> >> > - char *ptr; >>> >> > - uint32_t scratch_rsrc_dword0 = scratch_va & 0xffffffff; >>> >> > - uint32_t scratch_rsrc_dword1 = >>> >> > - S_008F04_BASE_ADDRESS_HI(scratch_va >> 32) >>> >> > - | S_008F04_STRIDE(shader->scratch_bytes_per_wave / >>> >> > 64); >>> >> > - >>> >> > - if (!binary->reloc_count) { >>> >> > - return; >>> >> > - } >>> >> > - >>> >> > - ptr = sscreen->b.ws->buffer_map(shader->bo->cs_buf, NULL, >>> >> > - PIPE_TRANSFER_READ_WRITE); >>> >> > - for (i = 0 ; i < binary->reloc_count; i++) { >>> >> > - const struct radeon_shader_reloc *reloc = >>> >> > &binary->relocs[i]; >>> >> > - if (!strcmp(scratch_rsrc_dword0_symbol, reloc->name)) { >>> >> > - util_memcpy_cpu_to_le32(ptr + reloc->offset, >>> >> > - &scratch_rsrc_dword0, 4); >>> >> > - } else if (!strcmp(scratch_rsrc_dword1_symbol, >>> >> > reloc->name)) { >>> >> > - util_memcpy_cpu_to_le32(ptr + reloc->offset, >>> >> > - &scratch_rsrc_dword1, 4); >>> >> > - } >>> >> > - } >>> >> > - sscreen->b.ws->buffer_unmap(shader->bo->cs_buf); >>> >> > -} >>> >> > - >>> >> > static void si_launch_grid( >>> >> > struct pipe_context *ctx, >>> >> > const uint *block_layout, const uint *grid_layout, >>> >> > @@ -256,7 +221,8 @@ static void si_launch_grid( >>> >> > >>> >> > #if HAVE_LLVM >= 0x0306 >>> >> > /* Read the config information */ >>> >> > - si_shader_binary_read_config(&program->binary, >>> >> > &program->program, pc); >>> >> > + si_shader_binary_read_config(sctx->screen, &program->binary, >>> >> > + &program->program, pc); >>> >> > #endif >>> >> > >>> >> > /* Upload the kernel arguments */ >>> >> > @@ -295,7 +261,7 @@ static void si_launch_grid( >>> >> > RADEON_PRIO_SHADER_RESOURCE_RW); >>> >> > >>> >> > /* Patch the shader with the scratch buffer address. */ >>> >> > - apply_scratch_relocs(sctx->screen, >>> >> > + si_shader_apply_scratch_relocs(sctx->screen, >>> >> > &program->binary, shader, scratch_buffer_va); >>> >> > >>> >> > } >>> >> > diff --git a/src/gallium/drivers/radeonsi/si_shader.c >>> >> > b/src/gallium/drivers/radeonsi/si_shader.c >>> >> > index cf28860..d59e736 100644 >>> >> > --- a/src/gallium/drivers/radeonsi/si_shader.c >>> >> > +++ b/src/gallium/drivers/radeonsi/si_shader.c >>> >> > @@ -46,6 +46,12 @@ >>> >> > >>> >> > #include <errno.h> >>> >> > >>> >> > +static const char *scratch_rsrc_dword0_symbol = >>> >> > + "SCRATCH_RSRC_DWORD0"; >>> >> > + >>> >> > +static const char *scratch_rsrc_dword1_symbol = >>> >> > + "SCRATCH_RSRC_DWORD1"; >>> >> > + >>> >> > struct si_shader_output_values >>> >> > { >>> >> > LLVMValueRef values[4]; >>> >> > @@ -2517,7 +2523,8 @@ static void preload_ring_buffers(struct >>> >> > si_shader_context *si_shader_ctx) >>> >> > } >>> >> > } >>> >> > >>> >> > -void si_shader_binary_read_config(const struct radeon_shader_binary >>> >> > *binary, >>> >> > +void si_shader_binary_read_config(const struct si_screen *sscreen, >>> >> > + const struct radeon_shader_binary >>> >> > *binary, >>> >> > struct si_shader *shader, >>> >> > unsigned symbol_offset) >>> >> > { >>> >> > @@ -2549,6 +2556,14 @@ void si_shader_binary_read_config(const struct >>> >> > radeon_shader_binary *binary, >>> >> > case R_0286CC_SPI_PS_INPUT_ENA: >>> >> > shader->spi_ps_input_ena = value; >>> >> > break; >>> >> > + case R_0286E8_SPI_TMPRING_SIZE: >>> >> > + /* XXX: This is the maximum value allowed. >>> >> > I'm not sure >>> >> > + * how compute this for non-cs shaders. >>> >> > + */ >>> >> > + shader->scratch_waves = >>> >> > + 32 * sscreen->b.info.max_compute_units; >>> >> > + /* Fall-through */ >>> >> > + >>> >> > case R_00B860_COMPUTE_TMPRING_SIZE: >>> >> > /* WAVESIZE is in units of 256 dwords. */ >>> >> > shader->scratch_bytes_per_wave = >>> >> > @@ -2562,6 +2577,36 @@ void si_shader_binary_read_config(const struct >>> >> > radeon_shader_binary *binary, >>> >> > } >>> >> > } >>> >> > >>> >> > +void si_shader_apply_scratch_relocs(const struct si_screen *sscreen, >>> >> > + const struct radeon_shader_binary *binary, >>> >> > + struct si_shader *shader, uint64_t scratch_va) >>> >> > { >>> >> > + unsigned i; >>> >> > + char *ptr; >>> >> > + uint32_t scratch_rsrc_dword0 = scratch_va & 0xffffffff; >>> >> > + uint32_t scratch_rsrc_dword1 = >>> >> > + S_008F04_BASE_ADDRESS_HI(scratch_va >> 32) >>> >> > + | S_008F04_STRIDE(shader->scratch_bytes_per_wave / >>> >> > 64); >>> >> > + >>> >> > + if (!binary->reloc_count) { >>> >> > + return; >>> >> > + } >>> >> > + >>> >> > + ptr = sscreen->b.ws->buffer_map(shader->bo->cs_buf, NULL, >>> >> > + PIPE_TRANSFER_READ_WRITE); >>> >> > + for (i = 0 ; i < binary->reloc_count; i++) { >>> >> > + const struct radeon_shader_reloc *reloc = >>> >> > &binary->relocs[i]; >>> >> > + if (!strcmp(scratch_rsrc_dword0_symbol, reloc->name)) { >>> >> > + util_memcpy_cpu_to_le32(ptr + reloc->offset, >>> >> > + &scratch_rsrc_dword0, 4); >>> >> > + } else if (!strcmp(scratch_rsrc_dword1_symbol, >>> >> > reloc->name)) { >>> >> > + util_memcpy_cpu_to_le32(ptr + reloc->offset, >>> >> > + &scratch_rsrc_dword1, 4); >>> >> > + } >>> >> > + } >>> >> > + sscreen->b.ws->buffer_unmap(shader->bo->cs_buf); >>> >> > +} >>> >> > + >>> >> > + >>> >> > int si_shader_binary_read(struct si_screen *sscreen, >>> >> > struct si_shader *shader, >>> >> > const struct radeon_shader_binary *binary) >>> >> > @@ -2582,7 +2627,7 @@ int si_shader_binary_read(struct si_screen >>> >> > *sscreen, >>> >> > } >>> >> > } >>> >> > >>> >> > - si_shader_binary_read_config(binary, shader, 0); >>> >> > + si_shader_binary_read_config(sscreen, binary, shader, 0); >>> >> > >>> >> > /* copy new shader */ >>> >> > code_size = binary->code_size + binary->rodata_size; >>> >> > @@ -2601,6 +2646,7 @@ int si_shader_binary_read(struct si_screen >>> >> > *sscreen, >>> >> > util_memcpy_cpu_to_le32(ptr, binary->rodata, >>> >> > binary->rodata_size); >>> >> > } >>> >> > >>> >> > + >>> >> >>> >> Unintentional new line? >>> >> >>> >> > sscreen->b.ws->buffer_unmap(shader->bo->cs_buf); >>> >> > >>> >> > return 0; >>> >> > @@ -2621,6 +2667,25 @@ int si_compile_llvm(struct si_screen *sscreen, >>> >> > struct si_shader *shader, >>> >> > return r; >>> >> > } >>> >> > r = si_shader_binary_read(sscreen, shader, &binary); >>> >> > + >>> >> > + if (shader->scratch_bytes_per_wave > 0) { >>> >> > + uint64_t scratch_buffer_va; >>> >> > + unsigned scratch_bytes = >>> >> > shader->scratch_bytes_per_wave * >>> >> > + shader->scratch_waves; >>> >> > + /* It's possible for different shader variants to have >>> >> > + * different scratch buffer sizes. The scratch buffer >>> >> > sizes >>> >> > + * won't vary by too much, so allocating double the >>> >> > scratch >>> >> > + * space need for the first variant should be enough >>> >> > for the >>> >> > + * rest if they need it. >>> >> > + */ >>> >> > + shader->scratch_bo = (struct r600_resource*) >>> >> > + >>> >> > si_resource_create_custom(&sscreen->b.b, >>> >> > + PIPE_USAGE_DEFAULT, scratch_bytes * 2); >>> >> > + scratch_buffer_va = shader->scratch_bo->gpu_address; >>> >> > + si_shader_apply_scratch_relocs(sscreen, &binary, >>> >> > shader, >>> >> > + scratch_buffer_va); >>> >> > + } >>> >> > + >>> >> > FREE(binary.code); >>> >> > FREE(binary.config); >>> >> > FREE(binary.rodata); >>> >> > diff --git a/src/gallium/drivers/radeonsi/si_shader.h >>> >> > b/src/gallium/drivers/radeonsi/si_shader.h >>> >> > index 08e344a..c5b274e 100644 >>> >> > --- a/src/gallium/drivers/radeonsi/si_shader.h >>> >> > +++ b/src/gallium/drivers/radeonsi/si_shader.h >>> >> > @@ -147,6 +147,7 @@ struct si_shader { >>> >> > unsigned lds_size; >>> >> > unsigned spi_ps_input_ena; >>> >> > unsigned scratch_bytes_per_wave; >>> >> > + unsigned scratch_waves; >>> >> > unsigned spi_shader_col_format; >>> >> > unsigned spi_shader_z_format; >>> >> > unsigned db_shader_control; >>> >> > @@ -185,7 +186,11 @@ void si_shader_destroy(struct pipe_context *ctx, >>> >> > struct si_shader *shader); >>> >> > unsigned si_shader_io_get_unique_index(unsigned semantic_name, >>> >> > unsigned index); >>> >> > int si_shader_binary_read(struct si_screen *sscreen, struct si_shader >>> >> > *shader, >>> >> > const struct radeon_shader_binary *binary); >>> >> > -void si_shader_binary_read_config(const struct radeon_shader_binary >>> >> > *binary, >>> >> > +void si_shader_apply_scratch_relocs(const struct si_screen *sscreen, >>> >> > + const struct radeon_shader_binary *binary, >>> >> > + struct si_shader *shader, uint64_t scratch_va); >>> >> > +void si_shader_binary_read_config(const struct si_screen *sscreen, >>> >> > + const struct radeon_shader_binary >>> >> > *binary, >>> >> > struct si_shader *shader, >>> >> > unsigned symbol_offset); >>> >> > >>> >> > diff --git a/src/gallium/drivers/radeonsi/si_state_shaders.c >>> >> > b/src/gallium/drivers/radeonsi/si_state_shaders.c >>> >> > index 817a990..b6a0f32 100644 >>> >> > --- a/src/gallium/drivers/radeonsi/si_state_shaders.c >>> >> > +++ b/src/gallium/drivers/radeonsi/si_state_shaders.c >>> >> > @@ -33,6 +33,22 @@ >>> >> > #include "util/u_memory.h" >>> >> > #include "util/u_simple_shaders.h" >>> >> > >>> >> > +static void si_shader_setup_scratch_buffer(struct si_shader *shader, >>> >> > + struct si_pm4_state *pm4) { >>> >> > + if (shader->scratch_bytes_per_wave == 0) { >>> >> > + return; >>> >> > + } >>> >> > + >>> >> > + assert(shader->scratch_bo); >>> >> > + >>> >> > + si_pm4_set_reg(pm4, R_0286E8_SPI_TMPRING_SIZE, >>> >> > + S_0286E8_WAVES(shader->scratch_waves) >>> >> > + | >>> >> > S_0286E8_WAVESIZE(shader->scratch_bytes_per_wave >> 10)); >>> >> >>> >> What happens if all VS, GS, and PS spill VGPRs? Will they share the >>> >> scratch buffer? Will the scratch buffer be large enough for all of >>> >> them? Will the WAVES and WAVESIZE parameters be the same for all of them? >>> >> >>> > >>> > Each shader will have its own scratch buffer. WAVES will always be the >>> > same, but WAVESIZE depends on the number of VGPRs spilled. >>> >>> But SPI_TMPRING_SIZE is shared by all stages, right? Will it not cause >>> problems if the register value (especially WAVESIZE) is different for >>> ES, GS, VS, and PS? Perhaps WAVESIZE should be set to a maximum of all >>> enabled shaders, right? >>> >>> Example sequence of state emission: >>> - emit VS regs >>> - emit SPI_TMPRING_SIZE for VS >>> - emit PS regs >>> - emit SPI_TMPRING_SIZE for PS >>> >>> Assuming VS.SPI_TMPRING_SIZE != PS.SPI_TMPRING_SIZE. >>> >> >> Ok, I see why this is a problem. >> >> If we use the same WAVESIZE for all shader types, then this means that >> we either need to use the same scratch buffer for all shader types or >> have all the scratch buffers be the same size. >> >> In a pipeline with VS and PS, do we know that all VS threads will >> complete before any of the PS threads start? If not, I think we will >> need one scratch buffer per shader type. > > No idea. Do you happen to know this, Christian? Does the 8-context CP > stuff have anything to do with this? >
The CP context stuff is for pipelining state changes (e.g., so you can have multiple draws in the pipeline at once). On r600 asics there was a tmp ring available for each shader stage, on SI, there's just one tmp ring for the whole gfx pipeline (SPI_TMPRING) and one tmp ring for the compute pipeline (COMPUTE_TMPRING). SPI_TMPRING is a context register (0x28000+), so it's multi-state pipelined just like the other context registers. We can only set one buffer for all shader types enabled in the current state. Alex >> >> No matter which of these solutions we choose, we will have to allocate >> the scratch buffer when the shaders are bound to the state, because that >> is the only time we will know how big they will need to be. >> >> It seems like the best solution may be to have a global per context >> scratch buffer which has its size increased every time a shader that >> requires more scratch space is bound to a state. >> >> What do you think? > > This sounds very good to me. > > It looks like we may need to add a separate state for the register, > because the shader pm4 states can't be changed after they are built. > There are 3 options: > 1) Add si_pm4_state to struct si_states. See e.g. how the "spi" state > is done. This is not very efficient for non-CSO states though. > 2) Add an r600_atom. See e.g. how the "clip_regs" state is done. > 3) Add the register emission to draw_vbo. Don't worry, it will still > be emitted conditionally. See e.g. how si_emit_rasterizer_prim_state > is done. > > For 1-register states, (3) is probably the best. > > Marek > _______________________________________________ > mesa-dev mailing list > mesa-dev@lists.freedesktop.org > http://lists.freedesktop.org/mailman/listinfo/mesa-dev _______________________________________________ mesa-dev mailing list mesa-dev@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/mesa-dev