Am 26.04.2016 um 23:18 schrieb Dave Airlie: > On 27 April 2016 at 06:07, Roland Scheidegger <srol...@vmware.com> wrote: >> Am 26.04.2016 um 06:42 schrieb Dave Airlie: >>> From: Dave Airlie <airl...@redhat.com> >>> >>> This enables ARB_compute_shader on softpipe. I've only >>> tested this with piglit so far, and I hopefully plan >>> on integrating it with my vulkan work. I'll get to >>> testing it with deqp more later. >>> >>> The basic premise is to create up to 1024 restartable >>> TGSI machines, and execute workgroups of those machines. >>> >>> Signed-off-by: Dave Airlie <airl...@redhat.com> >>> --- >>> src/gallium/drivers/softpipe/Makefile.sources | 1 + >>> src/gallium/drivers/softpipe/sp_compute.c | 211 >>> +++++++++++++++++++++++++ >>> src/gallium/drivers/softpipe/sp_context.c | 3 + >>> src/gallium/drivers/softpipe/sp_context.h | 4 +- >>> src/gallium/drivers/softpipe/sp_screen.c | 48 +++++- >>> src/gallium/drivers/softpipe/sp_state.h | 9 ++ >>> src/gallium/drivers/softpipe/sp_state_shader.c | 51 ++++++ >>> 7 files changed, 324 insertions(+), 3 deletions(-) >>> create mode 100644 src/gallium/drivers/softpipe/sp_compute.c >>> >>> diff --git a/src/gallium/drivers/softpipe/Makefile.sources >>> b/src/gallium/drivers/softpipe/Makefile.sources >>> index 1d42351..d72266f 100644 >>> --- a/src/gallium/drivers/softpipe/Makefile.sources >>> +++ b/src/gallium/drivers/softpipe/Makefile.sources >>> @@ -4,6 +4,7 @@ C_SOURCES := \ >>> sp_clear.h \ >>> sp_context.c \ >>> sp_context.h \ >>> + sp_compute.c \ >>> sp_draw_arrays.c \ >>> sp_fence.c \ >>> sp_fence.h \ >>> diff --git a/src/gallium/drivers/softpipe/sp_compute.c >>> b/src/gallium/drivers/softpipe/sp_compute.c >>> new file mode 100644 >>> index 0000000..7467686 >>> --- /dev/null >>> +++ b/src/gallium/drivers/softpipe/sp_compute.c >>> @@ -0,0 +1,211 @@ >>> +#include "util/u_inlines.h" >>> +#include "util/u_math.h" >>> +#include "util/u_memory.h" >>> +#include "util/u_pstipple.h" >>> +#include "pipe/p_shader_tokens.h" >>> +#include "draw/draw_context.h" >>> +#include "draw/draw_vertex.h" >>> +#include "sp_context.h" >>> +#include "sp_screen.h" >>> +#include "sp_state.h" >>> +#include "sp_texture.h" >>> +#include "sp_tex_sample.h" >>> +#include "sp_tex_tile_cache.h" >>> +#include "tgsi/tgsi_parse.h" >>> + >>> +static void >>> +cs_prepare(const struct sp_compute_shader *cs, >>> + struct tgsi_exec_machine *machine, >>> + int w, int h, int d, >>> + int g_w, int g_h, int g_d, >>> + int b_w, int b_h, int b_d, >>> + struct tgsi_sampler *sampler, >>> + struct tgsi_image *image, >>> + struct tgsi_buffer *buffer ) >>> +{ >>> + int j; >>> + /* >>> + * Bind tokens/shader to the interpreter's machine state. >>> + */ >>> + tgsi_exec_machine_bind_shader(machine, >>> + cs->tokens, >>> + sampler, image, buffer); >>> + >>> + if (machine->SysSemanticToIndex[TGSI_SEMANTIC_THREAD_ID] != -1) { >>> + unsigned i = machine->SysSemanticToIndex[TGSI_SEMANTIC_THREAD_ID]; >>> + for (j = 0; j < TGSI_QUAD_SIZE; j++) { >>> + machine->SystemValue[i].xyzw[0].i[j] = w; >>> + machine->SystemValue[i].xyzw[1].i[j] = h; >>> + machine->SystemValue[i].xyzw[2].i[j] = d; >>> + } >>> + } >>> + >>> + if (machine->SysSemanticToIndex[TGSI_SEMANTIC_GRID_SIZE] != -1) { >>> + unsigned i = machine->SysSemanticToIndex[TGSI_SEMANTIC_GRID_SIZE]; >>> + for (j = 0; j < TGSI_QUAD_SIZE; j++) { >>> + machine->SystemValue[i].xyzw[0].i[j] = g_w; >>> + machine->SystemValue[i].xyzw[1].i[j] = g_h; >>> + machine->SystemValue[i].xyzw[2].i[j] = g_d; >>> + } >>> + } >>> + >>> + if (machine->SysSemanticToIndex[TGSI_SEMANTIC_BLOCK_SIZE] != -1) { >>> + unsigned i = machine->SysSemanticToIndex[TGSI_SEMANTIC_BLOCK_SIZE]; >>> + for (j = 0; j < TGSI_QUAD_SIZE; j++) { >>> + machine->SystemValue[i].xyzw[0].i[j] = b_w; >>> + machine->SystemValue[i].xyzw[1].i[j] = b_h; >>> + machine->SystemValue[i].xyzw[2].i[j] = b_d; >>> + } >>> + } >>> +} >>> + >>> +static bool >>> +cs_run(const struct sp_compute_shader *cs, >>> + int g_w, int g_h, int g_d, >>> + struct tgsi_exec_machine *machine, bool restart) >>> +{ >>> + if (!restart) { >>> + if (machine->SysSemanticToIndex[TGSI_SEMANTIC_BLOCK_ID] != -1) { >>> + unsigned i = machine->SysSemanticToIndex[TGSI_SEMANTIC_BLOCK_ID]; >>> + int j; >>> + for (j = 0; j < TGSI_QUAD_SIZE; j++) { >>> + machine->SystemValue[i].xyzw[0].i[j] = g_w; >>> + machine->SystemValue[i].xyzw[1].i[j] = g_h; >>> + machine->SystemValue[i].xyzw[2].i[j] = g_d; >>> + } >>> + } >>> + machine->NonHelperMask = (1 << 1) - 1; >>> + } >>> + >>> + tgsi_exec_machine_run(machine, restart ? machine->pc : 0); >>> + >>> + if (machine->pc != -1) >>> + return true; >>> + return false; >>> +} >>> + >>> +static void >>> +run_workgroup(const struct sp_compute_shader *cs, >>> + int g_w, int g_h, int g_d, int num_threads, >>> + struct tgsi_exec_machine **machines) >>> +{ >>> + int i; >>> + bool grp_hit_barrier, restart_threads = false; >>> + >>> + do { >>> + grp_hit_barrier = false; >>> + for (i = 0; i < num_threads; i++) { >>> + grp_hit_barrier |= cs_run(cs, g_w, g_h, g_d, machines[i], >>> restart_threads); >>> + } >>> + restart_threads = false; >>> + if (grp_hit_barrier) { >>> + grp_hit_barrier = false; >>> + restart_threads = true; >>> + } >>> + } while (restart_threads); >>> +} >>> + >>> +static void >>> +cs_delete(const struct sp_compute_shader *cs, >>> + struct tgsi_exec_machine *machine) >>> +{ >>> + if (machine->Tokens == cs->tokens) { >>> + tgsi_exec_machine_bind_shader(machine, NULL, NULL, NULL, NULL); >>> + } >>> +} >>> + >>> +static void >>> +fill_grid_size(struct pipe_context *context, >>> + const struct pipe_grid_info *info, >>> + uint32_t grid_size[3]) >>> +{ >>> + struct pipe_transfer *transfer; >>> + uint32_t *params; >>> + if (!info->indirect) { >>> + grid_size[0] = info->grid[0]; >>> + grid_size[1] = info->grid[1]; >>> + grid_size[2] = info->grid[2]; >>> + return; >>> + } >>> + params = pipe_buffer_map_range(context, info->indirect, >>> + info->indirect_offset, >>> + 3 * sizeof(uint32_t), >>> + PIPE_TRANSFER_READ, >>> + &transfer); >>> + >>> + if (!transfer) >>> + return; >>> + >>> + grid_size[0] = params[0]; >>> + grid_size[1] = params[1]; >>> + grid_size[2] = params[2]; >>> + pipe_buffer_unmap(context, transfer); >>> +} >>> + >>> +void >>> +softpipe_launch_grid(struct pipe_context *context, >>> + const struct pipe_grid_info *info) >>> +{ >>> + struct softpipe_context *softpipe = softpipe_context(context); >>> + struct sp_compute_shader *cs = softpipe->cs; >>> + int num_threads_in_group; >>> + struct tgsi_exec_machine **machines; >>> + int bwidth, bheight, bdepth; >>> + int w, h, d, i; >>> + int g_w, g_h, g_d; >>> + uint32_t grid_size[3]; >>> + void *local_mem = NULL; >>> + >>> + bwidth = cs->info.properties[TGSI_PROPERTY_CS_FIXED_BLOCK_WIDTH]; >>> + bheight = cs->info.properties[TGSI_PROPERTY_CS_FIXED_BLOCK_HEIGHT]; >>> + bdepth = cs->info.properties[TGSI_PROPERTY_CS_FIXED_BLOCK_DEPTH]; >>> + num_threads_in_group = bwidth * bheight * bdepth; >>> + >>> + fill_grid_size(context, info, grid_size); >>> + >>> + if (cs->shader.req_local_mem) { >>> + local_mem = CALLOC(1, cs->shader.req_local_mem); >>> + } >>> + >>> + machines = CALLOC(sizeof(struct tgsi_exec_machine *), >>> num_threads_in_group); >>> + if (!machines) >>> + return; >>> + >>> + /* initialise machines + GRID_SIZE + THREAD_ID + BLOCK_SIZE */ >>> + for (d = 0; d < bdepth; d++) { >>> + for (h = 0; h < bheight; h++) { >>> + for (w = 0; w < bwidth; w++) { >>> + int idx = w + (h * bwidth) + (d * bheight * bwidth); >>> + machines[idx] = tgsi_exec_machine_create(PIPE_SHADER_COMPUTE); >>> + >>> + machines[idx]->LocalMem = local_mem; >>> + machines[idx]->LocalMemSize = cs->shader.req_local_mem; >>> + cs_prepare(cs, machines[idx], >>> + w, h, d, >>> + grid_size[0], grid_size[1], grid_size[2], >>> + bwidth, bheight, bdepth, >>> + (struct tgsi_sampler >>> *)softpipe->tgsi.sampler[PIPE_SHADER_COMPUTE], >>> + (struct tgsi_image >>> *)softpipe->tgsi.image[PIPE_SHADER_COMPUTE], >>> + (struct tgsi_buffer >>> *)softpipe->tgsi.buffer[PIPE_SHADER_COMPUTE]); >>> + tgsi_exec_set_constant_buffers(machines[idx], >>> PIPE_MAX_CONSTANT_BUFFERS, >>> + >>> softpipe->mapped_constants[PIPE_SHADER_COMPUTE], >>> + >>> softpipe->const_buffer_size[PIPE_SHADER_COMPUTE]); >>> + } >>> + } >>> + } >>> + >>> + for (g_d = 0; g_d < grid_size[2]; g_d++) { >>> + for (g_h = 0; g_h < grid_size[1]; g_h++) { >>> + for (g_w = 0; g_w < grid_size[0]; g_w++) { >>> + run_workgroup(cs, g_w, g_h, g_d, num_threads_in_group, >>> machines); >>> + } >>> + } >>> + } >>> + >>> + for (i = 0; i < num_threads_in_group; i++) { >>> + cs_delete(cs, machines[i]); >>> + } >>> + >>> + FREE(local_mem); >>> + FREE(machines); >>> +} >>> diff --git a/src/gallium/drivers/softpipe/sp_context.c >>> b/src/gallium/drivers/softpipe/sp_context.c >>> index e3ec524..1690e38 100644 >>> --- a/src/gallium/drivers/softpipe/sp_context.c >>> +++ b/src/gallium/drivers/softpipe/sp_context.c >>> @@ -212,6 +212,7 @@ softpipe_create_context(struct pipe_screen *screen, >>> >>> softpipe->dump_fs = debug_get_bool_option( "SOFTPIPE_DUMP_FS", FALSE ); >>> softpipe->dump_gs = debug_get_bool_option( "SOFTPIPE_DUMP_GS", FALSE ); >>> + softpipe->dump_cs = debug_get_bool_option( "SOFTPIPE_DUMP_CS", FALSE ); >>> >>> softpipe->pipe.screen = screen; >>> softpipe->pipe.destroy = softpipe_destroy; >>> @@ -233,6 +234,8 @@ softpipe_create_context(struct pipe_screen *screen, >>> >>> softpipe->pipe.draw_vbo = softpipe_draw_vbo; >>> >>> + softpipe->pipe.launch_grid = softpipe_launch_grid; >>> + >>> softpipe->pipe.clear = softpipe_clear; >>> softpipe->pipe.flush = softpipe_flush_wrapped; >>> softpipe->pipe.texture_barrier = softpipe_texture_barrier; >>> diff --git a/src/gallium/drivers/softpipe/sp_context.h >>> b/src/gallium/drivers/softpipe/sp_context.h >>> index 70d00c8..a57f587 100644 >>> --- a/src/gallium/drivers/softpipe/sp_context.h >>> +++ b/src/gallium/drivers/softpipe/sp_context.h >>> @@ -71,6 +71,7 @@ struct softpipe_context { >>> struct sp_geometry_shader *gs; >>> struct sp_velems_state *velems; >>> struct sp_so_state *so; >>> + struct sp_compute_shader *cs; >>> >>> /** Other rendering state */ >>> struct pipe_blend_color blend_color; >>> @@ -205,10 +206,11 @@ struct softpipe_context { >>> * XXX wouldn't it make more sense for the tile cache to just be part >>> * of sp_sampler_view? >>> */ >>> - struct softpipe_tex_tile_cache >>> *tex_cache[PIPE_SHADER_GEOMETRY+1][PIPE_MAX_SHADER_SAMPLER_VIEWS]; >>> + struct softpipe_tex_tile_cache >>> *tex_cache[PIPE_SHADER_TYPES][PIPE_MAX_SHADER_SAMPLER_VIEWS]; >>> >>> unsigned dump_fs : 1; >>> unsigned dump_gs : 1; >>> + unsigned dump_cs : 1; >>> unsigned no_rast : 1; >>> }; >>> >>> diff --git a/src/gallium/drivers/softpipe/sp_screen.c >>> b/src/gallium/drivers/softpipe/sp_screen.c >>> index d89d95c..4beeb80 100644 >>> --- a/src/gallium/drivers/softpipe/sp_screen.c >>> +++ b/src/gallium/drivers/softpipe/sp_screen.c >>> @@ -157,7 +157,7 @@ softpipe_get_param(struct pipe_screen *screen, enum >>> pipe_cap param) >>> case PIPE_CAP_QUADS_FOLLOW_PROVOKING_VERTEX_CONVENTION: >>> return 0; >>> case PIPE_CAP_COMPUTE: >>> - return 0; >>> + return 1; >>> case PIPE_CAP_USER_VERTEX_BUFFERS: >>> case PIPE_CAP_USER_INDEX_BUFFERS: >>> case PIPE_CAP_USER_CONSTANT_BUFFERS: >>> @@ -289,6 +289,8 @@ softpipe_get_shader_param(struct pipe_screen *screen, >>> unsigned shader, enum pipe >>> { >>> case PIPE_SHADER_FRAGMENT: >>> return tgsi_exec_get_shader_param(param); >>> + case PIPE_SHADER_COMPUTE: >>> + return tgsi_exec_get_shader_param(param); >>> case PIPE_SHADER_VERTEX: >>> case PIPE_SHADER_GEOMETRY: >>> if (sp_screen->use_llvm) >>> @@ -447,6 +449,48 @@ softpipe_get_timestamp(struct pipe_screen *_screen) >>> return os_time_get_nano(); >>> } >>> >>> +static int >>> +softpipe_get_compute_param(struct pipe_screen *_screen, >>> + enum pipe_shader_ir ir_type, >>> + enum pipe_compute_cap param, >>> + void *ret) >>> +{ >>> + switch (param) { >>> + case PIPE_COMPUTE_CAP_IR_TARGET: >>> + return 0; >>> + case PIPE_COMPUTE_CAP_MAX_GRID_SIZE: >>> + if (ret) { >>> + uint64_t *grid_size = ret; >>> + grid_size[0] = 65535; >>> + grid_size[1] = 65535; >>> + grid_size[2] = 65535; >>> + } >>> + return 3 * sizeof(uint64_t) ; >>> + case PIPE_COMPUTE_CAP_MAX_BLOCK_SIZE: >>> + if (ret) { >>> + uint64_t *block_size = ret; >>> + block_size[0] = 1024; >>> + block_size[1] = 1024; >>> + block_size[2] = 1024; >>> + } >>> + return 3 * sizeof(uint64_t); >>> + case PIPE_COMPUTE_CAP_MAX_THREADS_PER_BLOCK: >>> + if (ret) { >>> + uint64_t *max_threads_per_block = ret; >>> + *max_threads_per_block = 2048; >>> + } >>> + return sizeof(uint64_t); >>> + case PIPE_COMPUTE_CAP_MAX_LOCAL_SIZE: >>> + if (ret) { >>> + uint64_t *max_local_size = ret; >>> + /* Value reported by the closed source driver. */ >> >> The comment here doesn't make much sense... >> >> 1024 interpreted tgsi machines, all running serially - I'm sure >> performance is going to be amazing. >> >> But the approach looks reasonable to me. >> >> I'm not really familiar with compute shaders, but what I'm wondering is >> since tgsi exec always operates on 4 values at a time, is that somehow >> implicit in compute shaders? > > So far I've set the execmask to 1 active channel, I'm contemplating > changing that > though and using less machines. Ah yes, I think that would indeed be desirable.
> > Any ideas how to implement this in llvm? :-) 1024 CPU threads? I suppose 1024 is really the minimum work size you have to support? But since things are always run 4-wide (or 8-wide) that would "only" be 256 (or 128) threads. That many threads sound a bit suboptimal to me (unless you really have a boatload of cpu cores), but why not - I suppose you can always pause some of the threads, not all need to be active at the same time. Though I wonder what the opencl-on-cpu guys do... Roland _______________________________________________ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev