I'm not sure if using a scratch buffer per command buffer is correct. AFAIU each ring has a separate counter for the scratch offsets, and if a command buffer is used in multiple compute rings at the same time, these separate counters could conflict.
I'd think we need a preamble IB per queue that sets SGPR0/1 for all relevant stages, and modify the winsys so that that is called in the same submit ioctl as the application command buffers. - Bas On Tue, Jan 24, 2017, at 18:32, Dave Airlie wrote: > From: Dave Airlie <airl...@redhat.com> > > Currently LLVM 5.0 has support for spilling to a place > pointed to by the user sgprs instead of using relocations. > > This is enabled by using the amdgcn-mesa-mesa3d triple. > > For compute gfx shaders we spill to a buffer pointed to > by 64-bit address stored in sgprs 0/1. > For other gfx shaders we spill to a buffer pointed to by > the first two dwords of the buffer pointed to in sgprs 0/1. > > This patch enables radv to use the llvm support when present. > > This fixes Sascha Willems computeshader demo first screen, > and a bunch of CTS tests now pass. > > This patch is likely to be in LLVM 4.0 release as well > (fingers crossed) in which case we need to adjust the detection > logic. > > SIgned-off-by: Dave Airlie <airl...@redhat.com> > --- > src/amd/common/ac_binary.c | 30 +++++---- > src/amd/common/ac_binary.h | 4 +- > src/amd/common/ac_llvm_util.c | 4 +- > src/amd/common/ac_llvm_util.h | 2 +- > src/amd/common/ac_nir_to_llvm.c | 14 ++-- > src/amd/common/ac_nir_to_llvm.h | 6 +- > src/amd/vulkan/radv_cmd_buffer.c | 137 > ++++++++++++++++++++++++++++++++++++++- > src/amd/vulkan/radv_device.c | 22 +++++++ > src/amd/vulkan/radv_pipeline.c | 10 +-- > src/amd/vulkan/radv_private.h | 13 ++++ > 10 files changed, 215 insertions(+), 27 deletions(-) > > diff --git a/src/amd/common/ac_binary.c b/src/amd/common/ac_binary.c > index 01cf000..9c66a82 100644 > --- a/src/amd/common/ac_binary.c > +++ b/src/amd/common/ac_binary.c > @@ -212,23 +212,28 @@ static const char *scratch_rsrc_dword1_symbol = > > void ac_shader_binary_read_config(struct ac_shader_binary *binary, > struct ac_shader_config *conf, > - unsigned symbol_offset) > + unsigned symbol_offset, > + bool supports_spill) > { > unsigned i; > const unsigned char *config = > ac_shader_binary_config_start(binary, symbol_offset); > bool really_needs_scratch = false; > - > + uint32_t wavesize = 0; > /* LLVM adds SGPR spills to the scratch size. > * Find out if we really need the scratch buffer. > */ > - for (i = 0; i < binary->reloc_count; i++) { > - const struct ac_shader_reloc *reloc = &binary->relocs[i]; > + if (supports_spill) { > + really_needs_scratch = true; > + } else { > + for (i = 0; i < binary->reloc_count; i++) { > + const struct ac_shader_reloc *reloc = > &binary->relocs[i]; > > - if (!strcmp(scratch_rsrc_dword0_symbol, reloc->name) || > - !strcmp(scratch_rsrc_dword1_symbol, reloc->name)) { > - really_needs_scratch = true; > - break; > + if (!strcmp(scratch_rsrc_dword0_symbol, > reloc->name) || > + !strcmp(scratch_rsrc_dword1_symbol, > reloc->name)) { > + really_needs_scratch = true; > + break; > + } > } > } > > @@ -259,9 +264,7 @@ void ac_shader_binary_read_config(struct > ac_shader_binary *binary, > case R_0286E8_SPI_TMPRING_SIZE: > case R_00B860_COMPUTE_TMPRING_SIZE: > /* WAVESIZE is in units of 256 dwords. */ > - if (really_needs_scratch) > - conf->scratch_bytes_per_wave = > - G_00B860_WAVESIZE(value) * 256 * > 4; > + wavesize = value; > break; > case SPILLED_SGPRS: > conf->spilled_sgprs = value; > @@ -285,4 +288,9 @@ void ac_shader_binary_read_config(struct > ac_shader_binary *binary, > if (!conf->spi_ps_input_addr) > conf->spi_ps_input_addr = conf->spi_ps_input_ena; > } > + > + if (really_needs_scratch) { > + /* sgprs spills aren't spilling */ > + conf->scratch_bytes_per_wave = > G_00B860_WAVESIZE(wavesize) * 256 * 4; > + } > } > diff --git a/src/amd/common/ac_binary.h b/src/amd/common/ac_binary.h > index 282f33d..06fd855 100644 > --- a/src/amd/common/ac_binary.h > +++ b/src/amd/common/ac_binary.h > @@ -27,6 +27,7 @@ > #pragma once > > #include <stdint.h> > +#include <stdbool.h> > > struct ac_shader_reloc { > char name[32]; > @@ -85,4 +86,5 @@ void ac_elf_read(const char *elf_data, unsigned > elf_size, > > void ac_shader_binary_read_config(struct ac_shader_binary *binary, > struct ac_shader_config *conf, > - unsigned symbol_offset); > + unsigned symbol_offset, > + bool supports_spill); > diff --git a/src/amd/common/ac_llvm_util.c > b/src/amd/common/ac_llvm_util.c > index 770e3bd..3ba5281 100644 > --- a/src/amd/common/ac_llvm_util.c > +++ b/src/amd/common/ac_llvm_util.c > @@ -126,11 +126,11 @@ static const char *ac_get_llvm_processor_name(enum > radeon_family family) > } > } > > -LLVMTargetMachineRef ac_create_target_machine(enum radeon_family family) > +LLVMTargetMachineRef ac_create_target_machine(enum radeon_family family, > bool supports_spill) > { > assert(family >= CHIP_TAHITI); > > - const char *triple = "amdgcn--"; > + const char *triple = supports_spill ? "amdgcn-mesa-mesa3d" : > "amdgcn--"; > LLVMTargetRef target = ac_get_llvm_target(triple); > LLVMTargetMachineRef tm = LLVMCreateTargetMachine( > target, > diff --git a/src/amd/common/ac_llvm_util.h > b/src/amd/common/ac_llvm_util.h > index 802c266..2a5f325 100644 > --- a/src/amd/common/ac_llvm_util.h > +++ b/src/amd/common/ac_llvm_util.h > @@ -56,7 +56,7 @@ struct ac_llvm_context { > LLVMValueRef fpmath_md_2p5_ulp; > }; > > -LLVMTargetMachineRef ac_create_target_machine(enum radeon_family > family); > +LLVMTargetMachineRef ac_create_target_machine(enum radeon_family family, > bool supports_spill); > > void ac_add_attr_dereferenceable(LLVMValueRef val, uint64_t bytes); > bool ac_is_sgpr_param(LLVMValueRef param); > diff --git a/src/amd/common/ac_nir_to_llvm.c > b/src/amd/common/ac_nir_to_llvm.c > index 26b87e8..43e079e 100644 > --- a/src/amd/common/ac_nir_to_llvm.c > +++ b/src/amd/common/ac_nir_to_llvm.c > @@ -458,10 +458,10 @@ static void create_function(struct > nir_to_llvm_context *ctx) > arg_idx, array_params_mask, sgpr_count, ctx->options->unsafe_math); > set_llvm_calling_convention(ctx->main_function, ctx->stage); > > - > ctx->shader_info->num_input_sgprs = 0; > ctx->shader_info->num_input_vgprs = 0; > > + ctx->shader_info->num_user_sgprs = ctx->options->supports_spill ? > 2 : 0; > for (i = 0; i < user_sgpr_count; i++) > ctx->shader_info->num_user_sgprs += > llvm_get_type_size(arg_types[i]) / 4; > > @@ -475,6 +475,10 @@ static void create_function(struct > nir_to_llvm_context *ctx) > > arg_idx = 0; > user_sgpr_idx = 0; > + > + set_userdata_location_shader(ctx, AC_UD_SCRATCH, user_sgpr_idx, > 2); > + user_sgpr_idx += 2; > + > for (unsigned i = 0; i < num_sets; ++i) { > if (ctx->options->layout->set[i].layout->shader_stages & (1 << > ctx->stage)) { > > set_userdata_location(&ctx->shader_info->user_sgprs_locs.descriptor_sets[i], > user_sgpr_idx, 2); > @@ -4429,7 +4433,7 @@ LLVMModuleRef > ac_translate_nir_to_llvm(LLVMTargetMachineRef tm, > > memset(shader_info, 0, sizeof(*shader_info)); > > - LLVMSetTarget(ctx.module, "amdgcn--"); > + LLVMSetTarget(ctx.module, options->supports_spill ? > "amdgcn-mesa-mesa3d" : "amdgcn--"); > setup_types(&ctx); > > ctx.builder = LLVMCreateBuilderInContext(ctx.context); > @@ -4563,7 +4567,7 @@ static void > ac_compile_llvm_module(LLVMTargetMachineRef tm, > struct ac_shader_config *config, > struct ac_shader_variant_info *shader_info, > gl_shader_stage stage, > - bool dump_shader) > + bool dump_shader, bool supports_spill) > { > if (dump_shader) > LLVMDumpModule(llvm_module); > @@ -4577,7 +4581,7 @@ static void > ac_compile_llvm_module(LLVMTargetMachineRef tm, > if (dump_shader) > fprintf(stderr, "disasm:\n%s\n", binary->disasm_string); > > - ac_shader_binary_read_config(binary, config, 0); > + ac_shader_binary_read_config(binary, config, 0, supports_spill); > > LLVMContextRef ctx = LLVMGetModuleContext(llvm_module); > LLVMDisposeModule(llvm_module); > @@ -4637,7 +4641,7 @@ void ac_compile_nir_shader(LLVMTargetMachineRef tm, > LLVMModuleRef llvm_module = ac_translate_nir_to_llvm(tm, nir, > shader_info, > options); > > - ac_compile_llvm_module(tm, llvm_module, binary, config, > shader_info, nir->stage, dump_shader); > + ac_compile_llvm_module(tm, llvm_module, binary, config, > shader_info, nir->stage, dump_shader, options->supports_spill); > switch (nir->stage) { > case MESA_SHADER_COMPUTE: > for (int i = 0; i < 3; ++i) > diff --git a/src/amd/common/ac_nir_to_llvm.h > b/src/amd/common/ac_nir_to_llvm.h > index a57558e..9d66f94 100644 > --- a/src/amd/common/ac_nir_to_llvm.h > +++ b/src/amd/common/ac_nir_to_llvm.h > @@ -52,6 +52,7 @@ struct ac_nir_compiler_options { > struct radv_pipeline_layout *layout; > union ac_shader_variant_key key; > bool unsafe_math; > + bool supports_spill; > enum radeon_family family; > enum chip_class chip_class; > }; > @@ -64,8 +65,9 @@ struct ac_userdata_info { > }; > > enum ac_ud_index { > - AC_UD_PUSH_CONSTANTS = 0, > - AC_UD_SHADER_START = 1, > + AC_UD_SCRATCH = 0, > + AC_UD_PUSH_CONSTANTS = 1, > + AC_UD_SHADER_START = 2, > AC_UD_VS_VERTEX_BUFFERS = AC_UD_SHADER_START, > AC_UD_VS_BASE_VERTEX_START_INSTANCE, > AC_UD_VS_MAX_UD, > diff --git a/src/amd/vulkan/radv_cmd_buffer.c > b/src/amd/vulkan/radv_cmd_buffer.c > index c62d275..e904897 100644 > --- a/src/amd/vulkan/radv_cmd_buffer.c > +++ b/src/amd/vulkan/radv_cmd_buffer.c > @@ -466,6 +466,13 @@ radv_emit_vertex_shader(struct radv_cmd_buffer > *cmd_buffer, > va = ws->buffer_get_va(vs->bo); > ws->cs_add_buffer(cmd_buffer->cs, vs->bo, 8); > > + if (vs->config.scratch_bytes_per_wave) { > + uint32_t needed = vs->config.scratch_bytes_per_wave * > cmd_buffer->device->scratch_waves; > + if (needed > cmd_buffer->scratch_size_needed) > + cmd_buffer->scratch_size_needed = needed; > + cmd_buffer->scratch_needed_mask |= (1 << > MESA_SHADER_VERTEX); > + } > + > clip_dist_mask = vs->info.vs.clip_dist_mask; > cull_dist_mask = vs->info.vs.cull_dist_mask; > total_mask = clip_dist_mask | cull_dist_mask; > @@ -536,6 +543,13 @@ radv_emit_fragment_shader(struct radv_cmd_buffer > *cmd_buffer, > va = ws->buffer_get_va(ps->bo); > ws->cs_add_buffer(cmd_buffer->cs, ps->bo, 8); > > + if (ps->config.scratch_bytes_per_wave) { > + uint32_t needed = ps->config.scratch_bytes_per_wave * > cmd_buffer->device->scratch_waves; > + if (needed > cmd_buffer->scratch_size_needed) > + cmd_buffer->scratch_size_needed = needed; > + cmd_buffer->scratch_needed_mask |= (1 << > MESA_SHADER_FRAGMENT); > + } > + > radeon_set_sh_reg_seq(cmd_buffer->cs, R_00B020_SPI_SHADER_PGM_LO_PS, 4); > radeon_emit(cmd_buffer->cs, va >> 8); > radeon_emit(cmd_buffer->cs, va >> 40); > @@ -627,6 +641,15 @@ radv_emit_graphics_pipeline(struct radv_cmd_buffer > *cmd_buffer, > radeon_set_context_reg(cmd_buffer->cs, > R_028A94_VGT_MULTI_PRIM_IB_RESET_EN, > pipeline->graphics.prim_restart_enable); > > + uint32_t max_scratch_bytes_per_wave = 0; > + max_scratch_bytes_per_wave = MAX2(max_scratch_bytes_per_wave, > + > pipeline->shaders[MESA_SHADER_VERTEX]->config.scratch_bytes_per_wave); > + max_scratch_bytes_per_wave = MAX2(max_scratch_bytes_per_wave, > + > pipeline->shaders[MESA_SHADER_FRAGMENT]->config.scratch_bytes_per_wave); > + > + radeon_set_context_reg(cmd_buffer->cs, R_0286E8_SPI_TMPRING_SIZE, > + > S_0286E8_WAVES(cmd_buffer->device->scratch_waves) | > + > S_0286E8_WAVESIZE(max_scratch_bytes_per_wave >> 10)); > cmd_buffer->state.emitted_pipeline = pipeline; > } > > @@ -1372,6 +1395,13 @@ radv_cmd_buffer_destroy(struct radv_cmd_buffer > *cmd_buffer) > > if (cmd_buffer->upload.upload_bo) > > cmd_buffer->device->ws->buffer_destroy(cmd_buffer->upload.upload_bo); > + > + if (cmd_buffer->scratch_bo) > + > cmd_buffer->device->ws->buffer_destroy(cmd_buffer->scratch_bo); > + > + if (cmd_buffer->compute_scratch_bo) > + > cmd_buffer->device->ws->buffer_destroy(cmd_buffer->compute_scratch_bo); > + > cmd_buffer->device->ws->cs_destroy(cmd_buffer->cs); > vk_free(&cmd_buffer->pool->alloc, cmd_buffer); > } > @@ -1402,6 +1432,19 @@ static void radv_reset_cmd_buffer(struct > radv_cmd_buffer *cmd_buffer) > free(up); > } > > + if (cmd_buffer->scratch_bo) { > + > cmd_buffer->device->ws->buffer_destroy(cmd_buffer->scratch_bo); > + cmd_buffer->scratch_bo = NULL; > + } > + > + if (cmd_buffer->compute_scratch_bo) { > + > cmd_buffer->device->ws->buffer_destroy(cmd_buffer->compute_scratch_bo); > + cmd_buffer->compute_scratch_bo = NULL; > + } > + > + cmd_buffer->scratch_needed_mask = 0; > + cmd_buffer->scratch_size_needed = 0; > + cmd_buffer->compute_scratch_size_needed = 0; > if (cmd_buffer->upload.upload_bo) > cmd_buffer->device->ws->cs_add_buffer(cmd_buffer->cs, > > cmd_buffer->upload.upload_bo, 8); > @@ -1457,6 +1500,19 @@ VkResult radv_BeginCommandBuffer( > default: > break; > } > + > + uint32_t pad_word = 0xffff1000U; > + if > (cmd_buffer->device->physical_device->rad_info.gfx_ib_pad_with_type2) > + pad_word = 0x80000000; > + > + cmd_buffer->scratch_patch_idx = cmd_buffer->cs->cdw; > + cmd_buffer->cs_to_patch_scratch = cmd_buffer->cs->buf; > + for (unsigned i = 0; i < MESA_SHADER_STAGES; i++) { > + radeon_emit(cmd_buffer->cs, pad_word); > + radeon_emit(cmd_buffer->cs, pad_word); > + radeon_emit(cmd_buffer->cs, pad_word); > + radeon_emit(cmd_buffer->cs, pad_word); > + } > } > > if (pBeginInfo->flags & > VK_COMMAND_BUFFER_USAGE_RENDER_PASS_CONTINUE_BIT) { > @@ -1594,6 +1650,70 @@ VkResult radv_EndCommandBuffer( > > if (cmd_buffer->queue_family_index != RADV_QUEUE_TRANSFER) > si_emit_cache_flush(cmd_buffer); > + > + int idx = cmd_buffer->scratch_patch_idx; > + if (cmd_buffer->level == VK_COMMAND_BUFFER_LEVEL_PRIMARY && > cmd_buffer->compute_scratch_size_needed) { > + cmd_buffer->compute_scratch_bo = > cmd_buffer->device->ws->buffer_create(cmd_buffer->device->ws, > + > cmd_buffer->compute_scratch_size_needed, > + > 4096, > + > RADEON_DOMAIN_VRAM, > + > RADEON_FLAG_NO_CPU_ACCESS); > + > + if (!cmd_buffer->compute_scratch_bo) { > + cmd_buffer->record_fail = true; > + return VK_ERROR_OUT_OF_DEVICE_MEMORY; > + } > + cmd_buffer->device->ws->cs_add_buffer(cmd_buffer->cs, > cmd_buffer->compute_scratch_bo, 8); > + > + uint64_t scratch_va = > cmd_buffer->device->ws->buffer_get_va(cmd_buffer->compute_scratch_bo); > + uint32_t rsrc1 = S_008F04_BASE_ADDRESS_HI(scratch_va >> > 32) | > + S_008F04_SWIZZLE_ENABLE(1); > + uint32_t reg_base; > + > + reg_base = > shader_stage_to_user_data_0(MESA_SHADER_COMPUTE); > + cmd_buffer->cs_to_patch_scratch[idx++] = > PKT3(PKT3_SET_SH_REG, 2, 0); > + cmd_buffer->cs_to_patch_scratch[idx++] = (reg_base - > SI_SH_REG_OFFSET) >> 2; > + cmd_buffer->cs_to_patch_scratch[idx++] = scratch_va; > + cmd_buffer->cs_to_patch_scratch[idx++] = rsrc1; > + } > + > + if (cmd_buffer->level == VK_COMMAND_BUFFER_LEVEL_PRIMARY && > cmd_buffer->scratch_size_needed) { > + cmd_buffer->scratch_bo = > cmd_buffer->device->ws->buffer_create(cmd_buffer->device->ws, > + > cmd_buffer->scratch_size_needed, > + > 4096, > + > RADEON_DOMAIN_VRAM, > + > RADEON_FLAG_NO_CPU_ACCESS); > + > + if (!cmd_buffer->scratch_bo) { > + cmd_buffer->record_fail = true; > + return VK_ERROR_OUT_OF_DEVICE_MEMORY; > + } > + > + cmd_buffer->device->ws->cs_add_buffer(cmd_buffer->cs, > cmd_buffer->scratch_bo, 8); > + > + uint64_t scratch_va = > cmd_buffer->device->ws->buffer_get_va(cmd_buffer->scratch_bo); > + uint32_t rsrc1 = S_008F04_BASE_ADDRESS_HI(scratch_va >> > 32) | > + S_008F04_SWIZZLE_ENABLE(1); > + > + uint32_t *ring_ptr; > + uint32_t ring_offset; > + radv_cmd_buffer_upload_alloc(cmd_buffer, 4 * 4, 256, > &ring_offset, > + (void **)&ring_ptr); > + ring_ptr[0] = scratch_va; > + ring_ptr[1] = rsrc1; > + uint64_t va = > cmd_buffer->device->ws->buffer_get_va(cmd_buffer->upload.upload_bo) + > ring_offset; > + > + radv_foreach_stage(stage, > cmd_buffer->scratch_needed_mask) { > + uint32_t reg_base; > + > + reg_base = shader_stage_to_user_data_0(stage); > + cmd_buffer->cs_to_patch_scratch[idx++] = > PKT3(PKT3_SET_SH_REG, 2, 0); > + cmd_buffer->cs_to_patch_scratch[idx++] = > (reg_base - SI_SH_REG_OFFSET) >> 2; > + cmd_buffer->cs_to_patch_scratch[idx++] = va; > + cmd_buffer->cs_to_patch_scratch[idx++] = va >> > 32; > + } > + } > + > if (!cmd_buffer->device->ws->cs_finalize(cmd_buffer->cs) || > cmd_buffer->record_fail) > return VK_ERROR_OUT_OF_DEVICE_MEMORY; > @@ -1629,9 +1749,16 @@ radv_emit_compute_pipeline(struct radv_cmd_buffer > *cmd_buffer) > radeon_emit(cmd_buffer->cs, compute_shader->rsrc1); > radeon_emit(cmd_buffer->cs, compute_shader->rsrc2); > > + if (compute_shader->config.scratch_bytes_per_wave) { > + uint32_t needed = > compute_shader->config.scratch_bytes_per_wave * > cmd_buffer->device->scratch_waves; > + if (needed > cmd_buffer->compute_scratch_size_needed) > + cmd_buffer->compute_scratch_size_needed = needed; > + } > + > /* change these once we have scratch support */ > radeon_set_sh_reg(cmd_buffer->cs, R_00B860_COMPUTE_TMPRING_SIZE, > - S_00B860_WAVES(32) | S_00B860_WAVESIZE(0)); > + > S_00B860_WAVES(cmd_buffer->device->scratch_waves) | > + > S_00B860_WAVESIZE(compute_shader->config.scratch_bytes_per_wave >> 10)); > > radeon_set_sh_reg_seq(cmd_buffer->cs, R_00B81C_COMPUTE_NUM_THREAD_X, 3); > radeon_emit(cmd_buffer->cs, > @@ -1821,6 +1948,14 @@ void radv_CmdExecuteCommands( > for (uint32_t i = 0; i < commandBufferCount; i++) { > RADV_FROM_HANDLE(radv_cmd_buffer, secondary, pCmdBuffers[i]); > > + if (secondary->scratch_size_needed > > primary->scratch_size_needed) > + primary->scratch_size_needed = > secondary->scratch_size_needed; > + > + if (secondary->compute_scratch_size_needed > > primary->compute_scratch_size_needed) > + primary->compute_scratch_size_needed = > secondary->compute_scratch_size_needed; > + > + primary->scratch_needed_mask |= > secondary->scratch_needed_mask; > + > primary->device->ws->cs_execute_secondary(primary->cs, > secondary->cs); > } > > diff --git a/src/amd/vulkan/radv_device.c b/src/amd/vulkan/radv_device.c > index 4aa6af2..c465186 100644 > --- a/src/amd/vulkan/radv_device.c > +++ b/src/amd/vulkan/radv_device.c > @@ -781,6 +781,13 @@ VkResult radv_CreateDevice( > } > } > > + /* TODO : predicate on LLVM version this goes into */ > +#if HAVE_LLVM < 0x0500 > + device->llvm_supports_spill = false; > +#else > + device->llvm_supports_spill = true; > +#endif > + > result = radv_device_init_meta(device); > if (result != VK_SUCCESS) > goto fail; > @@ -814,6 +821,21 @@ VkResult radv_CreateDevice( > goto fail; > } > > + /* The maximum number of scratch waves. Scratch space isn't > divided > + * evenly between CUs. The number is only a function of the > number of CUs. > + * We can decrease the constant to decrease the scratch buffer > size. > + * > + * sctx->scratch_waves must be >= the maximum posible size of > + * 1 threadgroup, so that the hw doesn't hang from being unable > + * to start any. > + * > + * The recommended value is 4 per CU at most. Higher numbers > don't > + * bring much benefit, but they still occupy chip resources > (think > + * async compute). I've seen ~2% performance difference between 4 > and 32. > + */ > + uint32_t max_threads_per_block = 2048; > + device->scratch_waves = MAX2(32 * > physical_device->rad_info.num_good_compute_units, > + max_threads_per_block / 64); > *pDevice = radv_device_to_handle(device); > return VK_SUCCESS; > > diff --git a/src/amd/vulkan/radv_pipeline.c > b/src/amd/vulkan/radv_pipeline.c > index 360b519..060cfbb 100644 > --- a/src/amd/vulkan/radv_pipeline.c > +++ b/src/amd/vulkan/radv_pipeline.c > @@ -354,12 +354,13 @@ static void radv_fill_shader_variant(struct > radv_device *device, > struct ac_shader_binary *binary, > gl_shader_stage stage) > { > - variant->code_size = binary->code_size; > bool scratch_enabled = variant->config.scratch_bytes_per_wave > 0; > unsigned vgpr_comp_cnt = 0; > > - if (scratch_enabled) > - radv_finishme("shader scratch space"); > + if (scratch_enabled && !device->llvm_supports_spill) > + radv_finishme("shader scratch support only available with > LLVM 5.0"); > + > + variant->code_size = binary->code_size; > > switch (stage) { > case MESA_SHADER_VERTEX: > @@ -424,7 +425,8 @@ static struct radv_shader_variant > *radv_shader_variant_create(struct radv_device > options.unsafe_math = !!(device->debug_flags & RADV_DEBUG_UNSAFE_MATH); > options.family = chip_family; > options.chip_class = device->physical_device->rad_info.chip_class; > - tm = ac_create_target_machine(chip_family); > + options.supports_spill = device->llvm_supports_spill; > + tm = ac_create_target_machine(chip_family, > options.supports_spill); > ac_compile_nir_shader(tm, &binary, &variant->config, > &variant->info, shader, &options, dump); > LLVMDisposeTargetMachine(tm); > diff --git a/src/amd/vulkan/radv_private.h > b/src/amd/vulkan/radv_private.h > index 0b8f50a..6c746b5 100644 > --- a/src/amd/vulkan/radv_private.h > +++ b/src/amd/vulkan/radv_private.h > @@ -485,6 +485,8 @@ struct radv_device { > > uint64_t debug_flags; > > + bool llvm_supports_spill; > + uint32_t scratch_waves; > /* MSAA sample locations. > * The first index is the sample index. > * The second index is the coordinate: X, Y. */ > @@ -726,6 +728,17 @@ struct radv_cmd_buffer { > struct radv_cmd_buffer_upload upload; > > bool record_fail; > + > + /* for primary cmd buffers */ > + struct radeon_winsys_bo *scratch_bo; > + struct radeon_winsys_bo *compute_scratch_bo; > + uint32_t scratch_patch_idx; > + uint32_t *cs_to_patch_scratch; > + > + /* for primary + secondary cmd buffers */ > + uint32_t scratch_needed_mask; > + uint32_t scratch_size_needed; > + uint32_t compute_scratch_size_needed; > }; > > struct radv_image; > -- > 2.7.4 > > _______________________________________________ > mesa-dev mailing list > mesa-dev@lists.freedesktop.org > https://lists.freedesktop.org/mailman/listinfo/mesa-dev _______________________________________________ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev