I'm not sure if using a scratch buffer per command buffer is correct.
AFAIU each ring has a separate counter for the scratch offsets, and if a
command buffer is used in multiple compute rings at the same time, these
separate counters could conflict.

I'd think we need a preamble IB per queue that sets SGPR0/1 for all
relevant stages, and modify the winsys so that that is called in the
same submit ioctl as the application command buffers.

- Bas

On Tue, Jan 24, 2017, at 18:32, Dave Airlie wrote:
> From: Dave Airlie <airl...@redhat.com>
> 
> Currently LLVM 5.0 has support for spilling to a place
> pointed to by the user sgprs instead of using relocations.
> 
> This is enabled by using the amdgcn-mesa-mesa3d triple.
> 
> For compute gfx shaders we spill to a buffer pointed to
> by 64-bit address stored in sgprs 0/1.
> For other gfx shaders we spill to a buffer pointed to by
> the first two dwords of the buffer pointed to in sgprs 0/1.
> 
> This patch enables radv to use the llvm support when present.
> 
> This fixes Sascha Willems computeshader demo first screen,
> and a bunch of CTS tests now pass.
> 
> This patch is likely to be in LLVM 4.0 release as well
> (fingers crossed) in which case we need to adjust the detection
> logic.
> 
> SIgned-off-by: Dave Airlie <airl...@redhat.com>
> ---
>  src/amd/common/ac_binary.c       |  30 +++++----
>  src/amd/common/ac_binary.h       |   4 +-
>  src/amd/common/ac_llvm_util.c    |   4 +-
>  src/amd/common/ac_llvm_util.h    |   2 +-
>  src/amd/common/ac_nir_to_llvm.c  |  14 ++--
>  src/amd/common/ac_nir_to_llvm.h  |   6 +-
>  src/amd/vulkan/radv_cmd_buffer.c | 137
>  ++++++++++++++++++++++++++++++++++++++-
>  src/amd/vulkan/radv_device.c     |  22 +++++++
>  src/amd/vulkan/radv_pipeline.c   |  10 +--
>  src/amd/vulkan/radv_private.h    |  13 ++++
>  10 files changed, 215 insertions(+), 27 deletions(-)
> 
> diff --git a/src/amd/common/ac_binary.c b/src/amd/common/ac_binary.c
> index 01cf000..9c66a82 100644
> --- a/src/amd/common/ac_binary.c
> +++ b/src/amd/common/ac_binary.c
> @@ -212,23 +212,28 @@ static const char *scratch_rsrc_dword1_symbol =
>  
>  void ac_shader_binary_read_config(struct ac_shader_binary *binary,
>                                 struct ac_shader_config *conf,
> -                                 unsigned symbol_offset)
> +                                 unsigned symbol_offset,
> +                                 bool supports_spill)
>  {
>       unsigned i;
>       const unsigned char *config =
>               ac_shader_binary_config_start(binary, symbol_offset);
>       bool really_needs_scratch = false;
> -
> +       uint32_t wavesize = 0;
>       /* LLVM adds SGPR spills to the scratch size.
>        * Find out if we really need the scratch buffer.
>        */
> -       for (i = 0; i < binary->reloc_count; i++) {
> -               const struct ac_shader_reloc *reloc = &binary->relocs[i];
> +       if (supports_spill) {
> +               really_needs_scratch = true;
> +       } else {
> +               for (i = 0; i < binary->reloc_count; i++) {
> +                       const struct ac_shader_reloc *reloc =
> &binary->relocs[i];
>  
> -               if (!strcmp(scratch_rsrc_dword0_symbol, reloc->name) ||
> -                   !strcmp(scratch_rsrc_dword1_symbol, reloc->name)) {
> -                       really_needs_scratch = true;
> -                       break;
> +                       if (!strcmp(scratch_rsrc_dword0_symbol,
> reloc->name) ||
> +                           !strcmp(scratch_rsrc_dword1_symbol,
> reloc->name)) {
> +                               really_needs_scratch = true;
> +                               break;
> +                       }
>               }
>       }
>  
> @@ -259,9 +264,7 @@ void ac_shader_binary_read_config(struct
> ac_shader_binary *binary,
>               case R_0286E8_SPI_TMPRING_SIZE:
>               case R_00B860_COMPUTE_TMPRING_SIZE:
>                       /* WAVESIZE is in units of 256 dwords. */
> -                       if (really_needs_scratch)
> -                               conf->scratch_bytes_per_wave =
> -                                       G_00B860_WAVESIZE(value) * 256 *
> 4;
> +                       wavesize = value;
>                       break;
>               case SPILLED_SGPRS:
>                       conf->spilled_sgprs = value;
> @@ -285,4 +288,9 @@ void ac_shader_binary_read_config(struct
> ac_shader_binary *binary,
>               if (!conf->spi_ps_input_addr)
>                       conf->spi_ps_input_addr = conf->spi_ps_input_ena;
>       }
> +
> +       if (really_needs_scratch) {
> +               /* sgprs spills aren't spilling */
> +               conf->scratch_bytes_per_wave =
> G_00B860_WAVESIZE(wavesize) * 256 * 4;
> +       }
>  }
> diff --git a/src/amd/common/ac_binary.h b/src/amd/common/ac_binary.h
> index 282f33d..06fd855 100644
> --- a/src/amd/common/ac_binary.h
> +++ b/src/amd/common/ac_binary.h
> @@ -27,6 +27,7 @@
>  #pragma once
>  
>  #include <stdint.h>
> +#include <stdbool.h>
>  
>  struct ac_shader_reloc {
>       char name[32];
> @@ -85,4 +86,5 @@ void ac_elf_read(const char *elf_data, unsigned
> elf_size,
>  
>  void ac_shader_binary_read_config(struct ac_shader_binary *binary,
>                                 struct ac_shader_config *conf,
> -                                 unsigned symbol_offset);
> +                                 unsigned symbol_offset,
> +                                 bool supports_spill);
> diff --git a/src/amd/common/ac_llvm_util.c
> b/src/amd/common/ac_llvm_util.c
> index 770e3bd..3ba5281 100644
> --- a/src/amd/common/ac_llvm_util.c
> +++ b/src/amd/common/ac_llvm_util.c
> @@ -126,11 +126,11 @@ static const char *ac_get_llvm_processor_name(enum
> radeon_family family)
>       }
>  }
>  
> -LLVMTargetMachineRef ac_create_target_machine(enum radeon_family family)
> +LLVMTargetMachineRef ac_create_target_machine(enum radeon_family family,
> bool supports_spill)
>  {
>       assert(family >= CHIP_TAHITI);
>  
> -       const char *triple = "amdgcn--";
> +       const char *triple = supports_spill ? "amdgcn-mesa-mesa3d" :
> "amdgcn--";
>       LLVMTargetRef target = ac_get_llvm_target(triple);
>       LLVMTargetMachineRef tm = LLVMCreateTargetMachine(
>                                    target,
> diff --git a/src/amd/common/ac_llvm_util.h
> b/src/amd/common/ac_llvm_util.h
> index 802c266..2a5f325 100644
> --- a/src/amd/common/ac_llvm_util.h
> +++ b/src/amd/common/ac_llvm_util.h
> @@ -56,7 +56,7 @@ struct ac_llvm_context {
>       LLVMValueRef fpmath_md_2p5_ulp;
>  };
>  
> -LLVMTargetMachineRef ac_create_target_machine(enum radeon_family
> family);
> +LLVMTargetMachineRef ac_create_target_machine(enum radeon_family family,
> bool supports_spill);
>  
>  void ac_add_attr_dereferenceable(LLVMValueRef val, uint64_t bytes);
>  bool ac_is_sgpr_param(LLVMValueRef param);
> diff --git a/src/amd/common/ac_nir_to_llvm.c
> b/src/amd/common/ac_nir_to_llvm.c
> index 26b87e8..43e079e 100644
> --- a/src/amd/common/ac_nir_to_llvm.c
> +++ b/src/amd/common/ac_nir_to_llvm.c
> @@ -458,10 +458,10 @@ static void create_function(struct
> nir_to_llvm_context *ctx)
>           arg_idx, array_params_mask, sgpr_count, ctx->options->unsafe_math);
>       set_llvm_calling_convention(ctx->main_function, ctx->stage);
>  
> -
>       ctx->shader_info->num_input_sgprs = 0;
>       ctx->shader_info->num_input_vgprs = 0;
>  
> +       ctx->shader_info->num_user_sgprs = ctx->options->supports_spill ?
> 2 : 0;
>       for (i = 0; i < user_sgpr_count; i++)
>               ctx->shader_info->num_user_sgprs += 
> llvm_get_type_size(arg_types[i]) / 4;
>  
> @@ -475,6 +475,10 @@ static void create_function(struct
> nir_to_llvm_context *ctx)
>  
>       arg_idx = 0;
>       user_sgpr_idx = 0;
> +
> +       set_userdata_location_shader(ctx, AC_UD_SCRATCH, user_sgpr_idx,
> 2);
> +       user_sgpr_idx += 2;
> +
>       for (unsigned i = 0; i < num_sets; ++i) {
>               if (ctx->options->layout->set[i].layout->shader_stages & (1 << 
> ctx->stage)) {
>                       
> set_userdata_location(&ctx->shader_info->user_sgprs_locs.descriptor_sets[i], 
> user_sgpr_idx, 2);
> @@ -4429,7 +4433,7 @@ LLVMModuleRef
> ac_translate_nir_to_llvm(LLVMTargetMachineRef tm,
>  
>       memset(shader_info, 0, sizeof(*shader_info));
>  
> -       LLVMSetTarget(ctx.module, "amdgcn--");
> +       LLVMSetTarget(ctx.module, options->supports_spill ?
> "amdgcn-mesa-mesa3d" : "amdgcn--");
>       setup_types(&ctx);
>  
>       ctx.builder = LLVMCreateBuilderInContext(ctx.context);
> @@ -4563,7 +4567,7 @@ static void
> ac_compile_llvm_module(LLVMTargetMachineRef tm,
>                                  struct ac_shader_config *config,
>                                  struct ac_shader_variant_info *shader_info,
>                                  gl_shader_stage stage,
> -                                  bool dump_shader)
> +                                  bool dump_shader, bool supports_spill)
>  {
>       if (dump_shader)
>               LLVMDumpModule(llvm_module);
> @@ -4577,7 +4581,7 @@ static void
> ac_compile_llvm_module(LLVMTargetMachineRef tm,
>       if (dump_shader)
>               fprintf(stderr, "disasm:\n%s\n", binary->disasm_string);
>  
> -       ac_shader_binary_read_config(binary, config, 0);
> +       ac_shader_binary_read_config(binary, config, 0, supports_spill);
>  
>       LLVMContextRef ctx = LLVMGetModuleContext(llvm_module);
>       LLVMDisposeModule(llvm_module);
> @@ -4637,7 +4641,7 @@ void ac_compile_nir_shader(LLVMTargetMachineRef tm,
>       LLVMModuleRef llvm_module = ac_translate_nir_to_llvm(tm, nir, 
> shader_info,
>                                                            options);
>  
> -       ac_compile_llvm_module(tm, llvm_module, binary, config,
> shader_info, nir->stage, dump_shader);
> +       ac_compile_llvm_module(tm, llvm_module, binary, config,
> shader_info, nir->stage, dump_shader, options->supports_spill);
>       switch (nir->stage) {
>       case MESA_SHADER_COMPUTE:
>               for (int i = 0; i < 3; ++i)
> diff --git a/src/amd/common/ac_nir_to_llvm.h
> b/src/amd/common/ac_nir_to_llvm.h
> index a57558e..9d66f94 100644
> --- a/src/amd/common/ac_nir_to_llvm.h
> +++ b/src/amd/common/ac_nir_to_llvm.h
> @@ -52,6 +52,7 @@ struct ac_nir_compiler_options {
>       struct radv_pipeline_layout *layout;
>       union ac_shader_variant_key key;
>       bool unsafe_math;
> +       bool supports_spill;
>       enum radeon_family family;
>       enum chip_class chip_class;
>  };
> @@ -64,8 +65,9 @@ struct ac_userdata_info {
>  };
>  
>  enum ac_ud_index {
> -       AC_UD_PUSH_CONSTANTS = 0,
> -       AC_UD_SHADER_START = 1,
> +       AC_UD_SCRATCH = 0,
> +       AC_UD_PUSH_CONSTANTS = 1,
> +       AC_UD_SHADER_START = 2,
>       AC_UD_VS_VERTEX_BUFFERS = AC_UD_SHADER_START,
>       AC_UD_VS_BASE_VERTEX_START_INSTANCE,
>       AC_UD_VS_MAX_UD,
> diff --git a/src/amd/vulkan/radv_cmd_buffer.c
> b/src/amd/vulkan/radv_cmd_buffer.c
> index c62d275..e904897 100644
> --- a/src/amd/vulkan/radv_cmd_buffer.c
> +++ b/src/amd/vulkan/radv_cmd_buffer.c
> @@ -466,6 +466,13 @@ radv_emit_vertex_shader(struct radv_cmd_buffer
> *cmd_buffer,
>       va = ws->buffer_get_va(vs->bo);
>       ws->cs_add_buffer(cmd_buffer->cs, vs->bo, 8);
>  
> +       if (vs->config.scratch_bytes_per_wave) {
> +               uint32_t needed = vs->config.scratch_bytes_per_wave *
> cmd_buffer->device->scratch_waves;
> +               if (needed > cmd_buffer->scratch_size_needed)
> +                       cmd_buffer->scratch_size_needed = needed;
> +               cmd_buffer->scratch_needed_mask |= (1 <<
> MESA_SHADER_VERTEX);
> +       }
> +
>       clip_dist_mask = vs->info.vs.clip_dist_mask;
>       cull_dist_mask = vs->info.vs.cull_dist_mask;
>       total_mask = clip_dist_mask | cull_dist_mask;
> @@ -536,6 +543,13 @@ radv_emit_fragment_shader(struct radv_cmd_buffer
> *cmd_buffer,
>       va = ws->buffer_get_va(ps->bo);
>       ws->cs_add_buffer(cmd_buffer->cs, ps->bo, 8);
>  
> +       if (ps->config.scratch_bytes_per_wave) {
> +               uint32_t needed = ps->config.scratch_bytes_per_wave *
> cmd_buffer->device->scratch_waves;
> +               if (needed > cmd_buffer->scratch_size_needed)
> +                       cmd_buffer->scratch_size_needed = needed;
> +               cmd_buffer->scratch_needed_mask |= (1 <<
> MESA_SHADER_FRAGMENT);
> +       }
> +
>       radeon_set_sh_reg_seq(cmd_buffer->cs, R_00B020_SPI_SHADER_PGM_LO_PS, 4);
>       radeon_emit(cmd_buffer->cs, va >> 8);
>       radeon_emit(cmd_buffer->cs, va >> 40);
> @@ -627,6 +641,15 @@ radv_emit_graphics_pipeline(struct radv_cmd_buffer
> *cmd_buffer,
>       radeon_set_context_reg(cmd_buffer->cs, 
> R_028A94_VGT_MULTI_PRIM_IB_RESET_EN,
>                              pipeline->graphics.prim_restart_enable);
>  
> +       uint32_t max_scratch_bytes_per_wave = 0;
> +       max_scratch_bytes_per_wave = MAX2(max_scratch_bytes_per_wave,
> +                                        
> pipeline->shaders[MESA_SHADER_VERTEX]->config.scratch_bytes_per_wave);
> +       max_scratch_bytes_per_wave = MAX2(max_scratch_bytes_per_wave,
> +                                        
> pipeline->shaders[MESA_SHADER_FRAGMENT]->config.scratch_bytes_per_wave);
> +
> +       radeon_set_context_reg(cmd_buffer->cs, R_0286E8_SPI_TMPRING_SIZE,
> +                             
> S_0286E8_WAVES(cmd_buffer->device->scratch_waves) |
> +                             
> S_0286E8_WAVESIZE(max_scratch_bytes_per_wave >> 10));
>       cmd_buffer->state.emitted_pipeline = pipeline;
>  }
>  
> @@ -1372,6 +1395,13 @@ radv_cmd_buffer_destroy(struct radv_cmd_buffer
> *cmd_buffer)
>  
>       if (cmd_buffer->upload.upload_bo)
>               
> cmd_buffer->device->ws->buffer_destroy(cmd_buffer->upload.upload_bo);
> +
> +       if (cmd_buffer->scratch_bo)
> +              
> cmd_buffer->device->ws->buffer_destroy(cmd_buffer->scratch_bo);
> +
> +       if (cmd_buffer->compute_scratch_bo)
> +              
> cmd_buffer->device->ws->buffer_destroy(cmd_buffer->compute_scratch_bo);
> +
>       cmd_buffer->device->ws->cs_destroy(cmd_buffer->cs);
>       vk_free(&cmd_buffer->pool->alloc, cmd_buffer);
>  }
> @@ -1402,6 +1432,19 @@ static void  radv_reset_cmd_buffer(struct
> radv_cmd_buffer *cmd_buffer)
>               free(up);
>       }
>  
> +       if (cmd_buffer->scratch_bo) {
> +              
> cmd_buffer->device->ws->buffer_destroy(cmd_buffer->scratch_bo);
> +               cmd_buffer->scratch_bo = NULL;
> +       }
> +
> +       if (cmd_buffer->compute_scratch_bo) {
> +              
> cmd_buffer->device->ws->buffer_destroy(cmd_buffer->compute_scratch_bo);
> +               cmd_buffer->compute_scratch_bo = NULL;
> +       }
> +
> +       cmd_buffer->scratch_needed_mask = 0;
> +       cmd_buffer->scratch_size_needed = 0;
> +       cmd_buffer->compute_scratch_size_needed = 0;
>       if (cmd_buffer->upload.upload_bo)
>               cmd_buffer->device->ws->cs_add_buffer(cmd_buffer->cs,
>                                                     
> cmd_buffer->upload.upload_bo, 8);
> @@ -1457,6 +1500,19 @@ VkResult radv_BeginCommandBuffer(
>               default:
>                       break;
>               }
> +
> +               uint32_t pad_word = 0xffff1000U;
> +               if
> (cmd_buffer->device->physical_device->rad_info.gfx_ib_pad_with_type2)
> +                       pad_word = 0x80000000;
> +
> +               cmd_buffer->scratch_patch_idx = cmd_buffer->cs->cdw;
> +               cmd_buffer->cs_to_patch_scratch = cmd_buffer->cs->buf;
> +               for (unsigned i = 0; i < MESA_SHADER_STAGES; i++) {
> +                       radeon_emit(cmd_buffer->cs, pad_word);
> +                       radeon_emit(cmd_buffer->cs, pad_word);
> +                       radeon_emit(cmd_buffer->cs, pad_word);
> +                       radeon_emit(cmd_buffer->cs, pad_word);
> +               }
>       }
>  
>       if (pBeginInfo->flags & 
> VK_COMMAND_BUFFER_USAGE_RENDER_PASS_CONTINUE_BIT) {
> @@ -1594,6 +1650,70 @@ VkResult radv_EndCommandBuffer(
>  
>       if (cmd_buffer->queue_family_index != RADV_QUEUE_TRANSFER)
>               si_emit_cache_flush(cmd_buffer);
> +
> +       int idx = cmd_buffer->scratch_patch_idx;
> +       if (cmd_buffer->level == VK_COMMAND_BUFFER_LEVEL_PRIMARY &&
> cmd_buffer->compute_scratch_size_needed) {
> +               cmd_buffer->compute_scratch_bo =
> cmd_buffer->device->ws->buffer_create(cmd_buffer->device->ws,
> +                                                                        
>              cmd_buffer->compute_scratch_size_needed,
> +                                                                        
>              4096,
> +                                                                        
>              RADEON_DOMAIN_VRAM,
> +                                                                        
>              RADEON_FLAG_NO_CPU_ACCESS);
> +
> +               if (!cmd_buffer->compute_scratch_bo) {
> +                       cmd_buffer->record_fail = true;
> +                       return VK_ERROR_OUT_OF_DEVICE_MEMORY;
> +               }
> +               cmd_buffer->device->ws->cs_add_buffer(cmd_buffer->cs,
> cmd_buffer->compute_scratch_bo, 8);
> +
> +               uint64_t scratch_va =
> cmd_buffer->device->ws->buffer_get_va(cmd_buffer->compute_scratch_bo);
> +               uint32_t rsrc1 = S_008F04_BASE_ADDRESS_HI(scratch_va >>
> 32) |
> +                       S_008F04_SWIZZLE_ENABLE(1);
> +               uint32_t reg_base;
> +
> +               reg_base =
> shader_stage_to_user_data_0(MESA_SHADER_COMPUTE);
> +               cmd_buffer->cs_to_patch_scratch[idx++] =
> PKT3(PKT3_SET_SH_REG, 2, 0);
> +               cmd_buffer->cs_to_patch_scratch[idx++] = (reg_base -
> SI_SH_REG_OFFSET) >> 2;
> +               cmd_buffer->cs_to_patch_scratch[idx++] = scratch_va;
> +               cmd_buffer->cs_to_patch_scratch[idx++] = rsrc1;
> +       }
> +
> +       if (cmd_buffer->level == VK_COMMAND_BUFFER_LEVEL_PRIMARY &&
> cmd_buffer->scratch_size_needed) {
> +               cmd_buffer->scratch_bo =
> cmd_buffer->device->ws->buffer_create(cmd_buffer->device->ws,
> +                                                                        
>      cmd_buffer->scratch_size_needed,
> +                                                                        
>      4096,
> +                                                                        
>      RADEON_DOMAIN_VRAM,
> +                                                                        
>      RADEON_FLAG_NO_CPU_ACCESS);
> +
> +               if (!cmd_buffer->scratch_bo) {
> +                       cmd_buffer->record_fail = true;
> +                       return VK_ERROR_OUT_OF_DEVICE_MEMORY;
> +               }
> +
> +               cmd_buffer->device->ws->cs_add_buffer(cmd_buffer->cs,
> cmd_buffer->scratch_bo, 8);
> +
> +               uint64_t scratch_va =
> cmd_buffer->device->ws->buffer_get_va(cmd_buffer->scratch_bo);
> +               uint32_t rsrc1 = S_008F04_BASE_ADDRESS_HI(scratch_va >>
> 32) |
> +                       S_008F04_SWIZZLE_ENABLE(1);
> +
> +               uint32_t *ring_ptr;
> +               uint32_t ring_offset;
> +               radv_cmd_buffer_upload_alloc(cmd_buffer, 4 * 4, 256,
> &ring_offset,
> +                                            (void **)&ring_ptr);
> +               ring_ptr[0] = scratch_va;
> +               ring_ptr[1] = rsrc1;
> +               uint64_t va =
> cmd_buffer->device->ws->buffer_get_va(cmd_buffer->upload.upload_bo) +
> ring_offset;
> +
> +               radv_foreach_stage(stage,
> cmd_buffer->scratch_needed_mask) {
> +                       uint32_t reg_base;
> +
> +                       reg_base = shader_stage_to_user_data_0(stage);
> +                       cmd_buffer->cs_to_patch_scratch[idx++] =
> PKT3(PKT3_SET_SH_REG, 2, 0);
> +                       cmd_buffer->cs_to_patch_scratch[idx++] =
> (reg_base - SI_SH_REG_OFFSET) >> 2;
> +                       cmd_buffer->cs_to_patch_scratch[idx++] = va;
> +                       cmd_buffer->cs_to_patch_scratch[idx++] = va >>
> 32;
> +               }
> +       }
> +
>       if (!cmd_buffer->device->ws->cs_finalize(cmd_buffer->cs) ||
>           cmd_buffer->record_fail)
>               return VK_ERROR_OUT_OF_DEVICE_MEMORY;
> @@ -1629,9 +1749,16 @@ radv_emit_compute_pipeline(struct radv_cmd_buffer
> *cmd_buffer)
>       radeon_emit(cmd_buffer->cs, compute_shader->rsrc1);
>       radeon_emit(cmd_buffer->cs, compute_shader->rsrc2);
>  
> +       if (compute_shader->config.scratch_bytes_per_wave) {
> +               uint32_t needed =
> compute_shader->config.scratch_bytes_per_wave *
> cmd_buffer->device->scratch_waves;
> +               if (needed > cmd_buffer->compute_scratch_size_needed)
> +                       cmd_buffer->compute_scratch_size_needed = needed;
> +       }
> +
>       /* change these once we have scratch support */
>       radeon_set_sh_reg(cmd_buffer->cs, R_00B860_COMPUTE_TMPRING_SIZE,
> -                         S_00B860_WAVES(32) | S_00B860_WAVESIZE(0));
> +                        
> S_00B860_WAVES(cmd_buffer->device->scratch_waves) |
> +                        
> S_00B860_WAVESIZE(compute_shader->config.scratch_bytes_per_wave >> 10));
>  
>       radeon_set_sh_reg_seq(cmd_buffer->cs, R_00B81C_COMPUTE_NUM_THREAD_X, 3);
>       radeon_emit(cmd_buffer->cs,
> @@ -1821,6 +1948,14 @@ void radv_CmdExecuteCommands(
>       for (uint32_t i = 0; i < commandBufferCount; i++) {
>               RADV_FROM_HANDLE(radv_cmd_buffer, secondary, pCmdBuffers[i]);
>  
> +               if (secondary->scratch_size_needed >
> primary->scratch_size_needed)
> +                       primary->scratch_size_needed =
> secondary->scratch_size_needed;
> +
> +               if (secondary->compute_scratch_size_needed >
> primary->compute_scratch_size_needed)
> +                       primary->compute_scratch_size_needed =
> secondary->compute_scratch_size_needed;
> +
> +               primary->scratch_needed_mask |=
> secondary->scratch_needed_mask;
> +
>               primary->device->ws->cs_execute_secondary(primary->cs, 
> secondary->cs);
>       }
>  
> diff --git a/src/amd/vulkan/radv_device.c b/src/amd/vulkan/radv_device.c
> index 4aa6af2..c465186 100644
> --- a/src/amd/vulkan/radv_device.c
> +++ b/src/amd/vulkan/radv_device.c
> @@ -781,6 +781,13 @@ VkResult radv_CreateDevice(
>               }
>       }
>  
> +       /* TODO : predicate on LLVM version this goes into */
> +#if HAVE_LLVM < 0x0500
> +       device->llvm_supports_spill = false;
> +#else
> +       device->llvm_supports_spill = true;
> +#endif
> +
>       result = radv_device_init_meta(device);
>       if (result != VK_SUCCESS)
>               goto fail;
> @@ -814,6 +821,21 @@ VkResult radv_CreateDevice(
>                       goto fail;
>       }
>  
> +       /* The maximum number of scratch waves. Scratch space isn't
> divided
> +        * evenly between CUs. The number is only a function of the
> number of CUs.
> +        * We can decrease the constant to decrease the scratch buffer
> size.
> +        *
> +        * sctx->scratch_waves must be >= the maximum posible size of
> +        * 1 threadgroup, so that the hw doesn't hang from being unable
> +        * to start any.
> +        *
> +        * The recommended value is 4 per CU at most. Higher numbers
> don't
> +        * bring much benefit, but they still occupy chip resources
> (think
> +        * async compute). I've seen ~2% performance difference between 4
> and 32.
> +        */
> +       uint32_t max_threads_per_block = 2048;
> +       device->scratch_waves = MAX2(32 *
> physical_device->rad_info.num_good_compute_units,
> +                                    max_threads_per_block / 64);
>       *pDevice = radv_device_to_handle(device);
>       return VK_SUCCESS;
>  
> diff --git a/src/amd/vulkan/radv_pipeline.c
> b/src/amd/vulkan/radv_pipeline.c
> index 360b519..060cfbb 100644
> --- a/src/amd/vulkan/radv_pipeline.c
> +++ b/src/amd/vulkan/radv_pipeline.c
> @@ -354,12 +354,13 @@ static void radv_fill_shader_variant(struct
> radv_device *device,
>                                    struct ac_shader_binary *binary,
>                                    gl_shader_stage stage)
>  {
> -       variant->code_size = binary->code_size;
>       bool scratch_enabled = variant->config.scratch_bytes_per_wave > 0;
>       unsigned vgpr_comp_cnt = 0;
>  
> -       if (scratch_enabled)
> -               radv_finishme("shader scratch space");
> +       if (scratch_enabled && !device->llvm_supports_spill)
> +               radv_finishme("shader scratch support only available with
> LLVM 5.0");
> +
> +       variant->code_size = binary->code_size;
>  
>       switch (stage) {
>       case MESA_SHADER_VERTEX:
> @@ -424,7 +425,8 @@ static struct radv_shader_variant
> *radv_shader_variant_create(struct radv_device
>       options.unsafe_math = !!(device->debug_flags & RADV_DEBUG_UNSAFE_MATH);
>       options.family = chip_family;
>       options.chip_class = device->physical_device->rad_info.chip_class;
> -       tm = ac_create_target_machine(chip_family);
> +       options.supports_spill = device->llvm_supports_spill;
> +       tm = ac_create_target_machine(chip_family,
> options.supports_spill);
>       ac_compile_nir_shader(tm, &binary, &variant->config,
>                             &variant->info, shader, &options, dump);
>       LLVMDisposeTargetMachine(tm);
> diff --git a/src/amd/vulkan/radv_private.h
> b/src/amd/vulkan/radv_private.h
> index 0b8f50a..6c746b5 100644
> --- a/src/amd/vulkan/radv_private.h
> +++ b/src/amd/vulkan/radv_private.h
> @@ -485,6 +485,8 @@ struct radv_device {
>  
>       uint64_t debug_flags;
>  
> +       bool llvm_supports_spill;
> +       uint32_t scratch_waves;
>       /* MSAA sample locations.
>        * The first index is the sample index.
>        * The second index is the coordinate: X, Y. */
> @@ -726,6 +728,17 @@ struct radv_cmd_buffer {
>       struct radv_cmd_buffer_upload upload;
>  
>       bool record_fail;
> +
> +       /* for primary cmd buffers */
> +       struct radeon_winsys_bo *scratch_bo;
> +       struct radeon_winsys_bo *compute_scratch_bo;
> +       uint32_t scratch_patch_idx;
> +       uint32_t *cs_to_patch_scratch;
> +
> +       /* for primary + secondary cmd buffers */
> +       uint32_t scratch_needed_mask;
> +       uint32_t scratch_size_needed;
> +       uint32_t compute_scratch_size_needed;
>  };
>  
>  struct radv_image;
> -- 
> 2.7.4
> 
> _______________________________________________
> mesa-dev mailing list
> mesa-dev@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/mesa-dev
_______________________________________________
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev

Reply via email to