Module: Mesa Branch: main Commit: 533ec9843e466e0cadf876a39d6fac74db2bdcd6 URL: http://cgit.freedesktop.org/mesa/mesa/commit/?id=533ec9843e466e0cadf876a39d6fac74db2bdcd6
Author: Tatsuyuki Ishi <ishitatsuy...@gmail.com> Date: Thu Dec 14 18:15:11 2023 +0900 radv: Precompute shader max_waves. Doing it at bind-time causes a 1.4% overhead (among all driver calls) in Overwatch 2. !24502 mentions that it can be precomputed in case overhead is a concern, so do it here. max_waves is stored directly in the radv_shader struct, because ac_shader_config conforms to LLVM ABI and we cannot add anything custom, and radv_shader_info needs to be determined from NIR only. Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/26692> --- src/amd/vulkan/radv_pipeline.c | 3 +-- src/amd/vulkan/radv_shader.c | 38 ++++++++++++++++++++------------------ src/amd/vulkan/radv_shader.h | 3 +-- 3 files changed, 22 insertions(+), 22 deletions(-) diff --git a/src/amd/vulkan/radv_pipeline.c b/src/amd/vulkan/radv_pipeline.c index bc79dec0df6..efc86f05f90 100644 --- a/src/amd/vulkan/radv_pipeline.c +++ b/src/amd/vulkan/radv_pipeline.c @@ -961,7 +961,6 @@ radv_GetPipelineExecutableStatisticsKHR(VkDevice _device, const VkPipelineExecut unsigned lds_increment = pdevice->rad_info.gfx_level >= GFX11 && stage == MESA_SHADER_FRAGMENT ? 1024 : pdevice->rad_info.lds_encode_granularity; - unsigned max_waves = radv_get_max_waves(device, shader, stage); VkPipelineExecutableStatisticKHR *s = pStatistics; VkPipelineExecutableStatisticKHR *end = s + (pStatistics ? *pStatisticCount : 0); @@ -1035,7 +1034,7 @@ radv_GetPipelineExecutableStatisticsKHR(VkDevice _device, const VkPipelineExecut desc_copy(s->name, "Subgroups per SIMD"); desc_copy(s->description, "The maximum number of subgroups in flight on a SIMD unit"); s->format = VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_UINT64_KHR; - s->value.u64 = max_waves; + s->value.u64 = shader->max_waves; } ++s; diff --git a/src/amd/vulkan/radv_shader.c b/src/amd/vulkan/radv_shader.c index 9b4b6fcef43..7b911a44906 100644 --- a/src/amd/vulkan/radv_shader.c +++ b/src/amd/vulkan/radv_shader.c @@ -2049,45 +2049,46 @@ radv_shader_upload(struct radv_device *device, struct radv_shader *shader, const return true; } -unsigned -radv_get_max_waves(const struct radv_device *device, struct radv_shader *shader, gl_shader_stage stage) -{ - const struct radeon_info *info = &device->physical_device->rad_info; - const enum amd_gfx_level gfx_level = info->gfx_level; - const uint8_t wave_size = shader->info.wave_size; - const struct ac_shader_config *conf = &shader->config; - unsigned max_simd_waves = info->max_waves_per_simd; +static unsigned +radv_get_max_waves(const struct radv_device *device, const struct ac_shader_config *conf, + const struct radv_shader_info *info) +{ + const struct radeon_info *rad_info = &device->physical_device->rad_info; + const enum amd_gfx_level gfx_level = rad_info->gfx_level; + const uint8_t wave_size = info->wave_size; + gl_shader_stage stage = info->stage; + unsigned max_simd_waves = rad_info->max_waves_per_simd; unsigned lds_per_wave = 0; if (stage == MESA_SHADER_FRAGMENT) { - lds_per_wave = conf->lds_size * info->lds_encode_granularity + shader->info.ps.num_interp * 48; - lds_per_wave = align(lds_per_wave, info->lds_alloc_granularity); + lds_per_wave = conf->lds_size * rad_info->lds_encode_granularity + info->ps.num_interp * 48; + lds_per_wave = align(lds_per_wave, rad_info->lds_alloc_granularity); } else if (stage == MESA_SHADER_COMPUTE || stage == MESA_SHADER_TASK) { - unsigned max_workgroup_size = shader->info.workgroup_size; - lds_per_wave = align(conf->lds_size * info->lds_encode_granularity, info->lds_alloc_granularity); + unsigned max_workgroup_size = info->workgroup_size; + lds_per_wave = align(conf->lds_size * rad_info->lds_encode_granularity, rad_info->lds_alloc_granularity); lds_per_wave /= DIV_ROUND_UP(max_workgroup_size, wave_size); } if (conf->num_sgprs && gfx_level < GFX10) { unsigned sgprs = align(conf->num_sgprs, gfx_level >= GFX8 ? 16 : 8); - max_simd_waves = MIN2(max_simd_waves, info->num_physical_sgprs_per_simd / sgprs); + max_simd_waves = MIN2(max_simd_waves, rad_info->num_physical_sgprs_per_simd / sgprs); } if (conf->num_vgprs) { - unsigned physical_vgprs = info->num_physical_wave64_vgprs_per_simd * (64 / wave_size); + unsigned physical_vgprs = rad_info->num_physical_wave64_vgprs_per_simd * (64 / wave_size); unsigned vgprs = align(conf->num_vgprs, wave_size == 32 ? 8 : 4); if (gfx_level >= GFX10_3) { - unsigned real_vgpr_gran = info->num_physical_wave64_vgprs_per_simd / 64; + unsigned real_vgpr_gran = rad_info->num_physical_wave64_vgprs_per_simd / 64; vgprs = util_align_npot(vgprs, real_vgpr_gran * (wave_size == 32 ? 2 : 1)); } max_simd_waves = MIN2(max_simd_waves, physical_vgprs / vgprs); } - unsigned simd_per_workgroup = info->num_simd_per_compute_unit; + unsigned simd_per_workgroup = rad_info->num_simd_per_compute_unit; if (gfx_level >= GFX10) simd_per_workgroup *= 2; /* like lds_size_per_workgroup, assume WGP on GFX10+ */ - unsigned max_lds_per_simd = info->lds_size_per_workgroup / simd_per_workgroup; + unsigned max_lds_per_simd = rad_info->lds_size_per_workgroup / simd_per_workgroup; if (lds_per_wave) max_simd_waves = MIN2(max_simd_waves, DIV_ROUND_UP(max_lds_per_simd, lds_per_wave)); @@ -2099,7 +2100,7 @@ radv_get_max_scratch_waves(const struct radv_device *device, struct radv_shader { const unsigned num_cu = device->physical_device->rad_info.num_cu; - return MIN2(device->scratch_waves, 4 * num_cu * radv_get_max_waves(device, shader, shader->info.stage)); + return MIN2(device->scratch_waves, 4 * num_cu * shader->max_waves); } VkResult @@ -2118,6 +2119,7 @@ radv_shader_create_uncached(struct radv_device *device, const struct radv_shader shader->info = binary->info; shader->config = binary->config; + shader->max_waves = radv_get_max_waves(device, &shader->config, &shader->info); if (binary->type == RADV_BINARY_TYPE_RTLD) { #if !defined(USE_LIBELF) diff --git a/src/amd/vulkan/radv_shader.h b/src/amd/vulkan/radv_shader.h index 19038b06dbd..f4eaf5fae5f 100644 --- a/src/amd/vulkan/radv_shader.h +++ b/src/amd/vulkan/radv_shader.h @@ -613,6 +613,7 @@ struct radv_shader { uint32_t code_size; uint32_t exec_size; struct radv_shader_info info; + uint32_t max_waves; blake3_hash hash; void *code; @@ -757,8 +758,6 @@ struct radv_shader_part *radv_shader_part_cache_get(struct radv_device *device, uint64_t radv_shader_get_va(const struct radv_shader *shader); struct radv_shader *radv_find_shader(struct radv_device *device, uint64_t pc); -unsigned radv_get_max_waves(const struct radv_device *device, struct radv_shader *shader, gl_shader_stage stage); - unsigned radv_get_max_scratch_waves(const struct radv_device *device, struct radv_shader *shader); const char *radv_get_shader_name(const struct radv_shader_info *info, gl_shader_stage stage);