Mesa (main): radv: Precompute shader max_waves.

GitLab Mirror Mon, 18 Dec 2023 07:32:36 -0800

Module: Mesa
Branch: main
Commit: 533ec9843e466e0cadf876a39d6fac74db2bdcd6
URL:    
http://cgit.freedesktop.org/mesa/mesa/commit/?id=533ec9843e466e0cadf876a39d6fac74db2bdcd6


Author: Tatsuyuki Ishi <ishitatsuy...@gmail.com>
Date:   Thu Dec 14 18:15:11 2023 +0900

radv: Precompute shader max_waves.

Doing it at bind-time causes a 1.4% overhead (among all driver calls) in
Overwatch 2. !24502 mentions that it can be precomputed in case overhead
is a concern, so do it here.

max_waves is stored directly in the radv_shader struct, because
ac_shader_config conforms to LLVM ABI and we cannot add anything custom,
and radv_shader_info needs to be determined from NIR only.

Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/26692>

---

 src/amd/vulkan/radv_pipeline.c |  3 +--
 src/amd/vulkan/radv_shader.c   | 38 ++++++++++++++++++++------------------
 src/amd/vulkan/radv_shader.h   |  3 +--
 3 files changed, 22 insertions(+), 22 deletions(-)

diff --git a/src/amd/vulkan/radv_pipeline.c b/src/amd/vulkan/radv_pipeline.c
index bc79dec0df6..efc86f05f90 100644
--- a/src/amd/vulkan/radv_pipeline.c
+++ b/src/amd/vulkan/radv_pipeline.c
@@ -961,7 +961,6 @@ radv_GetPipelineExecutableStatisticsKHR(VkDevice _device, 
const VkPipelineExecut
    unsigned lds_increment = pdevice->rad_info.gfx_level >= GFX11 && stage == 
MESA_SHADER_FRAGMENT
                                ? 1024
                                : pdevice->rad_info.lds_encode_granularity;
-   unsigned max_waves = radv_get_max_waves(device, shader, stage);
 
    VkPipelineExecutableStatisticKHR *s = pStatistics;
    VkPipelineExecutableStatisticKHR *end = s + (pStatistics ? *pStatisticCount 
: 0);
@@ -1035,7 +1034,7 @@ radv_GetPipelineExecutableStatisticsKHR(VkDevice _device, 
const VkPipelineExecut
       desc_copy(s->name, "Subgroups per SIMD");
       desc_copy(s->description, "The maximum number of subgroups in flight on 
a SIMD unit");
       s->format = VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_UINT64_KHR;
-      s->value.u64 = max_waves;
+      s->value.u64 = shader->max_waves;
    }
    ++s;
 
diff --git a/src/amd/vulkan/radv_shader.c b/src/amd/vulkan/radv_shader.c
index 9b4b6fcef43..7b911a44906 100644
--- a/src/amd/vulkan/radv_shader.c
+++ b/src/amd/vulkan/radv_shader.c
@@ -2049,45 +2049,46 @@ radv_shader_upload(struct radv_device *device, struct 
radv_shader *shader, const
    return true;
 }
 
-unsigned
-radv_get_max_waves(const struct radv_device *device, struct radv_shader 
*shader, gl_shader_stage stage)
-{
-   const struct radeon_info *info = &device->physical_device->rad_info;
-   const enum amd_gfx_level gfx_level = info->gfx_level;
-   const uint8_t wave_size = shader->info.wave_size;
-   const struct ac_shader_config *conf = &shader->config;
-   unsigned max_simd_waves = info->max_waves_per_simd;
+static unsigned
+radv_get_max_waves(const struct radv_device *device, const struct 
ac_shader_config *conf,
+                   const struct radv_shader_info *info)
+{
+   const struct radeon_info *rad_info = &device->physical_device->rad_info;
+   const enum amd_gfx_level gfx_level = rad_info->gfx_level;
+   const uint8_t wave_size = info->wave_size;
+   gl_shader_stage stage = info->stage;
+   unsigned max_simd_waves = rad_info->max_waves_per_simd;
    unsigned lds_per_wave = 0;
 
    if (stage == MESA_SHADER_FRAGMENT) {
-      lds_per_wave = conf->lds_size * info->lds_encode_granularity + 
shader->info.ps.num_interp * 48;
-      lds_per_wave = align(lds_per_wave, info->lds_alloc_granularity);
+      lds_per_wave = conf->lds_size * rad_info->lds_encode_granularity + 
info->ps.num_interp * 48;
+      lds_per_wave = align(lds_per_wave, rad_info->lds_alloc_granularity);
    } else if (stage == MESA_SHADER_COMPUTE || stage == MESA_SHADER_TASK) {
-      unsigned max_workgroup_size = shader->info.workgroup_size;
-      lds_per_wave = align(conf->lds_size * info->lds_encode_granularity, 
info->lds_alloc_granularity);
+      unsigned max_workgroup_size = info->workgroup_size;
+      lds_per_wave = align(conf->lds_size * rad_info->lds_encode_granularity, 
rad_info->lds_alloc_granularity);
       lds_per_wave /= DIV_ROUND_UP(max_workgroup_size, wave_size);
    }
 
    if (conf->num_sgprs && gfx_level < GFX10) {
       unsigned sgprs = align(conf->num_sgprs, gfx_level >= GFX8 ? 16 : 8);
-      max_simd_waves = MIN2(max_simd_waves, info->num_physical_sgprs_per_simd 
/ sgprs);
+      max_simd_waves = MIN2(max_simd_waves, 
rad_info->num_physical_sgprs_per_simd / sgprs);
    }
 
    if (conf->num_vgprs) {
-      unsigned physical_vgprs = info->num_physical_wave64_vgprs_per_simd * (64 
/ wave_size);
+      unsigned physical_vgprs = rad_info->num_physical_wave64_vgprs_per_simd * 
(64 / wave_size);
       unsigned vgprs = align(conf->num_vgprs, wave_size == 32 ? 8 : 4);
       if (gfx_level >= GFX10_3) {
-         unsigned real_vgpr_gran = info->num_physical_wave64_vgprs_per_simd / 
64;
+         unsigned real_vgpr_gran = 
rad_info->num_physical_wave64_vgprs_per_simd / 64;
          vgprs = util_align_npot(vgprs, real_vgpr_gran * (wave_size == 32 ? 2 
: 1));
       }
       max_simd_waves = MIN2(max_simd_waves, physical_vgprs / vgprs);
    }
 
-   unsigned simd_per_workgroup = info->num_simd_per_compute_unit;
+   unsigned simd_per_workgroup = rad_info->num_simd_per_compute_unit;
    if (gfx_level >= GFX10)
       simd_per_workgroup *= 2; /* like lds_size_per_workgroup, assume WGP on 
GFX10+ */
 
-   unsigned max_lds_per_simd = info->lds_size_per_workgroup / 
simd_per_workgroup;
+   unsigned max_lds_per_simd = rad_info->lds_size_per_workgroup / 
simd_per_workgroup;
    if (lds_per_wave)
       max_simd_waves = MIN2(max_simd_waves, DIV_ROUND_UP(max_lds_per_simd, 
lds_per_wave));
 
@@ -2099,7 +2100,7 @@ radv_get_max_scratch_waves(const struct radv_device 
*device, struct radv_shader
 {
    const unsigned num_cu = device->physical_device->rad_info.num_cu;
 
-   return MIN2(device->scratch_waves, 4 * num_cu * radv_get_max_waves(device, 
shader, shader->info.stage));
+   return MIN2(device->scratch_waves, 4 * num_cu * shader->max_waves);
 }
 
 VkResult
@@ -2118,6 +2119,7 @@ radv_shader_create_uncached(struct radv_device *device, 
const struct radv_shader
 
    shader->info = binary->info;
    shader->config = binary->config;
+   shader->max_waves = radv_get_max_waves(device, &shader->config, 
&shader->info);
 
    if (binary->type == RADV_BINARY_TYPE_RTLD) {
 #if !defined(USE_LIBELF)
diff --git a/src/amd/vulkan/radv_shader.h b/src/amd/vulkan/radv_shader.h
index 19038b06dbd..f4eaf5fae5f 100644
--- a/src/amd/vulkan/radv_shader.h
+++ b/src/amd/vulkan/radv_shader.h
@@ -613,6 +613,7 @@ struct radv_shader {
    uint32_t code_size;
    uint32_t exec_size;
    struct radv_shader_info info;
+   uint32_t max_waves;
 
    blake3_hash hash;
    void *code;
@@ -757,8 +758,6 @@ struct radv_shader_part *radv_shader_part_cache_get(struct 
radv_device *device,
 uint64_t radv_shader_get_va(const struct radv_shader *shader);
 struct radv_shader *radv_find_shader(struct radv_device *device, uint64_t pc);
 
-unsigned radv_get_max_waves(const struct radv_device *device, struct 
radv_shader *shader, gl_shader_stage stage);
-
 unsigned radv_get_max_scratch_waves(const struct radv_device *device, struct 
radv_shader *shader);
 
 const char *radv_get_shader_name(const struct radv_shader_info *info, 
gl_shader_stage stage);

Mesa (main): radv: Precompute shader max_waves.

Reply via email to