Mesa (main): radv, aco: Rework VS prolog key handling.

GitLab Mirror Mon, 13 Nov 2023 04:07:18 -0800

Module: Mesa
Branch: main
Commit: 3fc3a94bce6c542cc5c23eca832b6a5c45ac5c79
URL:    
http://cgit.freedesktop.org/mesa/mesa/commit/?id=3fc3a94bce6c542cc5c23eca832b6a5c45ac5c79


Author: Tatsuyuki Ishi <ishitatsuy...@gmail.com>
Date:   Fri Nov  3 19:07:51 2023 +0900

radv, aco: Rework VS prolog key handling.

The main change is to use struct radv_vs_prolog_key directly instead of
the compressed representation to simplify an upcoming rework in prolog /
epilog caching. In doing so the state struct pointer was replaced with
an inline struct.

Care was also taken to pre-mask all the states with the active attribute
mask and other masks when it makes sense; this ensures that we don't
accidentally use information not hashed into the key during compilation.

Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/26023>

---

 src/amd/compiler/aco_instruction_selection.cpp |   7 +-
 src/amd/vulkan/radv_aco_shader_info.h          |   4 +-
 src/amd/vulkan/radv_cmd_buffer.c               | 112 +++++--------------------
 src/amd/vulkan/radv_device.c                   |  24 +-----
 src/amd/vulkan/radv_shader.c                   |   2 +-
 src/amd/vulkan/radv_shader.h                   |  16 +++-
 6 files changed, 45 insertions(+), 120 deletions(-)

diff --git a/src/amd/compiler/aco_instruction_selection.cpp 
b/src/amd/compiler/aco_instruction_selection.cpp
index 36bcf872479..336fd9260f5 100644
--- a/src/amd/compiler/aco_instruction_selection.cpp
+++ b/src/amd/compiler/aco_instruction_selection.cpp
@@ -12739,7 +12739,7 @@ select_vs_prolog(Program* program, const struct 
aco_vs_prolog_info* pinfo, ac_sh
    bld.sopp(aco_opcode::s_setprio, -1u, 0x3u);
 
    uint32_t attrib_mask = BITFIELD_MASK(pinfo->num_attributes);
-   bool has_nontrivial_divisors = pinfo->state.nontrivial_divisors & 
attrib_mask;
+   bool has_nontrivial_divisors = pinfo->state.nontrivial_divisors;
 
    wait_imm lgkm_imm;
    lgkm_imm.lgkm = 0;
@@ -12800,10 +12800,9 @@ select_vs_prolog(Program* program, const struct 
aco_vs_prolog_info* pinfo, ac_sh
          }
 
          bool needs_instance_index =
-            pinfo->state.instance_rate_inputs & attrib_mask &
+            pinfo->state.instance_rate_inputs &
             ~(pinfo->state.zero_divisors | pinfo->state.nontrivial_divisors); 
/* divisor is 1 */
-         bool needs_start_instance =
-            pinfo->state.instance_rate_inputs & attrib_mask & 
pinfo->state.zero_divisors;
+         bool needs_start_instance = pinfo->state.instance_rate_inputs & 
pinfo->state.zero_divisors;
          bool needs_vertex_index = ~pinfo->state.instance_rate_inputs & 
attrib_mask;
          if (needs_vertex_index)
             bld.vadd32(Definition(vertex_index, v1), get_arg_fixed(args, 
args->base_vertex),
diff --git a/src/amd/vulkan/radv_aco_shader_info.h 
b/src/amd/vulkan/radv_aco_shader_info.h
index b8887582525..e75ba5826aa 100644
--- a/src/amd/vulkan/radv_aco_shader_info.h
+++ b/src/amd/vulkan/radv_aco_shader_info.h
@@ -72,8 +72,8 @@ radv_aco_convert_shader_info(struct aco_shader_info 
*aco_info, const struct radv
    aco_info->next_stage_pc = radv_args->next_stage_pc;
 }
 
-#define ASSIGN_VS_STATE_FIELD(x)    aco_info->state.x = radv->state->x
-#define ASSIGN_VS_STATE_FIELD_CP(x) memcpy(&aco_info->state.x, 
&radv->state->x, sizeof(radv->state->x))
+#define ASSIGN_VS_STATE_FIELD(x)    aco_info->state.x = radv->state.x
+#define ASSIGN_VS_STATE_FIELD_CP(x) memcpy(&aco_info->state.x, &radv->state.x, 
sizeof(radv->state.x))
 static inline void
 radv_aco_convert_vs_prolog_key(struct aco_vs_prolog_info *aco_info, const 
struct radv_vs_prolog_key *radv,
                                const struct radv_shader_args *radv_args)
diff --git a/src/amd/vulkan/radv_cmd_buffer.c b/src/amd/vulkan/radv_cmd_buffer.c
index 0155bc2e2a2..19e0e55e45b 100644
--- a/src/amd/vulkan/radv_cmd_buffer.c
+++ b/src/amd/vulkan/radv_cmd_buffer.c
@@ -3691,53 +3691,25 @@ radv_instance_rate_prolog_index(unsigned 
num_attributes, uint32_t instance_rate_
    return start_index + offset_from_start_index + first;
 }
 
-union vs_prolog_key_header {
-   struct {
-      uint32_t key_size : 8;
-      uint32_t num_attributes : 6;
-      uint32_t as_ls : 1;
-      uint32_t is_ngg : 1;
-      uint32_t wave32 : 1;
-      uint32_t next_stage : 3;
-      uint32_t instance_rate_inputs : 1;
-      uint32_t alpha_adjust_lo : 1;
-      uint32_t alpha_adjust_hi : 1;
-      uint32_t misaligned_mask : 1;
-      uint32_t post_shuffle : 1;
-      uint32_t nontrivial_divisors : 1;
-      uint32_t zero_divisors : 1;
-      /* We need this to ensure the padding is zero. It's useful even if it's 
unused. */
-      uint32_t padding0 : 5;
-   };
-   uint32_t v;
-};
-
 uint32_t
 radv_hash_vs_prolog(const void *key_)
 {
-   const uint32_t *key = key_;
-   union vs_prolog_key_header header;
-   header.v = key[0];
-   return _mesa_hash_data(key, header.key_size);
+   const struct radv_vs_prolog_key *key = key_;
+   return _mesa_hash_data(key, sizeof(*key));
 }
 
 bool
 radv_cmp_vs_prolog(const void *a_, const void *b_)
 {
-   const uint32_t *a = a_;
-   const uint32_t *b = b_;
-   if (a[0] != b[0])
-      return false;
+   const struct radv_vs_prolog_key *a = a_;
+   const struct radv_vs_prolog_key *b = b_;
 
-   union vs_prolog_key_header header;
-   header.v = a[0];
-   return memcmp(a, b, header.key_size) == 0;
+   return memcmp(a, b, sizeof(*a)) == 0;
 }
 
 static struct radv_shader_part *
 lookup_vs_prolog(struct radv_cmd_buffer *cmd_buffer, const struct radv_shader 
*vs_shader, uint32_t *nontrivial_divisors)
 {
-   STATIC_ASSERT(sizeof(union vs_prolog_key_header) == 4);
    assert(vs_shader->info.vs.dynamic_inputs);
 
    const struct radv_vs_input_state *state = 
&cmd_buffer->state.dynamic_vs_input;
@@ -3800,12 +3772,17 @@ lookup_vs_prolog(struct radv_cmd_buffer *cmd_buffer, 
const struct radv_shader *v
    if (prolog)
       return prolog;
 
-   /* if we couldn't use a pre-compiled prolog, find one in the cache or 
create one */
-   uint32_t key_words[17];
-   unsigned key_size = 1;
-
    struct radv_vs_prolog_key key;
-   key.state = state;
+   memset(&key, 0, sizeof(key));
+   key.state.instance_rate_inputs = instance_rate_inputs;
+   key.state.nontrivial_divisors = *nontrivial_divisors;
+   key.state.zero_divisors = zero_divisors;
+   /* If the attribute is aligned, post shuffle is implemented using DST_SEL 
instead. */
+   key.state.post_shuffle = state->post_shuffle & attribute_mask & 
misaligned_mask;
+   key.state.alpha_adjust_hi = state->alpha_adjust_hi & attribute_mask;
+   key.state.alpha_adjust_lo = state->alpha_adjust_lo & attribute_mask;
+   u_foreach_bit (index, misaligned_mask)
+      key.state.formats[index] = state->formats[index];
    key.num_attributes = num_attributes;
    key.misaligned_mask = misaligned_mask;
    /* The instance ID input VGPR is placed differently when as_ls=true. */
@@ -3820,78 +3797,29 @@ lookup_vs_prolog(struct radv_cmd_buffer *cmd_buffer, 
const struct radv_shader *v
       key.next_stage = vs_shader->info.stage;
    }
 
-   union vs_prolog_key_header header;
-   header.v = 0;
-   header.num_attributes = num_attributes;
-   header.as_ls = key.as_ls;
-   header.is_ngg = key.is_ngg;
-   header.wave32 = key.wave32;
-   header.next_stage = key.next_stage;
-
-   if (instance_rate_inputs & ~*nontrivial_divisors) {
-      header.instance_rate_inputs = true;
-      key_words[key_size++] = instance_rate_inputs;
-   }
-   if (*nontrivial_divisors) {
-      header.nontrivial_divisors = true;
-      key_words[key_size++] = *nontrivial_divisors;
-   }
-   if (zero_divisors) {
-      header.zero_divisors = true;
-      key_words[key_size++] = zero_divisors;
-   }
-   if (misaligned_mask) {
-      header.misaligned_mask = true;
-      key_words[key_size++] = misaligned_mask;
-
-      uint8_t *formats = (uint8_t *)&key_words[key_size];
-      unsigned num_formats = 0;
-      u_foreach_bit (index, misaligned_mask)
-         formats[num_formats++] = state->formats[index];
-      while (num_formats & 0x3)
-         formats[num_formats++] = 0;
-      key_size += num_formats / 4u;
-
-      if (state->post_shuffle & attribute_mask) {
-         header.post_shuffle = true;
-         key_words[key_size++] = state->post_shuffle & attribute_mask;
-      }
-   }
-   if (state->alpha_adjust_lo & attribute_mask) {
-      header.alpha_adjust_lo = true;
-      key_words[key_size++] = state->alpha_adjust_lo & attribute_mask;
-   }
-   if (state->alpha_adjust_hi & attribute_mask) {
-      header.alpha_adjust_hi = true;
-      key_words[key_size++] = state->alpha_adjust_hi & attribute_mask;
-   }
-
-   header.key_size = key_size * sizeof(key_words[0]);
-   key_words[0] = header.v;
-
-   uint32_t hash = radv_hash_vs_prolog(key_words);
+   uint32_t hash = radv_hash_vs_prolog(&key);
 
    u_rwlock_rdlock(&device->vs_prologs_lock);
-   struct hash_entry *prolog_entry = 
_mesa_hash_table_search_pre_hashed(device->vs_prologs, hash, key_words);
+   struct hash_entry *prolog_entry = 
_mesa_hash_table_search_pre_hashed(device->vs_prologs, hash, &key);
    u_rwlock_rdunlock(&device->vs_prologs_lock);
 
    if (!prolog_entry) {
       u_rwlock_wrlock(&device->vs_prologs_lock);
-      prolog_entry = _mesa_hash_table_search_pre_hashed(device->vs_prologs, 
hash, key_words);
+      prolog_entry = _mesa_hash_table_search_pre_hashed(device->vs_prologs, 
hash, &key);
       if (prolog_entry) {
          u_rwlock_wrunlock(&device->vs_prologs_lock);
          return prolog_entry->data;
       }
 
       prolog = radv_create_vs_prolog(device, &key);
-      uint32_t *key2 = malloc(key_size * 4);
+      struct radv_vs_prolog_key *key2 = malloc(sizeof(key));
       if (!prolog || !key2) {
          radv_shader_part_unref(device, prolog);
          free(key2);
          u_rwlock_wrunlock(&device->vs_prologs_lock);
          return NULL;
       }
-      memcpy(key2, key_words, key_size * 4);
+      memcpy(key2, &key, sizeof(key));
       _mesa_hash_table_insert_pre_hashed(device->vs_prologs, hash, key2, 
prolog);
 
       u_rwlock_wrunlock(&device->vs_prologs_lock);
diff --git a/src/amd/vulkan/radv_device.c b/src/amd/vulkan/radv_device.c
index f613bf105e7..56a73c72b7c 100644
--- a/src/amd/vulkan/radv_device.c
+++ b/src/amd/vulkan/radv_device.c
@@ -168,25 +168,15 @@ radv_device_init_vs_prologs(struct radv_device *device)
    if (device->instance->debug_flags & RADV_DEBUG_DUMP_PROLOGS)
       return VK_SUCCESS;
 
-   struct radv_vs_input_state state;
-   state.nontrivial_divisors = 0;
-   memset(state.offsets, 0, sizeof(state.offsets));
-   state.alpha_adjust_lo = 0;
-   state.alpha_adjust_hi = 0;
-   memset(state.formats, 0, sizeof(state.formats));
-
    struct radv_vs_prolog_key key;
-   key.state = &state;
-   key.misaligned_mask = 0;
+   memset(&key, 0, sizeof(key));
    key.as_ls = false;
    key.is_ngg = device->physical_device->use_ngg;
    key.next_stage = MESA_SHADER_VERTEX;
    key.wave32 = device->physical_device->ge_wave_size == 32;
 
    for (unsigned i = 1; i <= MAX_VERTEX_ATTRIBS; i++) {
-      state.attribute_mask = BITFIELD_MASK(i);
-      state.instance_rate_inputs = 0;
-
+      key.state.instance_rate_inputs = 0;
       key.num_attributes = i;
 
       device->simple_vs_prologs[i - 1] = radv_create_vs_prolog(device, &key);
@@ -196,22 +186,16 @@ radv_device_init_vs_prologs(struct radv_device *device)
 
    unsigned idx = 0;
    for (unsigned num_attributes = 1; num_attributes <= 16; num_attributes++) {
-      state.attribute_mask = BITFIELD_MASK(num_attributes);
-
-      for (unsigned i = 0; i < num_attributes; i++)
-         state.divisors[i] = 1;
-
       for (unsigned count = 1; count <= num_attributes; count++) {
          for (unsigned start = 0; start <= (num_attributes - count); start++) {
-            state.instance_rate_inputs = u_bit_consecutive(start, count);
-
+            key.state.instance_rate_inputs = u_bit_consecutive(start, count);
             key.num_attributes = num_attributes;
 
             struct radv_shader_part *prolog = radv_create_vs_prolog(device, 
&key);
             if (!prolog)
                return vk_error(device->physical_device->instance, 
VK_ERROR_OUT_OF_DEVICE_MEMORY);
 
-            assert(idx == radv_instance_rate_prolog_index(num_attributes, 
state.instance_rate_inputs));
+            assert(idx == radv_instance_rate_prolog_index(num_attributes, 
key.state.instance_rate_inputs));
             device->instance_rate_vs_prologs[idx++] = prolog;
          }
       }
diff --git a/src/amd/vulkan/radv_shader.c b/src/amd/vulkan/radv_shader.c
index 3a0d4bf316a..fec40cf7065 100644
--- a/src/amd/vulkan/radv_shader.c
+++ b/src/amd/vulkan/radv_shader.c
@@ -2581,7 +2581,7 @@ radv_create_vs_prolog(struct radv_device *device, const 
struct radv_vs_prolog_ke
    if (!prolog)
       goto fail;
 
-   prolog->nontrivial_divisors = key->state->nontrivial_divisors;
+   prolog->nontrivial_divisors = key->state.nontrivial_divisors;
 
    if (options.dump_shader) {
       fprintf(stderr, "Vertex prolog");
diff --git a/src/amd/vulkan/radv_shader.h b/src/amd/vulkan/radv_shader.h
index b2e8a8e7bea..b62a7147d6b 100644
--- a/src/amd/vulkan/radv_shader.h
+++ b/src/amd/vulkan/radv_shader.h
@@ -485,7 +485,21 @@ struct radv_vs_input_state {
 };
 
 struct radv_vs_prolog_key {
-   const struct radv_vs_input_state *state;
+   /* All the fields are pre-masked with BITFIELD_MASK(num_attributes).
+    * Some of the fields are pre-masked by other conditions. See 
lookup_vs_prolog.
+    */
+   struct {
+      uint32_t instance_rate_inputs;
+      uint32_t nontrivial_divisors;
+      uint32_t zero_divisors;
+      uint32_t post_shuffle;
+      /* Having two separate fields instead of a single uint64_t makes it 
easier to remove attributes
+       * using bitwise arithmetic.
+       */
+      uint32_t alpha_adjust_lo;
+      uint32_t alpha_adjust_hi;
+      uint8_t formats[MAX_VERTEX_ATTRIBS];
+   } state;
    unsigned num_attributes;
    uint32_t misaligned_mask;
    bool as_ls;

Mesa (main): radv, aco: Rework VS prolog key handling.

Reply via email to