Module: Mesa Branch: main Commit: 3fc3a94bce6c542cc5c23eca832b6a5c45ac5c79 URL: http://cgit.freedesktop.org/mesa/mesa/commit/?id=3fc3a94bce6c542cc5c23eca832b6a5c45ac5c79
Author: Tatsuyuki Ishi <ishitatsuy...@gmail.com> Date: Fri Nov 3 19:07:51 2023 +0900 radv, aco: Rework VS prolog key handling. The main change is to use struct radv_vs_prolog_key directly instead of the compressed representation to simplify an upcoming rework in prolog / epilog caching. In doing so the state struct pointer was replaced with an inline struct. Care was also taken to pre-mask all the states with the active attribute mask and other masks when it makes sense; this ensures that we don't accidentally use information not hashed into the key during compilation. Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/26023> --- src/amd/compiler/aco_instruction_selection.cpp | 7 +- src/amd/vulkan/radv_aco_shader_info.h | 4 +- src/amd/vulkan/radv_cmd_buffer.c | 112 +++++-------------------- src/amd/vulkan/radv_device.c | 24 +----- src/amd/vulkan/radv_shader.c | 2 +- src/amd/vulkan/radv_shader.h | 16 +++- 6 files changed, 45 insertions(+), 120 deletions(-) diff --git a/src/amd/compiler/aco_instruction_selection.cpp b/src/amd/compiler/aco_instruction_selection.cpp index 36bcf872479..336fd9260f5 100644 --- a/src/amd/compiler/aco_instruction_selection.cpp +++ b/src/amd/compiler/aco_instruction_selection.cpp @@ -12739,7 +12739,7 @@ select_vs_prolog(Program* program, const struct aco_vs_prolog_info* pinfo, ac_sh bld.sopp(aco_opcode::s_setprio, -1u, 0x3u); uint32_t attrib_mask = BITFIELD_MASK(pinfo->num_attributes); - bool has_nontrivial_divisors = pinfo->state.nontrivial_divisors & attrib_mask; + bool has_nontrivial_divisors = pinfo->state.nontrivial_divisors; wait_imm lgkm_imm; lgkm_imm.lgkm = 0; @@ -12800,10 +12800,9 @@ select_vs_prolog(Program* program, const struct aco_vs_prolog_info* pinfo, ac_sh } bool needs_instance_index = - pinfo->state.instance_rate_inputs & attrib_mask & + pinfo->state.instance_rate_inputs & ~(pinfo->state.zero_divisors | pinfo->state.nontrivial_divisors); /* divisor is 1 */ - bool needs_start_instance = - pinfo->state.instance_rate_inputs & attrib_mask & pinfo->state.zero_divisors; + bool needs_start_instance = pinfo->state.instance_rate_inputs & pinfo->state.zero_divisors; bool needs_vertex_index = ~pinfo->state.instance_rate_inputs & attrib_mask; if (needs_vertex_index) bld.vadd32(Definition(vertex_index, v1), get_arg_fixed(args, args->base_vertex), diff --git a/src/amd/vulkan/radv_aco_shader_info.h b/src/amd/vulkan/radv_aco_shader_info.h index b8887582525..e75ba5826aa 100644 --- a/src/amd/vulkan/radv_aco_shader_info.h +++ b/src/amd/vulkan/radv_aco_shader_info.h @@ -72,8 +72,8 @@ radv_aco_convert_shader_info(struct aco_shader_info *aco_info, const struct radv aco_info->next_stage_pc = radv_args->next_stage_pc; } -#define ASSIGN_VS_STATE_FIELD(x) aco_info->state.x = radv->state->x -#define ASSIGN_VS_STATE_FIELD_CP(x) memcpy(&aco_info->state.x, &radv->state->x, sizeof(radv->state->x)) +#define ASSIGN_VS_STATE_FIELD(x) aco_info->state.x = radv->state.x +#define ASSIGN_VS_STATE_FIELD_CP(x) memcpy(&aco_info->state.x, &radv->state.x, sizeof(radv->state.x)) static inline void radv_aco_convert_vs_prolog_key(struct aco_vs_prolog_info *aco_info, const struct radv_vs_prolog_key *radv, const struct radv_shader_args *radv_args) diff --git a/src/amd/vulkan/radv_cmd_buffer.c b/src/amd/vulkan/radv_cmd_buffer.c index 0155bc2e2a2..19e0e55e45b 100644 --- a/src/amd/vulkan/radv_cmd_buffer.c +++ b/src/amd/vulkan/radv_cmd_buffer.c @@ -3691,53 +3691,25 @@ radv_instance_rate_prolog_index(unsigned num_attributes, uint32_t instance_rate_ return start_index + offset_from_start_index + first; } -union vs_prolog_key_header { - struct { - uint32_t key_size : 8; - uint32_t num_attributes : 6; - uint32_t as_ls : 1; - uint32_t is_ngg : 1; - uint32_t wave32 : 1; - uint32_t next_stage : 3; - uint32_t instance_rate_inputs : 1; - uint32_t alpha_adjust_lo : 1; - uint32_t alpha_adjust_hi : 1; - uint32_t misaligned_mask : 1; - uint32_t post_shuffle : 1; - uint32_t nontrivial_divisors : 1; - uint32_t zero_divisors : 1; - /* We need this to ensure the padding is zero. It's useful even if it's unused. */ - uint32_t padding0 : 5; - }; - uint32_t v; -}; - uint32_t radv_hash_vs_prolog(const void *key_) { - const uint32_t *key = key_; - union vs_prolog_key_header header; - header.v = key[0]; - return _mesa_hash_data(key, header.key_size); + const struct radv_vs_prolog_key *key = key_; + return _mesa_hash_data(key, sizeof(*key)); } bool radv_cmp_vs_prolog(const void *a_, const void *b_) { - const uint32_t *a = a_; - const uint32_t *b = b_; - if (a[0] != b[0]) - return false; + const struct radv_vs_prolog_key *a = a_; + const struct radv_vs_prolog_key *b = b_; - union vs_prolog_key_header header; - header.v = a[0]; - return memcmp(a, b, header.key_size) == 0; + return memcmp(a, b, sizeof(*a)) == 0; } static struct radv_shader_part * lookup_vs_prolog(struct radv_cmd_buffer *cmd_buffer, const struct radv_shader *vs_shader, uint32_t *nontrivial_divisors) { - STATIC_ASSERT(sizeof(union vs_prolog_key_header) == 4); assert(vs_shader->info.vs.dynamic_inputs); const struct radv_vs_input_state *state = &cmd_buffer->state.dynamic_vs_input; @@ -3800,12 +3772,17 @@ lookup_vs_prolog(struct radv_cmd_buffer *cmd_buffer, const struct radv_shader *v if (prolog) return prolog; - /* if we couldn't use a pre-compiled prolog, find one in the cache or create one */ - uint32_t key_words[17]; - unsigned key_size = 1; - struct radv_vs_prolog_key key; - key.state = state; + memset(&key, 0, sizeof(key)); + key.state.instance_rate_inputs = instance_rate_inputs; + key.state.nontrivial_divisors = *nontrivial_divisors; + key.state.zero_divisors = zero_divisors; + /* If the attribute is aligned, post shuffle is implemented using DST_SEL instead. */ + key.state.post_shuffle = state->post_shuffle & attribute_mask & misaligned_mask; + key.state.alpha_adjust_hi = state->alpha_adjust_hi & attribute_mask; + key.state.alpha_adjust_lo = state->alpha_adjust_lo & attribute_mask; + u_foreach_bit (index, misaligned_mask) + key.state.formats[index] = state->formats[index]; key.num_attributes = num_attributes; key.misaligned_mask = misaligned_mask; /* The instance ID input VGPR is placed differently when as_ls=true. */ @@ -3820,78 +3797,29 @@ lookup_vs_prolog(struct radv_cmd_buffer *cmd_buffer, const struct radv_shader *v key.next_stage = vs_shader->info.stage; } - union vs_prolog_key_header header; - header.v = 0; - header.num_attributes = num_attributes; - header.as_ls = key.as_ls; - header.is_ngg = key.is_ngg; - header.wave32 = key.wave32; - header.next_stage = key.next_stage; - - if (instance_rate_inputs & ~*nontrivial_divisors) { - header.instance_rate_inputs = true; - key_words[key_size++] = instance_rate_inputs; - } - if (*nontrivial_divisors) { - header.nontrivial_divisors = true; - key_words[key_size++] = *nontrivial_divisors; - } - if (zero_divisors) { - header.zero_divisors = true; - key_words[key_size++] = zero_divisors; - } - if (misaligned_mask) { - header.misaligned_mask = true; - key_words[key_size++] = misaligned_mask; - - uint8_t *formats = (uint8_t *)&key_words[key_size]; - unsigned num_formats = 0; - u_foreach_bit (index, misaligned_mask) - formats[num_formats++] = state->formats[index]; - while (num_formats & 0x3) - formats[num_formats++] = 0; - key_size += num_formats / 4u; - - if (state->post_shuffle & attribute_mask) { - header.post_shuffle = true; - key_words[key_size++] = state->post_shuffle & attribute_mask; - } - } - if (state->alpha_adjust_lo & attribute_mask) { - header.alpha_adjust_lo = true; - key_words[key_size++] = state->alpha_adjust_lo & attribute_mask; - } - if (state->alpha_adjust_hi & attribute_mask) { - header.alpha_adjust_hi = true; - key_words[key_size++] = state->alpha_adjust_hi & attribute_mask; - } - - header.key_size = key_size * sizeof(key_words[0]); - key_words[0] = header.v; - - uint32_t hash = radv_hash_vs_prolog(key_words); + uint32_t hash = radv_hash_vs_prolog(&key); u_rwlock_rdlock(&device->vs_prologs_lock); - struct hash_entry *prolog_entry = _mesa_hash_table_search_pre_hashed(device->vs_prologs, hash, key_words); + struct hash_entry *prolog_entry = _mesa_hash_table_search_pre_hashed(device->vs_prologs, hash, &key); u_rwlock_rdunlock(&device->vs_prologs_lock); if (!prolog_entry) { u_rwlock_wrlock(&device->vs_prologs_lock); - prolog_entry = _mesa_hash_table_search_pre_hashed(device->vs_prologs, hash, key_words); + prolog_entry = _mesa_hash_table_search_pre_hashed(device->vs_prologs, hash, &key); if (prolog_entry) { u_rwlock_wrunlock(&device->vs_prologs_lock); return prolog_entry->data; } prolog = radv_create_vs_prolog(device, &key); - uint32_t *key2 = malloc(key_size * 4); + struct radv_vs_prolog_key *key2 = malloc(sizeof(key)); if (!prolog || !key2) { radv_shader_part_unref(device, prolog); free(key2); u_rwlock_wrunlock(&device->vs_prologs_lock); return NULL; } - memcpy(key2, key_words, key_size * 4); + memcpy(key2, &key, sizeof(key)); _mesa_hash_table_insert_pre_hashed(device->vs_prologs, hash, key2, prolog); u_rwlock_wrunlock(&device->vs_prologs_lock); diff --git a/src/amd/vulkan/radv_device.c b/src/amd/vulkan/radv_device.c index f613bf105e7..56a73c72b7c 100644 --- a/src/amd/vulkan/radv_device.c +++ b/src/amd/vulkan/radv_device.c @@ -168,25 +168,15 @@ radv_device_init_vs_prologs(struct radv_device *device) if (device->instance->debug_flags & RADV_DEBUG_DUMP_PROLOGS) return VK_SUCCESS; - struct radv_vs_input_state state; - state.nontrivial_divisors = 0; - memset(state.offsets, 0, sizeof(state.offsets)); - state.alpha_adjust_lo = 0; - state.alpha_adjust_hi = 0; - memset(state.formats, 0, sizeof(state.formats)); - struct radv_vs_prolog_key key; - key.state = &state; - key.misaligned_mask = 0; + memset(&key, 0, sizeof(key)); key.as_ls = false; key.is_ngg = device->physical_device->use_ngg; key.next_stage = MESA_SHADER_VERTEX; key.wave32 = device->physical_device->ge_wave_size == 32; for (unsigned i = 1; i <= MAX_VERTEX_ATTRIBS; i++) { - state.attribute_mask = BITFIELD_MASK(i); - state.instance_rate_inputs = 0; - + key.state.instance_rate_inputs = 0; key.num_attributes = i; device->simple_vs_prologs[i - 1] = radv_create_vs_prolog(device, &key); @@ -196,22 +186,16 @@ radv_device_init_vs_prologs(struct radv_device *device) unsigned idx = 0; for (unsigned num_attributes = 1; num_attributes <= 16; num_attributes++) { - state.attribute_mask = BITFIELD_MASK(num_attributes); - - for (unsigned i = 0; i < num_attributes; i++) - state.divisors[i] = 1; - for (unsigned count = 1; count <= num_attributes; count++) { for (unsigned start = 0; start <= (num_attributes - count); start++) { - state.instance_rate_inputs = u_bit_consecutive(start, count); - + key.state.instance_rate_inputs = u_bit_consecutive(start, count); key.num_attributes = num_attributes; struct radv_shader_part *prolog = radv_create_vs_prolog(device, &key); if (!prolog) return vk_error(device->physical_device->instance, VK_ERROR_OUT_OF_DEVICE_MEMORY); - assert(idx == radv_instance_rate_prolog_index(num_attributes, state.instance_rate_inputs)); + assert(idx == radv_instance_rate_prolog_index(num_attributes, key.state.instance_rate_inputs)); device->instance_rate_vs_prologs[idx++] = prolog; } } diff --git a/src/amd/vulkan/radv_shader.c b/src/amd/vulkan/radv_shader.c index 3a0d4bf316a..fec40cf7065 100644 --- a/src/amd/vulkan/radv_shader.c +++ b/src/amd/vulkan/radv_shader.c @@ -2581,7 +2581,7 @@ radv_create_vs_prolog(struct radv_device *device, const struct radv_vs_prolog_ke if (!prolog) goto fail; - prolog->nontrivial_divisors = key->state->nontrivial_divisors; + prolog->nontrivial_divisors = key->state.nontrivial_divisors; if (options.dump_shader) { fprintf(stderr, "Vertex prolog"); diff --git a/src/amd/vulkan/radv_shader.h b/src/amd/vulkan/radv_shader.h index b2e8a8e7bea..b62a7147d6b 100644 --- a/src/amd/vulkan/radv_shader.h +++ b/src/amd/vulkan/radv_shader.h @@ -485,7 +485,21 @@ struct radv_vs_input_state { }; struct radv_vs_prolog_key { - const struct radv_vs_input_state *state; + /* All the fields are pre-masked with BITFIELD_MASK(num_attributes). + * Some of the fields are pre-masked by other conditions. See lookup_vs_prolog. + */ + struct { + uint32_t instance_rate_inputs; + uint32_t nontrivial_divisors; + uint32_t zero_divisors; + uint32_t post_shuffle; + /* Having two separate fields instead of a single uint64_t makes it easier to remove attributes + * using bitwise arithmetic. + */ + uint32_t alpha_adjust_lo; + uint32_t alpha_adjust_hi; + uint8_t formats[MAX_VERTEX_ATTRIBS]; + } state; unsigned num_attributes; uint32_t misaligned_mask; bool as_ls;