Module: Mesa Branch: main Commit: ee29a8d1cdbdc553caac8ceeda03f5c737eda0f2 URL: http://cgit.freedesktop.org/mesa/mesa/commit/?id=ee29a8d1cdbdc553caac8ceeda03f5c737eda0f2
Author: Faith Ekstrand <faith.ekstr...@collabora.com> Date: Fri Dec 8 18:26:08 2023 -0600 nvk: Upload cbufs based on the cbuf_map For draw, this requires that we use a macro to read descriptors out of the descriptor buffer because we may not have that information easily available on the CPU. Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/26615> --- src/nouveau/vulkan/nvk_cmd_buffer.c | 56 +++++++++ src/nouveau/vulkan/nvk_cmd_buffer.h | 11 ++ src/nouveau/vulkan/nvk_cmd_dispatch.c | 47 +++++-- src/nouveau/vulkan/nvk_cmd_draw.c | 162 +++++++++++++++++++++++-- src/nouveau/vulkan/nvk_graphics_pipeline.c | 5 +- src/nouveau/vulkan/nvk_mme.c | 1 + src/nouveau/vulkan/nvk_mme.h | 2 + src/nouveau/vulkan/nvk_nir_lower_descriptors.c | 6 + 8 files changed, 267 insertions(+), 23 deletions(-) diff --git a/src/nouveau/vulkan/nvk_cmd_buffer.c b/src/nouveau/vulkan/nvk_cmd_buffer.c index 4c7b49a0865..53327917092 100644 --- a/src/nouveau/vulkan/nvk_cmd_buffer.c +++ b/src/nouveau/vulkan/nvk_cmd_buffer.c @@ -705,6 +705,62 @@ nvk_cmd_buffer_flush_push_descriptors(struct nvk_cmd_buffer *cmd, } } +bool +nvk_cmd_buffer_get_cbuf_descriptor(struct nvk_cmd_buffer *cmd, + const struct nvk_descriptor_state *desc, + const struct nvk_cbuf *cbuf, + struct nvk_buffer_address *desc_out) +{ + switch (cbuf->type) { + case NVK_CBUF_TYPE_INVALID: + *desc_out = (struct nvk_buffer_address) { .size = 0 }; + return true; + + case NVK_CBUF_TYPE_ROOT_DESC: + unreachable("The caller should handle root descriptors"); + return false; + + case NVK_CBUF_TYPE_DESC_SET: + *desc_out = (struct nvk_buffer_address) { + .base_addr = desc->root.sets[cbuf->desc_set], + .size = desc->set_sizes[cbuf->desc_set], + }; + return true; + + case NVK_CBUF_TYPE_DYNAMIC_UBO: + *desc_out = desc->root.dynamic_buffers[cbuf->dynamic_idx]; + return true; + + case NVK_CBUF_TYPE_UBO_DESC: { + if (desc->sets[cbuf->desc_set] != NULL) + return false; + + struct nvk_push_descriptor_set *push = desc->push[cbuf->desc_set]; + if (push == NULL) + return false; + + assert(cbuf->desc_offset < NVK_PUSH_DESCRIPTOR_SET_SIZE); + void *desc = &push->data[cbuf->desc_offset]; + *desc_out = *(struct nvk_buffer_address *)desc; + return true; + } + + default: + unreachable("Invalid cbuf type"); + } +} + +uint64_t +nvk_cmd_buffer_get_cbuf_descriptor_addr(struct nvk_cmd_buffer *cmd, + const struct nvk_descriptor_state *desc, + const struct nvk_cbuf *cbuf) +{ + assert(cbuf->type == NVK_CBUF_TYPE_UBO_DESC); + + assert(cbuf->desc_offset < desc->set_sizes[cbuf->desc_set]); + return desc->root.sets[cbuf->desc_set] + cbuf->desc_offset; +} + void nvk_cmd_buffer_dump(struct nvk_cmd_buffer *cmd, FILE *fp) { diff --git a/src/nouveau/vulkan/nvk_cmd_buffer.h b/src/nouveau/vulkan/nvk_cmd_buffer.h index 4e64d409463..ec6042e3c73 100644 --- a/src/nouveau/vulkan/nvk_cmd_buffer.h +++ b/src/nouveau/vulkan/nvk_cmd_buffer.h @@ -18,6 +18,7 @@ #include <stdio.h> struct nvk_buffer; +struct nvk_cbuf; struct nvk_cmd_bo; struct nvk_cmd_pool; struct nvk_image_view; @@ -245,6 +246,16 @@ void nvk_cmd_buffer_flush_push_descriptors(struct nvk_cmd_buffer *cmd, struct nvk_descriptor_state *desc); +bool +nvk_cmd_buffer_get_cbuf_descriptor(struct nvk_cmd_buffer *cmd, + const struct nvk_descriptor_state *desc, + const struct nvk_cbuf *cbuf, + struct nvk_buffer_address *desc_out); +uint64_t +nvk_cmd_buffer_get_cbuf_descriptor_addr(struct nvk_cmd_buffer *cmd, + const struct nvk_descriptor_state *desc, + const struct nvk_cbuf *cbuf); + void nvk_meta_resolve_rendering(struct nvk_cmd_buffer *cmd, const VkRenderingInfo *pRenderingInfo); diff --git a/src/nouveau/vulkan/nvk_cmd_dispatch.c b/src/nouveau/vulkan/nvk_cmd_dispatch.c index 9697c61babc..5708c815da6 100644 --- a/src/nouveau/vulkan/nvk_cmd_dispatch.c +++ b/src/nouveau/vulkan/nvk_cmd_dispatch.c @@ -159,6 +159,9 @@ static uint64_t nvk_flush_compute_state(struct nvk_cmd_buffer *cmd, uint64_t *root_desc_addr_out) { + struct nvk_device *dev = nvk_cmd_buffer_device(cmd); + struct nvk_physical_device *pdev = nvk_device_physical(dev); + const uint32_t min_cbuf_alignment = nvk_min_cbuf_alignment(&pdev->info); const struct nvk_compute_pipeline *pipeline = cmd->state.cs.pipeline; struct nvk_descriptor_state *desc = &cmd->state.cs.descriptors; VkResult result; @@ -170,10 +173,12 @@ nvk_flush_compute_state(struct nvk_cmd_buffer *cmd, * 0x100 aligned. */ STATIC_ASSERT((sizeof(desc->root) & 0xff) == 0); + assert(sizeof(desc->root) % min_cbuf_alignment == 0); void *root_desc_map; uint64_t root_desc_addr; - result = nvk_cmd_buffer_upload_alloc(cmd, sizeof(desc->root), 0x100, + result = nvk_cmd_buffer_upload_alloc(cmd, sizeof(desc->root), + min_cbuf_alignment, &root_desc_addr, &root_desc_map); if (unlikely(result != VK_SUCCESS)) { vk_command_buffer_set_error(&cmd->vk, result); @@ -192,26 +197,50 @@ nvk_flush_compute_state(struct nvk_cmd_buffer *cmd, desc->root.cs.group_count[0], desc->root.cs.group_count[1], desc->root.cs.group_count[2]); - - nvc6c0_cp_launch_desc_set_cb(qmd, 0, sizeof(desc->root), root_desc_addr); - nvc6c0_cp_launch_desc_set_cb(qmd, 1, sizeof(desc->root), root_desc_addr); } else if (nvk_cmd_buffer_compute_cls(cmd) >= PASCAL_COMPUTE_A) { nvc0c0_qmd_set_dispatch_size(nvk_cmd_buffer_device(cmd), qmd, desc->root.cs.group_count[0], desc->root.cs.group_count[1], desc->root.cs.group_count[2]); - - nvc0c0_cp_launch_desc_set_cb(qmd, 0, sizeof(desc->root), root_desc_addr); - nvc0c0_cp_launch_desc_set_cb(qmd, 1, sizeof(desc->root), root_desc_addr); } else { assert(nvk_cmd_buffer_compute_cls(cmd) >= KEPLER_COMPUTE_A); nva0c0_qmd_set_dispatch_size(nvk_cmd_buffer_device(cmd), qmd, desc->root.cs.group_count[0], desc->root.cs.group_count[1], desc->root.cs.group_count[2]); + } + + const struct nvk_shader *shader = + &pipeline->base.shaders[MESA_SHADER_COMPUTE]; + for (uint32_t c = 0; c < shader->cbuf_map.cbuf_count; c++) { + const struct nvk_cbuf *cbuf = &shader->cbuf_map.cbufs[c]; + + struct nvk_buffer_address ba; + if (cbuf->type == NVK_CBUF_TYPE_ROOT_DESC) { + ba = (struct nvk_buffer_address) { + .base_addr = root_desc_addr, + .size = sizeof(desc->root), + }; + } else { + ASSERTED bool direct_descriptor = + nvk_cmd_buffer_get_cbuf_descriptor(cmd, desc, cbuf, &ba); + assert(direct_descriptor); + } - nva0c0_cp_launch_desc_set_cb(qmd, 0, sizeof(desc->root), root_desc_addr); - nva0c0_cp_launch_desc_set_cb(qmd, 1, sizeof(desc->root), root_desc_addr); + if (ba.size > 0) { + assert(ba.base_addr % min_cbuf_alignment == 0); + ba.size = align(ba.size, min_cbuf_alignment); + ba.size = MIN2(ba.size, NVK_MAX_CBUF_SIZE); + + if (nvk_cmd_buffer_compute_cls(cmd) >= AMPERE_COMPUTE_A) { + nvc6c0_cp_launch_desc_set_cb(qmd, c, ba.size, ba.base_addr); + } else if (nvk_cmd_buffer_compute_cls(cmd) >= PASCAL_COMPUTE_A) { + nvc0c0_cp_launch_desc_set_cb(qmd, c, ba.size, ba.base_addr); + } else { + assert(nvk_cmd_buffer_compute_cls(cmd) >= KEPLER_COMPUTE_A); + nva0c0_cp_launch_desc_set_cb(qmd, c, ba.size, ba.base_addr); + } + } } uint64_t qmd_addr; diff --git a/src/nouveau/vulkan/nvk_cmd_draw.c b/src/nouveau/vulkan/nvk_cmd_draw.c index 7ef9c86e2ea..8284de36635 100644 --- a/src/nouveau/vulkan/nvk_cmd_draw.c +++ b/src/nouveau/vulkan/nvk_cmd_draw.c @@ -454,7 +454,7 @@ nvk_cmd_buffer_begin_graphics(struct nvk_cmd_buffer *cmd, const VkCommandBufferBeginInfo *pBeginInfo) { if (cmd->vk.level == VK_COMMAND_BUFFER_LEVEL_PRIMARY) { - struct nv_push *p = nvk_cmd_buffer_push(cmd, 3); + struct nv_push *p = nvk_cmd_buffer_push(cmd, 5); P_MTHD(p, NV9097, INVALIDATE_SAMPLER_CACHE_NO_WFI); P_NV9097_INVALIDATE_SAMPLER_CACHE_NO_WFI(p, { .lines = LINES_ALL, @@ -462,6 +462,10 @@ nvk_cmd_buffer_begin_graphics(struct nvk_cmd_buffer *cmd, P_NV9097_INVALIDATE_TEXTURE_HEADER_CACHE_NO_WFI(p, { .lines = LINES_ALL, }); + + P_IMMD(p, NVA097, INVALIDATE_SHADER_CACHES_NO_WFI, { + .constant = CONSTANT_TRUE, + }); } if (cmd->vk.level != VK_COMMAND_BUFFER_LEVEL_PRIMARY && @@ -1649,9 +1653,71 @@ nvk_flush_dynamic_state(struct nvk_cmd_buffer *cmd) vk_dynamic_graphics_state_clear_dirty(dyn); } +void +nvk_mme_bind_cbuf_desc(struct mme_builder *b) +{ + /* First 4 bits are group, later bits are slot */ + struct mme_value group_slot = mme_load(b); + + if (b->devinfo->cls_eng3d >= TURING_A) { + struct mme_value64 addr = mme_load_addr64(b); + mme_tu104_read_fifoed(b, addr, mme_imm(3)); + } + + /* Load the descriptor */ + struct mme_value addr_lo = mme_load(b); + struct mme_value addr_hi = mme_load(b); + struct mme_value size = mme_load(b); + + struct mme_value cb = mme_alloc_reg(b); + mme_if(b, ieq, size, mme_zero()) { + /* Bottim bit is the valid bit, 8:4 are shader slot */ + mme_merge_to(b, cb, mme_zero(), group_slot, 4, 5, 4); + } + + mme_if(b, ine, size, mme_zero()) { + uint32_t alignment = nvk_min_cbuf_alignment(b->devinfo); + mme_add_to(b, size, size, mme_imm(alignment - 1)); + mme_and_to(b, size, size, mme_imm(~(alignment - 1))); + + /* size = max(size, NVK_MAX_CBUF_SIZE) */ + assert(util_is_power_of_two_nonzero(NVK_MAX_CBUF_SIZE)); + struct mme_value is_large = + mme_and(b, size, mme_imm(~(NVK_MAX_CBUF_SIZE - 1))); + mme_if(b, ine, is_large, mme_zero()) { + mme_mov_to(b, size, mme_imm(NVK_MAX_CBUF_SIZE)); + } + + mme_mthd(b, NV9097_SET_CONSTANT_BUFFER_SELECTOR_A); + mme_emit(b, size); + mme_emit(b, addr_hi); + mme_emit(b, addr_lo); + + /* Bottim bit is the valid bit, 8:4 are shader slot */ + mme_merge_to(b, cb, mme_imm(1), group_slot, 4, 5, 4); + } + + mme_free_reg(b, addr_hi); + mme_free_reg(b, addr_lo); + mme_free_reg(b, size); + + /* The group comes in the bottom 4 bits in group_slot and we need to + * combine it with the method. However, unlike most array methods with a + * stride if 1 dword, BIND_GROUP_CONSTANT_BUFFER has a stride of 32B or 8 + * dwords. This means we need to also shift by 3. + */ + struct mme_value group = mme_merge(b, mme_imm(0), group_slot, 3, 4, 0); + mme_mthd_arr(b, NV9097_BIND_GROUP_CONSTANT_BUFFER(0), group); + mme_emit(b, cb); +} + static void nvk_flush_descriptors(struct nvk_cmd_buffer *cmd) { + struct nvk_device *dev = nvk_cmd_buffer_device(cmd); + struct nvk_physical_device *pdev = nvk_device_physical(dev); + const uint32_t min_cbuf_alignment = nvk_min_cbuf_alignment(&pdev->info); + const struct nvk_graphics_pipeline *pipeline = cmd->state.gfx.pipeline; struct nvk_descriptor_state *desc = &cmd->state.gfx.descriptors; VkResult result; @@ -1662,10 +1728,12 @@ nvk_flush_descriptors(struct nvk_cmd_buffer *cmd) * 0x100 aligned. */ STATIC_ASSERT((sizeof(desc->root) & 0xff) == 0); + assert(sizeof(desc->root) % min_cbuf_alignment == 0); void *root_desc_map; uint64_t root_desc_addr; - result = nvk_cmd_buffer_upload_alloc(cmd, sizeof(desc->root), 0x100, + result = nvk_cmd_buffer_upload_alloc(cmd, sizeof(desc->root), + min_cbuf_alignment, &root_desc_addr, &root_desc_map); if (unlikely(result != VK_SUCCESS)) { vk_command_buffer_set_error(&cmd->vk, result); @@ -1675,22 +1743,92 @@ nvk_flush_descriptors(struct nvk_cmd_buffer *cmd) desc->root.root_desc_addr = root_desc_addr; memcpy(root_desc_map, &desc->root, sizeof(desc->root)); - struct nv_push *p = nvk_cmd_buffer_push(cmd, 24); + uint32_t root_cbuf_count = 0; + for (gl_shader_stage stage = 0; stage < MESA_SHADER_STAGES; stage++) { + const struct nvk_shader *shader = &pipeline->base.shaders[stage]; + if (shader->code_size == 0) + continue; + + uint32_t group = stage; + for (uint32_t c = 0; c < shader->cbuf_map.cbuf_count; c++) { + const struct nvk_cbuf *cbuf = &shader->cbuf_map.cbufs[c]; + + /* We bind these at the very end */ + if (cbuf->type == NVK_CBUF_TYPE_ROOT_DESC) { + root_cbuf_count++; + continue; + } + + struct nvk_buffer_address ba; + if (nvk_cmd_buffer_get_cbuf_descriptor(cmd, desc, cbuf, &ba)) { + assert(ba.base_addr % min_cbuf_alignment == 0); + ba.size = align(ba.size, min_cbuf_alignment); + ba.size = MIN2(ba.size, NVK_MAX_CBUF_SIZE); + + struct nv_push *p = nvk_cmd_buffer_push(cmd, 6); + + if (ba.size > 0) { + P_MTHD(p, NV9097, SET_CONSTANT_BUFFER_SELECTOR_A); + P_NV9097_SET_CONSTANT_BUFFER_SELECTOR_A(p, ba.size); + P_NV9097_SET_CONSTANT_BUFFER_SELECTOR_B(p, ba.base_addr >> 32); + P_NV9097_SET_CONSTANT_BUFFER_SELECTOR_C(p, ba.base_addr); + } + + P_IMMD(p, NV9097, BIND_GROUP_CONSTANT_BUFFER(group), { + .valid = ba.size > 0, + .shader_slot = c, + }); + } else { + uint64_t desc_addr = + nvk_cmd_buffer_get_cbuf_descriptor_addr(cmd, desc, cbuf); + + if (nvk_cmd_buffer_3d_cls(cmd) >= TURING_A) { + struct nv_push *p = nvk_cmd_buffer_push(cmd, 4); + + P_1INC(p, NV9097, CALL_MME_MACRO(NVK_MME_BIND_CBUF_DESC)); + P_INLINE_DATA(p, group | (c << 4)); + P_INLINE_DATA(p, desc_addr >> 32); + P_INLINE_DATA(p, desc_addr); + } else { + struct nv_push *p = nvk_cmd_buffer_push(cmd, 2); + + P_1INC(p, NV9097, CALL_MME_MACRO(NVK_MME_BIND_CBUF_DESC)); + P_INLINE_DATA(p, group | (c << 4)); + + nv_push_update_count(p, 3); + nvk_cmd_buffer_push_indirect(cmd, desc_addr, 3); + } + } + } + } + + /* We bind all root descriptors last so that CONSTANT_BUFFER_SELECTOR is + * always left pointing at the root descriptor table. This way draw + * parameters and similar MME root table updates always hit the root + * descriptor table and not some random UBO. + */ + struct nv_push *p = nvk_cmd_buffer_push(cmd, 4 + 2 * root_cbuf_count); P_MTHD(p, NV9097, SET_CONSTANT_BUFFER_SELECTOR_A); P_NV9097_SET_CONSTANT_BUFFER_SELECTOR_A(p, sizeof(desc->root)); P_NV9097_SET_CONSTANT_BUFFER_SELECTOR_B(p, root_desc_addr >> 32); P_NV9097_SET_CONSTANT_BUFFER_SELECTOR_C(p, root_desc_addr); - for (uint32_t i = 0; i < 5; i++) { - P_IMMD(p, NV9097, BIND_GROUP_CONSTANT_BUFFER(i), { - .valid = VALID_TRUE, - .shader_slot = 0, - }); - P_IMMD(p, NV9097, BIND_GROUP_CONSTANT_BUFFER(i), { - .valid = VALID_TRUE, - .shader_slot = 1, - }); + for (gl_shader_stage stage = 0; stage < MESA_SHADER_STAGES; stage++) { + const struct nvk_shader *shader = &pipeline->base.shaders[stage]; + if (shader->code_size == 0) + continue; + + uint32_t group = stage; + + for (uint32_t c = 0; c < shader->cbuf_map.cbuf_count; c++) { + if (shader->cbuf_map.cbufs[c].type == NVK_CBUF_TYPE_ROOT_DESC) { + P_IMMD(p, NV9097, BIND_GROUP_CONSTANT_BUFFER(group), { + .valid = VALID_TRUE, + .shader_slot = c, + }); + } + } } } diff --git a/src/nouveau/vulkan/nvk_graphics_pipeline.c b/src/nouveau/vulkan/nvk_graphics_pipeline.c index 2d8d23e7866..c5077086500 100644 --- a/src/nouveau/vulkan/nvk_graphics_pipeline.c +++ b/src/nouveau/vulkan/nvk_graphics_pipeline.c @@ -401,8 +401,9 @@ nvk_graphics_pipeline_create(struct nvk_device *dev, P_IMMD(p, NV9097, SET_PIPELINE_PROGRAM(idx), addr); } - P_IMMD(p, NV9097, SET_PIPELINE_REGISTER_COUNT(idx), - shader->info.num_gprs); + P_MTHD(p, NVC397, SET_PIPELINE_REGISTER_COUNT(idx)); + P_NVC397_SET_PIPELINE_REGISTER_COUNT(p, idx, shader->info.num_gprs); + P_NVC397_SET_PIPELINE_BINDING(p, idx, stage); switch (stage) { case MESA_SHADER_VERTEX: diff --git a/src/nouveau/vulkan/nvk_mme.c b/src/nouveau/vulkan/nvk_mme.c index ee3e9ccf908..7f57b634d58 100644 --- a/src/nouveau/vulkan/nvk_mme.c +++ b/src/nouveau/vulkan/nvk_mme.c @@ -7,6 +7,7 @@ #include "nvk_private.h" static const nvk_mme_builder_func mme_builders[NVK_MME_COUNT] = { + [NVK_MME_BIND_CBUF_DESC] = nvk_mme_bind_cbuf_desc, [NVK_MME_CLEAR] = nvk_mme_clear, [NVK_MME_DRAW] = nvk_mme_draw, [NVK_MME_DRAW_INDEXED] = nvk_mme_draw_indexed, diff --git a/src/nouveau/vulkan/nvk_mme.h b/src/nouveau/vulkan/nvk_mme.h index d73e6925e29..0a635f8c37b 100644 --- a/src/nouveau/vulkan/nvk_mme.h +++ b/src/nouveau/vulkan/nvk_mme.h @@ -10,6 +10,7 @@ struct nv_device_info; enum nvk_mme { + NVK_MME_BIND_CBUF_DESC, NVK_MME_CLEAR, NVK_MME_DRAW, NVK_MME_DRAW_INDEXED, @@ -111,6 +112,7 @@ uint32_t *nvk_build_mme(const struct nv_device_info *devinfo, void nvk_test_build_all_mmes(const struct nv_device_info *devinfo); +void nvk_mme_bind_cbuf_desc(struct mme_builder *b); void nvk_mme_clear(struct mme_builder *b); void nvk_mme_draw(struct mme_builder *b); void nvk_mme_draw_indexed(struct mme_builder *b); diff --git a/src/nouveau/vulkan/nvk_nir_lower_descriptors.c b/src/nouveau/vulkan/nvk_nir_lower_descriptors.c index e08a823e3d2..b2ebe316ded 100644 --- a/src/nouveau/vulkan/nvk_nir_lower_descriptors.c +++ b/src/nouveau/vulkan/nvk_nir_lower_descriptors.c @@ -435,6 +435,12 @@ build_cbuf_map(nir_shader *nir, struct lower_descriptors_ctx *ctx) if (mapped_cbuf_count >= max_cbuf_bindings) break; + /* We can't support indirect cbufs in compute yet */ + if ((nir->info.stage == MESA_SHADER_COMPUTE || + nir->info.stage == MESA_SHADER_KERNEL) && + cbufs[i].key.type == NVK_CBUF_TYPE_UBO_DESC) + continue; + ctx->cbuf_map->cbufs[mapped_cbuf_count++] = cbufs[i].key; } ctx->cbuf_map->cbuf_count = mapped_cbuf_count;