Module: Mesa
Branch: main
Commit: 1ba1ee9e7cbd57684f39cc0b841969d00688518f
URL:    
http://cgit.freedesktop.org/mesa/mesa/commit/?id=1ba1ee9e7cbd57684f39cc0b841969d00688518f

Author: Mike Blumenkrantz <[email protected]>
Date:   Fri Mar 18 09:56:31 2022 -0400

lavapipe: implement EXT_inline_uniform_block

this is a lot of machinery to propagate the block sizes down from the
descriptor layout to the pipeline layout to the rendering_state

block data is appended to ubo0 immediately following push constant
data (if it exists), which requires that a new buffer be created and
filled any time either type of data changes

shader handling is done by propagating the offset of each block relative
to the start of its descriptor set, then accumulating the sizes of
every uniform block in each preceding descriptor set into the offset,
then adding on the push constant size, and finally adding that on to
the existing load_ubo deref offset

update-after-bind is no longer an issue since each instance of pc+block
data is its own immutable buffer that can never be modified

Reviewed-by: Dave Airlie <[email protected]>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/15457>

---

 .../frontends/lavapipe/lvp_descriptor_set.c        | 72 ++++++++++++++++++----
 src/gallium/frontends/lavapipe/lvp_execute.c       | 43 ++++++++++++-
 .../frontends/lavapipe/lvp_lower_vulkan_resource.c | 47 ++++++++++++++
 src/gallium/frontends/lavapipe/lvp_private.h       | 11 ++++
 4 files changed, 159 insertions(+), 14 deletions(-)

diff --git a/src/gallium/frontends/lavapipe/lvp_descriptor_set.c 
b/src/gallium/frontends/lavapipe/lvp_descriptor_set.c
index 175a9262bf0..f8b4f86ec2c 100644
--- a/src/gallium/frontends/lavapipe/lvp_descriptor_set.c
+++ b/src/gallium/frontends/lavapipe/lvp_descriptor_set.c
@@ -98,7 +98,10 @@ VKAPI_ATTR VkResult VKAPI_CALL lvp_CreateDescriptorSetLayout(
       set_layout->binding[b].descriptor_index = set_layout->size;
       set_layout->binding[b].type = binding->descriptorType;
       set_layout->binding[b].valid = true;
-      set_layout->size += binding->descriptorCount;
+      if (binding->descriptorType == VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK)
+         set_layout->size++;
+      else
+         set_layout->size += binding->descriptorCount;
 
       for (gl_shader_stage stage = MESA_SHADER_VERTEX; stage < 
MESA_SHADER_STAGES; stage++) {
          set_layout->binding[b].stage[stage].const_buffer_index = -1;
@@ -106,6 +109,7 @@ VKAPI_ATTR VkResult VKAPI_CALL 
lvp_CreateDescriptorSetLayout(
          set_layout->binding[b].stage[stage].sampler_index = -1;
          set_layout->binding[b].stage[stage].sampler_view_index = -1;
          set_layout->binding[b].stage[stage].image_index = -1;
+         set_layout->binding[b].stage[stage].uniform_block_index = -1;
       }
 
       if (binding->descriptorType == VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC 
||
@@ -141,6 +145,14 @@ VKAPI_ATTR VkResult VKAPI_CALL 
lvp_CreateDescriptorSetLayout(
             set_layout->stage[s].const_buffer_count += 
binding->descriptorCount;
          }
         break;
+      case VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK:
+         lvp_foreach_stage(s, binding->stageFlags) {
+            set_layout->binding[b].stage[s].uniform_block_offset = 
set_layout->stage[s].uniform_block_size;
+            set_layout->binding[b].stage[s].uniform_block_index = 
set_layout->stage[s].uniform_block_count;
+            set_layout->stage[s].uniform_block_size += 
binding->descriptorCount;
+            
set_layout->stage[s].uniform_block_sizes[set_layout->stage[s].uniform_block_count++]
 = binding->descriptorCount;
+         }
+        break;
       case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER:
       case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER_DYNAMIC:
          lvp_foreach_stage(s, binding->stageFlags) {
@@ -260,6 +272,14 @@ VKAPI_ATTR VkResult VKAPI_CALL lvp_CreatePipelineLayout(
       LVP_FROM_HANDLE(lvp_descriptor_set_layout, set_layout,
                       pCreateInfo->pSetLayouts[set]);
       layout->set[set].layout = set_layout;
+      for (unsigned i = 0; i < MESA_SHADER_STAGES; i++) {
+         layout->stage[i].uniform_block_size += 
set_layout->stage[i].uniform_block_size;
+         for (unsigned j = 0; j < set_layout->stage[i].uniform_block_count; 
j++) {
+            assert(layout->stage[i].uniform_block_count + j < 
MAX_PER_STAGE_DESCRIPTOR_UNIFORM_BLOCKS * MAX_SETS);
+            
layout->stage[i].uniform_block_sizes[layout->stage[i].uniform_block_count + j] 
= set_layout->stage[i].uniform_block_sizes[j];
+         }
+         layout->stage[i].uniform_block_count += 
set_layout->stage[i].uniform_block_count;
+      }
       lvp_descriptor_set_layout_ref(set_layout);
    }
 
@@ -341,8 +361,10 @@ lvp_descriptor_set_create(struct lvp_device *device,
                           struct lvp_descriptor_set **out_set)
 {
    struct lvp_descriptor_set *set;
-   size_t size = sizeof(*set) + layout->size * sizeof(set->descriptors[0]);
-
+   size_t base_size = sizeof(*set) + layout->size * 
sizeof(set->descriptors[0]);
+   size_t size = base_size;
+   for (unsigned i = 0; i < MESA_SHADER_STAGES; i++)
+      size += layout->stage[i].uniform_block_size;
    set = vk_alloc(&device->vk.alloc /* XXX: Use the pool */, size, 8,
                    VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
    if (!set)
@@ -360,12 +382,19 @@ lvp_descriptor_set_create(struct lvp_device *device,
 
    /* Go through and fill out immutable samplers if we have any */
    struct lvp_descriptor *desc = set->descriptors;
+   uint8_t *uniform_mem = (uint8_t*)(set) + base_size;
    for (uint32_t b = 0; b < layout->binding_count; b++) {
-      if (layout->binding[b].immutable_samplers) {
-         for (uint32_t i = 0; i < layout->binding[b].array_size; i++)
-            desc[i].info.sampler = layout->binding[b].immutable_samplers[i];
+      if (layout->binding[b].type == VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK) {
+         desc->info.uniform = uniform_mem;
+         uniform_mem += layout->binding[b].array_size;
+         desc++;
+      } else {
+         if (layout->binding[b].immutable_samplers) {
+            for (uint32_t i = 0; i < layout->binding[b].array_size; i++)
+               desc[i].info.sampler = layout->binding[b].immutable_samplers[i];
+         }
+         desc += layout->binding[b].array_size;
       }
-      desc += layout->binding[b].array_size;
    }
 
    *out_set = set;
@@ -444,6 +473,14 @@ VKAPI_ATTR void VKAPI_CALL lvp_UpdateDescriptorSets(
          &set->layout->binding[write->dstBinding];
       struct lvp_descriptor *desc =
          &set->descriptors[bind_layout->descriptor_index];
+      if (write->descriptorType == VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK) {
+         const VkWriteDescriptorSetInlineUniformBlock *uniform_data =
+            vk_find_struct_const(write->pNext, 
WRITE_DESCRIPTOR_SET_INLINE_UNIFORM_BLOCK);
+         assert(uniform_data);
+         desc->type = VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK;
+         memcpy(desc->info.uniform + write->dstArrayElement, 
uniform_data->pData, uniform_data->dataSize);
+         continue;
+      }
       desc += write->dstArrayElement;
 
       switch (write->descriptorType) {
@@ -540,16 +577,24 @@ VKAPI_ATTR void VKAPI_CALL lvp_UpdateDescriptorSets(
          &src->layout->binding[copy->srcBinding];
       struct lvp_descriptor *src_desc =
          &src->descriptors[src_layout->descriptor_index];
-      src_desc += copy->srcArrayElement;
 
       const struct lvp_descriptor_set_binding_layout *dst_layout =
          &dst->layout->binding[copy->dstBinding];
       struct lvp_descriptor *dst_desc =
          &dst->descriptors[dst_layout->descriptor_index];
-      dst_desc += copy->dstArrayElement;
 
-      for (uint32_t j = 0; j < copy->descriptorCount; j++)
-         dst_desc[j] = src_desc[j];
+      if (src_desc->type == VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK) {
+         dst_desc->type = VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK;
+         memcpy(dst_desc->info.uniform + copy->dstArrayElement,
+                src_desc->info.uniform + copy->srcArrayElement,
+                copy->descriptorCount);
+      } else {
+         src_desc += copy->srcArrayElement;
+         dst_desc += copy->dstArrayElement;
+
+         for (uint32_t j = 0; j < copy->descriptorCount; j++)
+            dst_desc[j] = src_desc[j];
+      }
    }
 }
 
@@ -689,6 +734,11 @@ VKAPI_ATTR void VKAPI_CALL 
lvp_UpdateDescriptorSetWithTemplate(VkDevice _device,
          &set->layout->binding[entry->dstBinding];
       struct lvp_descriptor *desc =
          &set->descriptors[bind_layout->descriptor_index];
+      if (entry->descriptorType == 
VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK_EXT) {
+         desc->type = VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK_EXT;
+         memcpy(desc->info.uniform + entry->dstArrayElement, pSrc, 
entry->descriptorCount);
+         continue;
+      }
       for (j = 0; j < entry->descriptorCount; ++j) {
          unsigned idx = j + entry->dstArrayElement;
          switch (entry->descriptorType) {
diff --git a/src/gallium/frontends/lavapipe/lvp_execute.c 
b/src/gallium/frontends/lavapipe/lvp_execute.c
index d17e24a04e2..6d1aef7b296 100644
--- a/src/gallium/frontends/lavapipe/lvp_execute.c
+++ b/src/gallium/frontends/lavapipe/lvp_execute.c
@@ -146,6 +146,11 @@ struct rendering_state {
 
    uint8_t push_constants[128 * 4];
    uint16_t push_size[2]; //gfx, compute
+   struct {
+      void *block[MAX_PER_STAGE_DESCRIPTOR_UNIFORM_BLOCKS * MAX_SETS];
+      uint16_t size[MAX_PER_STAGE_DESCRIPTOR_UNIFORM_BLOCKS * MAX_SETS];
+      uint16_t count;
+   } uniform_blocks[PIPE_SHADER_TYPES];
 
    const struct lvp_render_pass *pass;
    struct lvp_subpass *subpass;
@@ -208,6 +213,8 @@ static unsigned
 calc_ubo0_size(struct rendering_state *state, enum pipe_shader_type pstage)
 {
    unsigned size = get_pcbuf_size(state, pstage);
+   for (unsigned i = 0; i < state->uniform_blocks[pstage].count; i++)
+      size += state->uniform_blocks[pstage].size[i];
    return size;
 }
 
@@ -217,6 +224,13 @@ fill_ubo0(struct rendering_state *state, uint8_t *mem, 
enum pipe_shader_type pst
    unsigned push_size = get_pcbuf_size(state, pstage);
    if (push_size)
       memcpy(mem, state->push_constants, push_size);
+
+   mem += push_size;
+   for (unsigned i = 0; i < state->uniform_blocks[pstage].count; i++) {
+      unsigned size = state->uniform_blocks[pstage].size[i];
+      memcpy(mem, state->uniform_blocks[pstage].block[i], size);
+      mem += size;
+   }
 }
 
 static void
@@ -418,7 +432,10 @@ static void handle_compute_pipeline(struct 
vk_cmd_queue_entry *cmd,
 
    if ((pipeline->layout->push_constant_stages & VK_SHADER_STAGE_COMPUTE_BIT) 
> 0)
       state->has_pcbuf[PIPE_SHADER_COMPUTE] = 
pipeline->layout->push_constant_size > 0;
-   if (!state->has_pcbuf[PIPE_SHADER_COMPUTE])
+   state->uniform_blocks[PIPE_SHADER_COMPUTE].count = 
pipeline->layout->stage[MESA_SHADER_COMPUTE].uniform_block_count;
+   for (unsigned j = 0; j < 
pipeline->layout->stage[MESA_SHADER_COMPUTE].uniform_block_count; j++)
+      state->uniform_blocks[PIPE_SHADER_COMPUTE].size[j] = 
pipeline->layout->stage[MESA_SHADER_COMPUTE].uniform_block_sizes[j];
+   if (!state->has_pcbuf[PIPE_SHADER_COMPUTE] && 
!pipeline->layout->stage[MESA_SHADER_COMPUTE].uniform_block_count)
       state->pcbuf_dirty[PIPE_SHADER_COMPUTE] = false;
 
    state->dispatch_info.block[0] = 
pipeline->pipeline_nir[MESA_SHADER_COMPUTE]->info.workgroup_size[0];
@@ -547,10 +564,16 @@ static void handle_graphics_pipeline(struct 
vk_cmd_queue_entry *cmd,
    for (enum pipe_shader_type sh = PIPE_SHADER_VERTEX; sh < 
PIPE_SHADER_COMPUTE; sh++)
       state->has_pcbuf[sh] = false;
 
+   for (unsigned i = 0; i < MESA_SHADER_COMPUTE; i++) {
+      enum pipe_shader_type sh = pipe_shader_type_from_mesa(i);
+      state->uniform_blocks[sh].count = 
pipeline->layout->stage[i].uniform_block_count;
+      for (unsigned j = 0; j < pipeline->layout->stage[i].uniform_block_count; 
j++)
+         state->uniform_blocks[sh].size[j] = 
pipeline->layout->stage[i].uniform_block_sizes[j];
+   }
    u_foreach_bit(stage, pipeline->layout->push_constant_stages) {
       enum pipe_shader_type sh = pipe_shader_type_from_mesa(stage);
       state->has_pcbuf[sh] = pipeline->layout->push_constant_size > 0;
-      if (!state->has_pcbuf[sh])
+      if (!state->has_pcbuf[sh] && !state->uniform_blocks[sh].count)
          state->pcbuf_dirty[sh] = false;
    }
 
@@ -992,6 +1015,7 @@ struct dyn_info {
       uint16_t sampler_count;
       uint16_t sampler_view_count;
       uint16_t image_count;
+      uint16_t uniform_block_count;
    } stage[MESA_SHADER_STAGES];
 
    uint32_t dyn_index;
@@ -1230,6 +1254,16 @@ static void handle_descriptor(struct rendering_state 
*state,
       type == VK_DESCRIPTOR_TYPE_STORAGE_BUFFER_DYNAMIC;
 
    switch (type) {
+   case VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK: {
+      int idx = binding->stage[stage].uniform_block_index;
+      if (idx == -1)
+         return;
+      idx += dyn_info->stage[stage].uniform_block_count;
+      assert(descriptor->uniform);
+      state->uniform_blocks[p_stage].block[idx] = descriptor->uniform;
+      state->pcbuf_dirty[p_stage] = true;
+      break;
+   }
    case VK_DESCRIPTOR_TYPE_INPUT_ATTACHMENT:
    case VK_DESCRIPTOR_TYPE_STORAGE_IMAGE: {
       fill_image_view_stage(state, dyn_info, stage, p_stage, array_idx, 
descriptor, binding);
@@ -1299,6 +1333,7 @@ static void handle_descriptor(struct rendering_state 
*state,
       break;
    default:
       fprintf(stderr, "Unhandled descriptor set %d\n", type);
+      unreachable("oops");
       break;
    }
 }
@@ -1316,7 +1351,8 @@ static void handle_set_stage(struct rendering_state 
*state,
       binding = &set->layout->binding[j];
 
       if (binding->valid) {
-         for (int i = 0; i < binding->array_size; i++) {
+         unsigned array_size = binding->type == 
VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK ? 1 : binding->array_size;
+         for (int i = 0; i < array_size; i++) {
             descriptor = &set->descriptors[binding->descriptor_index + i];
             handle_descriptor(state, dyn_info, binding, stage, p_stage, i, 
descriptor->type, &descriptor->info);
          }
@@ -1333,6 +1369,7 @@ static void increment_dyn_info(struct dyn_info *dyn_info,
       dyn_info->stage[stage].sampler_count += 
layout->stage[stage].sampler_count;
       dyn_info->stage[stage].sampler_view_count += 
layout->stage[stage].sampler_view_count;
       dyn_info->stage[stage].image_count += layout->stage[stage].image_count;
+      dyn_info->stage[stage].uniform_block_count += 
layout->stage[stage].uniform_block_count;
    }
    if (inc_dyn)
       dyn_info->dyn_index += layout->dynamic_offset_count;
diff --git a/src/gallium/frontends/lavapipe/lvp_lower_vulkan_resource.c 
b/src/gallium/frontends/lavapipe/lvp_lower_vulkan_resource.c
index 42938693ca4..8edf36293aa 100644
--- a/src/gallium/frontends/lavapipe/lvp_lower_vulkan_resource.c
+++ b/src/gallium/frontends/lavapipe/lvp_lower_vulkan_resource.c
@@ -47,6 +47,48 @@ lower_vulkan_resource_index(const nir_instr *instr, const 
void *data_cb)
    return false;
 }
 
+static bool
+lower_uniform_block_access(const nir_instr *instr, const void *data_cb)
+{
+   if (instr->type != nir_instr_type_intrinsic)
+      return false;
+
+   nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
+   if (intrin->intrinsic != nir_intrinsic_load_deref)
+      return false;
+   nir_deref_instr *deref = 
nir_instr_as_deref(intrin->src[0].ssa->parent_instr);
+   return deref->modes == nir_var_mem_ubo;
+}
+
+static nir_ssa_def *
+lower_block_instr(nir_builder *b, nir_instr *instr, void *data_cb)
+{
+   nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
+   nir_binding nb = nir_chase_binding(intrin->src[0]);
+   struct lvp_pipeline_layout *layout = data_cb;
+   struct lvp_descriptor_set_binding_layout *binding = 
&layout->set[nb.desc_set].layout->binding[nb.binding];
+   if (binding->type != VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK)
+      return NULL;
+   if (!binding->array_size)
+      return NIR_LOWER_INSTR_PROGRESS_REPLACE;
+
+   assert(intrin->src[0].ssa->num_components == 2);
+   unsigned value = 0;
+   for (unsigned s = 0; s < nb.desc_set; s++)
+      value += 
layout->set[s].layout->stage[b->shader->info.stage].uniform_block_size;
+   if (layout->push_constant_stages & BITFIELD_BIT(b->shader->info.stage))
+      value += layout->push_constant_size;
+   value += binding->stage[b->shader->info.stage].uniform_block_offset;
+
+   b->cursor = nir_before_instr(instr);
+   nir_ssa_def *offset = nir_imm_ivec2(b, 0, value);
+   nir_ssa_def *added = nir_iadd(b, intrin->src[0].ssa, offset);
+   nir_deref_instr *deref = 
nir_instr_as_deref(intrin->src[0].ssa->parent_instr);
+   nir_deref_instr *cast = nir_build_deref_cast(b, added, deref->modes, 
deref->type, 0);
+   nir_instr_rewrite_src_ssa(instr, &intrin->src[0], &cast->dest.ssa);
+   return NIR_LOWER_INSTR_PROGRESS;
+}
+
 static nir_ssa_def *lower_vri_intrin_vri(struct nir_builder *b,
                                            nir_instr *instr, void *data_cb)
 {
@@ -59,6 +101,10 @@ static nir_ssa_def *lower_vri_intrin_vri(struct nir_builder 
*b,
    bool is_ubo = (binding->type == VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER ||
                   binding->type == VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC);
 
+   /* always load inline uniform blocks from ubo0 */
+   if (binding->type == VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK)
+      return nir_imm_ivec2(b, 0, 0);
+
    for (unsigned s = 0; s < desc_set_idx; s++) {
      if (is_ubo)
        value += 
layout->set[s].layout->stage[b->shader->info.stage].const_buffer_count;
@@ -209,6 +255,7 @@ void lvp_lower_pipeline_layout(const struct lvp_device 
*device,
                                struct lvp_pipeline_layout *layout,
                                nir_shader *shader)
 {
+   nir_shader_lower_instructions(shader, lower_uniform_block_access, 
lower_block_instr, layout);
    nir_shader_lower_instructions(shader, lower_vulkan_resource_index, 
lower_vri_instr, layout);
    nir_foreach_variable_with_modes(var, shader, nir_var_uniform |
                                                 nir_var_image) {
diff --git a/src/gallium/frontends/lavapipe/lvp_private.h 
b/src/gallium/frontends/lavapipe/lvp_private.h
index 937b054c175..d36fdde98b7 100644
--- a/src/gallium/frontends/lavapipe/lvp_private.h
+++ b/src/gallium/frontends/lavapipe/lvp_private.h
@@ -77,6 +77,8 @@ extern "C" {
 #define MAX_SETS         8
 #define MAX_PUSH_CONSTANTS_SIZE 128
 #define MAX_PUSH_DESCRIPTORS 32
+#define MAX_DESCRIPTOR_UNIFORM_BLOCK_SIZE 4096
+#define MAX_PER_STAGE_DESCRIPTOR_UNIFORM_BLOCKS 8
 
 #ifdef _WIN32
 #define lvp_printflike(a, b)
@@ -336,6 +338,8 @@ struct lvp_descriptor_set_binding_layout {
       int16_t sampler_index;
       int16_t sampler_view_index;
       int16_t image_index;
+      int16_t uniform_block_index;
+      int16_t uniform_block_offset;
    } stage[MESA_SHADER_STAGES];
 
    /* Immutable samplers (or NULL if no immutable samplers) */
@@ -365,6 +369,9 @@ struct lvp_descriptor_set_layout {
       uint16_t sampler_count;
       uint16_t sampler_view_count;
       uint16_t image_count;
+      uint16_t uniform_block_count;
+      uint16_t uniform_block_size;
+      uint16_t uniform_block_sizes[MAX_PER_STAGE_DESCRIPTOR_UNIFORM_BLOCKS]; 
//zero-indexed
    } stage[MESA_SHADER_STAGES];
 
    /* Number of dynamic offsets used by this descriptor set */
@@ -405,6 +412,7 @@ union lvp_descriptor_info {
       VkDeviceSize range;
    };
    struct lvp_buffer_view *buffer_view;
+   uint8_t *uniform;
 };
 
 struct lvp_descriptor {
@@ -461,6 +469,9 @@ struct lvp_pipeline_layout {
    uint32_t push_constant_size;
    VkShaderStageFlags push_constant_stages;
    struct {
+      uint16_t uniform_block_size;
+      uint16_t uniform_block_count;
+      uint16_t uniform_block_sizes[MAX_PER_STAGE_DESCRIPTOR_UNIFORM_BLOCKS * 
MAX_SETS];
    } stage[MESA_SHADER_STAGES];
 };
 

Reply via email to