Module: Mesa
Branch: main
Commit: 0ac8731b6cc54685fa3eb78dee24bd01854c645f
URL:    
http://cgit.freedesktop.org/mesa/mesa/commit/?id=0ac8731b6cc54685fa3eb78dee24bd01854c645f

Author: Samuel Pitoiset <[email protected]>
Date:   Wed Aug 18 17:02:09 2021 +0200

radv: implement depth/stencil expand on compute

This works as long as the image is TC-compatible HTILE.

Signed-off-by: Samuel Pitoiset <[email protected]>
Reviewed-by: Bas Nieuwenhhuizen <[email protected]>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/12452>

---

 src/amd/vulkan/radv_meta_decompress.c | 241 +++++++++++++++++++++++++++++++++-
 src/amd/vulkan/radv_private.h         |   4 +
 2 files changed, 243 insertions(+), 2 deletions(-)

diff --git a/src/amd/vulkan/radv_meta_decompress.c 
b/src/amd/vulkan/radv_meta_decompress.c
index 1f1d8758c20..08c7f2d0a85 100644
--- a/src/amd/vulkan/radv_meta_decompress.c
+++ b/src/amd/vulkan/radv_meta_decompress.c
@@ -33,6 +33,123 @@ enum radv_depth_op {
    DEPTH_RESUMMARIZE,
 };
 
+static nir_shader *
+build_expand_depth_stencil_compute_shader(struct radv_device *dev)
+{
+   const struct glsl_type *img_type = glsl_image_type(GLSL_SAMPLER_DIM_2D, 
false, GLSL_TYPE_FLOAT);
+
+   nir_builder b =
+      nir_builder_init_simple_shader(MESA_SHADER_COMPUTE, NULL, 
"expand_depth_stencil_compute");
+
+   /* We need at least 8/8/1 to cover an entire HTILE block in a single 
workgroup. */
+   b.shader->info.workgroup_size[0] = 8;
+   b.shader->info.workgroup_size[1] = 8;
+   b.shader->info.workgroup_size[2] = 1;
+   nir_variable *input_img = nir_variable_create(b.shader, nir_var_uniform, 
img_type, "in_img");
+   input_img->data.descriptor_set = 0;
+   input_img->data.binding = 0;
+
+   nir_variable *output_img = nir_variable_create(b.shader, nir_var_uniform, 
img_type, "out_img");
+   output_img->data.descriptor_set = 0;
+   output_img->data.binding = 1;
+
+   nir_ssa_def *invoc_id = nir_load_local_invocation_id(&b);
+   nir_ssa_def *wg_id = nir_load_workgroup_id(&b, 32);
+   nir_ssa_def *block_size =
+      nir_imm_ivec4(&b, b.shader->info.workgroup_size[0], 
b.shader->info.workgroup_size[1],
+                    b.shader->info.workgroup_size[2], 0);
+
+   nir_ssa_def *global_id = nir_iadd(&b, nir_imul(&b, wg_id, block_size), 
invoc_id);
+
+   nir_ssa_def *data = nir_image_deref_load(
+      &b, 4, 32, &nir_build_deref_var(&b, input_img)->dest.ssa, global_id, 
nir_ssa_undef(&b, 1, 32),
+      nir_imm_int(&b, 0), .image_dim = GLSL_SAMPLER_DIM_2D);
+
+   /* We need a NIR_SCOPE_DEVICE memory_scope because ACO will avoid
+    * creating a vmcnt(0) because it expects the L1 cache to keep memory
+    * operations in-order for the same workgroup. The vmcnt(0) seems
+    * necessary however. */
+   nir_scoped_barrier(&b, .execution_scope = NIR_SCOPE_WORKGROUP, 
.memory_scope = NIR_SCOPE_DEVICE,
+                      .memory_semantics = NIR_MEMORY_ACQ_REL, .memory_modes = 
nir_var_mem_ssbo);
+
+   nir_image_deref_store(&b, &nir_build_deref_var(&b, output_img)->dest.ssa, 
global_id,
+                         nir_ssa_undef(&b, 1, 32), data, nir_imm_int(&b, 0),
+                         .image_dim = GLSL_SAMPLER_DIM_2D);
+   return b.shader;
+}
+
+static VkResult
+create_expand_depth_stencil_compute(struct radv_device *device)
+{
+   VkResult result = VK_SUCCESS;
+   nir_shader *cs = build_expand_depth_stencil_compute_shader(device);
+
+   VkDescriptorSetLayoutCreateInfo ds_create_info = {
+      .sType = VK_STRUCTURE_TYPE_DESCRIPTOR_SET_LAYOUT_CREATE_INFO,
+      .flags = VK_DESCRIPTOR_SET_LAYOUT_CREATE_PUSH_DESCRIPTOR_BIT_KHR,
+      .bindingCount = 2,
+      .pBindings = (VkDescriptorSetLayoutBinding[]){
+         {.binding = 0,
+          .descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_IMAGE,
+          .descriptorCount = 1,
+          .stageFlags = VK_SHADER_STAGE_COMPUTE_BIT,
+          .pImmutableSamplers = NULL},
+         {.binding = 1,
+          .descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_IMAGE,
+          .descriptorCount = 1,
+          .stageFlags = VK_SHADER_STAGE_COMPUTE_BIT,
+          .pImmutableSamplers = NULL},
+      }};
+
+   result = radv_CreateDescriptorSetLayout(
+      radv_device_to_handle(device), &ds_create_info, 
&device->meta_state.alloc,
+      &device->meta_state.expand_depth_stencil_compute_ds_layout);
+   if (result != VK_SUCCESS)
+      goto cleanup;
+
+   VkPipelineLayoutCreateInfo pl_create_info = {
+      .sType = VK_STRUCTURE_TYPE_PIPELINE_LAYOUT_CREATE_INFO,
+      .setLayoutCount = 1,
+      .pSetLayouts = 
&device->meta_state.expand_depth_stencil_compute_ds_layout,
+      .pushConstantRangeCount = 0,
+      .pPushConstantRanges = NULL,
+   };
+
+   result = radv_CreatePipelineLayout(
+      radv_device_to_handle(device), &pl_create_info, 
&device->meta_state.alloc,
+      &device->meta_state.expand_depth_stencil_compute_p_layout);
+   if (result != VK_SUCCESS)
+      goto cleanup;
+
+   /* compute shader */
+
+   VkPipelineShaderStageCreateInfo pipeline_shader_stage = {
+      .sType = VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO,
+      .stage = VK_SHADER_STAGE_COMPUTE_BIT,
+      .module = vk_shader_module_handle_from_nir(cs),
+      .pName = "main",
+      .pSpecializationInfo = NULL,
+   };
+
+   VkComputePipelineCreateInfo vk_pipeline_info = {
+      .sType = VK_STRUCTURE_TYPE_COMPUTE_PIPELINE_CREATE_INFO,
+      .stage = pipeline_shader_stage,
+      .flags = 0,
+      .layout = device->meta_state.expand_depth_stencil_compute_p_layout,
+   };
+
+   result = radv_CreateComputePipelines(
+      radv_device_to_handle(device), 
radv_pipeline_cache_to_handle(&device->meta_state.cache), 1,
+      &vk_pipeline_info, NULL,
+      &device->meta_state.expand_depth_stencil_compute_pipeline);
+   if (result != VK_SUCCESS)
+      goto cleanup;
+
+cleanup:
+   ralloc_free(cs);
+   return result;
+}
+
 static VkResult
 create_pass(struct radv_device *device, uint32_t samples, VkRenderPass *pass)
 {
@@ -263,6 +380,13 @@ radv_device_finish_meta_depth_decomp_state(struct 
radv_device *device)
       radv_DestroyPipeline(radv_device_to_handle(device),
                            state->depth_decomp[i].resummarize_pipeline, 
&state->alloc);
    }
+
+   radv_DestroyPipeline(radv_device_to_handle(device),
+                        state->expand_depth_stencil_compute_pipeline, 
&state->alloc);
+   radv_DestroyPipelineLayout(radv_device_to_handle(device),
+                              state->expand_depth_stencil_compute_p_layout, 
&state->alloc);
+   radv_DestroyDescriptorSetLayout(radv_device_to_handle(device),
+                                   
state->expand_depth_stencil_compute_ds_layout, &state->alloc);
 }
 
 VkResult
@@ -298,6 +422,10 @@ radv_device_init_meta_depth_decomp_state(struct 
radv_device *device, bool on_dem
          goto fail;
    }
 
+   res = create_expand_depth_stencil_compute(device);
+   if (res != VK_SUCCESS)
+      goto fail;
+
    return VK_SUCCESS;
 
 fail:
@@ -481,6 +609,112 @@ radv_process_depth_stencil(struct radv_cmd_buffer 
*cmd_buffer, struct radv_image
    radv_meta_restore(&saved_state, cmd_buffer);
 }
 
+static void
+radv_expand_depth_stencil_compute(struct radv_cmd_buffer *cmd_buffer, struct 
radv_image *image,
+                                  const VkImageSubresourceRange 
*subresourceRange)
+{
+   struct radv_meta_saved_state saved_state;
+   struct radv_image_view load_iview = {0};
+   struct radv_image_view store_iview = {0};
+   struct radv_device *device = cmd_buffer->device;
+
+   assert(radv_image_is_tc_compat_htile(image));
+
+   cmd_buffer->state.flush_bits |=
+      radv_dst_access_flush(cmd_buffer, VK_ACCESS_SHADER_WRITE_BIT, image);
+
+   radv_meta_save(&saved_state, cmd_buffer,
+                  RADV_META_SAVE_DESCRIPTORS | 
RADV_META_SAVE_COMPUTE_PIPELINE);
+
+   radv_CmdBindPipeline(radv_cmd_buffer_to_handle(cmd_buffer), 
VK_PIPELINE_BIND_POINT_COMPUTE,
+                        
device->meta_state.expand_depth_stencil_compute_pipeline);
+
+   for (uint32_t l = 0; l < radv_get_levelCount(image, subresourceRange); l++) 
{
+      uint32_t width, height;
+
+      /* Do not decompress levels without HTILE. */
+      if (!radv_htile_enabled(image, subresourceRange->baseMipLevel + l))
+         continue;
+
+      width = radv_minify(image->info.width, subresourceRange->baseMipLevel + 
l);
+      height = radv_minify(image->info.height, subresourceRange->baseMipLevel 
+ l);
+
+      for (uint32_t s = 0; s < radv_get_layerCount(image, subresourceRange); 
s++) {
+         radv_image_view_init(
+            &load_iview, cmd_buffer->device,
+            &(VkImageViewCreateInfo){
+               .sType = VK_STRUCTURE_TYPE_IMAGE_VIEW_CREATE_INFO,
+               .image = radv_image_to_handle(image),
+               .viewType = VK_IMAGE_VIEW_TYPE_2D,
+               .format = image->vk_format,
+               .subresourceRange = {.aspectMask = subresourceRange->aspectMask,
+                                    .baseMipLevel = 
subresourceRange->baseMipLevel + l,
+                                    .levelCount = 1,
+                                    .baseArrayLayer = 
subresourceRange->baseArrayLayer + s,
+                                    .layerCount = 1},
+            },
+            &(struct radv_image_view_extra_create_info){.enable_compression = 
true});
+         radv_image_view_init(
+            &store_iview, cmd_buffer->device,
+            &(VkImageViewCreateInfo){
+               .sType = VK_STRUCTURE_TYPE_IMAGE_VIEW_CREATE_INFO,
+               .image = radv_image_to_handle(image),
+               .viewType = VK_IMAGE_VIEW_TYPE_2D,
+               .format = image->vk_format,
+               .subresourceRange = {.aspectMask = subresourceRange->aspectMask,
+                                    .baseMipLevel = 
subresourceRange->baseMipLevel + l,
+                                    .levelCount = 1,
+                                    .baseArrayLayer = 
subresourceRange->baseArrayLayer + s,
+                                    .layerCount = 1},
+            },
+            &(struct radv_image_view_extra_create_info){.disable_compression = 
true});
+
+         radv_meta_push_descriptor_set(
+            cmd_buffer, VK_PIPELINE_BIND_POINT_COMPUTE,
+            device->meta_state.expand_depth_stencil_compute_p_layout, 0, /* 
set */
+            2, /* descriptorWriteCount */
+            (VkWriteDescriptorSet[]){{.sType = 
VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET,
+                                      .dstBinding = 0,
+                                      .dstArrayElement = 0,
+                                      .descriptorCount = 1,
+                                      .descriptorType = 
VK_DESCRIPTOR_TYPE_STORAGE_IMAGE,
+                                      .pImageInfo =
+                                         (VkDescriptorImageInfo[]){
+                                            {
+                                               .sampler = VK_NULL_HANDLE,
+                                               .imageView = 
radv_image_view_to_handle(&load_iview),
+                                               .imageLayout = 
VK_IMAGE_LAYOUT_GENERAL,
+                                            },
+                                         }},
+                                     {.sType = 
VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET,
+                                      .dstBinding = 1,
+                                      .dstArrayElement = 0,
+                                      .descriptorCount = 1,
+                                      .descriptorType = 
VK_DESCRIPTOR_TYPE_STORAGE_IMAGE,
+                                      .pImageInfo = (VkDescriptorImageInfo[]){
+                                         {
+                                            .sampler = VK_NULL_HANDLE,
+                                            .imageView = 
radv_image_view_to_handle(&store_iview),
+                                            .imageLayout = 
VK_IMAGE_LAYOUT_GENERAL,
+                                         },
+                                      }}});
+
+         radv_unaligned_dispatch(cmd_buffer, width, height, 1);
+      }
+   }
+
+   radv_meta_restore(&saved_state, cmd_buffer);
+
+   cmd_buffer->state.flush_bits |=
+      RADV_CMD_FLAG_CS_PARTIAL_FLUSH | RADV_CMD_FLAG_INV_VCACHE |
+      radv_src_access_flush(cmd_buffer, VK_ACCESS_SHADER_WRITE_BIT, image);
+
+   /* Initialize the HTILE metadata as "fully expanded". */
+   uint32_t htile_value = radv_get_htile_initial_value(cmd_buffer->device, 
image);
+
+   cmd_buffer->state.flush_bits |= radv_clear_htile(cmd_buffer, image, 
subresourceRange, htile_value);
+}
+
 void
 radv_expand_depth_stencil(struct radv_cmd_buffer *cmd_buffer, struct 
radv_image *image,
                           const VkImageSubresourceRange *subresourceRange,
@@ -491,8 +725,11 @@ radv_expand_depth_stencil(struct radv_cmd_buffer 
*cmd_buffer, struct radv_image
    barrier.layout_transitions.depth_stencil_expand = 1;
    radv_describe_layout_transition(cmd_buffer, &barrier);
 
-   assert(cmd_buffer->queue_family_index == RADV_QUEUE_GENERAL);
-   radv_process_depth_stencil(cmd_buffer, image, subresourceRange, 
sample_locs, DEPTH_DECOMPRESS);
+   if (cmd_buffer->queue_family_index == RADV_QUEUE_GENERAL) {
+      radv_process_depth_stencil(cmd_buffer, image, subresourceRange, 
sample_locs, DEPTH_DECOMPRESS);
+   } else {
+      radv_expand_depth_stencil_compute(cmd_buffer, image, subresourceRange);
+   }
 }
 
 void
diff --git a/src/amd/vulkan/radv_private.h b/src/amd/vulkan/radv_private.h
index b548b81dce2..89885cbb099 100644
--- a/src/amd/vulkan/radv_private.h
+++ b/src/amd/vulkan/radv_private.h
@@ -619,6 +619,10 @@ struct radv_meta_state {
       VkRenderPass pass;
    } depth_decomp[MAX_SAMPLES_LOG2];
 
+   VkDescriptorSetLayout expand_depth_stencil_compute_ds_layout;
+   VkPipelineLayout expand_depth_stencil_compute_p_layout;
+   VkPipeline expand_depth_stencil_compute_pipeline;
+
    struct {
       VkPipelineLayout p_layout;
       VkPipeline cmask_eliminate_pipeline;

Reply via email to