Mesa (main): anv/sparse: allow sparse resouces to use TR-TT as its backend

GitLab Mirror Fri, 17 Nov 2023 10:23:23 -0800

Module: Mesa
Branch: main
Commit: 04bfe828db8187bdda755d8c55ec2f8fbf3ae298
URL:    
http://cgit.freedesktop.org/mesa/mesa/commit/?id=04bfe828db8187bdda755d8c55ec2f8fbf3ae298


Author: Paulo Zanoni <paulo.r.zan...@intel.com>
Date:   Fri Sep 29 11:17:32 2023 -0700

anv/sparse: allow sparse resouces to use TR-TT as its backend

TR-TT is a hardware feature supported by both i915.ko and xe.ko, which
means we can now finally have Sparse Resources on i915.ko and we also
have 2 options for xe.ko (and whatever is the best should be the
default).

In this patch we use batch commands to write the page tables and
forever keep them in device memory. We maintain a mirror of both the
L3 and and L2 tables because that helps us never having to read the
tables that are in device memory.

We still have some things to improve, but with this commit, workloads
that didn't work at all due to the lack of sparse resources should
at least run.

This is still all disabled by default in i915.ko, you can turn it on
by exporting ANV_SPARSE=1 before launching the applications. For
xe.ko, switch the default with ANV_SPARSE_USE_TRTT=1.

Reviewed-by: Lionel Landwerlin <lionel.g.landwer...@intel.com>
Signed-off-by: Paulo Zanoni <paulo.r.zan...@intel.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/25512>

---

 src/intel/vulkan/anv_batch_chain.c      |  37 +++++
 src/intel/vulkan/anv_device.c           |  59 ++++++-
 src/intel/vulkan/anv_gem_stubs.c        |   8 +
 src/intel/vulkan/anv_genX.h             |   4 +
 src/intel/vulkan/anv_kmd_backend.h      |   3 +
 src/intel/vulkan/anv_private.h          |  68 ++++++++
 src/intel/vulkan/anv_sparse.c           | 276 +++++++++++++++++++++++++++++++-
 src/intel/vulkan/genX_cmd_buffer.c      |  54 +++++++
 src/intel/vulkan/genX_init_state.c      |  56 +++++++
 src/intel/vulkan/i915/anv_batch_chain.c |  99 +++++++++++-
 src/intel/vulkan/i915/anv_batch_chain.h |   6 +
 src/intel/vulkan/i915/anv_kmd_backend.c |   1 +
 src/intel/vulkan/xe/anv_batch_chain.c   |  45 ++++++
 src/intel/vulkan/xe/anv_batch_chain.h   |   5 +
 src/intel/vulkan/xe/anv_kmd_backend.c   |   1 +
 15 files changed, 715 insertions(+), 7 deletions(-)

diff --git a/src/intel/vulkan/anv_batch_chain.c 
b/src/intel/vulkan/anv_batch_chain.c
index 60af0fa3157..895b3979c81 100644
--- a/src/intel/vulkan/anv_batch_chain.c
+++ b/src/intel/vulkan/anv_batch_chain.c
@@ -118,6 +118,10 @@ VkResult
 anv_reloc_list_add_bo_impl(struct anv_reloc_list *list,
                            struct anv_bo *target_bo)
 {
+   /* This can happen with sparse resources. */
+   if (!target_bo)
+      return VK_SUCCESS;
+
    uint32_t idx = target_bo->gem_handle;
    VkResult result = anv_reloc_list_grow_deps(list,
                                               (idx / BITSET_WORDBITS) + 1);
@@ -1693,6 +1697,39 @@ anv_queue_submit_simple_batch(struct anv_queue *queue,
    return result;
 }
 
+VkResult
+anv_queue_submit_trtt_batch(struct anv_queue *queue,
+                            struct anv_batch *batch)
+{
+   struct anv_device *device = queue->device;
+   VkResult result = VK_SUCCESS;
+
+   uint32_t batch_size = align(batch->next - batch->start, 8);
+
+   struct anv_bo *batch_bo;
+   result = anv_bo_pool_alloc(&device->batch_bo_pool, batch_size, &batch_bo);
+   if (result != VK_SUCCESS)
+      return result;
+
+   memcpy(batch_bo->map, batch->start, batch_size);
+#ifdef SUPPORT_INTEL_INTEGRATED_GPUS
+   if (device->physical->memory.need_flush)
+      intel_flush_range(batch_bo->map, batch_size);
+#endif
+
+   if (INTEL_DEBUG(DEBUG_BATCH)) {
+      intel_print_batch(queue->decoder, batch_bo->map, batch_bo->size,
+                        batch_bo->offset, false);
+   }
+
+   result = device->kmd_backend->execute_trtt_batch(queue, batch_bo,
+                                                    batch_size);
+
+   anv_bo_pool_free(&device->batch_bo_pool, batch_bo);
+
+   return result;
+}
+
 void
 anv_cmd_buffer_clflush(struct anv_cmd_buffer **cmd_buffers,
                        uint32_t num_cmd_buffers)
diff --git a/src/intel/vulkan/anv_device.c b/src/intel/vulkan/anv_device.c
index ea0dd5cd749..85028608366 100644
--- a/src/intel/vulkan/anv_device.c
+++ b/src/intel/vulkan/anv_device.c
@@ -1444,8 +1444,17 @@ anv_physical_device_try_create(struct vk_instance 
*vk_instance,
 
    device->uses_relocs = device->info.kmd_type != INTEL_KMD_TYPE_XE;
 
-   device->has_sparse = device->info.kmd_type == INTEL_KMD_TYPE_XE &&
-      debug_get_bool_option("ANV_SPARSE", true);
+   /* While xe.ko can use both vm_bind and TR-TT, i915.ko only has TR-TT. */
+   if (device->info.kmd_type == INTEL_KMD_TYPE_XE) {
+      device->has_sparse = true;
+      device->sparse_uses_trtt =
+         debug_get_bool_option("ANV_SPARSE_USE_TRTT", false);
+   } else {
+      device->has_sparse =
+         device->info.ver >= 12 &&
+         debug_get_bool_option("ANV_SPARSE", false);
+      device->sparse_uses_trtt = true;
+   }
 
    device->always_flush_cache = INTEL_DEBUG(DEBUG_STALL) ||
       driQueryOptionb(&instance->dri_options, "always_flush_cache");
@@ -1732,6 +1741,11 @@ void anv_GetPhysicalDeviceProperties(
    const bool has_sparse_or_fake = pdevice->instance->has_fake_sparse ||
                                    pdevice->has_sparse;
 
+   uint64_t sparse_addr_space_size =
+      !has_sparse_or_fake ? 0 :
+      pdevice->sparse_uses_trtt ? pdevice->va.trtt.size :
+      1ULL << 48;
+
    VkSampleCountFlags sample_counts =
       isl_device_get_sample_counts(&pdevice->isl_dev);
 
@@ -1749,7 +1763,7 @@ void anv_GetPhysicalDeviceProperties(
       .maxMemoryAllocationCount                 = UINT32_MAX,
       .maxSamplerAllocationCount                = 64 * 1024,
       .bufferImageGranularity                   = 1,
-      .sparseAddressSpaceSize                   = has_sparse_or_fake ? (1uLL 
<< 48) : 0,
+      .sparseAddressSpaceSize                   = sparse_addr_space_size,
       .maxBoundDescriptorSets                   = MAX_SETS,
       .maxPerStageDescriptorSamplers            = max_samplers,
       .maxPerStageDescriptorUniformBuffers      = 
MAX_PER_STAGE_DESCRIPTOR_UNIFORM_BUFFERS,
@@ -3083,6 +3097,33 @@ anv_device_destroy_context_or_vm(struct anv_device 
*device)
    }
 }
 
+static VkResult
+anv_device_init_trtt(struct anv_device *device)
+{
+   struct anv_trtt *trtt = &device->trtt;
+
+   if (pthread_mutex_init(&trtt->mutex, NULL) != 0)
+      return vk_error(device, VK_ERROR_INITIALIZATION_FAILED);
+
+   return VK_SUCCESS;
+}
+
+static void
+anv_device_finish_trtt(struct anv_device *device)
+{
+   struct anv_trtt *trtt = &device->trtt;
+
+   pthread_mutex_destroy(&trtt->mutex);
+
+   vk_free(&device->vk.alloc, trtt->l3_mirror);
+   vk_free(&device->vk.alloc, trtt->l2_mirror);
+
+   for (int i = 0; i < trtt->num_page_table_bos; i++)
+      anv_device_release_bo(device, trtt->page_table_bos[i]);
+
+   vk_free(&device->vk.alloc, trtt->page_table_bos);
+}
+
 VkResult anv_CreateDevice(
     VkPhysicalDevice                            physicalDevice,
     const VkDeviceCreateInfo*                   pCreateInfo,
@@ -3542,16 +3583,20 @@ VkResult anv_CreateDevice(
          goto fail_trivial_batch_bo_and_scratch_pool;
    }
 
-   result = anv_genX(device->info, init_device_state)(device);
+   result = anv_device_init_trtt(device);
    if (result != VK_SUCCESS)
       goto fail_btd_fifo_bo;
 
+   result = anv_genX(device->info, init_device_state)(device);
+   if (result != VK_SUCCESS)
+      goto fail_trtt;
+
    struct vk_pipeline_cache_create_info pcc_info = { };
    device->default_pipeline_cache =
       vk_pipeline_cache_create(&device->vk, &pcc_info, NULL);
    if (!device->default_pipeline_cache) {
       result = vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
-      goto fail_btd_fifo_bo;
+      goto fail_trtt;
    }
 
    /* Internal shaders need their own pipeline cache because, unlike the rest
@@ -3654,6 +3699,8 @@ VkResult anv_CreateDevice(
    vk_pipeline_cache_destroy(device->internal_cache, NULL);
  fail_default_pipeline_cache:
    vk_pipeline_cache_destroy(device->default_pipeline_cache, NULL);
+ fail_trtt:
+   anv_device_finish_trtt(device);
  fail_btd_fifo_bo:
    if (ANV_SUPPORT_RT && device->info->has_ray_tracing)
       anv_device_release_bo(device, device->btd_fifo_bo);
@@ -3754,6 +3801,8 @@ void anv_DestroyDevice(
    vk_pipeline_cache_destroy(device->internal_cache, NULL);
    vk_pipeline_cache_destroy(device->default_pipeline_cache, NULL);
 
+   anv_device_finish_trtt(device);
+
    if (ANV_SUPPORT_RT && device->info->has_ray_tracing)
       anv_device_release_bo(device, device->btd_fifo_bo);
 
diff --git a/src/intel/vulkan/anv_gem_stubs.c b/src/intel/vulkan/anv_gem_stubs.c
index f2b8103ea81..55f7a403ee7 100644
--- a/src/intel/vulkan/anv_gem_stubs.c
+++ b/src/intel/vulkan/anv_gem_stubs.c
@@ -65,6 +65,13 @@ stub_execute_simple_batch(struct anv_queue *queue, struct 
anv_bo *batch_bo,
    return VK_ERROR_UNKNOWN;
 }
 
+static VkResult
+stub_execute_trtt_batch(struct anv_queue *queue, struct anv_bo *batch_bo,
+                        uint32_t batch_size)
+{
+   return VK_ERROR_UNKNOWN;
+}
+
 static VkResult
 stub_queue_exec_locked(struct anv_queue *queue,
                        uint32_t wait_count,
@@ -190,6 +197,7 @@ const struct anv_kmd_backend *anv_stub_kmd_backend_get(void)
       .vm_bind_bo = stub_vm_bind_bo,
       .vm_unbind_bo = stub_vm_bind_bo,
       .execute_simple_batch = stub_execute_simple_batch,
+      .execute_trtt_batch = stub_execute_trtt_batch,
       .queue_exec_locked = stub_queue_exec_locked,
       .queue_exec_trace = stub_queue_exec_trace,
       .bo_alloc_flags_to_bo_flags = stub_bo_alloc_flags_to_bo_flags,
diff --git a/src/intel/vulkan/anv_genX.h b/src/intel/vulkan/anv_genX.h
index a9bd1134982..745a5d38907 100644
--- a/src/intel/vulkan/anv_genX.h
+++ b/src/intel/vulkan/anv_genX.h
@@ -283,3 +283,7 @@ genX(simple_shader_push_state_address)(struct 
anv_simple_shader *state,
 
 void
 genX(emit_simple_shader_end)(struct anv_simple_shader *state);
+
+VkResult genX(init_trtt_context_state)(struct anv_queue *queue);
+
+VkResult genX(write_trtt_entries)(struct anv_trtt_submission *submit);
diff --git a/src/intel/vulkan/anv_kmd_backend.h 
b/src/intel/vulkan/anv_kmd_backend.h
index ed860eba81f..5e3f508e49b 100644
--- a/src/intel/vulkan/anv_kmd_backend.h
+++ b/src/intel/vulkan/anv_kmd_backend.h
@@ -77,6 +77,9 @@ struct anv_kmd_backend {
                                     struct anv_bo *batch_bo,
                                     uint32_t batch_bo_size,
                                     bool is_companion_rcs_batch);
+   VkResult (*execute_trtt_batch)(struct anv_queue *queue,
+                                  struct anv_bo *batch_bo,
+                                  uint32_t batch_size);
    VkResult (*queue_exec_locked)(struct anv_queue *queue,
                                  uint32_t wait_count,
                                  const struct vk_sync_wait *waits,
diff --git a/src/intel/vulkan/anv_private.h b/src/intel/vulkan/anv_private.h
index 08f3ee3652d..52f1cd722c6 100644
--- a/src/intel/vulkan/anv_private.h
+++ b/src/intel/vulkan/anv_private.h
@@ -238,6 +238,22 @@ struct intel_perf_query_result;
 #define SO_BUFFER_INDEX_0_CMD 0x60
 #define anv_printflike(a, b) __attribute__((__format__(__printf__, a, b)))
 
+/* The TR-TT L1 page table entries may contain these values instead of actual
+ * pointers to indicate the regions are either NULL or invalid. We program
+ * these values to TR-TT registers, so we could change them, but it's super
+ * convenient to have the NULL value be 0 because everything is
+ * zero-initialized when allocated.
+ *
+ * Since we reserve these values for NULL/INVALID, then we can't use them as
+ * destinations for TR-TT address translation. Both values are shifted by 16
+ * bits, wich results in graphic addresses 0 and 64k. On Anv the first vma
+ * starts at 2MB, so we already don't use 0 and 64k for anything, so there's
+ * nothing really to reserve. We could instead just reserve random 64kb
+ * ranges from any of the non-TR-TT vmas and use their addresses.
+ */
+#define ANV_TRTT_L1_NULL_TILE_VAL 0
+#define ANV_TRTT_L1_INVALID_TILE_VAL 1
+
 static inline uint32_t
 align_down_npot_u32(uint32_t v, uint32_t a)
 {
@@ -695,6 +711,21 @@ struct anv_state_stream {
    struct util_dynarray all_blocks;
 };
 
+struct anv_trtt_bind {
+   uint64_t pte_addr;
+   uint64_t entry_addr;
+};
+
+struct anv_trtt_submission {
+   struct anv_queue *queue;
+
+   struct anv_trtt_bind *l3l2_binds;
+   struct anv_trtt_bind *l1_binds;
+
+   int l3l2_binds_len;
+   int l1_binds_len;
+};
+
 /* The block_pool functions exported for testing only.  The block pool should
  * only be used via a state pool (see below).
  */
@@ -912,6 +943,7 @@ struct anv_physical_device {
      * a vm_bind ioctl).
      */
     bool                                        has_sparse;
+    bool                                        sparse_uses_trtt;
 
     /** True if HW supports ASTC LDR */
     bool                                        has_astc_ldr;
@@ -1724,6 +1756,40 @@ struct anv_device {
      */
     VkCommandPool                               companion_rcs_cmd_pool;
 
+    struct anv_trtt {
+       pthread_mutex_t mutex;
+
+       /* Sometimes we need to run batches from places where we don't have a
+        * queue coming from the API, so we use this.
+        */
+       struct anv_queue *queue;
+
+       /* There's only one L3 table, so if l3_addr is zero that means we
+        * didn't initialize the TR-TT context yet (i.e., we're not using TR-TT
+        * yet in this context).
+        */
+       uint64_t l3_addr;
+
+       /* We don't want to access the page tables from the CPU, so just
+        * maintain a mirror that we can use.
+        */
+       uint64_t *l3_mirror;
+       uint64_t *l2_mirror;
+
+       /* We keep a dynamic list of page table bos, and each bo can store
+        * multiple page tables.
+        */
+       struct anv_bo **page_table_bos;
+       int num_page_table_bos;
+       int page_table_bos_capacity;
+
+       /* These are used to keep track of space available for more page tables
+        * within a bo.
+        */
+       struct anv_bo *cur_page_table_bo;
+       uint64_t next_page_table_bo_offset;
+    } trtt;
+
     /* This is true if the user ever bound a sparse resource to memory. This
      * is used for a workaround that makes every memoryBarrier flush more
      * things than it should. Many applications request for the sparse
@@ -1861,6 +1927,8 @@ VkResult anv_queue_submit(struct vk_queue *queue,
 VkResult anv_queue_submit_simple_batch(struct anv_queue *queue,
                                        struct anv_batch *batch,
                                        bool is_companion_rcs_batch);
+VkResult anv_queue_submit_trtt_batch(struct anv_queue *queue,
+                                     struct anv_batch *batch);
 
 void anv_queue_trace(struct anv_queue *queue, const char *label,
                      bool frame, bool begin);
diff --git a/src/intel/vulkan/anv_sparse.c b/src/intel/vulkan/anv_sparse.c
index 8e8024db1ab..3da57f35de8 100644
--- a/src/intel/vulkan/anv_sparse.c
+++ b/src/intel/vulkan/anv_sparse.c
@@ -276,6 +276,275 @@ anv_sparse_get_standard_image_block_shape(enum isl_format 
format,
    return vk_extent3d_el_to_px(block_shape, layout);
 }
 
+/* We really want to try to have all the page tables on as few BOs as possible
+ * to benefit from cache locality and to keep the i915.ko relocation lists
+ * small. On the other hand, we don't want to waste memory on unused space.
+ */
+#define ANV_TRTT_PAGE_TABLE_BO_SIZE (2 * 1024 * 1024)
+
+static VkResult
+trtt_make_page_table_bo(struct anv_device *device, struct anv_bo **bo)
+{
+   VkResult result;
+   struct anv_trtt *trtt = &device->trtt;
+
+   result = anv_device_alloc_bo(device, "trtt-page-table",
+                                ANV_TRTT_PAGE_TABLE_BO_SIZE, 0, 0, bo);
+   if (result != VK_SUCCESS)
+      return result;
+
+   if (trtt->num_page_table_bos < trtt->page_table_bos_capacity) {
+      trtt->page_table_bos[trtt->num_page_table_bos++] = *bo;
+   } else {
+
+      int new_capacity = MAX2(8, trtt->page_table_bos_capacity * 2);
+      struct anv_bo **new_page_table_bos =
+         vk_realloc(&device->vk.alloc, trtt->page_table_bos,
+                    new_capacity * sizeof(*trtt->page_table_bos), 8,
+                    VK_SYSTEM_ALLOCATION_SCOPE_DEVICE);
+      if (!new_page_table_bos) {
+         anv_device_release_bo(device, *bo);
+         return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
+      }
+
+      new_page_table_bos[trtt->num_page_table_bos] = *bo;
+
+      trtt->page_table_bos = new_page_table_bos;
+      trtt->page_table_bos_capacity = new_capacity;
+      trtt->num_page_table_bos++;
+   }
+
+   trtt->cur_page_table_bo = *bo;
+   trtt->next_page_table_bo_offset = 0;
+
+   sparse_debug("new number of page table BOs: %d\n",
+                trtt->num_page_table_bos);
+
+   return VK_SUCCESS;
+}
+
+static VkResult
+trtt_get_page_table_bo(struct anv_device *device, struct anv_bo **bo,
+                       uint64_t *bo_addr)
+{
+   struct anv_trtt *trtt = &device->trtt;
+   VkResult result;
+
+   if (!trtt->cur_page_table_bo) {
+      result = trtt_make_page_table_bo(device, bo);
+      if (result != VK_SUCCESS)
+         return result;
+   }
+
+   *bo = trtt->cur_page_table_bo;
+   *bo_addr = trtt->cur_page_table_bo->offset +
+              trtt->next_page_table_bo_offset;
+
+   trtt->next_page_table_bo_offset += 4096;
+   if (trtt->next_page_table_bo_offset >= ANV_TRTT_PAGE_TABLE_BO_SIZE)
+      trtt->cur_page_table_bo = NULL;
+
+   return VK_SUCCESS;
+}
+
+static VkResult
+anv_trtt_init_context_state(struct anv_queue *queue)
+{
+   struct anv_device *device = queue->device;
+   struct anv_trtt *trtt = &device->trtt;
+
+   struct anv_bo *l3_bo;
+   VkResult result = trtt_get_page_table_bo(device, &l3_bo, &trtt->l3_addr);
+   if (result != VK_SUCCESS)
+      return result;
+
+   trtt->l3_mirror = vk_zalloc(&device->vk.alloc, 4096, 8,
+                                VK_SYSTEM_ALLOCATION_SCOPE_DEVICE);
+   if (!trtt->l3_mirror) {
+      result = vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
+      return result;
+   }
+
+   /* L3 has 512 entries, so we can have up to 512 L2 tables. */
+   trtt->l2_mirror = vk_zalloc(&device->vk.alloc, 512 * 4096, 8,
+                               VK_SYSTEM_ALLOCATION_SCOPE_DEVICE);
+   if (!trtt->l2_mirror) {
+      result = vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
+      goto fail_free_l3;
+   }
+
+   result = anv_genX(device->info, init_trtt_context_state)(queue);
+
+   return result;
+
+fail_free_l3:
+   vk_free(&device->vk.alloc, trtt->l3_mirror);
+   return result;
+}
+
+static void
+anv_trtt_bind_list_add_entry(struct anv_trtt_bind *binds, int *binds_len,
+                             uint64_t pte_addr, uint64_t entry_addr)
+{
+   binds[*binds_len] = (struct anv_trtt_bind) {
+      .pte_addr = pte_addr,
+      .entry_addr = entry_addr,
+   };
+   (*binds_len)++;
+}
+
+/* For L3 and L2 pages, null and invalid entries are indicated by bits 1 and 0
+ * respectively. For L1 entries, the hardware compares the addresses against
+ * what we program to the GFX_TRTT_NULL and GFX_TRTT_INVAL registers.
+ */
+#define ANV_TRTT_L3L2_NULL_ENTRY (1 << 1)
+#define ANV_TRTT_L3L2_INVALID_ENTRY (1 << 0)
+
+/* Adds elements to the anv_trtt_bind structs passed. This doesn't write the
+ * entries to the HW yet.
+ */
+static VkResult
+anv_trtt_bind_add(struct anv_device *device,
+                  uint64_t trtt_addr, uint64_t dest_addr,
+                  struct anv_trtt_submission *s)
+{
+   VkResult result = VK_SUCCESS;
+   struct anv_trtt *trtt = &device->trtt;
+   bool is_null_bind = dest_addr == ANV_TRTT_L1_NULL_TILE_VAL;
+
+   int l3_index = (trtt_addr >> 35) & 0x1FF;
+   int l2_index = (trtt_addr >> 26) & 0x1FF;
+   int l1_index = (trtt_addr >> 16) & 0x3FF;
+
+   uint64_t l2_addr = trtt->l3_mirror[l3_index];
+   if (l2_addr == ANV_TRTT_L3L2_NULL_ENTRY && is_null_bind) {
+      return VK_SUCCESS;
+   } else if (l2_addr == 0 || l2_addr == ANV_TRTT_L3L2_NULL_ENTRY) {
+      if (is_null_bind) {
+         trtt->l3_mirror[l3_index] = ANV_TRTT_L3L2_NULL_ENTRY;
+
+         anv_trtt_bind_list_add_entry(s->l3l2_binds, &s->l3l2_binds_len,
+               trtt->l3_addr + l3_index * sizeof(uint64_t),
+               ANV_TRTT_L3L2_NULL_ENTRY);
+
+         return VK_SUCCESS;
+      }
+
+      struct anv_bo *l2_bo;
+      result = trtt_get_page_table_bo(device, &l2_bo, &l2_addr);
+      if (result != VK_SUCCESS)
+         return result;
+
+      trtt->l3_mirror[l3_index] = l2_addr;
+
+      anv_trtt_bind_list_add_entry(s->l3l2_binds, &s->l3l2_binds_len,
+            trtt->l3_addr + l3_index * sizeof(uint64_t), l2_addr);
+   }
+   assert(l2_addr != 0 && l2_addr != ANV_TRTT_L3L2_NULL_ENTRY);
+
+   /* The first page in the l2_mirror corresponds to l3_index=0 and so on. */
+   uint64_t l1_addr = trtt->l2_mirror[l3_index * 512 + l2_index];
+   if (l1_addr == ANV_TRTT_L3L2_NULL_ENTRY && is_null_bind) {
+      return VK_SUCCESS;
+   } else if (l1_addr == 0 || l1_addr == ANV_TRTT_L3L2_NULL_ENTRY) {
+      if (is_null_bind) {
+         trtt->l2_mirror[l3_index * 512 + l2_index] =
+            ANV_TRTT_L3L2_NULL_ENTRY;
+
+         anv_trtt_bind_list_add_entry(s->l3l2_binds, &s->l3l2_binds_len,
+               l2_addr + l2_index * sizeof(uint64_t),
+               ANV_TRTT_L3L2_NULL_ENTRY);
+
+         return VK_SUCCESS;
+      }
+
+      struct anv_bo *l1_bo;
+      result = trtt_get_page_table_bo(device, &l1_bo, &l1_addr);
+      if (result != VK_SUCCESS)
+         return result;
+
+      trtt->l2_mirror[l3_index * 512 + l2_index] = l1_addr;
+
+      anv_trtt_bind_list_add_entry(s->l3l2_binds, &s->l3l2_binds_len,
+            l2_addr + l2_index * sizeof(uint64_t), l1_addr);
+   }
+   assert(l1_addr != 0 && l1_addr != ANV_TRTT_L3L2_NULL_ENTRY);
+
+   anv_trtt_bind_list_add_entry(s->l1_binds, &s->l1_binds_len,
+            l1_addr + l1_index * sizeof(uint32_t), dest_addr);
+
+   return VK_SUCCESS;
+}
+
+static VkResult
+anv_sparse_bind_trtt(struct anv_device *device, int num_vm_binds,
+                     struct anv_vm_bind *vm_binds)
+{
+   struct anv_trtt *trtt = &device->trtt;
+   VkResult result;
+
+   /* These capacities are conservative estimations. For L1 binds the
+    * number will match exactly unless we skip NULL binds due to L2 already
+    * being NULL. For L3/L2 things are harder to estimate, but the resulting
+    * numbers are so small that a little overestimation won't hurt.
+    *
+    * We have assertions below to catch estimation errors.
+    */
+   int l3l2_binds_capacity = 1;
+   int l1_binds_capacity = 0;
+   for (int b = 0; b < num_vm_binds; b++) {
+      int pages = vm_binds[b].size / (64 * 1024);
+      l1_binds_capacity += pages;
+      l3l2_binds_capacity += (pages / 1024 + 1) * 2;
+   }
+
+   STACK_ARRAY(struct anv_trtt_bind, l3l2_binds, l3l2_binds_capacity);
+   STACK_ARRAY(struct anv_trtt_bind, l1_binds, l1_binds_capacity);
+   struct anv_trtt_submission s = {
+      .queue = trtt->queue,
+      .l3l2_binds = l3l2_binds,
+      .l1_binds = l1_binds,
+      .l3l2_binds_len = 0,
+      .l1_binds_len = 0,
+   };
+
+   pthread_mutex_lock(&trtt->mutex);
+
+   if (!trtt->l3_addr)
+      anv_trtt_init_context_state(s.queue);
+
+   assert(trtt->l3_addr);
+
+   for (int b = 0; b < num_vm_binds; b++) {
+      for (size_t i = 0; i < vm_binds[b].size; i += 64 * 1024) {
+         uint64_t trtt_addr = vm_binds[b].address + i;
+         uint64_t dest_addr =
+            (vm_binds[b].op == ANV_VM_BIND && vm_binds[b].bo) ?
+               vm_binds[b].bo->offset + vm_binds[b].bo_offset + i :
+               ANV_TRTT_L1_NULL_TILE_VAL;
+
+         result = anv_trtt_bind_add(device, trtt_addr, dest_addr, &s);
+         if (result != VK_SUCCESS)
+            goto out;
+      }
+   }
+
+   assert(s.l3l2_binds_len <= l3l2_binds_capacity);
+   assert(s.l1_binds_len <= l1_binds_capacity);
+
+   sparse_debug("trtt_binds: num_vm_binds:%02d l3l2:%04d l1:%04d\n",
+                num_vm_binds, s.l3l2_binds_len, s.l1_binds_len);
+
+   if (s.l3l2_binds_len || s.l1_binds_len)
+      result = anv_genX(device->info, write_trtt_entries)(&s);
+
+out:
+   pthread_mutex_unlock(&trtt->mutex);
+   STACK_ARRAY_FINISH(l1_binds);
+   STACK_ARRAY_FINISH(l3l2_binds);
+   return result;
+}
+
 static VkResult
 anv_sparse_bind_vm_bind(struct anv_device *device, int num_binds,
                         struct anv_vm_bind *binds)
@@ -303,7 +572,9 @@ anv_sparse_bind(struct anv_device *device,
          dump_anv_vm_bind(device, sparse, &binds[b]);
    }
 
-   return anv_sparse_bind_vm_bind(device, num_binds, binds);
+   return device->physical->sparse_uses_trtt ?
+            anv_sparse_bind_trtt(device, num_binds, binds) :
+            anv_sparse_bind_vm_bind(device, num_binds, binds);
 }
 
 VkResult
@@ -316,6 +587,9 @@ anv_init_sparse_bindings(struct anv_device *device,
 {
    uint64_t size = align64(size_, ANV_SPARSE_BLOCK_SIZE);
 
+   if (device->physical->sparse_uses_trtt)
+      alloc_flags |= ANV_BO_ALLOC_TRTT;
+
    sparse->address = anv_vma_alloc(device, size, ANV_SPARSE_BLOCK_SIZE,
                                    alloc_flags,
                                    intel_48b_address(client_address),
diff --git a/src/intel/vulkan/genX_cmd_buffer.c 
b/src/intel/vulkan/genX_cmd_buffer.c
index 2e5387d8227..496b7ed8557 100644
--- a/src/intel/vulkan/genX_cmd_buffer.c
+++ b/src/intel/vulkan/genX_cmd_buffer.c
@@ -8426,3 +8426,57 @@ genX(cmd_buffer_end_companion_rcs_syncpoint)(struct 
anv_cmd_buffer *cmd_buffer,
    unreachable("Not implemented");
 #endif
 }
+
+VkResult
+genX(write_trtt_entries)(struct anv_trtt_submission *submit)
+{
+#if GFX_VER >= 12
+   struct anv_queue *queue = submit->queue;
+   size_t batch_size = submit->l3l2_binds_len * 20 +
+                       submit->l1_binds_len * 16 + 8;
+   STACK_ARRAY(uint32_t, cmds, batch_size);
+   struct anv_batch batch = {
+      .start = cmds,
+      .next = cmds,
+      .end = (void *)cmds + batch_size,
+   };
+
+   /* TODO: writes to contiguous addresses can be combined into a single big
+    * MI_STORE_DATA_IMM instruction.
+    */
+
+   for (int i = 0; i < submit->l3l2_binds_len; i++) {
+      bool is_last_write = submit->l1_binds_len == 0 &&
+                           i + 1 == submit->l3l2_binds_len;
+
+      anv_batch_emitn(&batch, 5, GENX(MI_STORE_DATA_IMM),
+         .ForceWriteCompletionCheck = is_last_write,
+         .StoreQword = true,
+         .Address = anv_address_from_u64(submit->l3l2_binds[i].pte_addr),
+         .ImmediateData = submit->l3l2_binds[i].entry_addr,
+      );
+   }
+
+   for (int i = 0; i < submit->l1_binds_len; i++) {
+      bool is_last_write = i + 1 == submit->l1_binds_len;
+
+      anv_batch_emit(&batch, GENX(MI_STORE_DATA_IMM), sdi) {
+         sdi.ForceWriteCompletionCheck = is_last_write;
+         sdi.Address = anv_address_from_u64(submit->l1_binds[i].pte_addr);
+         sdi.ImmediateData =
+            (submit->l1_binds[i].entry_addr >> 16) & 0xFFFFFFFF;
+      }
+   }
+
+   anv_batch_emit(&batch, GENX(MI_BATCH_BUFFER_END), bbe);
+
+   assert(batch.next <= batch.end);
+
+   VkResult result = anv_queue_submit_trtt_batch(queue, &batch);
+   STACK_ARRAY_FINISH(cmds);
+
+   return result;
+
+#endif
+   return VK_SUCCESS;
+}
diff --git a/src/intel/vulkan/genX_init_state.c 
b/src/intel/vulkan/genX_init_state.c
index 98d0c40a490..1586686fa99 100644
--- a/src/intel/vulkan/genX_init_state.c
+++ b/src/intel/vulkan/genX_init_state.c
@@ -606,6 +606,9 @@ init_render_queue_state(struct anv_queue *queue, bool 
is_companion_rcs_batch)
 
    assert(batch.next <= batch.end);
 
+   if (!device->trtt.queue)
+      device->trtt.queue = queue;
+
    return anv_queue_submit_simple_batch(queue, &batch, is_companion_rcs_batch);
 }
 
@@ -1205,3 +1208,56 @@ genX(apply_task_urb_workaround)(struct anv_cmd_buffer 
*cmd_buffer)
        WriteImmediateData, cmd_buffer->device->workaround_address, 0, 0);
 #endif
 }
+
+VkResult
+genX(init_trtt_context_state)(struct anv_queue *queue)
+{
+#if GFX_VER >= 12
+   struct anv_device *device = queue->device;
+   struct anv_trtt *trtt = &device->trtt;
+
+   uint32_t cmds[128];
+   struct anv_batch batch = {
+      .start = cmds,
+      .next = cmds,
+      .end = (void *)cmds + sizeof(cmds),
+   };
+
+   anv_batch_write_reg(&batch, GENX(GFX_TRTT_INVAL), trtt_inval) {
+      trtt_inval.InvalidTileDetectionValue = ANV_TRTT_L1_INVALID_TILE_VAL;
+   }
+   anv_batch_write_reg(&batch, GENX(GFX_TRTT_NULL), trtt_null) {
+      trtt_null.NullTileDetectionValue = ANV_TRTT_L1_NULL_TILE_VAL;
+   }
+   anv_batch_write_reg(&batch, GENX(GFX_TRTT_VA_RANGE), trtt_va_range) {
+      trtt_va_range.TRVAMaskValue = 0xF;
+      trtt_va_range.TRVADataValue = 0xF;
+   }
+
+   uint64_t l3_addr = trtt->l3_addr;
+   assert((l3_addr & 0xFFF) == 0);
+   anv_batch_write_reg(&batch, GENX(GFX_TRTT_L3_BASE_LOW), trtt_base_low) {
+      trtt_base_low.TRVAL3PointerLowerAddress =
+         (l3_addr & 0xFFFFF000) >> 12;
+   }
+   anv_batch_write_reg(&batch, GENX(GFX_TRTT_L3_BASE_HIGH),
+         trtt_base_high) {
+      trtt_base_high.TRVAL3PointerUpperAddress =
+         (l3_addr >> 32) & 0xFFFF;
+   }
+   /* Enabling TR-TT needs to be done after setting up the other registers.
+   */
+   anv_batch_write_reg(&batch, GENX(GFX_TRTT_CR), trtt_cr) {
+      trtt_cr.TRTTEnable = true;
+   }
+
+   anv_batch_emit(&batch, GENX(MI_BATCH_BUFFER_END), bbe);
+   assert(batch.next <= batch.end);
+
+   VkResult res = anv_queue_submit_simple_batch(queue, &batch, false);
+   if (res != VK_SUCCESS)
+      return res;
+
+#endif
+   return VK_SUCCESS;
+}
diff --git a/src/intel/vulkan/i915/anv_batch_chain.c 
b/src/intel/vulkan/i915/anv_batch_chain.c
index 925c40e09ca..1ea0697f2bd 100644
--- a/src/intel/vulkan/i915/anv_batch_chain.c
+++ b/src/intel/vulkan/i915/anv_batch_chain.c
@@ -338,6 +338,31 @@ get_context_and_exec_flags(struct anv_queue *queue,
                  device->context_id;
 }
 
+static VkResult
+anv_execbuf_add_trtt_bos(struct anv_device *device,
+                         struct anv_execbuf *execbuf)
+{
+   struct anv_trtt *trtt = &device->trtt;
+   VkResult result = VK_SUCCESS;
+
+   /* If l3_addr is zero we're not using TR-TT, there's no bo to add. */
+   if (!trtt->l3_addr)
+      return VK_SUCCESS;
+
+   pthread_mutex_lock(&trtt->mutex);
+
+   for (int i = 0; i < trtt->num_page_table_bos; i++) {
+      result = anv_execbuf_add_bo(device, execbuf, trtt->page_table_bos[i],
+                                  NULL, 0);
+      if (result != VK_SUCCESS)
+         goto out;
+   }
+
+out:
+   pthread_mutex_unlock(&trtt->mutex);
+   return result;
+}
+
 static VkResult
 setup_execbuf_for_cmd_buffers(struct anv_execbuf *execbuf,
                               struct anv_queue *queue,
@@ -401,7 +426,8 @@ setup_execbuf_for_cmd_buffers(struct anv_execbuf *execbuf,
       return result;
 
    /* Add the BOs for all user allocated memory objects because we can't
-    * track after binding updates of VK_EXT_descriptor_indexing.
+    * track after binding updates of VK_EXT_descriptor_indexing and due to how
+    * sparse resources work.
     */
    list_for_each_entry(struct anv_device_memory, mem,
                        &device->memory_objects, link) {
@@ -410,6 +436,10 @@ setup_execbuf_for_cmd_buffers(struct anv_execbuf *execbuf,
          return result;
    }
 
+   result = anv_execbuf_add_trtt_bos(device, execbuf);
+   if (result != VK_SUCCESS)
+      return result;
+
    /* Add all the private BOs from images because we can't track after binding
     * updates of VK_EXT_descriptor_indexing.
     */
@@ -954,6 +984,73 @@ fail:
    return result;
 }
 
+VkResult
+i915_execute_trtt_batch(struct anv_queue *queue, struct anv_bo *batch_bo,
+                        uint32_t batch_size)
+{
+   struct anv_device *device = queue->device;
+   struct anv_trtt *trtt = &device->trtt;
+   struct anv_execbuf execbuf = {
+      .alloc = &device->vk.alloc,
+      .alloc_scope = VK_SYSTEM_ALLOCATION_SCOPE_DEVICE,
+   };
+   VkResult result;
+
+   result = anv_execbuf_add_bo(device, &execbuf, device->workaround_bo, NULL,
+                               0);
+   if (result != VK_SUCCESS)
+      goto out;
+
+   for (int i = 0; i < trtt->num_page_table_bos; i++) {
+      result = anv_execbuf_add_bo(device, &execbuf, trtt->page_table_bos[i],
+                                  NULL, EXEC_OBJECT_WRITE);
+      if (result != VK_SUCCESS)
+         goto out;
+   }
+
+   result = anv_execbuf_add_bo(device, &execbuf, batch_bo, NULL, 0);
+   if (result != VK_SUCCESS)
+      goto out;
+
+   if (INTEL_DEBUG(DEBUG_SUBMIT))
+      anv_i915_debug_submit(&execbuf);
+
+   uint64_t exec_flags = 0;
+   uint32_t context_id;
+   get_context_and_exec_flags(queue, false, &exec_flags, &context_id);
+
+   execbuf.execbuf = (struct drm_i915_gem_execbuffer2) {
+      .buffers_ptr = (uintptr_t) execbuf.objects,
+      .buffer_count = execbuf.bo_count,
+      .batch_start_offset = 0,
+      .batch_len = batch_size,
+      .flags = I915_EXEC_HANDLE_LUT | I915_EXEC_NO_RELOC | exec_flags,
+      .rsvd1 = context_id,
+      .rsvd2 = 0,
+   };
+
+   int ret = queue->device->info->no_hw ? 0 :
+      anv_gem_execbuffer(device, &execbuf.execbuf);
+   if (ret) {
+      result = vk_device_set_lost(&device->vk,
+                                  "trtt anv_gem_execbuffer failed: %m");
+      goto out;
+   }
+
+   /* TODO: we can get rid of this wait once we can properly handle the buffer
+    * lifetimes.
+    */
+   result = anv_device_wait(device, batch_bo, INT64_MAX);
+   if (result != VK_SUCCESS) {
+      result = vk_device_set_lost(&device->vk,
+                                  "trtt anv_device_wait failed: %m");
+   }
+
+out:
+   anv_execbuf_finish(&execbuf);
+   return result;
+}
+
 VkResult
 i915_queue_exec_trace(struct anv_queue *queue,
                       struct anv_utrace_submit *submit)
diff --git a/src/intel/vulkan/i915/anv_batch_chain.h 
b/src/intel/vulkan/i915/anv_batch_chain.h
index f46f19c90bc..eff38ce2ee2 100644
--- a/src/intel/vulkan/i915/anv_batch_chain.h
+++ b/src/intel/vulkan/i915/anv_batch_chain.h
@@ -29,6 +29,7 @@
 
 #include "vk_sync.h"
 
+struct anv_device;
 struct anv_queue;
 struct anv_bo;
 struct anv_cmd_buffer;
@@ -41,6 +42,11 @@ i915_queue_exec_trace(struct anv_queue *queue,
 VkResult
 i915_execute_simple_batch(struct anv_queue *queue, struct anv_bo *batch_bo,
                           uint32_t batch_bo_size, bool is_companion_rcs_batch);
+
+VkResult
+i915_execute_trtt_batch(struct anv_queue *queue, struct anv_bo *batch_bo,
+                        uint32_t batch_size);
+
 VkResult
 i915_queue_exec_locked(struct anv_queue *queue,
                        uint32_t wait_count,
diff --git a/src/intel/vulkan/i915/anv_kmd_backend.c 
b/src/intel/vulkan/i915/anv_kmd_backend.c
index fe9be942ca8..887a9dbf6c5 100644
--- a/src/intel/vulkan/i915/anv_kmd_backend.c
+++ b/src/intel/vulkan/i915/anv_kmd_backend.c
@@ -277,6 +277,7 @@ anv_i915_kmd_backend_get(void)
       .vm_bind_bo = i915_vm_bind_bo,
       .vm_unbind_bo = i915_vm_bind_bo,
       .execute_simple_batch = i915_execute_simple_batch,
+      .execute_trtt_batch = i915_execute_trtt_batch,
       .queue_exec_locked = i915_queue_exec_locked,
       .queue_exec_trace = i915_queue_exec_trace,
       .bo_alloc_flags_to_bo_flags = i915_bo_alloc_flags_to_bo_flags,
diff --git a/src/intel/vulkan/xe/anv_batch_chain.c 
b/src/intel/vulkan/xe/anv_batch_chain.c
index baf04db3cb3..187be25e5ca 100644
--- a/src/intel/vulkan/xe/anv_batch_chain.c
+++ b/src/intel/vulkan/xe/anv_batch_chain.c
@@ -178,6 +178,51 @@ xe_exec_print_debug(struct anv_queue *queue, uint32_t 
cmd_buffer_count,
                                    is_companion_rcs_cmd_buffer);
 }
 
+VkResult
+xe_execute_trtt_batch(struct anv_queue *queue, struct anv_bo *batch_bo,
+                      uint32_t batch_size)
+{
+   struct anv_device *device = queue->device;
+   VkResult result = VK_SUCCESS;
+
+   uint32_t syncobj_handle;
+   if (drmSyncobjCreate(device->fd, 0, &syncobj_handle))
+      return vk_errorf(device, VK_ERROR_UNKNOWN, "Unable to create sync obj");
+
+   struct drm_xe_sync sync = {
+      .flags = DRM_XE_SYNC_SYNCOBJ | DRM_XE_SYNC_SIGNAL,
+      .handle = syncobj_handle,
+   };
+   struct drm_xe_exec exec = {
+      .exec_queue_id = queue->exec_queue_id,
+      .num_batch_buffer = 1,
+      .address = batch_bo->offset,
+      .num_syncs = 1,
+      .syncs = (uintptr_t)&sync,
+   };
+
+   if (!device->info->no_hw) {
+      if (intel_ioctl(device->fd, DRM_IOCTL_XE_EXEC, &exec)) {
+         result = vk_device_set_lost(&device->vk, "XE_EXEC failed: %m");
+         goto exec_error;
+      }
+   }
+
+   /* FIXME: we shouldn't need this wait, figure out a way to remove it. */
+   struct drm_syncobj_wait wait = {
+      .handles = (uintptr_t)&syncobj_handle,
+      .timeout_nsec = INT64_MAX,
+      .count_handles = 1,
+   };
+   if (intel_ioctl(device->fd, DRM_IOCTL_SYNCOBJ_WAIT, &wait))
+      result = vk_device_set_lost(&device->vk, "DRM_IOCTL_SYNCOBJ_WAIT failed: 
%m");
+
+exec_error:
+   drmSyncobjDestroy(device->fd, syncobj_handle);
+
+   return result;
+}
+
 VkResult
 xe_queue_exec_utrace_locked(struct anv_queue *queue,
                             struct anv_utrace_submit *utrace_submit)
diff --git a/src/intel/vulkan/xe/anv_batch_chain.h 
b/src/intel/vulkan/xe/anv_batch_chain.h
index f664f9673dd..d11dd11316a 100644
--- a/src/intel/vulkan/xe/anv_batch_chain.h
+++ b/src/intel/vulkan/xe/anv_batch_chain.h
@@ -28,6 +28,7 @@
 #include "vulkan/vulkan_core.h"
 #include "vk_sync.h"
 
+struct anv_device;
 struct anv_queue;
 struct anv_bo;
 struct anv_cmd_buffer;
@@ -38,6 +39,10 @@ VkResult
 xe_execute_simple_batch(struct anv_queue *queue, struct anv_bo *batch_bo,
                         uint32_t batch_bo_size, bool is_companion_rcs_batch);
 VkResult
+xe_execute_trtt_batch(struct anv_queue *queue, struct anv_bo *batch_bo,
+                      uint32_t batch_size);
+
+VkResult
 xe_queue_exec_locked(struct anv_queue *queue,
                      uint32_t wait_count,
                      const struct vk_sync_wait *waits,
diff --git a/src/intel/vulkan/xe/anv_kmd_backend.c 
b/src/intel/vulkan/xe/anv_kmd_backend.c
index 5063b19ec2c..a80bfb57c3b 100644
--- a/src/intel/vulkan/xe/anv_kmd_backend.c
+++ b/src/intel/vulkan/xe/anv_kmd_backend.c
@@ -220,6 +220,7 @@ anv_xe_kmd_backend_get(void)
       .vm_bind_bo = xe_vm_bind_bo,
       .vm_unbind_bo = xe_vm_unbind_bo,
       .execute_simple_batch = xe_execute_simple_batch,
+      .execute_trtt_batch = xe_execute_trtt_batch,
       .queue_exec_locked = xe_queue_exec_locked,
       .queue_exec_trace = xe_queue_exec_utrace_locked,
       .bo_alloc_flags_to_bo_flags = xe_bo_alloc_flags_to_bo_flags,

Mesa (main): anv/sparse: allow sparse resouces to use TR-TT as its backend

Reply via email to