Mesa (main): aux/tc: rework inter-batch renderpass info handling

GitLab Mirror Fri, 17 Mar 2023 08:19:46 -0700

Module: Mesa
Branch: main
Commit: 430db8107198f4a914d048039fbdb9d58de0464f
URL:    
http://cgit.freedesktop.org/mesa/mesa/commit/?id=430db8107198f4a914d048039fbdb9d58de0464f


Author: Mike Blumenkrantz <[email protected]>
Date:   Wed Mar  8 18:36:24 2023 -0500

aux/tc: rework inter-batch renderpass info handling

the tricky part of tracking renderpass info in tc is handling batch
flushing. there are a number of places that can trigger it, but
there are only two types of flushes:
* full flushes (all commands execute)
* partial flushes (more commands will execute after)

the latter case is the important one, as it effectively means that
the current renderpass info needs to "roll over" into the next one,
and it's really the next info that the driver will want to look at.
this is made trickier by there being no way (for the driver) to distinguish
when a rollover happens in order to delay beginning a renderpass for
further parsing

to solve this, add a member to renderpass info to chain the rolled-over info,
which tc can then process when the driver tries to wait. this works "most"
of the time, except when an app/test blows out the tc batch count, in which
case this pointer will be non-null, and it can be directly signaled as a less
optimized renderpass to avoid deadlocking

also sometimes a flush will trigger sub-flushes for buffer lists, so
add an assert to ensure nobody tries using this with 
driver_calls_flush_notify=true

Acked-by: Marek Olšák <[email protected]>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/21800>

---

 src/gallium/auxiliary/util/u_threaded_context.c | 120 ++++++++++++++++++++----
 src/gallium/auxiliary/util/u_threaded_context.h |   5 +-
 2 files changed, 104 insertions(+), 21 deletions(-)

diff --git a/src/gallium/auxiliary/util/u_threaded_context.c 
b/src/gallium/auxiliary/util/u_threaded_context.c
index 75802fd5ad5..d4b0f7e5285 100644
--- a/src/gallium/auxiliary/util/u_threaded_context.c
+++ b/src/gallium/auxiliary/util/u_threaded_context.c
@@ -133,6 +133,21 @@ tc_batch_rp_info(struct tc_renderpass_info *info)
    return (struct tc_batch_rp_info *)info;
 }
 
+static void
+tc_sanitize_renderpass_info(struct threaded_context *tc)
+{
+   tc->renderpass_info_recording->cbuf_invalidate = 0;
+   tc->renderpass_info_recording->zsbuf_invalidate = false;
+   tc->renderpass_info_recording->cbuf_load |= 
(~tc->renderpass_info_recording->cbuf_clear) & 
BITFIELD_MASK(PIPE_MAX_COLOR_BUFS);
+   if (tc->fb_resources[PIPE_MAX_COLOR_BUFS] && 
!tc_renderpass_info_is_zsbuf_used(tc->renderpass_info_recording))
+      /* this should be a "safe" way to indicate to the driver that both loads 
and stores are required;
+      * driver can always detect invalidation
+      */
+      tc->renderpass_info_recording->zsbuf_clear_partial = true;
+   if (tc->num_queries_active)
+      tc->renderpass_info_recording->has_query_ends = true;
+}
+
 /* ensure the batch's array of renderpass data is large enough for the current 
index */
 static void
 tc_batch_renderpass_infos_resize(struct threaded_context *tc, struct tc_batch 
*batch)
@@ -158,6 +173,8 @@ tc_batch_renderpass_infos_resize(struct threaded_context 
*tc, struct tc_batch *b
       unsigned count = (batch->renderpass_infos.capacity - size) /
                        sizeof(struct tc_batch_rp_info);
       infos = batch->renderpass_infos.data;
+      if (infos->prev)
+         infos->prev->next = infos;
       for (unsigned i = 0; i < count; i++)
          util_queue_fence_init(&infos[start + i].ready);
       /* re-set current recording info on resize */
@@ -179,37 +196,67 @@ tc_signal_renderpass_info_ready(struct threaded_context 
*tc)
  * 'full_copy' is used for preserving data across non-blocking tc batch flushes
  */
 static void
-tc_batch_increment_renderpass_info(struct threaded_context *tc, bool full_copy)
+tc_batch_increment_renderpass_info(struct threaded_context *tc, unsigned 
batch_idx, bool full_copy)
 {
-   struct tc_batch *batch = &tc->batch_slots[tc->next];
+   struct tc_batch *batch = &tc->batch_slots[batch_idx];
    struct tc_batch_rp_info *tc_info = batch->renderpass_infos.data;
 
-   /* signal existing info since it will not be used anymore */
-   tc_signal_renderpass_info_ready(tc);
+   if (tc_info[0].next || batch->num_total_slots) {
+      /* deadlock condition detected: all batches are in flight, renderpass 
hasn't ended
+       * (probably a cts case)
+       */
+      struct tc_batch_rp_info *info = 
tc_batch_rp_info(tc->renderpass_info_recording);
+      if (!util_queue_fence_is_signalled(&info->ready)) {
+         /* this batch is actively executing and the driver is waiting on the 
recording fence to signal */
+         /* force all buffer usage to avoid data loss */
+         info->info.cbuf_load = ~(BITFIELD_MASK(8) & info->info.cbuf_clear);
+         info->info.zsbuf_clear_partial = true;
+         info->info.has_query_ends = tc->num_queries_active > 0;
+         /* ensure threaded_context_get_renderpass_info() won't deadlock */
+         info->next = NULL;
+         util_queue_fence_signal(&info->ready);
+      }
+      /* always wait on the batch to finish since this will otherwise 
overwrite thread data */
+      util_queue_fence_wait(&batch->fence);
+   }
    /* increment rp info and initialize it */
    batch->renderpass_info_idx++;
    tc_batch_renderpass_infos_resize(tc, batch);
    tc_info = batch->renderpass_infos.data;
 
    if (full_copy) {
+      /* this should only be called when changing batches */
+      assert(batch->renderpass_info_idx == 0);
       /* copy the previous data in its entirety: this is still the same 
renderpass */
       if (tc->renderpass_info_recording) {
          tc_info[batch->renderpass_info_idx].info.data = 
tc->renderpass_info_recording->data;
+         tc_batch_rp_info(tc->renderpass_info_recording)->next = 
&tc_info[batch->renderpass_info_idx];
+         tc_info[batch->renderpass_info_idx].prev = 
tc_batch_rp_info(tc->renderpass_info_recording);
+         /* guard against deadlock scenario */
+         assert(&tc_batch_rp_info(tc->renderpass_info_recording)->next->info 
!= tc->renderpass_info_recording);
       } else {
          tc_info[batch->renderpass_info_idx].info.data = 0;
+         tc_info[batch->renderpass_info_idx].prev = NULL;
       }
    } else {
       /* selectively copy: only the CSO metadata is copied, and a new 
framebuffer state will be added later */
       tc_info[batch->renderpass_info_idx].info.data = 0;
       if (tc->renderpass_info_recording) {
          tc_info[batch->renderpass_info_idx].info.data16[2] = 
tc->renderpass_info_recording->data16[2];
+         tc_batch_rp_info(tc->renderpass_info_recording)->next = NULL;
+         tc_info[batch->renderpass_info_idx].prev = NULL;
       }
    }
 
+   assert(!full_copy || !tc->renderpass_info_recording || 
tc_batch_rp_info(tc->renderpass_info_recording)->next);
+   /* signal existing info since it will not be used anymore */
+   tc_signal_renderpass_info_ready(tc);
    util_queue_fence_reset(&tc_info[batch->renderpass_info_idx].ready);
+   /* guard against deadlock scenario */
    assert(tc->renderpass_info_recording != 
&tc_info[batch->renderpass_info_idx].info);
    /* this is now the current recording renderpass info */
    tc->renderpass_info_recording = &tc_info[batch->renderpass_info_idx].info;
+   batch->max_renderpass_info_idx = batch->renderpass_info_idx;
 }
 
 static ALWAYS_INLINE struct tc_renderpass_info *
@@ -385,10 +432,18 @@ tc_batch_execute(void *job, UNUSED void *gdata, int 
thread_index)
    /* setup renderpass info */
    batch->tc->renderpass_info = batch->renderpass_infos.data;
 
-   if (batch->tc->options.parse_renderpass_info)
+   if (batch->tc->options.parse_renderpass_info) {
       batch_execute(batch, pipe, last, true);
-   else
+
+      struct tc_batch_rp_info *info = batch->renderpass_infos.data;
+      for (unsigned i = 0; i < batch->max_renderpass_info_idx + 1; i++) {
+         if (info[i].next)
+            info[i].next->prev = NULL;
+         info[i].next = NULL;
+      }
+   } else {
       batch_execute(batch, pipe, last, false);
+   }
 
    /* Add the fence to the list of fences for the driver to signal at the next
     * flush, which we use for tracking which buffers are referenced by
@@ -418,6 +473,7 @@ tc_batch_execute(void *job, UNUSED void *gdata, int 
thread_index)
    batch->num_total_slots = 0;
    batch->last_mergeable_call = NULL;
    batch->first_set_fb = false;
+   batch->max_renderpass_info_idx = 0;
 }
 
 static void
@@ -441,6 +497,7 @@ static void
 tc_batch_flush(struct threaded_context *tc, bool full_copy)
 {
    struct tc_batch *next = &tc->batch_slots[tc->next];
+   unsigned next_id = (tc->next + 1) % TC_MAX_BATCHES;
 
    tc_assert(next->num_total_slots != 0);
    tc_batch_check(next);
@@ -455,19 +512,20 @@ tc_batch_flush(struct threaded_context *tc, bool 
full_copy)
    /* reset renderpass info index for subsequent use */
    next->renderpass_info_idx = -1;
 
-   util_queue_add_job(&tc->queue, next, &next->fence, tc_batch_execute,
-                      NULL, 0);
-   tc->last = tc->next;
-   tc->next = (tc->next + 1) % TC_MAX_BATCHES;
-   tc_begin_next_buffer_list(tc);
-
    /* always increment renderpass info on batch flush;
     * renderpass info can only be accessed by its owner batch during execution
     */
    if (tc->renderpass_info_recording) {
-      tc->batch_slots[tc->next].first_set_fb = full_copy;
-      tc_batch_increment_renderpass_info(tc, full_copy);
+      tc->batch_slots[next_id].first_set_fb = full_copy;
+      tc_batch_increment_renderpass_info(tc, next_id, full_copy);
    }
+
+   util_queue_add_job(&tc->queue, next, &next->fence, tc_batch_execute,
+                      NULL, 0);
+   tc->last = tc->next;
+   tc->next = next_id;
+   tc_begin_next_buffer_list(tc);
+
 }
 
 /* This is the function that adds variable-sized calls into the current
@@ -588,6 +646,18 @@ _tc_sync(struct threaded_context *tc, UNUSED const char 
*info, UNUSED const char
 
    tc_debug_check(tc);
 
+   if (tc->options.parse_renderpass_info && tc->in_renderpass && 
!tc->flushing) {
+      /* corner case: if tc syncs for any reason but a driver flush during a 
renderpass,
+       * then the current renderpass info MUST be signaled to avoid 
deadlocking the driver
+       *
+       * this is not a "complete" signal operation, however, as it's unknown 
what calls may
+       * come after this one, which means that framebuffer attachment data is 
unreliable
+       * 
+       * to avoid erroneously passing bad state to the driver (e.g., allowing 
zsbuf elimination),
+       * force all attachments active and assume the app was going to get bad 
perf here anyway
+       */
+      tc_sanitize_renderpass_info(tc);
+   }
    tc_signal_renderpass_info_ready(tc);
 
    /* Only wait for queued calls... */
@@ -626,7 +696,7 @@ _tc_sync(struct threaded_context *tc, UNUSED const char 
*info, UNUSED const char
       int renderpass_info_idx = next->renderpass_info_idx;
       if (renderpass_info_idx > 0) {
          next->renderpass_info_idx = -1;
-         tc_batch_increment_renderpass_info(tc, false);
+         tc_batch_increment_renderpass_info(tc, tc->next, false);
       } else if (tc->renderpass_info_recording->has_draw) {
          tc->renderpass_info_recording->data32[0] = 0;
       }
@@ -1443,7 +1513,7 @@ tc_set_framebuffer_state(struct pipe_context *_pipe,
       tc->fb_resolve = fb->resolve;
       if (tc->seen_fb_state) {
          /* this is the end of a renderpass, so increment the renderpass info 
*/
-         tc_batch_increment_renderpass_info(tc, false);
+         tc_batch_increment_renderpass_info(tc, tc->next, false);
          /* if zsbuf hasn't changed (i.e., possibly just adding a color 
buffer):
           * keep zsbuf usage data
           */
@@ -3508,6 +3578,7 @@ tc_flush(struct pipe_context *_pipe, struct 
pipe_fence_handle **fence,
    }
 
 out_of_memory:
+   tc->flushing = true;
    /* renderpass info is signaled during sync */
    tc_sync_msg(tc, flags & PIPE_FLUSH_END_OF_FRAME ? "end of frame" :
                    flags & PIPE_FLUSH_DEFERRED ? "deferred fence" : "normal");
@@ -3520,6 +3591,7 @@ out_of_memory:
    tc_set_driver_thread(tc);
    pipe->flush(pipe, fence, flags);
    tc_clear_driver_thread(tc);
+   tc->flushing = false;
 }
 
 struct tc_draw_single {
@@ -4885,8 +4957,11 @@ threaded_context_create(struct pipe_context *pipe,
       return NULL;
    }
 
-   if (options)
+   if (options) {
+      /* this is unimplementable */
+      assert(!(options->parse_renderpass_info && 
options->driver_calls_flush_notify));
       tc->options = *options;
+   }
 
    pipe = trace_context_create_threaded(pipe->screen, pipe, &replace_buffer, 
&tc->options);
 
@@ -5095,7 +5170,7 @@ threaded_context_create(struct pipe_context *pipe,
 
    tc_begin_next_buffer_list(tc);
    if (tc->options.parse_renderpass_info)
-      tc_batch_increment_renderpass_info(tc, false);
+      tc_batch_increment_renderpass_info(tc, tc->next, false);
    return &tc->base;
 
 fail:
@@ -5117,7 +5192,12 @@ threaded_context_init_bytes_mapped_limit(struct 
threaded_context *tc, unsigned d
 const struct tc_renderpass_info *
 threaded_context_get_renderpass_info(struct threaded_context *tc)
 {
+   assert(tc->renderpass_info && tc->options.parse_renderpass_info);
    struct tc_batch_rp_info *info = tc_batch_rp_info(tc->renderpass_info);
-   util_queue_fence_wait(&info->ready);
-   return &info->info;
+   while (1) {
+      util_queue_fence_wait(&info->ready);
+      if (!info->next)
+         return &info->info;
+      info = info->next;
+   }
 }
diff --git a/src/gallium/auxiliary/util/u_threaded_context.h 
b/src/gallium/auxiliary/util/u_threaded_context.h
index 30913b673de..0e5b3f9c874 100644
--- a/src/gallium/auxiliary/util/u_threaded_context.h
+++ b/src/gallium/auxiliary/util/u_threaded_context.h
@@ -510,7 +510,8 @@ struct tc_batch {
    uint16_t num_total_slots;
    uint16_t buffer_list_index;
    /* the index of the current renderpass info for recording */
-   int renderpass_info_idx;
+   int16_t renderpass_info_idx;
+   uint16_t max_renderpass_info_idx;
 
    /* The last mergeable call that was added to this batch (i.e.
     * buffer subdata). This might be out-of-date or NULL.
@@ -608,6 +609,8 @@ struct threaded_context {
    bool in_renderpass;
    /* whether a query has ended more recently than a draw */
    bool query_ended;
+   /* whether pipe_context::flush has been called */
+   bool flushing;
 
    bool seen_streamout_buffers;
    bool seen_shader_buffers[PIPE_SHADER_TYPES];

Mesa (main): aux/tc: rework inter-batch renderpass info handling

Reply via email to