Module: Mesa
Branch: main
Commit: 688f03e3699973157a9f8b0514e956c37f9fb9cd
URL:    
http://cgit.freedesktop.org/mesa/mesa/commit/?id=688f03e3699973157a9f8b0514e956c37f9fb9cd

Author: Lionel Landwerlin <[email protected]>
Date:   Fri May 19 21:34:46 2023 +0300

iris: use COMPUTE_WALKER post sync field to track compute work

Signed-off-by: Lionel Landwerlin <[email protected]>
Reviewed-by: Felix DeGrood <[email protected]>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/23131>

---

 src/gallium/drivers/iris/iris_batch.c       |  10 +--
 src/gallium/drivers/iris/iris_context.h     |   8 ++-
 src/gallium/drivers/iris/iris_genx_macros.h |   3 +
 src/gallium/drivers/iris/iris_screen.h      |   5 ++
 src/gallium/drivers/iris/iris_state.c       |  31 +++++++-
 src/gallium/drivers/iris/iris_utrace.c      | 107 +++++++++++++++++++++++-----
 6 files changed, 140 insertions(+), 24 deletions(-)

diff --git a/src/gallium/drivers/iris/iris_batch.c 
b/src/gallium/drivers/iris/iris_batch.c
index 28db1d7d90c..b66f990ef6c 100644
--- a/src/gallium/drivers/iris/iris_batch.c
+++ b/src/gallium/drivers/iris/iris_batch.c
@@ -551,9 +551,9 @@ void iris_batch_maybe_begin_frame(struct iris_batch *batch)
 {
    struct iris_context *ice = batch->ice;
 
-   if (ice->tracing_begin_frame != ice->frame) {
+   if (ice->utrace.begin_frame != ice->frame) {
       trace_intel_begin_frame(&batch->trace, batch);
-      ice->tracing_begin_frame = ice->tracing_end_frame = ice->frame;
+      ice->utrace.begin_frame = ice->utrace.end_frame = ice->frame;
    }
 }
 
@@ -656,9 +656,9 @@ iris_finish_batch(struct iris_batch *batch)
    trace_intel_end_batch(&batch->trace, batch->name);
 
    struct iris_context *ice = batch->ice;
-   if (ice->tracing_end_frame != ice->frame) {
-      trace_intel_end_frame(&batch->trace, batch, ice->tracing_end_frame);
-      ice->tracing_end_frame = ice->frame;
+   if (ice->utrace.end_frame != ice->frame) {
+      trace_intel_end_frame(&batch->trace, batch, ice->utrace.end_frame);
+      ice->utrace.end_frame = ice->frame;
    }
 
    /* Emit MI_BATCH_BUFFER_END to finish our batch. */
diff --git a/src/gallium/drivers/iris/iris_context.h 
b/src/gallium/drivers/iris/iris_context.h
index 696c44f34d6..2a2726a62d3 100644
--- a/src/gallium/drivers/iris/iris_context.h
+++ b/src/gallium/drivers/iris/iris_context.h
@@ -738,8 +738,12 @@ struct iris_context {
    struct intel_perf_context *perf_ctx;
 
    /** Frame number for u_trace */
-   uint32_t tracing_begin_frame;
-   uint32_t tracing_end_frame;
+   struct {
+      uint32_t begin_frame;
+      uint32_t end_frame;
+      uint64_t last_full_timestamp;
+      void    *last_compute_walker;
+   } utrace;
 
    /** Frame number for debug prints */
    uint32_t frame;
diff --git a/src/gallium/drivers/iris/iris_genx_macros.h 
b/src/gallium/drivers/iris/iris_genx_macros.h
index 8b80d74f9d8..d63ea41bb9b 100644
--- a/src/gallium/drivers/iris/iris_genx_macros.h
+++ b/src/gallium/drivers/iris/iris_genx_macros.h
@@ -108,6 +108,9 @@ __gen_get_batch_address(struct iris_batch *batch, void 
*location)
 #define iris_emit_cmd(batch, cmd, name) \
    _iris_pack_command(batch, cmd, __gen_get_batch_dwords(batch, 
__genxml_cmd_length(cmd)), name)
 
+#define iris_emit_dwords(batch, n) \
+   __gen_get_batch_dwords(batch, n)
+
 #define iris_emit_merge(batch, dwords0, dwords1, num_dwords)    \
    do {                                                         \
       uint32_t *dw = __gen_get_batch_dwords(batch, num_dwords); \
diff --git a/src/gallium/drivers/iris/iris_screen.h 
b/src/gallium/drivers/iris/iris_screen.h
index ce0931c5b7e..1b22cb12a18 100644
--- a/src/gallium/drivers/iris/iris_screen.h
+++ b/src/gallium/drivers/iris/iris_screen.h
@@ -114,6 +114,11 @@ struct iris_vtable {
                                      uint32_t offset_in_bytes,
                                      uint32_t report_id);
 
+   void (*rewrite_compute_walker_pc)(struct iris_batch *batch,
+                                     uint32_t *walker,
+                                     struct iris_bo *bo,
+                                     uint32_t offset);
+
    unsigned (*derived_program_state_size)(enum iris_program_cache_id id);
    void (*store_derived_program_state)(const struct intel_device_info *devinfo,
                                        enum iris_program_cache_id cache_id,
diff --git a/src/gallium/drivers/iris/iris_state.c 
b/src/gallium/drivers/iris/iris_state.c
index b745d7d65af..502617eb035 100644
--- a/src/gallium/drivers/iris/iris_state.c
+++ b/src/gallium/drivers/iris/iris_state.c
@@ -634,6 +634,31 @@ iris_copy_mem_mem(struct iris_batch *batch,
    iris_batch_sync_region_end(batch);
 }
 
+static void
+iris_rewrite_compute_walker_pc(struct iris_batch *batch,
+                               uint32_t *walker,
+                               struct iris_bo *bo,
+                               uint32_t offset)
+{
+#if GFX_VERx10 >= 125
+   struct iris_screen *screen = batch->screen;
+   struct iris_address addr = rw_bo(bo, offset, IRIS_DOMAIN_OTHER_WRITE);
+
+   uint32_t dwords[GENX(COMPUTE_WALKER_length)];
+
+   _iris_pack_command(batch, GENX(COMPUTE_WALKER), dwords, cw) {
+      cw.PostSync.Operation          = WriteTimestamp;
+      cw.PostSync.DestinationAddress = addr;
+      cw.PostSync.MOCS               = iris_mocs(NULL, &screen->isl_dev, 0);
+   }
+
+   for (uint32_t i = 0; i < GENX(COMPUTE_WALKER_length); i++)
+      walker[i] |= dwords[i];
+#else
+   unreachable("Unsupported");
+#endif
+}
+
 static void
 emit_pipeline_select(struct iris_batch *batch, uint32_t pipeline)
 {
@@ -7628,7 +7653,10 @@ iris_upload_compute_walker(struct iris_context *ice,
 
    iris_measure_snapshot(ice, batch, INTEL_SNAPSHOT_COMPUTE, NULL, NULL, NULL);
 
-   iris_emit_cmd(batch, GENX(COMPUTE_WALKER), cw) {
+   ice->utrace.last_compute_walker =
+      iris_emit_dwords(batch, GENX(COMPUTE_WALKER_length));
+   _iris_pack_command(batch, GENX(COMPUTE_WALKER),
+                      ice->utrace.last_compute_walker, cw) {
       cw.IndirectParameterEnable        = grid->indirect;
       cw.SIMDSize                       = dispatch.simd_size / 16;
       cw.LocalXMaximum                  = grid->block[0] - 1;
@@ -8901,6 +8929,7 @@ genX(init_screen_state)(struct iris_screen *screen)
    screen->vtbl.update_binder_address = iris_update_binder_address;
    screen->vtbl.upload_compute_state = iris_upload_compute_state;
    screen->vtbl.emit_raw_pipe_control = iris_emit_raw_pipe_control;
+   screen->vtbl.rewrite_compute_walker_pc = iris_rewrite_compute_walker_pc;
    screen->vtbl.emit_mi_report_perf_count = iris_emit_mi_report_perf_count;
    screen->vtbl.rebind_buffer = iris_rebind_buffer;
    screen->vtbl.load_register_reg32 = iris_load_register_reg32;
diff --git a/src/gallium/drivers/iris/iris_utrace.c 
b/src/gallium/drivers/iris/iris_utrace.c
index 415a4f416e5..44f1d23120c 100644
--- a/src/gallium/drivers/iris/iris_utrace.c
+++ b/src/gallium/drivers/iris/iris_utrace.c
@@ -39,25 +39,81 @@
 #include <sys/stat.h>
 #include <unistd.h>
 
+/** Timestamp structure format */
+union iris_utrace_timestamp {
+   /* Timestamp writtem by either 2 * MI_STORE_REGISTER_MEM or
+    * PIPE_CONTROL.
+    */
+   uint64_t timestamp;
+
+   /* Timestamp written by COMPUTE_WALKER::PostSync
+    *
+    * Layout is described in PRMs.
+    * ATSM PRMs, Volume 2d: Command Reference: Structures, POSTSYNC_DATA:
+    *
+    *    "The timestamp layout :
+    *        [0] = 32b Context Timestamp Start
+    *        [1] = 32b Global Timestamp Start
+    *        [2] = 32b Context Timestamp End
+    *        [3] = 32b Global Timestamp End"
+    */
+   uint32_t compute_walker[4];
+};
+
+static void *
+iris_utrace_create_ts_buffer(struct u_trace_context *utctx, uint32_t size)
+{
+   struct iris_context *ice =
+      container_of(utctx, struct iris_context, ds.trace_context);
+   struct pipe_context *ctx = &ice->ctx;
+   struct iris_screen *screen = (struct iris_screen *)ctx->screen;
+   uint32_t iris_size =
+      (size / sizeof(uint64_t)) * sizeof(union iris_utrace_timestamp);
+
+   struct iris_bo *bo =
+      iris_bo_alloc(screen->bufmgr, "utrace timestamps",
+                    iris_size, 16 /* alignment */,
+                    IRIS_MEMZONE_OTHER,
+                    BO_ALLOC_COHERENT | BO_ALLOC_SMEM);
+
+   void *ptr = iris_bo_map(NULL, bo, MAP_READ | MAP_WRITE);
+   memset(ptr, 0, iris_size);
+
+   return bo;
+}
+
+static void
+iris_utrace_delete_ts_buffer(struct u_trace_context *utctx, void *timestamps)
+{
+   struct iris_bo *bo = timestamps;
+   iris_bo_unreference(bo);
+}
+
 static void
 iris_utrace_record_ts(struct u_trace *trace, void *cs,
                       void *timestamps, unsigned idx,
                       bool end_of_pipe)
 {
    struct iris_batch *batch = container_of(trace, struct iris_batch, trace);
-   struct iris_resource *res = (void *) timestamps;
-   struct iris_bo *bo = res->bo;
+   struct iris_context *ice = batch->ice;
+   struct iris_bo *bo = timestamps;
+   uint32_t ts_offset = idx * sizeof(union iris_utrace_timestamp);
 
    iris_use_pinned_bo(batch, bo, true, IRIS_DOMAIN_NONE);
 
-   if (end_of_pipe) {
+   const bool is_end_compute =
+      (cs == NULL && ice->utrace.last_compute_walker != NULL && end_of_pipe);
+   if (is_end_compute) {
+      batch->screen->vtbl.rewrite_compute_walker_pc(
+         batch, ice->utrace.last_compute_walker, bo, ts_offset);
+      ice->utrace.last_compute_walker = NULL;
+   } else if (end_of_pipe) {
       iris_emit_pipe_control_write(batch, "query: pipelined snapshot write",
                                    PIPE_CONTROL_WRITE_TIMESTAMP,
-                                   bo, idx * sizeof(uint64_t), 0ull);
+                                   bo, ts_offset, 0ull);
    } else {
-      batch->screen->vtbl.store_register_mem64(batch,
-                                               0x2358,
-                                               bo, idx * sizeof(uint64_t),
+      batch->screen->vtbl.store_register_mem64(batch, 0x2358,
+                                               bo, ts_offset,
                                                false);
    }
 }
@@ -70,19 +126,35 @@ iris_utrace_read_ts(struct u_trace_context *utctx,
       container_of(utctx, struct iris_context, ds.trace_context);
    struct pipe_context *ctx = &ice->ctx;
    struct iris_screen *screen = (struct iris_screen *)ctx->screen;
-   struct iris_resource *res = (void *) timestamps;
-   struct iris_bo *bo = res->bo;
+   struct iris_bo *bo = timestamps;
 
    if (idx == 0)
       iris_bo_wait_rendering(bo);
 
-   uint64_t *ts = iris_bo_map(NULL, bo, MAP_READ);
+   union iris_utrace_timestamp *ts = iris_bo_map(NULL, bo, MAP_READ);
 
    /* Don't translate the no-timestamp marker: */
-   if (ts[idx] == U_TRACE_NO_TIMESTAMP)
+   if (ts[idx].timestamp == U_TRACE_NO_TIMESTAMP)
       return U_TRACE_NO_TIMESTAMP;
 
-   return intel_device_info_timebase_scale(screen->devinfo, ts[idx]);
+   /* Detect a 16bytes timestamp write */
+   if (ts[idx].compute_walker[2] != 0 || ts[idx].compute_walker[3] != 0) {
+      /* The timestamp written by COMPUTE_WALKER::PostSync only as 32bits. We
+       * need to rebuild the full 64bits using the previous timestamp. We
+       * assume that utrace is reading the timestamp in order. Anyway
+       * timestamp rollover on 32bits in a few minutes so in most cases that
+       * should be correct.
+       */
+      uint64_t timestamp =
+         (ice->utrace.last_full_timestamp & 0xffffffff00000000) |
+         (uint64_t) ts[idx].compute_walker[3];
+
+      return intel_device_info_timebase_scale(screen->devinfo, timestamp);
+   }
+
+   ice->utrace.last_full_timestamp = ts[idx].timestamp;
+
+   return intel_device_info_timebase_scale(screen->devinfo, ts[idx].timestamp);
 }
 
 static void
@@ -116,10 +188,13 @@ void iris_utrace_init(struct iris_context *ice)
     */
    intel_ds_device_init(&ice->ds, screen->devinfo, screen->fd, minor % 128,
                         INTEL_DS_API_OPENGL);
-   u_trace_pipe_context_init(&ice->ds.trace_context, &ice->ctx,
-                             iris_utrace_record_ts,
-                             iris_utrace_read_ts,
-                             iris_utrace_delete_flush_data);
+
+   u_trace_context_init(&ice->ds.trace_context, &ice->ctx,
+                        iris_utrace_create_ts_buffer,
+                        iris_utrace_delete_ts_buffer,
+                        iris_utrace_record_ts,
+                        iris_utrace_read_ts,
+                        iris_utrace_delete_flush_data);
 
    for (int i = 0; i < IRIS_BATCH_COUNT; i++) {
       intel_ds_device_init_queue(&ice->ds, &ice->batches[i].ds, "%s",

Reply via email to