Module: Mesa Branch: main Commit: 688f03e3699973157a9f8b0514e956c37f9fb9cd URL: http://cgit.freedesktop.org/mesa/mesa/commit/?id=688f03e3699973157a9f8b0514e956c37f9fb9cd
Author: Lionel Landwerlin <[email protected]> Date: Fri May 19 21:34:46 2023 +0300 iris: use COMPUTE_WALKER post sync field to track compute work Signed-off-by: Lionel Landwerlin <[email protected]> Reviewed-by: Felix DeGrood <[email protected]> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/23131> --- src/gallium/drivers/iris/iris_batch.c | 10 +-- src/gallium/drivers/iris/iris_context.h | 8 ++- src/gallium/drivers/iris/iris_genx_macros.h | 3 + src/gallium/drivers/iris/iris_screen.h | 5 ++ src/gallium/drivers/iris/iris_state.c | 31 +++++++- src/gallium/drivers/iris/iris_utrace.c | 107 +++++++++++++++++++++++----- 6 files changed, 140 insertions(+), 24 deletions(-) diff --git a/src/gallium/drivers/iris/iris_batch.c b/src/gallium/drivers/iris/iris_batch.c index 28db1d7d90c..b66f990ef6c 100644 --- a/src/gallium/drivers/iris/iris_batch.c +++ b/src/gallium/drivers/iris/iris_batch.c @@ -551,9 +551,9 @@ void iris_batch_maybe_begin_frame(struct iris_batch *batch) { struct iris_context *ice = batch->ice; - if (ice->tracing_begin_frame != ice->frame) { + if (ice->utrace.begin_frame != ice->frame) { trace_intel_begin_frame(&batch->trace, batch); - ice->tracing_begin_frame = ice->tracing_end_frame = ice->frame; + ice->utrace.begin_frame = ice->utrace.end_frame = ice->frame; } } @@ -656,9 +656,9 @@ iris_finish_batch(struct iris_batch *batch) trace_intel_end_batch(&batch->trace, batch->name); struct iris_context *ice = batch->ice; - if (ice->tracing_end_frame != ice->frame) { - trace_intel_end_frame(&batch->trace, batch, ice->tracing_end_frame); - ice->tracing_end_frame = ice->frame; + if (ice->utrace.end_frame != ice->frame) { + trace_intel_end_frame(&batch->trace, batch, ice->utrace.end_frame); + ice->utrace.end_frame = ice->frame; } /* Emit MI_BATCH_BUFFER_END to finish our batch. */ diff --git a/src/gallium/drivers/iris/iris_context.h b/src/gallium/drivers/iris/iris_context.h index 696c44f34d6..2a2726a62d3 100644 --- a/src/gallium/drivers/iris/iris_context.h +++ b/src/gallium/drivers/iris/iris_context.h @@ -738,8 +738,12 @@ struct iris_context { struct intel_perf_context *perf_ctx; /** Frame number for u_trace */ - uint32_t tracing_begin_frame; - uint32_t tracing_end_frame; + struct { + uint32_t begin_frame; + uint32_t end_frame; + uint64_t last_full_timestamp; + void *last_compute_walker; + } utrace; /** Frame number for debug prints */ uint32_t frame; diff --git a/src/gallium/drivers/iris/iris_genx_macros.h b/src/gallium/drivers/iris/iris_genx_macros.h index 8b80d74f9d8..d63ea41bb9b 100644 --- a/src/gallium/drivers/iris/iris_genx_macros.h +++ b/src/gallium/drivers/iris/iris_genx_macros.h @@ -108,6 +108,9 @@ __gen_get_batch_address(struct iris_batch *batch, void *location) #define iris_emit_cmd(batch, cmd, name) \ _iris_pack_command(batch, cmd, __gen_get_batch_dwords(batch, __genxml_cmd_length(cmd)), name) +#define iris_emit_dwords(batch, n) \ + __gen_get_batch_dwords(batch, n) + #define iris_emit_merge(batch, dwords0, dwords1, num_dwords) \ do { \ uint32_t *dw = __gen_get_batch_dwords(batch, num_dwords); \ diff --git a/src/gallium/drivers/iris/iris_screen.h b/src/gallium/drivers/iris/iris_screen.h index ce0931c5b7e..1b22cb12a18 100644 --- a/src/gallium/drivers/iris/iris_screen.h +++ b/src/gallium/drivers/iris/iris_screen.h @@ -114,6 +114,11 @@ struct iris_vtable { uint32_t offset_in_bytes, uint32_t report_id); + void (*rewrite_compute_walker_pc)(struct iris_batch *batch, + uint32_t *walker, + struct iris_bo *bo, + uint32_t offset); + unsigned (*derived_program_state_size)(enum iris_program_cache_id id); void (*store_derived_program_state)(const struct intel_device_info *devinfo, enum iris_program_cache_id cache_id, diff --git a/src/gallium/drivers/iris/iris_state.c b/src/gallium/drivers/iris/iris_state.c index b745d7d65af..502617eb035 100644 --- a/src/gallium/drivers/iris/iris_state.c +++ b/src/gallium/drivers/iris/iris_state.c @@ -634,6 +634,31 @@ iris_copy_mem_mem(struct iris_batch *batch, iris_batch_sync_region_end(batch); } +static void +iris_rewrite_compute_walker_pc(struct iris_batch *batch, + uint32_t *walker, + struct iris_bo *bo, + uint32_t offset) +{ +#if GFX_VERx10 >= 125 + struct iris_screen *screen = batch->screen; + struct iris_address addr = rw_bo(bo, offset, IRIS_DOMAIN_OTHER_WRITE); + + uint32_t dwords[GENX(COMPUTE_WALKER_length)]; + + _iris_pack_command(batch, GENX(COMPUTE_WALKER), dwords, cw) { + cw.PostSync.Operation = WriteTimestamp; + cw.PostSync.DestinationAddress = addr; + cw.PostSync.MOCS = iris_mocs(NULL, &screen->isl_dev, 0); + } + + for (uint32_t i = 0; i < GENX(COMPUTE_WALKER_length); i++) + walker[i] |= dwords[i]; +#else + unreachable("Unsupported"); +#endif +} + static void emit_pipeline_select(struct iris_batch *batch, uint32_t pipeline) { @@ -7628,7 +7653,10 @@ iris_upload_compute_walker(struct iris_context *ice, iris_measure_snapshot(ice, batch, INTEL_SNAPSHOT_COMPUTE, NULL, NULL, NULL); - iris_emit_cmd(batch, GENX(COMPUTE_WALKER), cw) { + ice->utrace.last_compute_walker = + iris_emit_dwords(batch, GENX(COMPUTE_WALKER_length)); + _iris_pack_command(batch, GENX(COMPUTE_WALKER), + ice->utrace.last_compute_walker, cw) { cw.IndirectParameterEnable = grid->indirect; cw.SIMDSize = dispatch.simd_size / 16; cw.LocalXMaximum = grid->block[0] - 1; @@ -8901,6 +8929,7 @@ genX(init_screen_state)(struct iris_screen *screen) screen->vtbl.update_binder_address = iris_update_binder_address; screen->vtbl.upload_compute_state = iris_upload_compute_state; screen->vtbl.emit_raw_pipe_control = iris_emit_raw_pipe_control; + screen->vtbl.rewrite_compute_walker_pc = iris_rewrite_compute_walker_pc; screen->vtbl.emit_mi_report_perf_count = iris_emit_mi_report_perf_count; screen->vtbl.rebind_buffer = iris_rebind_buffer; screen->vtbl.load_register_reg32 = iris_load_register_reg32; diff --git a/src/gallium/drivers/iris/iris_utrace.c b/src/gallium/drivers/iris/iris_utrace.c index 415a4f416e5..44f1d23120c 100644 --- a/src/gallium/drivers/iris/iris_utrace.c +++ b/src/gallium/drivers/iris/iris_utrace.c @@ -39,25 +39,81 @@ #include <sys/stat.h> #include <unistd.h> +/** Timestamp structure format */ +union iris_utrace_timestamp { + /* Timestamp writtem by either 2 * MI_STORE_REGISTER_MEM or + * PIPE_CONTROL. + */ + uint64_t timestamp; + + /* Timestamp written by COMPUTE_WALKER::PostSync + * + * Layout is described in PRMs. + * ATSM PRMs, Volume 2d: Command Reference: Structures, POSTSYNC_DATA: + * + * "The timestamp layout : + * [0] = 32b Context Timestamp Start + * [1] = 32b Global Timestamp Start + * [2] = 32b Context Timestamp End + * [3] = 32b Global Timestamp End" + */ + uint32_t compute_walker[4]; +}; + +static void * +iris_utrace_create_ts_buffer(struct u_trace_context *utctx, uint32_t size) +{ + struct iris_context *ice = + container_of(utctx, struct iris_context, ds.trace_context); + struct pipe_context *ctx = &ice->ctx; + struct iris_screen *screen = (struct iris_screen *)ctx->screen; + uint32_t iris_size = + (size / sizeof(uint64_t)) * sizeof(union iris_utrace_timestamp); + + struct iris_bo *bo = + iris_bo_alloc(screen->bufmgr, "utrace timestamps", + iris_size, 16 /* alignment */, + IRIS_MEMZONE_OTHER, + BO_ALLOC_COHERENT | BO_ALLOC_SMEM); + + void *ptr = iris_bo_map(NULL, bo, MAP_READ | MAP_WRITE); + memset(ptr, 0, iris_size); + + return bo; +} + +static void +iris_utrace_delete_ts_buffer(struct u_trace_context *utctx, void *timestamps) +{ + struct iris_bo *bo = timestamps; + iris_bo_unreference(bo); +} + static void iris_utrace_record_ts(struct u_trace *trace, void *cs, void *timestamps, unsigned idx, bool end_of_pipe) { struct iris_batch *batch = container_of(trace, struct iris_batch, trace); - struct iris_resource *res = (void *) timestamps; - struct iris_bo *bo = res->bo; + struct iris_context *ice = batch->ice; + struct iris_bo *bo = timestamps; + uint32_t ts_offset = idx * sizeof(union iris_utrace_timestamp); iris_use_pinned_bo(batch, bo, true, IRIS_DOMAIN_NONE); - if (end_of_pipe) { + const bool is_end_compute = + (cs == NULL && ice->utrace.last_compute_walker != NULL && end_of_pipe); + if (is_end_compute) { + batch->screen->vtbl.rewrite_compute_walker_pc( + batch, ice->utrace.last_compute_walker, bo, ts_offset); + ice->utrace.last_compute_walker = NULL; + } else if (end_of_pipe) { iris_emit_pipe_control_write(batch, "query: pipelined snapshot write", PIPE_CONTROL_WRITE_TIMESTAMP, - bo, idx * sizeof(uint64_t), 0ull); + bo, ts_offset, 0ull); } else { - batch->screen->vtbl.store_register_mem64(batch, - 0x2358, - bo, idx * sizeof(uint64_t), + batch->screen->vtbl.store_register_mem64(batch, 0x2358, + bo, ts_offset, false); } } @@ -70,19 +126,35 @@ iris_utrace_read_ts(struct u_trace_context *utctx, container_of(utctx, struct iris_context, ds.trace_context); struct pipe_context *ctx = &ice->ctx; struct iris_screen *screen = (struct iris_screen *)ctx->screen; - struct iris_resource *res = (void *) timestamps; - struct iris_bo *bo = res->bo; + struct iris_bo *bo = timestamps; if (idx == 0) iris_bo_wait_rendering(bo); - uint64_t *ts = iris_bo_map(NULL, bo, MAP_READ); + union iris_utrace_timestamp *ts = iris_bo_map(NULL, bo, MAP_READ); /* Don't translate the no-timestamp marker: */ - if (ts[idx] == U_TRACE_NO_TIMESTAMP) + if (ts[idx].timestamp == U_TRACE_NO_TIMESTAMP) return U_TRACE_NO_TIMESTAMP; - return intel_device_info_timebase_scale(screen->devinfo, ts[idx]); + /* Detect a 16bytes timestamp write */ + if (ts[idx].compute_walker[2] != 0 || ts[idx].compute_walker[3] != 0) { + /* The timestamp written by COMPUTE_WALKER::PostSync only as 32bits. We + * need to rebuild the full 64bits using the previous timestamp. We + * assume that utrace is reading the timestamp in order. Anyway + * timestamp rollover on 32bits in a few minutes so in most cases that + * should be correct. + */ + uint64_t timestamp = + (ice->utrace.last_full_timestamp & 0xffffffff00000000) | + (uint64_t) ts[idx].compute_walker[3]; + + return intel_device_info_timebase_scale(screen->devinfo, timestamp); + } + + ice->utrace.last_full_timestamp = ts[idx].timestamp; + + return intel_device_info_timebase_scale(screen->devinfo, ts[idx].timestamp); } static void @@ -116,10 +188,13 @@ void iris_utrace_init(struct iris_context *ice) */ intel_ds_device_init(&ice->ds, screen->devinfo, screen->fd, minor % 128, INTEL_DS_API_OPENGL); - u_trace_pipe_context_init(&ice->ds.trace_context, &ice->ctx, - iris_utrace_record_ts, - iris_utrace_read_ts, - iris_utrace_delete_flush_data); + + u_trace_context_init(&ice->ds.trace_context, &ice->ctx, + iris_utrace_create_ts_buffer, + iris_utrace_delete_ts_buffer, + iris_utrace_record_ts, + iris_utrace_read_ts, + iris_utrace_delete_flush_data); for (int i = 0; i < IRIS_BATCH_COUNT; i++) { intel_ds_device_init_queue(&ice->ds, &ice->batches[i].ds, "%s",
