Mesa (marge_bot_batch_merge_job): ir3: Be able to reduce register limit for RA when CS has barriers

GitLab Mirror Fri, 07 Jan 2022 04:44:51 -0800

Module: Mesa
Branch: marge_bot_batch_merge_job
Commit: 47227cd1fed6e9004e583d4ce7a09c269b71f0cc
URL:    
http://cgit.freedesktop.org/mesa/mesa/commit/?id=47227cd1fed6e9004e583d4ce7a09c269b71f0cc


Author: Danylo Piliaiev <[email protected]>
Date:   Tue Dec  7 15:15:23 2021 +0200

ir3: Be able to reduce register limit for RA when CS has barriers

If barriers are used, it must be possible for all waves in the workgroup
to execute concurrently. Thus we may have to reduce the registers limit.

Fixes a hang in "Digital Combat Simulator".

Signed-off-by: Danylo Piliaiev <[email protected]>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/14110>

---

 src/freedreno/ir3/ir3_compiler.c                 |  2 +
 src/freedreno/ir3/ir3_compiler.h                 |  3 ++
 src/freedreno/ir3/ir3_compiler_nir.c             |  3 ++
 src/freedreno/ir3/ir3_ra.c                       | 53 +++++++++++++++++++++++-
 src/freedreno/ir3/ir3_shader.h                   |  3 ++
 src/gallium/drivers/freedreno/freedreno_screen.c |  4 +-
 6 files changed, 66 insertions(+), 2 deletions(-)

diff --git a/src/freedreno/ir3/ir3_compiler.c b/src/freedreno/ir3/ir3_compiler.c
index 26b120a87c0..dfc6aec037f 100644
--- a/src/freedreno/ir3/ir3_compiler.c
+++ b/src/freedreno/ir3/ir3_compiler.c
@@ -95,6 +95,8 @@ ir3_compiler_create(struct fd_device *dev, const struct 
fd_dev_id *dev_id,
    compiler->wave_granularity = 2;
    compiler->max_waves = 16;
 
+   compiler->max_variable_workgroup_size = 1024;
+
    if (compiler->gen >= 6) {
       compiler->samgq_workaround = true;
       /* a6xx split the pipeline state into geometry and fragment state, in
diff --git a/src/freedreno/ir3/ir3_compiler.h b/src/freedreno/ir3/ir3_compiler.h
index 9b2edd3b944..68e5d944ea5 100644
--- a/src/freedreno/ir3/ir3_compiler.h
+++ b/src/freedreno/ir3/ir3_compiler.h
@@ -166,6 +166,9 @@ struct ir3_compiler {
     */
    bool has_getfiberid;
 
+   /* MAX_COMPUTE_VARIABLE_GROUP_INVOCATIONS_ARB */
+   uint32_t max_variable_workgroup_size;
+
    /* Type to use for 1b nir bools: */
    type_t bool_type;
 };
diff --git a/src/freedreno/ir3/ir3_compiler_nir.c 
b/src/freedreno/ir3/ir3_compiler_nir.c
index b0572e67c9b..cb11d5120cd 100644
--- a/src/freedreno/ir3/ir3_compiler_nir.c
+++ b/src/freedreno/ir3/ir3_compiler_nir.c
@@ -1400,6 +1400,8 @@ emit_control_barrier(struct ir3_context *ctx)
    barrier->flags = IR3_INSTR_SS | IR3_INSTR_SY;
    barrier->barrier_class = IR3_BARRIER_EVERYTHING;
    array_insert(b, b->keeps, barrier);
+
+   ctx->so->has_barrier = true;
 }
 
 static void
@@ -3968,6 +3970,7 @@ emit_instructions(struct ir3_context *ctx)
       barrier->flags = IR3_INSTR_SS | IR3_INSTR_SY;
       barrier->barrier_class = IR3_BARRIER_EVERYTHING;
       array_insert(ctx->block, ctx->block->keeps, barrier);
+      ctx->so->has_barrier = true;
    }
 
    /* And emit the body: */
diff --git a/src/freedreno/ir3/ir3_ra.c b/src/freedreno/ir3/ir3_ra.c
index 56a8c7d17ea..0f50c99fc64 100644
--- a/src/freedreno/ir3/ir3_ra.c
+++ b/src/freedreno/ir3/ir3_ra.c
@@ -2212,6 +2212,54 @@ calc_min_limit_pressure(struct ir3_shader_variant *v,
    ralloc_free(ctx);
 }
 
+/*
+ * If barriers are used, it must be possible for all waves in the workgroup
+ * to execute concurrently. Thus we may have to reduce the registers limit.
+ */
+static void
+calc_limit_pressure_for_cs_with_barrier(struct ir3_shader_variant *v,
+                                        struct ir3_pressure *limit_pressure)
+{
+   const struct ir3_compiler *compiler = v->shader->compiler;
+
+   unsigned threads_per_wg;
+   if (v->local_size_variable) {
+      /* We have to expect the worst case. */
+      threads_per_wg = compiler->max_variable_workgroup_size;
+   } else {
+      threads_per_wg = v->local_size[0] * v->local_size[1] * v->local_size[2];
+   }
+
+   /* The register file is grouped into reg_size_vec4 number of parts.
+    * Each part has enough registers to add a single vec4 register to
+    * each thread of a single-sized wave-pair. With double threadsize
+    * each wave-pair would consume two parts of the register file to get
+    * a single vec4 for a thread. The more active wave-pairs the less
+    * parts each could get.
+    */
+
+   bool double_threadsize = ir3_should_double_threadsize(v, 0);
+   unsigned waves_per_wg = DIV_ROUND_UP(
+      threads_per_wg, compiler->threadsize_base * (double_threadsize ? 2 : 1) *
+                         compiler->wave_granularity);
+
+   uint32_t vec4_regs_per_thread =
+      compiler->reg_size_vec4 / (waves_per_wg * (double_threadsize ? 2 : 1));
+   assert(vec4_regs_per_thread > 0);
+
+   uint32_t half_regs_per_thread = vec4_regs_per_thread * 4 * 2;
+
+   if (limit_pressure->full > half_regs_per_thread) {
+      if (v->mergedregs) {
+         limit_pressure->full = half_regs_per_thread;
+      } else {
+         /* TODO: Handle !mergedregs case, probably we would have to do this
+          * after the first register pressure pass.
+          */
+      }
+   }
+}
+
 int
 ir3_ra(struct ir3_shader_variant *v)
 {
@@ -2238,12 +2286,15 @@ ir3_ra(struct ir3_shader_variant *v)
    d("\thalf: %u", max_pressure.half);
    d("\tshared: %u", max_pressure.shared);
 
-   /* TODO: calculate half/full limit correctly for CS with barrier */
    struct ir3_pressure limit_pressure;
    limit_pressure.full = RA_FULL_SIZE;
    limit_pressure.half = RA_HALF_SIZE;
    limit_pressure.shared = RA_SHARED_SIZE;
 
+   if (gl_shader_stage_is_compute(v->type) && v->has_barrier) {
+      calc_limit_pressure_for_cs_with_barrier(v, &limit_pressure);
+   }
+
    /* If requested, lower the limit so that spilling happens more often. */
    if (ir3_shader_debug & IR3_DBG_SPILLALL)
       calc_min_limit_pressure(v, live, &limit_pressure);
diff --git a/src/freedreno/ir3/ir3_shader.h b/src/freedreno/ir3/ir3_shader.h
index 6dc005ea9f3..69bffc68d52 100644
--- a/src/freedreno/ir3/ir3_shader.h
+++ b/src/freedreno/ir3/ir3_shader.h
@@ -697,6 +697,9 @@ struct ir3_shader_variant {
    uint16_t local_size[3];
    bool local_size_variable;
 
+   /* Important for compute shader to determine max reg footprint */
+   bool has_barrier;
+
    struct ir3_disasm_info disasm_info;
 };
 
diff --git a/src/gallium/drivers/freedreno/freedreno_screen.c 
b/src/gallium/drivers/freedreno/freedreno_screen.c
index aae18be3d2b..969562887b9 100644
--- a/src/gallium/drivers/freedreno/freedreno_screen.c
+++ b/src/gallium/drivers/freedreno/freedreno_screen.c
@@ -724,6 +724,8 @@ fd_get_compute_param(struct pipe_screen *pscreen, enum 
pipe_shader_ir ir_type,
    if (!has_compute(screen))
       return 0;
 
+   struct ir3_compiler *compiler = screen->compiler;
+
 #define RET(x)                                                                 
\
    do {                                                                        
\
       if (ret)                                                                 
\
@@ -780,7 +782,7 @@ fd_get_compute_param(struct pipe_screen *pscreen, enum 
pipe_shader_ir ir_type,
       RET((uint32_t[]){32}); // TODO
 
    case PIPE_COMPUTE_CAP_MAX_VARIABLE_THREADS_PER_BLOCK:
-      RET((uint64_t[]){1024}); // TODO
+      RET((uint64_t[]){ compiler->max_variable_workgroup_size });
    }
 
    return 0;

Mesa (marge_bot_batch_merge_job): ir3: Be able to reduce register limit for RA when CS has barriers

Reply via email to