Module: Mesa
Branch: main
Commit: 029c686c6dbe37639bf6d391bb9361488a6a5ea6
URL:    
http://cgit.freedesktop.org/mesa/mesa/commit/?id=029c686c6dbe37639bf6d391bb9361488a6a5ea6

Author: Alyssa Rosenzweig <[email protected]>
Date:   Fri Feb 17 18:24:38 2023 -0500

asahi: Implement color masks with masked stores

Blend states can require masking colour. Currently, this is handled by
nir_lower_blend, which lowers masks to a read-modify-write operation as required
on Mali hardware. However, our "tilebuffer store" instruction supports a write
mask, allowing us to write only a subset of channels to the tilebuffer. It's
more efficient to use that than to emit pointless tilebuffer loads.

Note that even without tilebuffer loads, non-opaque masks don't work with opaque
pass types.  Here, we handle this with a translucent pass type, which gets HSR
to do the right thing and is consistent with the pass type used previously.
However, it's a bit heavy handed -- Apple manages to use an opaque pass type
with masking but with some unknown HSR fields twiddled. IMO reverse-engineering
those details shouldn't block this because this gets us closer to optimal (just
not all the way there) and is strictly better than what we had before.

Signed-off-by: Alyssa Rosenzweig <[email protected]>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/21431>

---

 src/asahi/lib/agx_meta.c                 |  2 +-
 src/asahi/lib/agx_nir_lower_tilebuffer.c | 58 +++++++++++++++++++++++++++-----
 src/asahi/lib/agx_tilebuffer.c           |  8 +++++
 src/asahi/lib/agx_tilebuffer.h           |  6 +++-
 src/gallium/drivers/asahi/agx_state.c    | 32 ++++++++++++++++--
 5 files changed, 94 insertions(+), 12 deletions(-)

diff --git a/src/asahi/lib/agx_meta.c b/src/asahi/lib/agx_meta.c
index a0dcdc2421f..bbd862175ca 100644
--- a/src/asahi/lib/agx_meta.c
+++ b/src/asahi/lib/agx_meta.c
@@ -19,7 +19,7 @@ agx_compile_meta_shader(struct agx_meta_cache *cache, 
nir_shader *shader,
 
    agx_preprocess_nir(shader);
    if (tib)
-      agx_nir_lower_tilebuffer(shader, tib);
+      agx_nir_lower_tilebuffer(shader, tib, NULL, NULL);
 
    struct agx_meta_shader *res = rzalloc(cache->ht, struct agx_meta_shader);
    agx_compile_shader_nir(shader, key, NULL, &binary, &res->info);
diff --git a/src/asahi/lib/agx_nir_lower_tilebuffer.c 
b/src/asahi/lib/agx_nir_lower_tilebuffer.c
index 3d49c2841cd..43a295b413f 100644
--- a/src/asahi/lib/agx_nir_lower_tilebuffer.c
+++ b/src/asahi/lib/agx_nir_lower_tilebuffer.c
@@ -3,6 +3,7 @@
  * SPDX-License-Identifier: MIT
  */
 
+#include "compiler/agx_internal_formats.h"
 #include "agx_nir_format_helpers.h"
 #include "agx_tilebuffer.h"
 #include "nir.h"
@@ -10,6 +11,12 @@
 
 #define ALL_SAMPLES 0xFF
 
+struct ctx {
+   struct agx_tilebuffer_layout *tib;
+   uint8_t *colormasks;
+   bool *translucent;
+};
+
 static bool
 tib_filter(const nir_instr *instr, UNUSED const void *_)
 {
@@ -29,7 +36,8 @@ tib_filter(const nir_instr *instr, UNUSED const void *_)
 static nir_ssa_def *
 tib_impl(nir_builder *b, nir_instr *instr, void *data)
 {
-   struct agx_tilebuffer_layout *tib = data;
+   struct ctx *ctx = data;
+   struct agx_tilebuffer_layout *tib = ctx->tib;
    nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
 
    nir_io_semantics sem = nir_intrinsic_io_semantics(intr);
@@ -41,10 +49,38 @@ tib_impl(nir_builder *b, nir_instr *instr, void *data)
    unsigned comps = util_format_get_nr_components(logical_format);
 
    if (intr->intrinsic == nir_intrinsic_store_output) {
+      /* Only write components that actually exist */
+      uint16_t write_mask = BITFIELD_MASK(comps);
+
       /* Delete stores to nonexistant render targets */
       if (logical_format == PIPE_FORMAT_NONE)
          return NIR_LOWER_INSTR_PROGRESS_REPLACE;
 
+      /* Only write colours masked by the blend state */
+      if (ctx->colormasks)
+         write_mask &= ctx->colormasks[rt];
+
+      /* Masked stores require a translucent pass type */
+      if (write_mask != BITFIELD_MASK(comps)) {
+         assert(ctx->translucent != NULL &&
+                "colour masking requires translucency");
+
+         assert(agx_internal_format_supports_mask(format) &&
+                "write mask but format cannot be masked");
+
+         *(ctx->translucent) = true;
+      }
+
+      /* But we ignore the NIR write mask for that, since it's basically an
+       * optimization hint.
+       */
+      if (agx_internal_format_supports_mask(format))
+         write_mask &= nir_intrinsic_write_mask(intr);
+
+      /* Delete stores that are entirely masked out */
+      if (!write_mask)
+         return NIR_LOWER_INSTR_PROGRESS_REPLACE;
+
       nir_ssa_def *value = intr->src[0].ssa;
 
       /* Trim to format as required by hardware */
@@ -60,11 +96,9 @@ tib_impl(nir_builder *b, nir_instr *instr, void *data)
             value = nir_f2f32(b, value);
       }
 
-      nir_store_local_pixel_agx(
-         b, value, nir_imm_intN_t(b, ALL_SAMPLES, 16),
-         .base = tib->offset_B[rt],
-         .write_mask = nir_intrinsic_write_mask(intr) & BITFIELD_MASK(comps),
-         .format = format);
+      nir_store_local_pixel_agx(b, value, nir_imm_intN_t(b, ALL_SAMPLES, 16),
+                                .base = tib->offset_B[rt],
+                                .write_mask = write_mask, .format = format);
 
       return NIR_LOWER_INSTR_PROGRESS_REPLACE;
    } else {
@@ -101,8 +135,16 @@ tib_impl(nir_builder *b, nir_instr *instr, void *data)
 }
 
 bool
-agx_nir_lower_tilebuffer(nir_shader *shader, struct agx_tilebuffer_layout *tib)
+agx_nir_lower_tilebuffer(nir_shader *shader, struct agx_tilebuffer_layout *tib,
+                         uint8_t *colormasks, bool *translucent)
 {
    assert(shader->info.stage == MESA_SHADER_FRAGMENT);
-   return nir_shader_lower_instructions(shader, tib_filter, tib_impl, tib);
+
+   struct ctx ctx = {
+      .tib = tib,
+      .colormasks = colormasks,
+      .translucent = translucent,
+   };
+
+   return nir_shader_lower_instructions(shader, tib_filter, tib_impl, &ctx);
 }
diff --git a/src/asahi/lib/agx_tilebuffer.c b/src/asahi/lib/agx_tilebuffer.c
index 81df1d25636..0c724881833 100644
--- a/src/asahi/lib/agx_tilebuffer.c
+++ b/src/asahi/lib/agx_tilebuffer.c
@@ -5,6 +5,7 @@
 
 #include "agx_tilebuffer.h"
 #include <assert.h>
+#include "compiler/agx_internal_formats.h"
 #include "util/format/u_format.h"
 #include "agx_formats.h"
 #include "agx_usc.h"
@@ -77,6 +78,13 @@ agx_tilebuffer_physical_format(struct agx_tilebuffer_layout 
*tib, unsigned rt)
    return agx_pixel_format[tib->logical_format[rt]].internal;
 }
 
+bool
+agx_tilebuffer_supports_mask(struct agx_tilebuffer_layout *tib, unsigned rt)
+{
+   enum pipe_format fmt = agx_tilebuffer_physical_format(tib, rt);
+   return agx_internal_format_supports_mask((enum agx_internal_formats)fmt);
+}
+
 static unsigned
 agx_shared_layout_from_tile_size(struct agx_tile_size t)
 {
diff --git a/src/asahi/lib/agx_tilebuffer.h b/src/asahi/lib/agx_tilebuffer.h
index 2a120037a9e..4e57e265ca6 100644
--- a/src/asahi/lib/agx_tilebuffer.h
+++ b/src/asahi/lib/agx_tilebuffer.h
@@ -47,7 +47,8 @@ agx_build_tilebuffer_layout(enum pipe_format *formats, 
uint8_t nr_cbufs,
                             uint8_t nr_samples);
 
 bool agx_nir_lower_tilebuffer(struct nir_shader *shader,
-                              struct agx_tilebuffer_layout *tib);
+                              struct agx_tilebuffer_layout *tib,
+                              uint8_t *colormasks, bool *translucent);
 
 void agx_usc_tilebuffer(struct agx_usc_builder *b,
                         struct agx_tilebuffer_layout *tib);
@@ -57,6 +58,9 @@ uint32_t agx_tilebuffer_total_size(struct 
agx_tilebuffer_layout *tib);
 enum pipe_format
 agx_tilebuffer_physical_format(struct agx_tilebuffer_layout *tib, unsigned rt);
 
+bool agx_tilebuffer_supports_mask(struct agx_tilebuffer_layout *tib,
+                                  unsigned rt);
+
 #ifdef __cplusplus
 } /* extern C */
 #endif
diff --git a/src/gallium/drivers/asahi/agx_state.c 
b/src/gallium/drivers/asahi/agx_state.c
index de26f576dd0..3fa69f72783 100644
--- a/src/gallium/drivers/asahi/agx_state.c
+++ b/src/gallium/drivers/asahi/agx_state.c
@@ -1370,6 +1370,8 @@ agx_compile_variant(struct agx_device *dev, struct 
agx_uncompiled_shader *so,
 
    nir_shader *nir = nir_shader_clone(NULL, so->nir);
 
+   bool force_translucent = false;
+
    if (nir->info.stage == MESA_SHADER_VERTEX) {
       struct asahi_vs_shader_key *key = &key_->vs;
 
@@ -1393,9 +1395,25 @@ agx_compile_variant(struct agx_device *dev, struct 
agx_uncompiled_shader *so,
          opts.format[i] = key->rt_formats[i];
 
       memcpy(opts.rt, key->blend.rt, sizeof(opts.rt));
-      NIR_PASS_V(nir, nir_lower_blend, &opts);
 
-      NIR_PASS_V(nir, agx_nir_lower_tilebuffer, &tib);
+      /* It's more efficient to use masked stores (with
+       * agx_nir_lower_tilebuffer) than to emulate colour masking with
+       * nir_lower_blend.
+       */
+      uint8_t colormasks[PIPE_MAX_COLOR_BUFS] = {0};
+
+      for (unsigned i = 0; i < PIPE_MAX_COLOR_BUFS; ++i) {
+         if (agx_tilebuffer_supports_mask(&tib, i)) {
+            colormasks[i] = key->blend.rt[i].colormask;
+            opts.rt[i].colormask = BITFIELD_MASK(4);
+         } else {
+            colormasks[i] = BITFIELD_MASK(4);
+         }
+      }
+
+      NIR_PASS_V(nir, nir_lower_blend, &opts);
+      NIR_PASS_V(nir, agx_nir_lower_tilebuffer, &tib, colormasks,
+                 &force_translucent);
 
       if (key->sprite_coord_enable) {
          NIR_PASS_V(nir, nir_lower_texcoord_replace_late,
@@ -1415,6 +1433,16 @@ agx_compile_variant(struct agx_device *dev, struct 
agx_uncompiled_shader *so,
 
    agx_compile_shader_nir(nir, &base_key, debug, &binary, &compiled->info);
 
+   /* reads_tib => Translucent pass type */
+   compiled->info.reads_tib |= force_translucent;
+
+   /* Could be optimized to use non-translucent pass types with the appropriate
+    * HSR configuration, but that mechanism is not yet understood. Warn that
+    * we're leaving perf on the table when used.
+    */
+   if (force_translucent)
+      perf_debug(dev, "Translucency forced due to colour masking");
+
    if (binary.size) {
       compiled->bo = agx_bo_create(dev, binary.size,
                                    AGX_BO_EXEC | AGX_BO_LOW_VA, "Executable");

Reply via email to