nir: Add a new pass 'ir3_nir_lower_sampler_io'

Eduardo Lima Mitev Fri, 25 Jan 2019 07:49:46 -0800

This pass moves to NIR some offset calculations that are currently
implemented on the backend compiler, to allow NIR to possibly
optimize them.


For now, only coordinate byte-offset calculations for imageStore
and image atomic operations are implemented.
---
 src/freedreno/Makefile.sources               |   1 +
 src/freedreno/ir3/ir3_nir.h                  |   1 +
 src/freedreno/ir3/ir3_nir_lower_sampler_io.c | 349 +++++++++++++++++++
 3 files changed, 351 insertions(+)
 create mode 100644 src/freedreno/ir3/ir3_nir_lower_sampler_io.c

diff --git a/src/freedreno/Makefile.sources b/src/freedreno/Makefile.sources
index 7fea9de39ef..fd4f7f294cd 100644
--- a/src/freedreno/Makefile.sources
+++ b/src/freedreno/Makefile.sources
@@ -31,6 +31,7 @@ ir3_SOURCES := \
        ir3/ir3_legalize.c \
        ir3/ir3_nir.c \
        ir3/ir3_nir.h \
+       ir3/ir3_nir_lower_sampler_io.c \
        ir3/ir3_nir_lower_tg4_to_tex.c \
        ir3/ir3_print.c \
        ir3/ir3_ra.c \
diff --git a/src/freedreno/ir3/ir3_nir.h b/src/freedreno/ir3/ir3_nir.h
index 74201d34160..52809ba099e 100644
--- a/src/freedreno/ir3/ir3_nir.h
+++ b/src/freedreno/ir3/ir3_nir.h
@@ -36,6 +36,7 @@ void ir3_nir_scan_driver_consts(nir_shader *shader, struct 
ir3_driver_const_layo
 
 bool ir3_nir_apply_trig_workarounds(nir_shader *shader);
 bool ir3_nir_lower_tg4_to_tex(nir_shader *shader);
+bool ir3_nir_lower_sampler_io(nir_shader *shader);
 
 const nir_shader_compiler_options * ir3_get_compiler_options(struct 
ir3_compiler *compiler);
 bool ir3_key_lowers_nir(const struct ir3_shader_key *key);
diff --git a/src/freedreno/ir3/ir3_nir_lower_sampler_io.c 
b/src/freedreno/ir3/ir3_nir_lower_sampler_io.c
new file mode 100644
index 00000000000..e2910d8906d
--- /dev/null
+++ b/src/freedreno/ir3/ir3_nir_lower_sampler_io.c
@@ -0,0 +1,349 @@
+/*
+ * Copyright © 2018 Igalia S.L.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include "ir3_nir.h"
+#include "compiler/nir/nir_builder.h"
+
+/**
+ * The goal of this pass is to move to NIR some offset calculations for
+ * different I/O that are currently implemented on the backend compiler,
+ * to allow NIR to possibly optimize them.
+ *
+ * Currently, only offset calculations for image store and image
+ * atomic operations are implemented.
+ */
+
+
+/* This flag enables/disables a code-path where the bytes-per-pixel of
+ * an image is obtained directly from the format, which is known
+ * at shader compile time; as opposed to using image_dims[0] constant
+ * available only at shader runtime.
+ *
+ * Inlining the bytes-per-pixel here as an immediate has the advantage
+ * that it gets converted to a single (SHL) instruction (because all
+ * possible values are powers of two); while loading it as a uniform and
+ * emitting an IMUL will cause the backend to expand it to quite a few
+ * instructions (see ir3_compiler_nir for imul), thus ultimately hurting
+ * instruction count.
+ */
+#define INLINE_BPP 1
+
+
+static bool
+intrinsic_is_image_atomic(unsigned intrinsic)
+{
+       switch (intrinsic) {
+       case nir_intrinsic_image_deref_atomic_add:
+       case nir_intrinsic_image_deref_atomic_min:
+       case nir_intrinsic_image_deref_atomic_max:
+       case nir_intrinsic_image_deref_atomic_and:
+       case nir_intrinsic_image_deref_atomic_or:
+       case nir_intrinsic_image_deref_atomic_xor:
+       case nir_intrinsic_image_deref_atomic_exchange:
+       case nir_intrinsic_image_deref_atomic_comp_swap:
+               return true;
+       default:
+               break;
+       }
+
+       return false;
+}
+
+static bool
+intrinsic_is_image_store_or_atomic(unsigned intrinsic)
+{
+       if (intrinsic == nir_intrinsic_image_deref_store)
+               return true;
+       else
+               return intrinsic_is_image_atomic(intrinsic);
+}
+
+/*
+ * FIXME: shamelessly copied from ir3_compiler_nir until it gets factorized
+ * out at some point.
+ */
+static unsigned
+get_image_coords(const nir_variable *var)
+{
+       const struct glsl_type *type = glsl_without_array(var->type);
+       unsigned coords;
+
+       switch (glsl_get_sampler_dim(type)) {
+       case GLSL_SAMPLER_DIM_1D:
+       case GLSL_SAMPLER_DIM_BUF:
+               coords = 1;
+               break;
+       case GLSL_SAMPLER_DIM_2D:
+       case GLSL_SAMPLER_DIM_RECT:
+       case GLSL_SAMPLER_DIM_EXTERNAL:
+       case GLSL_SAMPLER_DIM_MS:
+               coords = 2;
+               break;
+       case GLSL_SAMPLER_DIM_3D:
+       case GLSL_SAMPLER_DIM_CUBE:
+               coords = 3;
+               break;
+       default:
+               unreachable("bad sampler dim");
+               return 0;
+       }
+
+       if (glsl_sampler_type_is_array(type)) {
+               /* note: unlike tex_info(), adjust # of coords to include array 
idx: */
+               coords++;
+       }
+
+       return coords;
+}
+
+#if INLINE_BPP
+/* Returns the bytes-per-pixel for the different GL formats corresponding to
+ * all supported image formats.
+ */
+static unsigned
+bytes_per_pixel_for_gl_format(GLuint format)
+{
+       switch (format) {
+       case GL_R8I:
+       case GL_R8UI:
+       case GL_R8:
+       case GL_R8_SNORM:
+               return 1;
+
+       case GL_R16F:
+       case GL_R16I:
+       case GL_R16UI:
+       case GL_R16:
+       case GL_R16_SNORM:
+       case GL_RG8I:
+       case GL_RG8UI:
+       case GL_RG8:
+       case GL_RG8_SNORM:
+               return 2;
+
+       case GL_R32F:
+       case GL_R32I:
+       case GL_R32UI:
+       case GL_RG16F:
+       case GL_RG16I:
+       case GL_RG16UI:
+       case GL_RG16:
+       case GL_RG16_SNORM:
+       case GL_RGBA8I:
+       case GL_RGBA8UI:
+       case GL_RGBA8:
+       case GL_RGBA8_SNORM:
+       case GL_RGB10_A2UI:
+       case GL_RGB10_A2:
+       case GL_R11F_G11F_B10F:
+               return 4;
+
+       case GL_RG32F:
+       case GL_RG32I:
+       case GL_RG32UI:
+       case GL_RGBA16F:
+       case GL_RGBA16I:
+       case GL_RGBA16UI:
+       case GL_RGBA16:
+       case GL_RGBA16_SNORM:
+               return 8;
+
+       case GL_RGBA32F:
+       case GL_RGBA32I:
+       case GL_RGBA32UI:
+               return 16;
+
+       default:
+               debug_assert(!"Unhandled GL format");
+       }
+
+       return 0;
+}
+#endif /* INLINE_BPP */
+
+static nir_ssa_def *
+insert_load_image_stride(nir_builder *b, unsigned image_index,
+                                                unsigned dimension)
+{
+       nir_ssa_def *zero = nir_imm_int(b, 0);
+       nir_intrinsic_instr *load =
+               nir_intrinsic_instr_create(b->shader, 
nir_intrinsic_load_image_stride);
+       load->num_components = 1;
+       load->const_index[0] = image_index;
+       load->const_index[1] = dimension;
+       load->src[0] = nir_src_for_ssa(zero);
+       nir_ssa_dest_init(&load->instr, &load->dest, 1, 32, NULL);
+
+       nir_builder_instr_insert(b, &load->instr);
+
+       return &load->dest.ssa;
+}
+
+static void
+lower_offset_for_image_store_or_atomic(nir_intrinsic_instr *intrinsic,
+                                                                          
const nir_variable *var, nir_builder *b,
+                                                                          void 
*mem_ctx)
+{
+       /* Find the instruction that defines the coord source of the image
+        * store/atomic intrinsic. It must be a "vec4" ALU instruction.
+        */
+       debug_assert(intrinsic->src[1].is_ssa);
+       nir_ssa_def *offset_src_def = intrinsic->src[1].ssa;
+
+       nir_instr *offset_parent_instr = offset_src_def->parent_instr;
+       debug_assert(offset_parent_instr->type == nir_instr_type_alu);
+
+       nir_alu_instr *vec4_instr = nir_instr_as_alu(offset_parent_instr);
+       debug_assert(vec4_instr->op == nir_op_vec4);
+
+       unsigned coords = get_image_coords(var);
+
+       b->cursor = nir_before_instr(&vec4_instr->instr);
+
+       /* These are actually offsets into image_dims register file (for
+        * a given image).
+        */
+       enum {
+               BYTES_PER_PIXEL = 0,
+               Y_STRIDE        = 1,
+               Z_STRIDE        = 2
+       };
+
+       /* x_offset = coords.x * bytes_per_pixel */
+       nir_ssa_def *x_coord = vec4_instr->src[0].src.ssa;
+#if INLINE_BPP
+       unsigned bpp = bytes_per_pixel_for_gl_format(var->data.image.format);
+       nir_ssa_def *offset = nir_imul_imm(b, x_coord, bpp);
+#else
+       nir_ssa_def *bpp =
+               insert_load_image_stride(b, var->data.driver_location, 
BYTES_PER_PIXEL);
+       nir_ssa_def *offset = nir_imul(b, x_coord, bpp);
+#endif
+       nir_alu_instr *imul = nir_instr_as_alu(offset->parent_instr);
+       debug_assert(imul);
+       imul->src[0].swizzle[0] = vec4_instr->src[0].swizzle[0];
+       debug_assert(offset);
+
+       /* For Y and Z dimensions, we emit a temporary load_image_stride
+        * intrinsic, to be consumed by ir3_compiler_nir::emit_intrinsic(), 
which
+        * will just emit an uniform with the right value from image_dims[].
+        */
+
+       if (coords > 1) {
+               nir_ssa_def *y_coord = vec4_instr->src[1].src.ssa;
+               nir_ssa_def *y_stride =
+                       insert_load_image_stride(b, var->data.driver_location, 
Y_STRIDE);
+
+               /* y_offset = coords.y * y_stride + x_offset */
+               offset = nir_imad(b, y_stride, y_coord, offset);
+               debug_assert(offset);
+               nir_alu_instr *imad = nir_instr_as_alu(offset->parent_instr);
+               debug_assert(imad);
+               imad->src[1].swizzle[0] = vec4_instr->src[1].swizzle[0];
+       }
+
+       if (coords > 2) {
+               nir_ssa_def *z_coord = vec4_instr->src[2].src.ssa;
+               nir_ssa_def *z_stride =
+                       insert_load_image_stride(b, var->data.driver_location, 
Z_STRIDE);
+
+               /* z_offset = coords.z * z_stride + y_offset */
+               offset = nir_imad(b, z_stride, z_coord, offset);
+               debug_assert(offset);
+               nir_alu_instr *imad = nir_instr_as_alu(offset->parent_instr);
+               debug_assert(imad);
+               imad->src[1].swizzle[0] = vec4_instr->src[2].swizzle[0];
+       }
+
+       if (intrinsic_is_image_atomic(intrinsic->intrinsic)) {
+               /* Some cases, like atomics, seem to use dword offset instead
+                * of byte offsets.. blob just puts an extra shr.b in there
+                * in those cases:
+                */
+               nir_ssa_def *two = nir_imm_int(b, 2);
+               offset = nir_ushr(b, offset, two);
+       }
+
+       /* Finally, store the calculate offset in the 4th component of the
+        * vec4 instruction. We use the 4th coordinate because it is the
+        * one we know for sure is not used.
+        */
+       nir_instr_rewrite_src(&vec4_instr->instr,
+                                                 &vec4_instr->src[3].src,
+                                                 nir_src_for_ssa(offset));
+}
+
+static bool
+lower_sampler_io_block(nir_block *block, nir_builder *b, void *mem_ctx)
+{
+       bool progress = false;
+
+       nir_foreach_instr_safe(instr, block) {
+               if (instr->type != nir_instr_type_intrinsic)
+                       continue;
+
+               nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
+               if (!intrinsic_is_image_store_or_atomic(intr->intrinsic))
+                       continue;
+
+               const nir_variable *var = nir_intrinsic_get_var(intr, 0);
+               lower_offset_for_image_store_or_atomic(intr, var, b, mem_ctx);
+
+               progress = true;
+       }
+
+       return progress;
+}
+
+static bool
+lower_sampler_io_func(nir_function_impl *impl)
+{
+       void *mem_ctx = ralloc_parent(impl);
+       nir_builder b;
+       nir_builder_init(&b, impl);
+
+       bool progress = false;
+       nir_foreach_block_safe(block, impl) {
+               progress |= lower_sampler_io_block(block, &b, mem_ctx);
+       }
+
+       if (progress) {
+               nir_metadata_preserve(impl, nir_metadata_block_index |
+                                                                       
nir_metadata_dominance);
+       }
+
+       return progress;
+}
+
+bool
+ir3_nir_lower_sampler_io(nir_shader *shader)
+{
+       bool progress = false;
+
+       nir_foreach_function(function, shader) {
+               if (function->impl)
+                       progress |= lower_sampler_io_func(function->impl);
+       }
+
+       return progress;
+}
-- 
2.20.1

_______________________________________________
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev

[Mesa-dev] [RFC 3/4] ir3/nir: Add a new pass 'ir3_nir_lower_sampler_io'

Reply via email to