This pass moves to NIR some offset calculations that are currently implemented on the backend compiler, to allow NIR to possibly optimize them.
For now, only coordinate byte-offset calculations for imageStore and image atomic operations are implemented. --- src/freedreno/Makefile.sources | 1 + src/freedreno/ir3/ir3_nir.h | 1 + src/freedreno/ir3/ir3_nir_lower_sampler_io.c | 349 +++++++++++++++++++ 3 files changed, 351 insertions(+) create mode 100644 src/freedreno/ir3/ir3_nir_lower_sampler_io.c diff --git a/src/freedreno/Makefile.sources b/src/freedreno/Makefile.sources index 7fea9de39ef..fd4f7f294cd 100644 --- a/src/freedreno/Makefile.sources +++ b/src/freedreno/Makefile.sources @@ -31,6 +31,7 @@ ir3_SOURCES := \ ir3/ir3_legalize.c \ ir3/ir3_nir.c \ ir3/ir3_nir.h \ + ir3/ir3_nir_lower_sampler_io.c \ ir3/ir3_nir_lower_tg4_to_tex.c \ ir3/ir3_print.c \ ir3/ir3_ra.c \ diff --git a/src/freedreno/ir3/ir3_nir.h b/src/freedreno/ir3/ir3_nir.h index 74201d34160..52809ba099e 100644 --- a/src/freedreno/ir3/ir3_nir.h +++ b/src/freedreno/ir3/ir3_nir.h @@ -36,6 +36,7 @@ void ir3_nir_scan_driver_consts(nir_shader *shader, struct ir3_driver_const_layo bool ir3_nir_apply_trig_workarounds(nir_shader *shader); bool ir3_nir_lower_tg4_to_tex(nir_shader *shader); +bool ir3_nir_lower_sampler_io(nir_shader *shader); const nir_shader_compiler_options * ir3_get_compiler_options(struct ir3_compiler *compiler); bool ir3_key_lowers_nir(const struct ir3_shader_key *key); diff --git a/src/freedreno/ir3/ir3_nir_lower_sampler_io.c b/src/freedreno/ir3/ir3_nir_lower_sampler_io.c new file mode 100644 index 00000000000..e2910d8906d --- /dev/null +++ b/src/freedreno/ir3/ir3_nir_lower_sampler_io.c @@ -0,0 +1,349 @@ +/* + * Copyright © 2018 Igalia S.L. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#include "ir3_nir.h" +#include "compiler/nir/nir_builder.h" + +/** + * The goal of this pass is to move to NIR some offset calculations for + * different I/O that are currently implemented on the backend compiler, + * to allow NIR to possibly optimize them. + * + * Currently, only offset calculations for image store and image + * atomic operations are implemented. + */ + + +/* This flag enables/disables a code-path where the bytes-per-pixel of + * an image is obtained directly from the format, which is known + * at shader compile time; as opposed to using image_dims[0] constant + * available only at shader runtime. + * + * Inlining the bytes-per-pixel here as an immediate has the advantage + * that it gets converted to a single (SHL) instruction (because all + * possible values are powers of two); while loading it as a uniform and + * emitting an IMUL will cause the backend to expand it to quite a few + * instructions (see ir3_compiler_nir for imul), thus ultimately hurting + * instruction count. + */ +#define INLINE_BPP 1 + + +static bool +intrinsic_is_image_atomic(unsigned intrinsic) +{ + switch (intrinsic) { + case nir_intrinsic_image_deref_atomic_add: + case nir_intrinsic_image_deref_atomic_min: + case nir_intrinsic_image_deref_atomic_max: + case nir_intrinsic_image_deref_atomic_and: + case nir_intrinsic_image_deref_atomic_or: + case nir_intrinsic_image_deref_atomic_xor: + case nir_intrinsic_image_deref_atomic_exchange: + case nir_intrinsic_image_deref_atomic_comp_swap: + return true; + default: + break; + } + + return false; +} + +static bool +intrinsic_is_image_store_or_atomic(unsigned intrinsic) +{ + if (intrinsic == nir_intrinsic_image_deref_store) + return true; + else + return intrinsic_is_image_atomic(intrinsic); +} + +/* + * FIXME: shamelessly copied from ir3_compiler_nir until it gets factorized + * out at some point. + */ +static unsigned +get_image_coords(const nir_variable *var) +{ + const struct glsl_type *type = glsl_without_array(var->type); + unsigned coords; + + switch (glsl_get_sampler_dim(type)) { + case GLSL_SAMPLER_DIM_1D: + case GLSL_SAMPLER_DIM_BUF: + coords = 1; + break; + case GLSL_SAMPLER_DIM_2D: + case GLSL_SAMPLER_DIM_RECT: + case GLSL_SAMPLER_DIM_EXTERNAL: + case GLSL_SAMPLER_DIM_MS: + coords = 2; + break; + case GLSL_SAMPLER_DIM_3D: + case GLSL_SAMPLER_DIM_CUBE: + coords = 3; + break; + default: + unreachable("bad sampler dim"); + return 0; + } + + if (glsl_sampler_type_is_array(type)) { + /* note: unlike tex_info(), adjust # of coords to include array idx: */ + coords++; + } + + return coords; +} + +#if INLINE_BPP +/* Returns the bytes-per-pixel for the different GL formats corresponding to + * all supported image formats. + */ +static unsigned +bytes_per_pixel_for_gl_format(GLuint format) +{ + switch (format) { + case GL_R8I: + case GL_R8UI: + case GL_R8: + case GL_R8_SNORM: + return 1; + + case GL_R16F: + case GL_R16I: + case GL_R16UI: + case GL_R16: + case GL_R16_SNORM: + case GL_RG8I: + case GL_RG8UI: + case GL_RG8: + case GL_RG8_SNORM: + return 2; + + case GL_R32F: + case GL_R32I: + case GL_R32UI: + case GL_RG16F: + case GL_RG16I: + case GL_RG16UI: + case GL_RG16: + case GL_RG16_SNORM: + case GL_RGBA8I: + case GL_RGBA8UI: + case GL_RGBA8: + case GL_RGBA8_SNORM: + case GL_RGB10_A2UI: + case GL_RGB10_A2: + case GL_R11F_G11F_B10F: + return 4; + + case GL_RG32F: + case GL_RG32I: + case GL_RG32UI: + case GL_RGBA16F: + case GL_RGBA16I: + case GL_RGBA16UI: + case GL_RGBA16: + case GL_RGBA16_SNORM: + return 8; + + case GL_RGBA32F: + case GL_RGBA32I: + case GL_RGBA32UI: + return 16; + + default: + debug_assert(!"Unhandled GL format"); + } + + return 0; +} +#endif /* INLINE_BPP */ + +static nir_ssa_def * +insert_load_image_stride(nir_builder *b, unsigned image_index, + unsigned dimension) +{ + nir_ssa_def *zero = nir_imm_int(b, 0); + nir_intrinsic_instr *load = + nir_intrinsic_instr_create(b->shader, nir_intrinsic_load_image_stride); + load->num_components = 1; + load->const_index[0] = image_index; + load->const_index[1] = dimension; + load->src[0] = nir_src_for_ssa(zero); + nir_ssa_dest_init(&load->instr, &load->dest, 1, 32, NULL); + + nir_builder_instr_insert(b, &load->instr); + + return &load->dest.ssa; +} + +static void +lower_offset_for_image_store_or_atomic(nir_intrinsic_instr *intrinsic, + const nir_variable *var, nir_builder *b, + void *mem_ctx) +{ + /* Find the instruction that defines the coord source of the image + * store/atomic intrinsic. It must be a "vec4" ALU instruction. + */ + debug_assert(intrinsic->src[1].is_ssa); + nir_ssa_def *offset_src_def = intrinsic->src[1].ssa; + + nir_instr *offset_parent_instr = offset_src_def->parent_instr; + debug_assert(offset_parent_instr->type == nir_instr_type_alu); + + nir_alu_instr *vec4_instr = nir_instr_as_alu(offset_parent_instr); + debug_assert(vec4_instr->op == nir_op_vec4); + + unsigned coords = get_image_coords(var); + + b->cursor = nir_before_instr(&vec4_instr->instr); + + /* These are actually offsets into image_dims register file (for + * a given image). + */ + enum { + BYTES_PER_PIXEL = 0, + Y_STRIDE = 1, + Z_STRIDE = 2 + }; + + /* x_offset = coords.x * bytes_per_pixel */ + nir_ssa_def *x_coord = vec4_instr->src[0].src.ssa; +#if INLINE_BPP + unsigned bpp = bytes_per_pixel_for_gl_format(var->data.image.format); + nir_ssa_def *offset = nir_imul_imm(b, x_coord, bpp); +#else + nir_ssa_def *bpp = + insert_load_image_stride(b, var->data.driver_location, BYTES_PER_PIXEL); + nir_ssa_def *offset = nir_imul(b, x_coord, bpp); +#endif + nir_alu_instr *imul = nir_instr_as_alu(offset->parent_instr); + debug_assert(imul); + imul->src[0].swizzle[0] = vec4_instr->src[0].swizzle[0]; + debug_assert(offset); + + /* For Y and Z dimensions, we emit a temporary load_image_stride + * intrinsic, to be consumed by ir3_compiler_nir::emit_intrinsic(), which + * will just emit an uniform with the right value from image_dims[]. + */ + + if (coords > 1) { + nir_ssa_def *y_coord = vec4_instr->src[1].src.ssa; + nir_ssa_def *y_stride = + insert_load_image_stride(b, var->data.driver_location, Y_STRIDE); + + /* y_offset = coords.y * y_stride + x_offset */ + offset = nir_imad(b, y_stride, y_coord, offset); + debug_assert(offset); + nir_alu_instr *imad = nir_instr_as_alu(offset->parent_instr); + debug_assert(imad); + imad->src[1].swizzle[0] = vec4_instr->src[1].swizzle[0]; + } + + if (coords > 2) { + nir_ssa_def *z_coord = vec4_instr->src[2].src.ssa; + nir_ssa_def *z_stride = + insert_load_image_stride(b, var->data.driver_location, Z_STRIDE); + + /* z_offset = coords.z * z_stride + y_offset */ + offset = nir_imad(b, z_stride, z_coord, offset); + debug_assert(offset); + nir_alu_instr *imad = nir_instr_as_alu(offset->parent_instr); + debug_assert(imad); + imad->src[1].swizzle[0] = vec4_instr->src[2].swizzle[0]; + } + + if (intrinsic_is_image_atomic(intrinsic->intrinsic)) { + /* Some cases, like atomics, seem to use dword offset instead + * of byte offsets.. blob just puts an extra shr.b in there + * in those cases: + */ + nir_ssa_def *two = nir_imm_int(b, 2); + offset = nir_ushr(b, offset, two); + } + + /* Finally, store the calculate offset in the 4th component of the + * vec4 instruction. We use the 4th coordinate because it is the + * one we know for sure is not used. + */ + nir_instr_rewrite_src(&vec4_instr->instr, + &vec4_instr->src[3].src, + nir_src_for_ssa(offset)); +} + +static bool +lower_sampler_io_block(nir_block *block, nir_builder *b, void *mem_ctx) +{ + bool progress = false; + + nir_foreach_instr_safe(instr, block) { + if (instr->type != nir_instr_type_intrinsic) + continue; + + nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr); + if (!intrinsic_is_image_store_or_atomic(intr->intrinsic)) + continue; + + const nir_variable *var = nir_intrinsic_get_var(intr, 0); + lower_offset_for_image_store_or_atomic(intr, var, b, mem_ctx); + + progress = true; + } + + return progress; +} + +static bool +lower_sampler_io_func(nir_function_impl *impl) +{ + void *mem_ctx = ralloc_parent(impl); + nir_builder b; + nir_builder_init(&b, impl); + + bool progress = false; + nir_foreach_block_safe(block, impl) { + progress |= lower_sampler_io_block(block, &b, mem_ctx); + } + + if (progress) { + nir_metadata_preserve(impl, nir_metadata_block_index | + nir_metadata_dominance); + } + + return progress; +} + +bool +ir3_nir_lower_sampler_io(nir_shader *shader) +{ + bool progress = false; + + nir_foreach_function(function, shader) { + if (function->impl) + progress |= lower_sampler_io_func(function->impl); + } + + return progress; +} -- 2.20.1 _______________________________________________ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev