Re: [Mesa-dev] [PATCH 8/8] intel/compiler: Lower SSBO and shared loads/stores in NIR
On 14/11/2018 00:23, Jason Ekstrand wrote: > We have a bunch of code to do this in the back-end compiler but it's > fairly specific to typed surface messages and the way we emit them. > This breaks it out into NIR were it's easier to do things a bit more > generally. It also means we can easily share the code between the bec4 vec4 Reviewed-by: Samuel Iglesias Gonsálvez > and FS back-ends if we wish. > --- > src/intel/Makefile.sources| 1 + > src/intel/compiler/brw_fs_nir.cpp | 381 -- > src/intel/compiler/brw_nir.c | 2 + > src/intel/compiler/brw_nir.h | 2 + > .../brw_nir_lower_mem_access_bit_sizes.c | 313 ++ > src/intel/compiler/brw_vec4_nir.cpp | 126 +- > src/intel/compiler/meson.build| 1 + > 7 files changed, 421 insertions(+), 405 deletions(-) > create mode 100644 src/intel/compiler/brw_nir_lower_mem_access_bit_sizes.c > > diff --git a/src/intel/Makefile.sources b/src/intel/Makefile.sources > index 4da887f7ed2..5e7d32293b7 100644 > --- a/src/intel/Makefile.sources > +++ b/src/intel/Makefile.sources > @@ -85,6 +85,7 @@ COMPILER_FILES = \ > compiler/brw_nir_attribute_workarounds.c \ > compiler/brw_nir_lower_cs_intrinsics.c \ > compiler/brw_nir_lower_image_load_store.c \ > + compiler/brw_nir_lower_mem_access_bit_sizes.c \ > compiler/brw_nir_opt_peephole_ffma.c \ > compiler/brw_nir_tcs_workarounds.c \ > compiler/brw_packed_float.c \ > diff --git a/src/intel/compiler/brw_fs_nir.cpp > b/src/intel/compiler/brw_fs_nir.cpp > index 2b36171136e..84d0c6be6c3 100644 > --- a/src/intel/compiler/brw_fs_nir.cpp > +++ b/src/intel/compiler/brw_fs_nir.cpp > @@ -26,6 +26,7 @@ > #include "brw_fs_surface_builder.h" > #include "brw_nir.h" > #include "util/u_math.h" > +#include "util/bitscan.h" > > using namespace brw; > using namespace brw::surface_access; > @@ -2250,107 +2251,6 @@ fs_visitor::get_indirect_offset(nir_intrinsic_instr > *instr) > return get_nir_src(*offset_src); > } > > -static void > -do_untyped_vector_read(const fs_builder , > - const fs_reg dest, > - const fs_reg surf_index, > - const fs_reg offset_reg, > - unsigned num_components) > -{ > - if (type_sz(dest.type) <= 2) { > - assert(dest.stride == 1); > - boolean is_const_offset = offset_reg.file == BRW_IMMEDIATE_VALUE; > - > - if (is_const_offset) { > - uint32_t start = offset_reg.ud & ~3; > - uint32_t end = offset_reg.ud + num_components * type_sz(dest.type); > - end = ALIGN(end, 4); > - assert (end - start <= 16); > - > - /* At this point we have 16-bit component/s that have constant > - * offset aligned to 4-bytes that can be read with untyped_reads. > - * untyped_read message requires 32-bit aligned offsets. > - */ > - unsigned first_component = (offset_reg.ud & 3) / type_sz(dest.type); > - unsigned num_components_32bit = (end - start) / 4; > - > - fs_reg read_result = > -emit_untyped_read(bld, surf_index, brw_imm_ud(start), > - 1 /* dims */, > - num_components_32bit, > - BRW_PREDICATE_NONE); > - shuffle_from_32bit_read(bld, dest, read_result, first_component, > - num_components); > - } else { > - fs_reg read_offset = bld.vgrf(BRW_REGISTER_TYPE_UD); > - for (unsigned i = 0; i < num_components; i++) { > -if (i == 0) { > - bld.MOV(read_offset, offset_reg); > -} else { > - bld.ADD(read_offset, offset_reg, > - brw_imm_ud(i * type_sz(dest.type))); > -} > -/* Non constant offsets are not guaranteed to be aligned 32-bits > - * so they are read using one byte_scattered_read message > - * for each component. > - */ > -fs_reg read_result = > - emit_byte_scattered_read(bld, surf_index, read_offset, > -1 /* dims */, 1, > -type_sz(dest.type) * 8 /* bit_size > */, > -BRW_PREDICATE_NONE); > -bld.MOV(offset(dest, bld, i), > -subscript (read_result, dest.type, 0)); > - } > - } > - } else if (type_sz(dest.type) == 4) { > - fs_reg read_result = emit_untyped_read(bld, surf_index, offset_reg, > - 1 /* dims */, > - num_components, > - BRW_PREDICATE_NONE); > - read_result.type = dest.type; > - for (unsigned i = 0; i < num_components; i++) > -
[Mesa-dev] [PATCH 8/8] intel/compiler: Lower SSBO and shared loads/stores in NIR
We have a bunch of code to do this in the back-end compiler but it's fairly specific to typed surface messages and the way we emit them. This breaks it out into NIR were it's easier to do things a bit more generally. It also means we can easily share the code between the bec4 and FS back-ends if we wish. --- src/intel/Makefile.sources| 1 + src/intel/compiler/brw_fs_nir.cpp | 381 -- src/intel/compiler/brw_nir.c | 2 + src/intel/compiler/brw_nir.h | 2 + .../brw_nir_lower_mem_access_bit_sizes.c | 313 ++ src/intel/compiler/brw_vec4_nir.cpp | 126 +- src/intel/compiler/meson.build| 1 + 7 files changed, 421 insertions(+), 405 deletions(-) create mode 100644 src/intel/compiler/brw_nir_lower_mem_access_bit_sizes.c diff --git a/src/intel/Makefile.sources b/src/intel/Makefile.sources index 4da887f7ed2..5e7d32293b7 100644 --- a/src/intel/Makefile.sources +++ b/src/intel/Makefile.sources @@ -85,6 +85,7 @@ COMPILER_FILES = \ compiler/brw_nir_attribute_workarounds.c \ compiler/brw_nir_lower_cs_intrinsics.c \ compiler/brw_nir_lower_image_load_store.c \ + compiler/brw_nir_lower_mem_access_bit_sizes.c \ compiler/brw_nir_opt_peephole_ffma.c \ compiler/brw_nir_tcs_workarounds.c \ compiler/brw_packed_float.c \ diff --git a/src/intel/compiler/brw_fs_nir.cpp b/src/intel/compiler/brw_fs_nir.cpp index 2b36171136e..84d0c6be6c3 100644 --- a/src/intel/compiler/brw_fs_nir.cpp +++ b/src/intel/compiler/brw_fs_nir.cpp @@ -26,6 +26,7 @@ #include "brw_fs_surface_builder.h" #include "brw_nir.h" #include "util/u_math.h" +#include "util/bitscan.h" using namespace brw; using namespace brw::surface_access; @@ -2250,107 +2251,6 @@ fs_visitor::get_indirect_offset(nir_intrinsic_instr *instr) return get_nir_src(*offset_src); } -static void -do_untyped_vector_read(const fs_builder , - const fs_reg dest, - const fs_reg surf_index, - const fs_reg offset_reg, - unsigned num_components) -{ - if (type_sz(dest.type) <= 2) { - assert(dest.stride == 1); - boolean is_const_offset = offset_reg.file == BRW_IMMEDIATE_VALUE; - - if (is_const_offset) { - uint32_t start = offset_reg.ud & ~3; - uint32_t end = offset_reg.ud + num_components * type_sz(dest.type); - end = ALIGN(end, 4); - assert (end - start <= 16); - - /* At this point we have 16-bit component/s that have constant - * offset aligned to 4-bytes that can be read with untyped_reads. - * untyped_read message requires 32-bit aligned offsets. - */ - unsigned first_component = (offset_reg.ud & 3) / type_sz(dest.type); - unsigned num_components_32bit = (end - start) / 4; - - fs_reg read_result = -emit_untyped_read(bld, surf_index, brw_imm_ud(start), - 1 /* dims */, - num_components_32bit, - BRW_PREDICATE_NONE); - shuffle_from_32bit_read(bld, dest, read_result, first_component, - num_components); - } else { - fs_reg read_offset = bld.vgrf(BRW_REGISTER_TYPE_UD); - for (unsigned i = 0; i < num_components; i++) { -if (i == 0) { - bld.MOV(read_offset, offset_reg); -} else { - bld.ADD(read_offset, offset_reg, - brw_imm_ud(i * type_sz(dest.type))); -} -/* Non constant offsets are not guaranteed to be aligned 32-bits - * so they are read using one byte_scattered_read message - * for each component. - */ -fs_reg read_result = - emit_byte_scattered_read(bld, surf_index, read_offset, -1 /* dims */, 1, -type_sz(dest.type) * 8 /* bit_size */, -BRW_PREDICATE_NONE); -bld.MOV(offset(dest, bld, i), -subscript (read_result, dest.type, 0)); - } - } - } else if (type_sz(dest.type) == 4) { - fs_reg read_result = emit_untyped_read(bld, surf_index, offset_reg, - 1 /* dims */, - num_components, - BRW_PREDICATE_NONE); - read_result.type = dest.type; - for (unsigned i = 0; i < num_components; i++) - bld.MOV(offset(dest, bld, i), offset(read_result, bld, i)); - } else if (type_sz(dest.type) == 8) { - /* Reading a dvec, so we need to: - * - * 1. Multiply num_components by 2, to account for the fact that we - *need to read 64-bit components. - * 2. Shuffle the result