Re: [Mesa-dev] [PATCH 8/8] intel/compiler: Lower SSBO and shared loads/stores in NIR

2018-11-15 Thread Samuel Iglesias Gonsálvez


On 14/11/2018 00:23, Jason Ekstrand wrote:
> We have a bunch of code to do this in the back-end compiler but it's
> fairly specific to typed surface messages and the way we emit them.
> This breaks it out into NIR were it's easier to do things a bit more
> generally.  It also means we can easily share the code between the bec4

vec4

Reviewed-by: Samuel Iglesias Gonsálvez 

> and FS back-ends if we wish.
> ---
>  src/intel/Makefile.sources|   1 +
>  src/intel/compiler/brw_fs_nir.cpp | 381 --
>  src/intel/compiler/brw_nir.c  |   2 +
>  src/intel/compiler/brw_nir.h  |   2 +
>  .../brw_nir_lower_mem_access_bit_sizes.c  | 313 ++
>  src/intel/compiler/brw_vec4_nir.cpp   | 126 +-
>  src/intel/compiler/meson.build|   1 +
>  7 files changed, 421 insertions(+), 405 deletions(-)
>  create mode 100644 src/intel/compiler/brw_nir_lower_mem_access_bit_sizes.c
> 
> diff --git a/src/intel/Makefile.sources b/src/intel/Makefile.sources
> index 4da887f7ed2..5e7d32293b7 100644
> --- a/src/intel/Makefile.sources
> +++ b/src/intel/Makefile.sources
> @@ -85,6 +85,7 @@ COMPILER_FILES = \
>   compiler/brw_nir_attribute_workarounds.c \
>   compiler/brw_nir_lower_cs_intrinsics.c \
>   compiler/brw_nir_lower_image_load_store.c \
> + compiler/brw_nir_lower_mem_access_bit_sizes.c \
>   compiler/brw_nir_opt_peephole_ffma.c \
>   compiler/brw_nir_tcs_workarounds.c \
>   compiler/brw_packed_float.c \
> diff --git a/src/intel/compiler/brw_fs_nir.cpp 
> b/src/intel/compiler/brw_fs_nir.cpp
> index 2b36171136e..84d0c6be6c3 100644
> --- a/src/intel/compiler/brw_fs_nir.cpp
> +++ b/src/intel/compiler/brw_fs_nir.cpp
> @@ -26,6 +26,7 @@
>  #include "brw_fs_surface_builder.h"
>  #include "brw_nir.h"
>  #include "util/u_math.h"
> +#include "util/bitscan.h"
>  
>  using namespace brw;
>  using namespace brw::surface_access;
> @@ -2250,107 +2251,6 @@ fs_visitor::get_indirect_offset(nir_intrinsic_instr 
> *instr)
> return get_nir_src(*offset_src);
>  }
>  
> -static void
> -do_untyped_vector_read(const fs_builder ,
> -   const fs_reg dest,
> -   const fs_reg surf_index,
> -   const fs_reg offset_reg,
> -   unsigned num_components)
> -{
> -   if (type_sz(dest.type) <= 2) {
> -  assert(dest.stride == 1);
> -  boolean is_const_offset = offset_reg.file == BRW_IMMEDIATE_VALUE;
> -
> -  if (is_const_offset) {
> - uint32_t start = offset_reg.ud & ~3;
> - uint32_t end = offset_reg.ud + num_components * type_sz(dest.type);
> - end = ALIGN(end, 4);
> - assert (end - start <= 16);
> -
> - /* At this point we have 16-bit component/s that have constant
> -  * offset aligned to 4-bytes that can be read with untyped_reads.
> -  * untyped_read message requires 32-bit aligned offsets.
> -  */
> - unsigned first_component = (offset_reg.ud & 3) / type_sz(dest.type);
> - unsigned num_components_32bit = (end - start) / 4;
> -
> - fs_reg read_result =
> -emit_untyped_read(bld, surf_index, brw_imm_ud(start),
> -  1 /* dims */,
> -  num_components_32bit,
> -  BRW_PREDICATE_NONE);
> - shuffle_from_32bit_read(bld, dest, read_result, first_component,
> - num_components);
> -  } else {
> - fs_reg read_offset = bld.vgrf(BRW_REGISTER_TYPE_UD);
> - for (unsigned i = 0; i < num_components; i++) {
> -if (i == 0) {
> -   bld.MOV(read_offset, offset_reg);
> -} else {
> -   bld.ADD(read_offset, offset_reg,
> -   brw_imm_ud(i * type_sz(dest.type)));
> -}
> -/* Non constant offsets are not guaranteed to be aligned 32-bits
> - * so they are read using one byte_scattered_read message
> - * for each component.
> - */
> -fs_reg read_result =
> -   emit_byte_scattered_read(bld, surf_index, read_offset,
> -1 /* dims */, 1,
> -type_sz(dest.type) * 8 /* bit_size 
> */,
> -BRW_PREDICATE_NONE);
> -bld.MOV(offset(dest, bld, i),
> -subscript (read_result, dest.type, 0));
> - }
> -  }
> -   } else if (type_sz(dest.type) == 4) {
> -  fs_reg read_result = emit_untyped_read(bld, surf_index, offset_reg,
> - 1 /* dims */,
> - num_components,
> - BRW_PREDICATE_NONE);
> -  read_result.type = dest.type;
> -  for (unsigned i = 0; i < num_components; i++)
> - 

[Mesa-dev] [PATCH 8/8] intel/compiler: Lower SSBO and shared loads/stores in NIR

2018-11-13 Thread Jason Ekstrand
We have a bunch of code to do this in the back-end compiler but it's
fairly specific to typed surface messages and the way we emit them.
This breaks it out into NIR were it's easier to do things a bit more
generally.  It also means we can easily share the code between the bec4
and FS back-ends if we wish.
---
 src/intel/Makefile.sources|   1 +
 src/intel/compiler/brw_fs_nir.cpp | 381 --
 src/intel/compiler/brw_nir.c  |   2 +
 src/intel/compiler/brw_nir.h  |   2 +
 .../brw_nir_lower_mem_access_bit_sizes.c  | 313 ++
 src/intel/compiler/brw_vec4_nir.cpp   | 126 +-
 src/intel/compiler/meson.build|   1 +
 7 files changed, 421 insertions(+), 405 deletions(-)
 create mode 100644 src/intel/compiler/brw_nir_lower_mem_access_bit_sizes.c

diff --git a/src/intel/Makefile.sources b/src/intel/Makefile.sources
index 4da887f7ed2..5e7d32293b7 100644
--- a/src/intel/Makefile.sources
+++ b/src/intel/Makefile.sources
@@ -85,6 +85,7 @@ COMPILER_FILES = \
compiler/brw_nir_attribute_workarounds.c \
compiler/brw_nir_lower_cs_intrinsics.c \
compiler/brw_nir_lower_image_load_store.c \
+   compiler/brw_nir_lower_mem_access_bit_sizes.c \
compiler/brw_nir_opt_peephole_ffma.c \
compiler/brw_nir_tcs_workarounds.c \
compiler/brw_packed_float.c \
diff --git a/src/intel/compiler/brw_fs_nir.cpp 
b/src/intel/compiler/brw_fs_nir.cpp
index 2b36171136e..84d0c6be6c3 100644
--- a/src/intel/compiler/brw_fs_nir.cpp
+++ b/src/intel/compiler/brw_fs_nir.cpp
@@ -26,6 +26,7 @@
 #include "brw_fs_surface_builder.h"
 #include "brw_nir.h"
 #include "util/u_math.h"
+#include "util/bitscan.h"
 
 using namespace brw;
 using namespace brw::surface_access;
@@ -2250,107 +2251,6 @@ fs_visitor::get_indirect_offset(nir_intrinsic_instr 
*instr)
return get_nir_src(*offset_src);
 }
 
-static void
-do_untyped_vector_read(const fs_builder ,
-   const fs_reg dest,
-   const fs_reg surf_index,
-   const fs_reg offset_reg,
-   unsigned num_components)
-{
-   if (type_sz(dest.type) <= 2) {
-  assert(dest.stride == 1);
-  boolean is_const_offset = offset_reg.file == BRW_IMMEDIATE_VALUE;
-
-  if (is_const_offset) {
- uint32_t start = offset_reg.ud & ~3;
- uint32_t end = offset_reg.ud + num_components * type_sz(dest.type);
- end = ALIGN(end, 4);
- assert (end - start <= 16);
-
- /* At this point we have 16-bit component/s that have constant
-  * offset aligned to 4-bytes that can be read with untyped_reads.
-  * untyped_read message requires 32-bit aligned offsets.
-  */
- unsigned first_component = (offset_reg.ud & 3) / type_sz(dest.type);
- unsigned num_components_32bit = (end - start) / 4;
-
- fs_reg read_result =
-emit_untyped_read(bld, surf_index, brw_imm_ud(start),
-  1 /* dims */,
-  num_components_32bit,
-  BRW_PREDICATE_NONE);
- shuffle_from_32bit_read(bld, dest, read_result, first_component,
- num_components);
-  } else {
- fs_reg read_offset = bld.vgrf(BRW_REGISTER_TYPE_UD);
- for (unsigned i = 0; i < num_components; i++) {
-if (i == 0) {
-   bld.MOV(read_offset, offset_reg);
-} else {
-   bld.ADD(read_offset, offset_reg,
-   brw_imm_ud(i * type_sz(dest.type)));
-}
-/* Non constant offsets are not guaranteed to be aligned 32-bits
- * so they are read using one byte_scattered_read message
- * for each component.
- */
-fs_reg read_result =
-   emit_byte_scattered_read(bld, surf_index, read_offset,
-1 /* dims */, 1,
-type_sz(dest.type) * 8 /* bit_size */,
-BRW_PREDICATE_NONE);
-bld.MOV(offset(dest, bld, i),
-subscript (read_result, dest.type, 0));
- }
-  }
-   } else if (type_sz(dest.type) == 4) {
-  fs_reg read_result = emit_untyped_read(bld, surf_index, offset_reg,
- 1 /* dims */,
- num_components,
- BRW_PREDICATE_NONE);
-  read_result.type = dest.type;
-  for (unsigned i = 0; i < num_components; i++)
- bld.MOV(offset(dest, bld, i), offset(read_result, bld, i));
-   } else if (type_sz(dest.type) == 8) {
-  /* Reading a dvec, so we need to:
-   *
-   * 1. Multiply num_components by 2, to account for the fact that we
-   *need to read 64-bit components.
-   * 2. Shuffle the result