On Fri, Feb 23, 2018 at 1:26 AM, Jose Maria Casanova Crespo <
jmcasan...@igalia.com> wrote:

> 16-bit load_ubo/ssbo operations that call do_untyped_read_vector doesn't
> guarantee that offsets are multiple of 4-bytes as required by untyped_read
> message. This happens for example on 16-bit scalar arrays and in the case
> of f16vec3 when then VK_KHR_relaxed_block_layoud is enabled.
>
> Vectors reads when we have non-constant offsets are implemented with
> multiple byte_scattered_read messages that not require 32-bit aligned
> offsets.
> The same applies for constant offsets not aligned to 32-bits.
>
> Untyped_read_surface is used message when there is a constant offset
> 32-bit aligned and we have more than 1 component to read.
> ---
>  src/intel/compiler/brw_fs_nir.cpp | 60 ++++++++++++++++++++++++++++--
> ---------
>  1 file changed, 44 insertions(+), 16 deletions(-)
>
> diff --git a/src/intel/compiler/brw_fs_nir.cpp
> b/src/intel/compiler/brw_fs_nir.cpp
> index 8efec34cc9..45b8e8b637 100644
> --- a/src/intel/compiler/brw_fs_nir.cpp
> +++ b/src/intel/compiler/brw_fs_nir.cpp
> @@ -2305,27 +2305,55 @@ do_untyped_vector_read(const fs_builder &bld,
>     if (type_sz(dest.type) <= 2) {
>        assert(dest.stride == 1);
>
> -      if (num_components > 1) {
> -         /* Pairs of 16-bit components can be read with untyped read, for
> 16-bit
> -          * vec3 4th component is ignored.
> +      unsigned pending_components = num_components;
> +      unsigned first_component = 0;
> +      boolean is_const_offset = offset_reg.file == BRW_IMMEDIATE_VALUE;
> +      fs_reg read_offset;
> +      if (is_const_offset)
> +         read_offset = offset_reg;
> +      else {
> +         read_offset = bld.vgrf(BRW_REGISTER_TYPE_UD);
> +         bld.MOV(read_offset, offset_reg);
> +      }
> +      while (pending_components > 0 &&
> +             (pending_components == 1 ||
> +              !is_const_offset ||
> +              (offset_reg.ud + first_component * 2) % 4)) {
>

I think this would be easier to read as

while (pending_components > 0) {
   if (!is_const_offset || (offset_reg.ud + first_component *
type_sz(dst.type)) % 4) {
      /* Do a single-component load with a byte message */
      first_compoent++;
      pending_components--;
   } else {
      /* Do a load with an untyped read */
      pending_components = 0;
      break;
   }
}

Or, alternatively,

if (is_const_offset) {
   uint32_t start = offset_reg.ud & 3;
   uint32_t end = offset_reg + num_components * type_sz(dst.type);
   end = ALIGN(end, 4);
   assert(end - start <= 16);
   /* Do a load with an untyped read and then throw away unneeded data at
the beginning and the end */
} else {
   /* Do the load using a loop and byte scattered reads */
}

This second one would require a small extension to
shuffle_32bit_load_result_to_16bit_data to handle throwing away data at the
start.


> +         /* Non constant offsets, 16-bit scalars and constant offsets not
> +          * aligned 32-bits are read using one byte_scattered_read message
> +          * for eache  component untyped_read requires 32-bit aligned
> offsets.
>            */
>           fs_reg read_result =
> -            emit_untyped_read(bld, surf_index, offset_reg,
> -                              1 /* dims */, DIV_ROUND_UP(num_components,
> 2),
> -                              BRW_PREDICATE_NONE);
> -         shuffle_32bit_load_result_to_16bit_data(bld,
> -               retype(dest, BRW_REGISTER_TYPE_W),
> -               retype(read_result, BRW_REGISTER_TYPE_D),
> -               num_components);
> -      } else {
> -         assert(num_components == 1);
> -         /* scalar 16-bit are read using one byte_scattered_read message
> */
> -         fs_reg read_result =
> -            emit_byte_scattered_read(bld, surf_index, offset_reg,
> +            emit_byte_scattered_read(bld, surf_index, read_offset,
>                                       1 /* dims */, 1,
>                                       type_sz(dest.type) * 8 /* bit_size
> */,
>                                       BRW_PREDICATE_NONE);
> -         bld.MOV(dest, subscript(read_result, dest.type, 0));
> +         shuffle_32bit_load_result_to_16bit_data(bld,
> +               retype(offset(dest, bld, first_component),
> BRW_REGISTER_TYPE_W),
> +               retype(read_result, BRW_REGISTER_TYPE_D),
> +               1);
> +         pending_components--;
> +         first_component ++;
> +         if (is_const_offset)
> +            read_offset.ud += 2;
> +         else
> +            bld.ADD(read_offset, offset_reg, brw_imm_ud(2 *
> first_component));
> +      }
> +      assert(pending_components != 1);
> +      if (pending_components > 1) {
> +         assert (is_const_offset &&
> +                 (offset_reg.ud + first_component * 2) % 4 == 0);
> +         /* At this point we have multiple 16-bit components that have
> constant
> +          * offset multiple of 4-bytes that can be read with
> untyped_reads.
> +          */
> +         fs_reg read_result =
> +            emit_untyped_read(bld, surf_index, read_offset,
> +                              1 /* dims */, DIV_ROUND_UP(pending_components,
> 2),
> +                              BRW_PREDICATE_NONE);
> +         shuffle_32bit_load_result_to_16bit_data(bld,
> +               retype(offset(dest,bld,first_component),
> BRW_REGISTER_TYPE_W),
> +               retype(read_result, BRW_REGISTER_TYPE_D),
> +               pending_components);
>        }
>     } else if (type_sz(dest.type) == 4) {
>        fs_reg read_result = emit_untyped_read(bld, surf_index, offset_reg,
> --
> 2.14.3
>
> _______________________________________________
> mesa-dev mailing list
> mesa-dev@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/mesa-dev
>
_______________________________________________
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev

Reply via email to