Re: [Mesa-dev] [PATCH 1/2] i965/fs: Rewrite assign_constant_locations

2017-12-06 Thread Chema Casanova
I've tested this patch against the VK-CTS push constant 16-bit tests,
and enabled storagePushConstant16 at VK_KHR_16bit_storage. All test pass
without any extra modification.

dEQP-VK.spirv_assembly.instruction.compute.16bit_storage.push_constant.*
dEQP-VK.spirv_assembly.instruction.graphics.16bit_storage.push_constant.*

All test pass. I have pending in my TODO to build a test mixing push
constant values with different bitsizes.

Tested-by: Jose Maria Casanova Crespo 


On 04/12/17 02:50, Jason Ekstrand wrote:
> This rewires the logic for assigning uniform locations to work in terms
> of "complex alignments".  The basic idea is that, as we walk the list of
> instructions, we keep track of the alignment and continuity requirements
> of each slot and assert that the alignments all match up.  We then use
> those alignments in the compaction stage to ensure that everything gets
> placed at a properly aligned register.  The old mechanism handled
> alignments by special-casing each of the bit sizes and placing 64-bit
> values first followed by 32-bit values.
> 
> The old scheme had the advantage of never leaving a hole since all the
> 64-bit values could be tightly packed and so could the 32-bit values.
> However, the new scheme has no type size special cases so it handles not
> only 32 and 64-bit types but should gracefully extend to 16 and 8-bit
> types as the need arises.
> 
> Cc: Kenneth Graunke 
> Cc: Jose Maria Casanova Crespo 
> ---
>  src/intel/compiler/brw_fs.cpp | 307 
> --
>  1 file changed, 174 insertions(+), 133 deletions(-)
> 
> diff --git a/src/intel/compiler/brw_fs.cpp b/src/intel/compiler/brw_fs.cpp
> index 6772c0d..ffd8e12 100644
> --- a/src/intel/compiler/brw_fs.cpp
> +++ b/src/intel/compiler/brw_fs.cpp
> @@ -1874,62 +1874,6 @@ fs_visitor::compact_virtual_grfs()
> return progress;
>  }
>  
> -static void
> -set_push_pull_constant_loc(unsigned uniform, int *chunk_start,
> -   unsigned *max_chunk_bitsize,
> -   bool contiguous, unsigned bitsize,
> -   const unsigned target_bitsize,
> -   int *push_constant_loc, int *pull_constant_loc,
> -   unsigned *num_push_constants,
> -   unsigned *num_pull_constants,
> -   const unsigned max_push_components,
> -   const unsigned max_chunk_size,
> -   bool allow_pull_constants,
> -   struct brw_stage_prog_data *stage_prog_data)
> -{
> -   /* This is the first live uniform in the chunk */
> -   if (*chunk_start < 0)
> -  *chunk_start = uniform;
> -
> -   /* Keep track of the maximum bit size access in contiguous uniforms */
> -   *max_chunk_bitsize = MAX2(*max_chunk_bitsize, bitsize);
> -
> -   /* If this element does not need to be contiguous with the next, we
> -* split at this point and everything between chunk_start and u forms a
> -* single chunk.
> -*/
> -   if (!contiguous) {
> -  /* If bitsize doesn't match the target one, skip it */
> -  if (*max_chunk_bitsize != target_bitsize) {
> - /* FIXME: right now we only support 32 and 64-bit accesses */
> - assert(*max_chunk_bitsize == 4 || *max_chunk_bitsize == 8);
> - *max_chunk_bitsize = 0;
> - *chunk_start = -1;
> - return;
> -  }
> -
> -  unsigned chunk_size = uniform - *chunk_start + 1;
> -
> -  /* Decide whether we should push or pull this parameter.  In the
> -   * Vulkan driver, push constants are explicitly exposed via the API
> -   * so we push everything.  In GL, we only push small arrays.
> -   */
> -  if (!allow_pull_constants ||
> -  (*num_push_constants + chunk_size <= max_push_components &&
> -   chunk_size <= max_chunk_size)) {
> - assert(*num_push_constants + chunk_size <= max_push_components);
> - for (unsigned j = *chunk_start; j <= uniform; j++)
> -push_constant_loc[j] = (*num_push_constants)++;
> -  } else {
> - for (unsigned j = *chunk_start; j <= uniform; j++)
> -pull_constant_loc[j] = (*num_pull_constants)++;
> -  }
> -
> -  *max_chunk_bitsize = 0;
> -  *chunk_start = -1;
> -   }
> -}
> -
>  static int
>  get_subgroup_id_param_index(const brw_stage_prog_data *prog_data)
>  {
> @@ -1945,6 +1889,98 @@ get_subgroup_id_param_index(const brw_stage_prog_data 
> *prog_data)
>  }
>  
>  /**
> + * Struct for handling complex alignments.
> + *
> + * A complex alignment is stored as multiplier and an offset.  A value is
> + * considered to be aligned if it is congruent to the offset modulo the
> + * multiplier.
> + */
> +struct cplx_align {
> +   unsigned mul:4;
> +   unsigned offset:4;
> +};
> +
> +#define CPLX_ALIGN_MAX_MUL 8
> +
> +static void
> 

[Mesa-dev] [PATCH 1/2] i965/fs: Rewrite assign_constant_locations

2017-12-03 Thread Jason Ekstrand
This rewires the logic for assigning uniform locations to work in terms
of "complex alignments".  The basic idea is that, as we walk the list of
instructions, we keep track of the alignment and continuity requirements
of each slot and assert that the alignments all match up.  We then use
those alignments in the compaction stage to ensure that everything gets
placed at a properly aligned register.  The old mechanism handled
alignments by special-casing each of the bit sizes and placing 64-bit
values first followed by 32-bit values.

The old scheme had the advantage of never leaving a hole since all the
64-bit values could be tightly packed and so could the 32-bit values.
However, the new scheme has no type size special cases so it handles not
only 32 and 64-bit types but should gracefully extend to 16 and 8-bit
types as the need arises.

Cc: Kenneth Graunke 
Cc: Jose Maria Casanova Crespo 
---
 src/intel/compiler/brw_fs.cpp | 307 --
 1 file changed, 174 insertions(+), 133 deletions(-)

diff --git a/src/intel/compiler/brw_fs.cpp b/src/intel/compiler/brw_fs.cpp
index 6772c0d..ffd8e12 100644
--- a/src/intel/compiler/brw_fs.cpp
+++ b/src/intel/compiler/brw_fs.cpp
@@ -1874,62 +1874,6 @@ fs_visitor::compact_virtual_grfs()
return progress;
 }
 
-static void
-set_push_pull_constant_loc(unsigned uniform, int *chunk_start,
-   unsigned *max_chunk_bitsize,
-   bool contiguous, unsigned bitsize,
-   const unsigned target_bitsize,
-   int *push_constant_loc, int *pull_constant_loc,
-   unsigned *num_push_constants,
-   unsigned *num_pull_constants,
-   const unsigned max_push_components,
-   const unsigned max_chunk_size,
-   bool allow_pull_constants,
-   struct brw_stage_prog_data *stage_prog_data)
-{
-   /* This is the first live uniform in the chunk */
-   if (*chunk_start < 0)
-  *chunk_start = uniform;
-
-   /* Keep track of the maximum bit size access in contiguous uniforms */
-   *max_chunk_bitsize = MAX2(*max_chunk_bitsize, bitsize);
-
-   /* If this element does not need to be contiguous with the next, we
-* split at this point and everything between chunk_start and u forms a
-* single chunk.
-*/
-   if (!contiguous) {
-  /* If bitsize doesn't match the target one, skip it */
-  if (*max_chunk_bitsize != target_bitsize) {
- /* FIXME: right now we only support 32 and 64-bit accesses */
- assert(*max_chunk_bitsize == 4 || *max_chunk_bitsize == 8);
- *max_chunk_bitsize = 0;
- *chunk_start = -1;
- return;
-  }
-
-  unsigned chunk_size = uniform - *chunk_start + 1;
-
-  /* Decide whether we should push or pull this parameter.  In the
-   * Vulkan driver, push constants are explicitly exposed via the API
-   * so we push everything.  In GL, we only push small arrays.
-   */
-  if (!allow_pull_constants ||
-  (*num_push_constants + chunk_size <= max_push_components &&
-   chunk_size <= max_chunk_size)) {
- assert(*num_push_constants + chunk_size <= max_push_components);
- for (unsigned j = *chunk_start; j <= uniform; j++)
-push_constant_loc[j] = (*num_push_constants)++;
-  } else {
- for (unsigned j = *chunk_start; j <= uniform; j++)
-pull_constant_loc[j] = (*num_pull_constants)++;
-  }
-
-  *max_chunk_bitsize = 0;
-  *chunk_start = -1;
-   }
-}
-
 static int
 get_subgroup_id_param_index(const brw_stage_prog_data *prog_data)
 {
@@ -1945,6 +1889,98 @@ get_subgroup_id_param_index(const brw_stage_prog_data 
*prog_data)
 }
 
 /**
+ * Struct for handling complex alignments.
+ *
+ * A complex alignment is stored as multiplier and an offset.  A value is
+ * considered to be aligned if it is congruent to the offset modulo the
+ * multiplier.
+ */
+struct cplx_align {
+   unsigned mul:4;
+   unsigned offset:4;
+};
+
+#define CPLX_ALIGN_MAX_MUL 8
+
+static void
+cplx_align_assert_sane(struct cplx_align a)
+{
+   assert(a.mul > 0 && util_is_power_of_two(a.mul));
+   assert(a.offset < a.mul);
+}
+
+/**
+ * Combines two alignments to produce a least multiple of sorts.
+ *
+ * The returned alignment is the smallest (in terms of multiplier) such that
+ * anything aligned to both a and b will be aligned to the new alignment.
+ * This function will assert-fail if a and b are not compatible, i.e. if the
+ * offset parameters are such that no common alignment is possible.
+ */
+static struct cplx_align
+cplx_align_combine(struct cplx_align a, struct cplx_align b)
+{
+   cplx_align_assert_sane(a);
+   cplx_align_assert_sane(b);
+
+   /* Assert that the alignments agree. */
+   assert((a.offset & (b.mul - 1)) == (b.offset & (a.mul - 1)));
+
+