Re: [Mesa-dev] [PATCH] draw: improve vertex fetch (v2)

2016-10-18 Thread Jose Fonseca

On 15/10/16 02:54, srol...@vmware.com wrote:

From: Roland Scheidegger 

The per-element fetch has quite some calculations which are constant,
these can be moved outside both the per-element as well as the main
shader loop (llvm can figure out it's constant mostly on its own, however
this can have a significant compile time cost).
Similarly, it looks easier swapping the fetch loops (outer loop per attrib,
inner loop filling up the per vertex elements - this way the aos->soa
conversion also can be done per attrib and not just at the end though again
this doesn't really make much of a difference in the generated code). (This
would also make it possible to vectorize the calculations leading to the
fetches.)
There's also some minimal change simplifying the overflow math slightly.
All in all, the generated code seems to look slightly simpler (depending
on the actual vs), but more importantly I've seen a significant reduction
in compile times for some vs (albeit with old (3.3) llvm version, and the
time reduction is only really for the optimizations run on the IR).
v2: adapt to other draw change.

No changes with piglit.
---
 src/gallium/auxiliary/draw/draw_llvm.c | 190 +++--
 .../auxiliary/gallivm/lp_bld_arit_overflow.c   |  24 +++
 .../auxiliary/gallivm/lp_bld_arit_overflow.h   |   6 +
 3 files changed, 134 insertions(+), 86 deletions(-)

diff --git a/src/gallium/auxiliary/draw/draw_llvm.c 
b/src/gallium/auxiliary/draw/draw_llvm.c
index 3b56856..2f82d9d 100644
--- a/src/gallium/auxiliary/draw/draw_llvm.c
+++ b/src/gallium/auxiliary/draw/draw_llvm.c
@@ -659,85 +659,42 @@ generate_vs(struct draw_llvm_variant *variant,
 static void
 generate_fetch(struct gallivm_state *gallivm,
struct draw_context *draw,
-   LLVMValueRef vbuffers_ptr,
+   const struct util_format_description *format_desc,
+   LLVMValueRef vb_stride,
+   LLVMValueRef stride_fixed,
+   LLVMValueRef map_ptr,
+   LLVMValueRef buffer_size_adj,
+   LLVMValueRef ofbit,
LLVMValueRef *res,
-   struct pipe_vertex_element *velem,
-   LLVMValueRef vbuf,
-   LLVMValueRef index,
-   LLVMValueRef instance_id,
-   LLVMValueRef start_instance)
+   LLVMValueRef index)
 {
-   const struct util_format_description *format_desc =
-  util_format_description(velem->src_format);
LLVMValueRef zero = LLVMConstNull(LLVMInt32TypeInContext(gallivm->context));
LLVMBuilderRef builder = gallivm->builder;
-   LLVMValueRef indices =
-  LLVMConstInt(LLVMInt64TypeInContext(gallivm->context),
-   velem->vertex_buffer_index, 0);
-   LLVMValueRef vbuffer_ptr = LLVMBuildGEP(builder, vbuffers_ptr,
-   , 1, "");
-   LLVMValueRef vb_stride = draw_jit_vbuffer_stride(gallivm, vbuf);
-   LLVMValueRef vb_buffer_offset = draw_jit_vbuffer_offset(gallivm, vbuf);
-   LLVMValueRef map_ptr = draw_jit_dvbuffer_map(gallivm, vbuffer_ptr);
-   LLVMValueRef buffer_size = draw_jit_dvbuffer_size(gallivm, vbuffer_ptr);
LLVMValueRef stride;
LLVMValueRef buffer_overflowed;
-   LLVMValueRef needed_buffer_size;
LLVMValueRef temp_ptr =
   lp_build_alloca(gallivm,
   lp_build_vec_type(gallivm, lp_float32_vec4_type()), "");
-   LLVMValueRef ofbit = NULL;
struct lp_build_if_state if_ctx;

-   if (velem->src_format == PIPE_FORMAT_NONE) {
+   if (format_desc->format == PIPE_FORMAT_NONE) {
   *res = lp_build_const_vec(gallivm, lp_float32_vec4_type(), 0);
   return;
}

-   if (velem->instance_divisor) {
-  /* Index is equal to the start instance plus the number of current
-   * instance divided by the divisor. In this case we compute it as:
-   * index = start_instance + (instance_id  / divisor)
-   */
-  LLVMValueRef current_instance;
-  current_instance = LLVMBuildUDiv(builder, instance_id,
-   lp_build_const_int32(gallivm, 
velem->instance_divisor),
-   "instance_divisor");
-  index = lp_build_uadd_overflow(gallivm, start_instance,
- current_instance, );
-   }
-
stride = lp_build_umul_overflow(gallivm, vb_stride, index, );
-   stride = lp_build_uadd_overflow(gallivm, stride, vb_buffer_offset, );
-   stride = lp_build_uadd_overflow(
-  gallivm, stride,
-  lp_build_const_int32(gallivm, velem->src_offset), );
-   needed_buffer_size = lp_build_uadd_overflow(
-  gallivm, stride,
-  lp_build_const_int32(gallivm,
-   util_format_get_blocksize(velem->src_format)),
-  );
+   stride = lp_build_uadd_overflow(gallivm, stride, stride_fixed, );

buffer_overflowed = LLVMBuildICmp(builder, LLVMIntUGT,
- needed_buffer_size, buffer_size,
+ 

[Mesa-dev] [PATCH] draw: improve vertex fetch (v2)

2016-10-14 Thread sroland
From: Roland Scheidegger 

The per-element fetch has quite some calculations which are constant,
these can be moved outside both the per-element as well as the main
shader loop (llvm can figure out it's constant mostly on its own, however
this can have a significant compile time cost).
Similarly, it looks easier swapping the fetch loops (outer loop per attrib,
inner loop filling up the per vertex elements - this way the aos->soa
conversion also can be done per attrib and not just at the end though again
this doesn't really make much of a difference in the generated code). (This
would also make it possible to vectorize the calculations leading to the
fetches.)
There's also some minimal change simplifying the overflow math slightly.
All in all, the generated code seems to look slightly simpler (depending
on the actual vs), but more importantly I've seen a significant reduction
in compile times for some vs (albeit with old (3.3) llvm version, and the
time reduction is only really for the optimizations run on the IR).
v2: adapt to other draw change.

No changes with piglit.
---
 src/gallium/auxiliary/draw/draw_llvm.c | 190 +++--
 .../auxiliary/gallivm/lp_bld_arit_overflow.c   |  24 +++
 .../auxiliary/gallivm/lp_bld_arit_overflow.h   |   6 +
 3 files changed, 134 insertions(+), 86 deletions(-)

diff --git a/src/gallium/auxiliary/draw/draw_llvm.c 
b/src/gallium/auxiliary/draw/draw_llvm.c
index 3b56856..2f82d9d 100644
--- a/src/gallium/auxiliary/draw/draw_llvm.c
+++ b/src/gallium/auxiliary/draw/draw_llvm.c
@@ -659,85 +659,42 @@ generate_vs(struct draw_llvm_variant *variant,
 static void
 generate_fetch(struct gallivm_state *gallivm,
struct draw_context *draw,
-   LLVMValueRef vbuffers_ptr,
+   const struct util_format_description *format_desc,
+   LLVMValueRef vb_stride,
+   LLVMValueRef stride_fixed,
+   LLVMValueRef map_ptr,
+   LLVMValueRef buffer_size_adj,
+   LLVMValueRef ofbit,
LLVMValueRef *res,
-   struct pipe_vertex_element *velem,
-   LLVMValueRef vbuf,
-   LLVMValueRef index,
-   LLVMValueRef instance_id,
-   LLVMValueRef start_instance)
+   LLVMValueRef index)
 {
-   const struct util_format_description *format_desc =
-  util_format_description(velem->src_format);
LLVMValueRef zero = LLVMConstNull(LLVMInt32TypeInContext(gallivm->context));
LLVMBuilderRef builder = gallivm->builder;
-   LLVMValueRef indices =
-  LLVMConstInt(LLVMInt64TypeInContext(gallivm->context),
-   velem->vertex_buffer_index, 0);
-   LLVMValueRef vbuffer_ptr = LLVMBuildGEP(builder, vbuffers_ptr,
-   , 1, "");
-   LLVMValueRef vb_stride = draw_jit_vbuffer_stride(gallivm, vbuf);
-   LLVMValueRef vb_buffer_offset = draw_jit_vbuffer_offset(gallivm, vbuf);
-   LLVMValueRef map_ptr = draw_jit_dvbuffer_map(gallivm, vbuffer_ptr);
-   LLVMValueRef buffer_size = draw_jit_dvbuffer_size(gallivm, vbuffer_ptr);
LLVMValueRef stride;
LLVMValueRef buffer_overflowed;
-   LLVMValueRef needed_buffer_size;
LLVMValueRef temp_ptr =
   lp_build_alloca(gallivm,
   lp_build_vec_type(gallivm, lp_float32_vec4_type()), "");
-   LLVMValueRef ofbit = NULL;
struct lp_build_if_state if_ctx;
 
-   if (velem->src_format == PIPE_FORMAT_NONE) {
+   if (format_desc->format == PIPE_FORMAT_NONE) {
   *res = lp_build_const_vec(gallivm, lp_float32_vec4_type(), 0);
   return;
}
 
-   if (velem->instance_divisor) {
-  /* Index is equal to the start instance plus the number of current 
-   * instance divided by the divisor. In this case we compute it as:
-   * index = start_instance + (instance_id  / divisor)
-   */
-  LLVMValueRef current_instance;
-  current_instance = LLVMBuildUDiv(builder, instance_id,
-   lp_build_const_int32(gallivm, 
velem->instance_divisor),
-   "instance_divisor");
-  index = lp_build_uadd_overflow(gallivm, start_instance,
- current_instance, );
-   }
-
stride = lp_build_umul_overflow(gallivm, vb_stride, index, );
-   stride = lp_build_uadd_overflow(gallivm, stride, vb_buffer_offset, );
-   stride = lp_build_uadd_overflow(
-  gallivm, stride,
-  lp_build_const_int32(gallivm, velem->src_offset), );
-   needed_buffer_size = lp_build_uadd_overflow(
-  gallivm, stride,
-  lp_build_const_int32(gallivm,
-   util_format_get_blocksize(velem->src_format)),
-  );
+   stride = lp_build_uadd_overflow(gallivm, stride, stride_fixed, );
 
buffer_overflowed = LLVMBuildICmp(builder, LLVMIntUGT,
- needed_buffer_size, buffer_size,
+ stride,