Re: [Mesa-dev] [PATCH 3/3] ac/llvm: drop pointless wrappers around umsb/imsb

2017-10-25 Thread Timothy Arceri

Series:

Reviewed-by: Timothy Arceri 

On 26/10/17 16:31, Dave Airlie wrote:

From: Dave Airlie 

Signed-off-by: Dave Airlie 
---
  src/amd/common/ac_nir_to_llvm.c | 16 ++--
  1 file changed, 2 insertions(+), 14 deletions(-)

diff --git a/src/amd/common/ac_nir_to_llvm.c b/src/amd/common/ac_nir_to_llvm.c
index 0167755..a736d34 100644
--- a/src/amd/common/ac_nir_to_llvm.c
+++ b/src/amd/common/ac_nir_to_llvm.c
@@ -1227,18 +1227,6 @@ static LLVMValueRef emit_bcsel(struct ac_llvm_context 
*ctx,
return LLVMBuildSelect(ctx->builder, v, src1, src2, "");
  }
  
-static LLVMValueRef emit_ifind_msb(struct ac_llvm_context *ctx,

-  LLVMValueRef src0)
-{
-   return ac_build_imsb(ctx, src0, ctx->i32);
-}
-
-static LLVMValueRef emit_ufind_msb(struct ac_llvm_context *ctx,
-  LLVMValueRef src0)
-{
-   return ac_build_umsb(ctx, src0, ctx->i32);
-}
-
  static LLVMValueRef emit_minmax_int(struct ac_llvm_context *ctx,
LLVMIntPredicate pred,
LLVMValueRef src0, LLVMValueRef src1)
@@ -1871,11 +1859,11 @@ static void visit_alu(struct ac_nir_context *ctx, const 
nir_alu_instr *instr)
break;
case nir_op_ufind_msb:
src[0] = ac_to_integer(>ac, src[0]);
-   result = emit_ufind_msb(>ac, src[0]);
+   result = ac_build_umsb(>ac, src[0], ctx->ac.i32);
break;
case nir_op_ifind_msb:
src[0] = ac_to_integer(>ac, src[0]);
-   result = emit_ifind_msb(>ac, src[0]);
+   result = ac_build_imsb(>ac, src[0], ctx->ac.i32);
break;
case nir_op_uadd_carry:
src[0] = ac_to_integer(>ac, src[0]);


___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH 2/3] ac/llvm: consolidate find lsb function.

2017-10-25 Thread Dave Airlie
From: Dave Airlie 

This was the same between si and ac.

Signed-off-by: Dave Airlie 
---
 src/amd/common/ac_llvm_build.c| 31 +++
 src/amd/common/ac_llvm_build.h|  4 +++
 src/amd/common/ac_nir_to_llvm.c   | 30 +-
 src/gallium/drivers/radeonsi/si_shader_tgsi_alu.c | 27 ++--
 4 files changed, 38 insertions(+), 54 deletions(-)

diff --git a/src/amd/common/ac_llvm_build.c b/src/amd/common/ac_llvm_build.c
index 2b1f15b..242712e 100644
--- a/src/amd/common/ac_llvm_build.c
+++ b/src/amd/common/ac_llvm_build.c
@@ -1775,3 +1775,34 @@ ac_lds_store(struct ac_llvm_context *ctx,
ac_build_indexed_store(ctx, ctx->lds,
   dw_addr, value);
 }
+
+LLVMValueRef ac_find_lsb(struct ac_llvm_context *ctx,
+LLVMTypeRef dst_type,
+LLVMValueRef src0)
+{
+   LLVMValueRef params[2] = {
+   src0,
+
+   /* The value of 1 means that ffs(x=0) = undef, so LLVM won't
+* add special code to check for x=0. The reason is that
+* the LLVM behavior for x=0 is different from what we
+* need here. However, LLVM also assumes that ffs(x) is
+* in [0, 31], but GLSL expects that ffs(0) = -1, so
+* a conditional assignment to handle 0 is still required.
+*
+* The hardware already implements the correct behavior.
+*/
+   LLVMConstInt(ctx->i1, 1, false),
+   };
+
+   LLVMValueRef lsb = ac_build_intrinsic(ctx, "llvm.cttz.i32", ctx->i32,
+ params, 2,
+ AC_FUNC_ATTR_READNONE);
+
+   /* TODO: We need an intrinsic to skip this conditional. */
+   /* Check for zero: */
+   return LLVMBuildSelect(ctx->builder, LLVMBuildICmp(ctx->builder,
+  LLVMIntEQ, src0,
+  ctx->i32_0, ""),
+  LLVMConstInt(ctx->i32, -1, 0), lsb, "");
+}
diff --git a/src/amd/common/ac_llvm_build.h b/src/amd/common/ac_llvm_build.h
index b47d51a..f790619 100644
--- a/src/amd/common/ac_llvm_build.h
+++ b/src/amd/common/ac_llvm_build.h
@@ -297,6 +297,10 @@ LLVMValueRef ac_lds_load(struct ac_llvm_context *ctx,
 LLVMValueRef dw_addr);
 void ac_lds_store(struct ac_llvm_context *ctx,
  LLVMValueRef dw_addr, LLVMValueRef value);
+
+LLVMValueRef ac_find_lsb(struct ac_llvm_context *ctx,
+LLVMTypeRef dst_type,
+LLVMValueRef src0);
 #ifdef __cplusplus
 }
 #endif
diff --git a/src/amd/common/ac_nir_to_llvm.c b/src/amd/common/ac_nir_to_llvm.c
index f78f486..0167755 100644
--- a/src/amd/common/ac_nir_to_llvm.c
+++ b/src/amd/common/ac_nir_to_llvm.c
@@ -1227,34 +1227,6 @@ static LLVMValueRef emit_bcsel(struct ac_llvm_context 
*ctx,
return LLVMBuildSelect(ctx->builder, v, src1, src2, "");
 }
 
-static LLVMValueRef emit_find_lsb(struct ac_llvm_context *ctx,
- LLVMValueRef src0)
-{
-   LLVMValueRef params[2] = {
-   src0,
-
-   /* The value of 1 means that ffs(x=0) = undef, so LLVM won't
-* add special code to check for x=0. The reason is that
-* the LLVM behavior for x=0 is different from what we
-* need here.
-*
-* The hardware already implements the correct behavior.
-*/
-   LLVMConstInt(ctx->i1, 1, false),
-   };
-
-   LLVMValueRef lsb = ac_build_intrinsic(ctx, "llvm.cttz.i32", ctx->i32,
- params, 2,
- AC_FUNC_ATTR_READNONE);
-
-   /* TODO: We need an intrinsic to skip this conditional. */
-   /* Check for zero: */
-   return LLVMBuildSelect(ctx->builder, LLVMBuildICmp(ctx->builder,
-  LLVMIntEQ, src0,
-  ctx->i32_0, ""),
-  LLVMConstInt(ctx->i32, -1, 0), lsb, "");
-}
-
 static LLVMValueRef emit_ifind_msb(struct ac_llvm_context *ctx,
   LLVMValueRef src0)
 {
@@ -1895,7 +1867,7 @@ static void visit_alu(struct ac_nir_context *ctx, const 
nir_alu_instr *instr)
break;
case nir_op_find_lsb:
src[0] = ac_to_integer(>ac, src[0]);
-   result = emit_find_lsb(>ac, src[0]);
+   result = ac_find_lsb(>ac, ctx->ac.i32, src[0]);
break;
case nir_op_ufind_msb:
src[0] = ac_to_integer(>ac, src[0]);
diff --git 

[Mesa-dev] [PATCH 3/3] ac/llvm: drop pointless wrappers around umsb/imsb

2017-10-25 Thread Dave Airlie
From: Dave Airlie 

Signed-off-by: Dave Airlie 
---
 src/amd/common/ac_nir_to_llvm.c | 16 ++--
 1 file changed, 2 insertions(+), 14 deletions(-)

diff --git a/src/amd/common/ac_nir_to_llvm.c b/src/amd/common/ac_nir_to_llvm.c
index 0167755..a736d34 100644
--- a/src/amd/common/ac_nir_to_llvm.c
+++ b/src/amd/common/ac_nir_to_llvm.c
@@ -1227,18 +1227,6 @@ static LLVMValueRef emit_bcsel(struct ac_llvm_context 
*ctx,
return LLVMBuildSelect(ctx->builder, v, src1, src2, "");
 }
 
-static LLVMValueRef emit_ifind_msb(struct ac_llvm_context *ctx,
-  LLVMValueRef src0)
-{
-   return ac_build_imsb(ctx, src0, ctx->i32);
-}
-
-static LLVMValueRef emit_ufind_msb(struct ac_llvm_context *ctx,
-  LLVMValueRef src0)
-{
-   return ac_build_umsb(ctx, src0, ctx->i32);
-}
-
 static LLVMValueRef emit_minmax_int(struct ac_llvm_context *ctx,
LLVMIntPredicate pred,
LLVMValueRef src0, LLVMValueRef src1)
@@ -1871,11 +1859,11 @@ static void visit_alu(struct ac_nir_context *ctx, const 
nir_alu_instr *instr)
break;
case nir_op_ufind_msb:
src[0] = ac_to_integer(>ac, src[0]);
-   result = emit_ufind_msb(>ac, src[0]);
+   result = ac_build_umsb(>ac, src[0], ctx->ac.i32);
break;
case nir_op_ifind_msb:
src[0] = ac_to_integer(>ac, src[0]);
-   result = emit_ifind_msb(>ac, src[0]);
+   result = ac_build_imsb(>ac, src[0], ctx->ac.i32);
break;
case nir_op_uadd_carry:
src[0] = ac_to_integer(>ac, src[0]);
-- 
2.9.5

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH 1/3] ac/llvm: drop v4f32empty. (v2)

2017-10-25 Thread Dave Airlie
From: Dave Airlie 

This was unused.

v2: drop args.

Signed-off-by: Dave Airlie 
---
 src/amd/common/ac_nir_to_llvm.c | 12 
 1 file changed, 12 deletions(-)

diff --git a/src/amd/common/ac_nir_to_llvm.c b/src/amd/common/ac_nir_to_llvm.c
index 9713c06..f78f486 100644
--- a/src/amd/common/ac_nir_to_llvm.c
+++ b/src/amd/common/ac_nir_to_llvm.c
@@ -150,8 +150,6 @@ struct nir_to_llvm_context {
LLVMTypeRef v4f32;
LLVMTypeRef voidt;
 
-   LLVMValueRef v4f32empty;
-
unsigned uniform_md_kind;
LLVMValueRef empty_md;
gl_shader_stage stage;
@@ -999,8 +997,6 @@ static void create_function(struct nir_to_llvm_context *ctx,
 
 static void setup_types(struct nir_to_llvm_context *ctx)
 {
-   LLVMValueRef args[4];
-
ctx->voidt = LLVMVoidTypeInContext(ctx->context);
ctx->i1 = LLVMIntTypeInContext(ctx->context, 1);
ctx->i8 = LLVMIntTypeInContext(ctx->context, 8);
@@ -1017,17 +1013,9 @@ static void setup_types(struct nir_to_llvm_context *ctx)
ctx->v2f32 = LLVMVectorType(ctx->f32, 2);
ctx->v4f32 = LLVMVectorType(ctx->f32, 4);
 
-   args[0] = ctx->ac.f32_0;
-   args[1] = ctx->ac.f32_0;
-   args[2] = ctx->ac.f32_0;
-   args[3] = ctx->ac.f32_1;
-   ctx->v4f32empty = LLVMConstVector(args, 4);
-
ctx->uniform_md_kind =
LLVMGetMDKindIDInContext(ctx->context, "amdgpu.uniform", 14);
ctx->empty_md = LLVMMDNodeInContext(ctx->context, NULL, 0);
-
-   args[0] = LLVMConstReal(ctx->f32, 2.5);
 }
 
 static int get_llvm_num_components(LLVMValueRef value)
-- 
2.9.5

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


Re: [Mesa-dev] [PATCH] ac/nir: move lds declaration/load/store into shared code.

2017-10-25 Thread Timothy Arceri

Reviewed-by: Timothy Arceri 

On 26/10/17 16:05, Dave Airlie wrote:

From: Dave Airlie 

This was duplicated between both drivers, share here.

Signed-off-by: Dave Airlie 
---
  src/amd/common/ac_llvm_build.c| 24 ++
  src/amd/common/ac_llvm_build.h| 12 +
  src/amd/common/ac_nir_to_llvm.c   | 56 ++-
  src/gallium/drivers/radeonsi/si_shader.c  | 20 ++--
  src/gallium/drivers/radeonsi/si_shader_internal.h |  1 -
  5 files changed, 56 insertions(+), 57 deletions(-)

diff --git a/src/amd/common/ac_llvm_build.c b/src/amd/common/ac_llvm_build.c
index 80b027e..946f97f 100644
--- a/src/amd/common/ac_llvm_build.c
+++ b/src/amd/common/ac_llvm_build.c
@@ -1748,3 +1748,27 @@ void ac_init_exec_full_mask(struct ac_llvm_context *ctx)
   "llvm.amdgcn.init.exec", ctx->voidt,
   _mask, 1, AC_FUNC_ATTR_CONVERGENT);
  }
+
+void ac_declare_lds_as_pointer(struct ac_llvm_context *ctx)
+{
+   unsigned lds_size = ctx->chip_class >= CIK ? 65536 : 32768;
+   ctx->lds = LLVMBuildIntToPtr(ctx->builder, ctx->i32_0,
+LLVMPointerType(LLVMArrayType(ctx->i32, 
lds_size / 4), AC_LOCAL_ADDR_SPACE),
+"lds");
+}
+
+LLVMValueRef
+ac_lds_load(struct ac_llvm_context *ctx,
+   LLVMValueRef dw_addr)
+{
+   return ac_build_load(ctx, ctx->lds, dw_addr);
+}
+
+void
+ac_lds_store(struct ac_llvm_context *ctx,
+LLVMValueRef dw_addr, LLVMValueRef value)
+{
+   value = ac_to_integer(ctx, value);
+   ac_build_indexed_store(ctx, ctx->lds,
+  dw_addr, value);
+}
diff --git a/src/amd/common/ac_llvm_build.h b/src/amd/common/ac_llvm_build.h
index 996f558..7d57b8b 100644
--- a/src/amd/common/ac_llvm_build.h
+++ b/src/amd/common/ac_llvm_build.h
@@ -34,6 +34,10 @@
  extern "C" {
  #endif
  
+enum {

+   AC_LOCAL_ADDR_SPACE = 3,
+};
+
  struct ac_llvm_context {
LLVMContextRef context;
LLVMModuleRef module;
@@ -65,6 +69,8 @@ struct ac_llvm_context {
LLVMValueRef empty_md;
  
  	enum chip_class chip_class;

+
+   LLVMValueRef lds;
  };
  
  void

@@ -283,6 +289,12 @@ void ac_optimize_vs_outputs(struct ac_llvm_context *ac,
uint32_t num_outputs,
uint8_t *num_param_exports);
  void ac_init_exec_full_mask(struct ac_llvm_context *ctx);
+
+void ac_declare_lds_as_pointer(struct ac_llvm_context *ac);
+LLVMValueRef ac_lds_load(struct ac_llvm_context *ctx,
+LLVMValueRef dw_addr);
+void ac_lds_store(struct ac_llvm_context *ctx,
+ LLVMValueRef dw_addr, LLVMValueRef value);
  #ifdef __cplusplus
  }
  #endif
diff --git a/src/amd/common/ac_nir_to_llvm.c b/src/amd/common/ac_nir_to_llvm.c
index 06937d6..cbd646e 100644
--- a/src/amd/common/ac_nir_to_llvm.c
+++ b/src/amd/common/ac_nir_to_llvm.c
@@ -162,7 +162,6 @@ struct nir_to_llvm_context {
LLVMValueRef empty_md;
gl_shader_stage stage;
  
-	LLVMValueRef lds;

LLVMValueRef inputs[RADEON_LLVM_MAX_INPUTS * 4];
  
  	uint64_t input_mask;

@@ -548,14 +547,6 @@ static void set_userdata_location_indirect(struct 
ac_userdata_info *ud_info, uin
ud_info->indirect_offset = indirect_offset;
  }
  
-static void declare_tess_lds(struct nir_to_llvm_context *ctx)

-{
-   unsigned lds_size = ctx->options->chip_class >= CIK ? 65536 : 32768;
-   ctx->lds = LLVMBuildIntToPtr(ctx->builder, ctx->i32zero,
-LLVMPointerType(LLVMArrayType(ctx->i32, 
lds_size / 4), LOCAL_ADDR_SPACE),
-   "tess_lds");
-}
-
  struct user_sgpr_info {
bool need_ring_offsets;
uint8_t sgpr_count;
@@ -971,7 +962,7 @@ static void create_function(struct nir_to_llvm_context *ctx,
set_userdata_location_shader(ctx, 
AC_UD_VS_LS_TCS_IN_LAYOUT, _sgpr_idx, 1);
}
if (ctx->options->key.vs.as_ls)
-   declare_tess_lds(ctx);
+   ac_declare_lds_as_pointer(>ac);
break;
case MESA_SHADER_TESS_CTRL:
radv_define_vs_user_sgprs_phase2(ctx, stage, has_previous_stage, 
previous_stage, _sgpr_idx);
@@ -980,7 +971,7 @@ static void create_function(struct nir_to_llvm_context *ctx,
set_userdata_location_shader(ctx, AC_UD_TCS_OFFCHIP_LAYOUT, 
_sgpr_idx, 4);
if (ctx->view_index)
set_userdata_location_shader(ctx, AC_UD_VIEW_INDEX, 
_sgpr_idx, 1);
-   declare_tess_lds(ctx);
+   ac_declare_lds_as_pointer(>ac);
break;
case MESA_SHADER_TESS_EVAL:
set_userdata_location_shader(ctx, AC_UD_TES_OFFCHIP_LAYOUT, 
_sgpr_idx, 1);
@@ -998,7 +989,7 @@ static void create_function(struct nir_to_llvm_context 

Re: [Mesa-dev] [PATCH] ac/llvm: drop v4f32empty.

2017-10-25 Thread Timothy Arceri

Reviewed-by: Timothy Arceri 

On 26/10/17 16:22, Dave Airlie wrote:

From: Dave Airlie 

This was unused.

Signed-off-by: Dave Airlie 
---
  src/amd/common/ac_nir_to_llvm.c | 8 
  1 file changed, 8 deletions(-)

diff --git a/src/amd/common/ac_nir_to_llvm.c b/src/amd/common/ac_nir_to_llvm.c
index 9713c06..860f39c 100644
--- a/src/amd/common/ac_nir_to_llvm.c
+++ b/src/amd/common/ac_nir_to_llvm.c
@@ -150,8 +150,6 @@ struct nir_to_llvm_context {
LLVMTypeRef v4f32;
LLVMTypeRef voidt;
  
-	LLVMValueRef v4f32empty;

-
unsigned uniform_md_kind;
LLVMValueRef empty_md;
gl_shader_stage stage;
@@ -1017,12 +1015,6 @@ static void setup_types(struct nir_to_llvm_context *ctx)
ctx->v2f32 = LLVMVectorType(ctx->f32, 2);
ctx->v4f32 = LLVMVectorType(ctx->f32, 4);
  
-	args[0] = ctx->ac.f32_0;

-   args[1] = ctx->ac.f32_0;
-   args[2] = ctx->ac.f32_0;
-   args[3] = ctx->ac.f32_1;
-   ctx->v4f32empty = LLVMConstVector(args, 4);
-
ctx->uniform_md_kind =
LLVMGetMDKindIDInContext(ctx->context, "amdgpu.uniform", 14);
ctx->empty_md = LLVMMDNodeInContext(ctx->context, NULL, 0);


___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


Re: [Mesa-dev] [PATCH] ac/llvm: add i1false/i1true to common code.

2017-10-25 Thread Timothy Arceri

Reviewed-by: Timothy Arceri 

On 26/10/17 16:20, Dave Airlie wrote:

From: Dave Airlie 

These get used in fair few places.

Signed-off-by: Dave Airlie 
---
  src/amd/common/ac_llvm_build.c  | 15 ++-
  src/amd/common/ac_llvm_build.h  |  2 ++
  src/amd/common/ac_nir_to_llvm.c | 57 -
  3 files changed, 33 insertions(+), 41 deletions(-)

diff --git a/src/amd/common/ac_llvm_build.c b/src/amd/common/ac_llvm_build.c
index 946f97f..2b1f15b 100644
--- a/src/amd/common/ac_llvm_build.c
+++ b/src/amd/common/ac_llvm_build.c
@@ -75,6 +75,9 @@ ac_llvm_context_init(struct ac_llvm_context *ctx, 
LLVMContextRef context,
ctx->f32_0 = LLVMConstReal(ctx->f32, 0.0);
ctx->f32_1 = LLVMConstReal(ctx->f32, 1.0);
  
+	ctx->i1false = LLVMConstInt(ctx->i1, 0, false);

+   ctx->i1true = LLVMConstInt(ctx->i1, 1, false);
+
ctx->range_md_kind = LLVMGetMDKindIDInContext(ctx->context,
 "range", 5);
  
@@ -946,8 +949,8 @@ LLVMValueRef ac_build_buffer_load_format(struct ac_llvm_context *ctx,

LLVMBuildBitCast(ctx->builder, rsrc, ctx->v4i32, ""),
vindex,
voffset,
-   LLVMConstInt(ctx->i1, 0, 0), /* glc */
-   LLVMConstInt(ctx->i1, 0, 0), /* slc */
+   ctx->i1false, /* glc */
+   ctx->i1false, /* slc */
};
  
  	return ac_build_intrinsic(ctx,

@@ -1150,7 +1153,7 @@ ac_build_umsb(struct ac_llvm_context *ctx,
  {
LLVMValueRef args[2] = {
arg,
-   LLVMConstInt(ctx->i1, 1, 0),
+   ctx->i1true,
};
LLVMValueRef msb = ac_build_intrinsic(ctx, "llvm.ctlz.i32",
  dst_type, args, ARRAY_SIZE(args),
@@ -1276,9 +1279,9 @@ LLVMValueRef ac_build_image_opcode(struct ac_llvm_context 
*ctx,
args[num_args++] = LLVMConstInt(ctx->i32, a->dmask, 0);
if (sample)
args[num_args++] = LLVMConstInt(ctx->i1, a->unorm, 0);
-   args[num_args++] = LLVMConstInt(ctx->i1, 0, 0); /* glc */
-   args[num_args++] = LLVMConstInt(ctx->i1, 0, 0); /* slc */
-   args[num_args++] = LLVMConstInt(ctx->i1, 0, 0); /* lwe */
+   args[num_args++] = ctx->i1false; /* glc */
+   args[num_args++] = ctx->i1false; /* slc */
+   args[num_args++] = ctx->i1false; /* lwe */
args[num_args++] = LLVMConstInt(ctx->i1, a->da, 0);
  
  		switch (a->opcode) {

diff --git a/src/amd/common/ac_llvm_build.h b/src/amd/common/ac_llvm_build.h
index 7d57b8b..b47d51a 100644
--- a/src/amd/common/ac_llvm_build.h
+++ b/src/amd/common/ac_llvm_build.h
@@ -60,6 +60,8 @@ struct ac_llvm_context {
LLVMValueRef i32_1;
LLVMValueRef f32_0;
LLVMValueRef f32_1;
+   LLVMValueRef i1true;
+   LLVMValueRef i1false;
  
  	unsigned range_md_kind;

unsigned invariant_load_md_kind;
diff --git a/src/amd/common/ac_nir_to_llvm.c b/src/amd/common/ac_nir_to_llvm.c
index 00a3ec2..9713c06 100644
--- a/src/amd/common/ac_nir_to_llvm.c
+++ b/src/amd/common/ac_nir_to_llvm.c
@@ -150,8 +150,6 @@ struct nir_to_llvm_context {
LLVMTypeRef v4f32;
LLVMTypeRef voidt;
  
-	LLVMValueRef i1true;

-   LLVMValueRef i1false;
LLVMValueRef v4f32empty;
  
  	unsigned uniform_md_kind;

@@ -1019,9 +1017,6 @@ static void setup_types(struct nir_to_llvm_context *ctx)
ctx->v2f32 = LLVMVectorType(ctx->f32, 2);
ctx->v4f32 = LLVMVectorType(ctx->f32, 4);
  
-	ctx->i1false = LLVMConstInt(ctx->i1, 0, false);

-   ctx->i1true = LLVMConstInt(ctx->i1, 1, false);
-
args[0] = ctx->ac.f32_0;
args[1] = ctx->ac.f32_0;
args[2] = ctx->ac.f32_0;
@@ -2346,13 +2341,12 @@ static void visit_store_ssbo(struct ac_nir_context *ctx,
unsigned writemask = nir_intrinsic_write_mask(instr);
LLVMValueRef base_data, base_offset;
LLVMValueRef params[6];
-   LLVMValueRef i1false = LLVMConstInt(ctx->ac.i1, 0, false);
  
  	params[1] = ctx->abi->load_ssbo(ctx->abi,

get_src(ctx, instr->src[1]), true);
params[2] = LLVMConstInt(ctx->ac.i32, 0, false); /* vindex */
-   params[4] = i1false;  /* glc */
-   params[5] = i1false;  /* slc */
+   params[4] = ctx->ac.i1false;  /* glc */
+   params[5] = ctx->ac.i1false;  /* slc */
  
  	if (components_32bit > 1)

data_type = LLVMVectorType(ctx->ac.f32, components_32bit);
@@ -2508,15 +2502,14 @@ static LLVMValueRef visit_load_buffer(struct 
ac_nir_context *ctx,
else
unreachable("unhandled number of components");
  
-		LLVMValueRef i1false = LLVMConstInt(ctx->ac.i1, 0, false);

LLVMValueRef params[] = {
ctx->abi->load_ssbo(ctx->abi,
 

[Mesa-dev] [PATCH] ac/llvm: drop v4f32empty.

2017-10-25 Thread Dave Airlie
From: Dave Airlie 

This was unused.

Signed-off-by: Dave Airlie 
---
 src/amd/common/ac_nir_to_llvm.c | 8 
 1 file changed, 8 deletions(-)

diff --git a/src/amd/common/ac_nir_to_llvm.c b/src/amd/common/ac_nir_to_llvm.c
index 9713c06..860f39c 100644
--- a/src/amd/common/ac_nir_to_llvm.c
+++ b/src/amd/common/ac_nir_to_llvm.c
@@ -150,8 +150,6 @@ struct nir_to_llvm_context {
LLVMTypeRef v4f32;
LLVMTypeRef voidt;
 
-   LLVMValueRef v4f32empty;
-
unsigned uniform_md_kind;
LLVMValueRef empty_md;
gl_shader_stage stage;
@@ -1017,12 +1015,6 @@ static void setup_types(struct nir_to_llvm_context *ctx)
ctx->v2f32 = LLVMVectorType(ctx->f32, 2);
ctx->v4f32 = LLVMVectorType(ctx->f32, 4);
 
-   args[0] = ctx->ac.f32_0;
-   args[1] = ctx->ac.f32_0;
-   args[2] = ctx->ac.f32_0;
-   args[3] = ctx->ac.f32_1;
-   ctx->v4f32empty = LLVMConstVector(args, 4);
-
ctx->uniform_md_kind =
LLVMGetMDKindIDInContext(ctx->context, "amdgpu.uniform", 14);
ctx->empty_md = LLVMMDNodeInContext(ctx->context, NULL, 0);
-- 
2.9.5

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH] ac/llvm: add i1false/i1true to common code.

2017-10-25 Thread Dave Airlie
From: Dave Airlie 

These get used in fair few places.

Signed-off-by: Dave Airlie 
---
 src/amd/common/ac_llvm_build.c  | 15 ++-
 src/amd/common/ac_llvm_build.h  |  2 ++
 src/amd/common/ac_nir_to_llvm.c | 57 -
 3 files changed, 33 insertions(+), 41 deletions(-)

diff --git a/src/amd/common/ac_llvm_build.c b/src/amd/common/ac_llvm_build.c
index 946f97f..2b1f15b 100644
--- a/src/amd/common/ac_llvm_build.c
+++ b/src/amd/common/ac_llvm_build.c
@@ -75,6 +75,9 @@ ac_llvm_context_init(struct ac_llvm_context *ctx, 
LLVMContextRef context,
ctx->f32_0 = LLVMConstReal(ctx->f32, 0.0);
ctx->f32_1 = LLVMConstReal(ctx->f32, 1.0);
 
+   ctx->i1false = LLVMConstInt(ctx->i1, 0, false);
+   ctx->i1true = LLVMConstInt(ctx->i1, 1, false);
+
ctx->range_md_kind = LLVMGetMDKindIDInContext(ctx->context,
 "range", 5);
 
@@ -946,8 +949,8 @@ LLVMValueRef ac_build_buffer_load_format(struct 
ac_llvm_context *ctx,
LLVMBuildBitCast(ctx->builder, rsrc, ctx->v4i32, ""),
vindex,
voffset,
-   LLVMConstInt(ctx->i1, 0, 0), /* glc */
-   LLVMConstInt(ctx->i1, 0, 0), /* slc */
+   ctx->i1false, /* glc */
+   ctx->i1false, /* slc */
};
 
return ac_build_intrinsic(ctx,
@@ -1150,7 +1153,7 @@ ac_build_umsb(struct ac_llvm_context *ctx,
 {
LLVMValueRef args[2] = {
arg,
-   LLVMConstInt(ctx->i1, 1, 0),
+   ctx->i1true,
};
LLVMValueRef msb = ac_build_intrinsic(ctx, "llvm.ctlz.i32",
  dst_type, args, ARRAY_SIZE(args),
@@ -1276,9 +1279,9 @@ LLVMValueRef ac_build_image_opcode(struct ac_llvm_context 
*ctx,
args[num_args++] = LLVMConstInt(ctx->i32, a->dmask, 0);
if (sample)
args[num_args++] = LLVMConstInt(ctx->i1, a->unorm, 0);
-   args[num_args++] = LLVMConstInt(ctx->i1, 0, 0); /* glc */
-   args[num_args++] = LLVMConstInt(ctx->i1, 0, 0); /* slc */
-   args[num_args++] = LLVMConstInt(ctx->i1, 0, 0); /* lwe */
+   args[num_args++] = ctx->i1false; /* glc */
+   args[num_args++] = ctx->i1false; /* slc */
+   args[num_args++] = ctx->i1false; /* lwe */
args[num_args++] = LLVMConstInt(ctx->i1, a->da, 0);
 
switch (a->opcode) {
diff --git a/src/amd/common/ac_llvm_build.h b/src/amd/common/ac_llvm_build.h
index 7d57b8b..b47d51a 100644
--- a/src/amd/common/ac_llvm_build.h
+++ b/src/amd/common/ac_llvm_build.h
@@ -60,6 +60,8 @@ struct ac_llvm_context {
LLVMValueRef i32_1;
LLVMValueRef f32_0;
LLVMValueRef f32_1;
+   LLVMValueRef i1true;
+   LLVMValueRef i1false;
 
unsigned range_md_kind;
unsigned invariant_load_md_kind;
diff --git a/src/amd/common/ac_nir_to_llvm.c b/src/amd/common/ac_nir_to_llvm.c
index 00a3ec2..9713c06 100644
--- a/src/amd/common/ac_nir_to_llvm.c
+++ b/src/amd/common/ac_nir_to_llvm.c
@@ -150,8 +150,6 @@ struct nir_to_llvm_context {
LLVMTypeRef v4f32;
LLVMTypeRef voidt;
 
-   LLVMValueRef i1true;
-   LLVMValueRef i1false;
LLVMValueRef v4f32empty;
 
unsigned uniform_md_kind;
@@ -1019,9 +1017,6 @@ static void setup_types(struct nir_to_llvm_context *ctx)
ctx->v2f32 = LLVMVectorType(ctx->f32, 2);
ctx->v4f32 = LLVMVectorType(ctx->f32, 4);
 
-   ctx->i1false = LLVMConstInt(ctx->i1, 0, false);
-   ctx->i1true = LLVMConstInt(ctx->i1, 1, false);
-
args[0] = ctx->ac.f32_0;
args[1] = ctx->ac.f32_0;
args[2] = ctx->ac.f32_0;
@@ -2346,13 +2341,12 @@ static void visit_store_ssbo(struct ac_nir_context *ctx,
unsigned writemask = nir_intrinsic_write_mask(instr);
LLVMValueRef base_data, base_offset;
LLVMValueRef params[6];
-   LLVMValueRef i1false = LLVMConstInt(ctx->ac.i1, 0, false);
 
params[1] = ctx->abi->load_ssbo(ctx->abi,
get_src(ctx, instr->src[1]), true);
params[2] = LLVMConstInt(ctx->ac.i32, 0, false); /* vindex */
-   params[4] = i1false;  /* glc */
-   params[5] = i1false;  /* slc */
+   params[4] = ctx->ac.i1false;  /* glc */
+   params[5] = ctx->ac.i1false;  /* slc */
 
if (components_32bit > 1)
data_type = LLVMVectorType(ctx->ac.f32, components_32bit);
@@ -2508,15 +2502,14 @@ static LLVMValueRef visit_load_buffer(struct 
ac_nir_context *ctx,
else
unreachable("unhandled number of components");
 
-   LLVMValueRef i1false = LLVMConstInt(ctx->ac.i1, 0, false);
LLVMValueRef params[] = {
ctx->abi->load_ssbo(ctx->abi,
get_src(ctx, 

Re: [Mesa-dev] [PATCH] ac/llvm: use the ac i32 0/1 and f32 0/1 llvm types.

2017-10-25 Thread Timothy Arceri

Reviewed-by: Timothy Arceri 

On 26/10/17 16:14, Dave Airlie wrote:

From: Dave Airlie 

This just avoids having two copies of these.

Signed-off-by: Dave Airlie 
---
  src/amd/common/ac_nir_to_llvm.c | 112 +++-
  1 file changed, 52 insertions(+), 60 deletions(-)

diff --git a/src/amd/common/ac_nir_to_llvm.c b/src/amd/common/ac_nir_to_llvm.c
index cbd646e..00a3ec2 100644
--- a/src/amd/common/ac_nir_to_llvm.c
+++ b/src/amd/common/ac_nir_to_llvm.c
@@ -152,10 +152,6 @@ struct nir_to_llvm_context {
  
  	LLVMValueRef i1true;

LLVMValueRef i1false;
-   LLVMValueRef i32zero;
-   LLVMValueRef i32one;
-   LLVMValueRef f32zero;
-   LLVMValueRef f32one;
LLVMValueRef v4f32empty;
  
  	unsigned uniform_md_kind;

@@ -1025,15 +1021,11 @@ static void setup_types(struct nir_to_llvm_context *ctx)
  
  	ctx->i1false = LLVMConstInt(ctx->i1, 0, false);

ctx->i1true = LLVMConstInt(ctx->i1, 1, false);
-   ctx->i32zero = LLVMConstInt(ctx->i32, 0, false);
-   ctx->i32one = LLVMConstInt(ctx->i32, 1, false);
-   ctx->f32zero = LLVMConstReal(ctx->f32, 0.0);
-   ctx->f32one = LLVMConstReal(ctx->f32, 1.0);
-
-   args[0] = ctx->f32zero;
-   args[1] = ctx->f32zero;
-   args[2] = ctx->f32zero;
-   args[3] = ctx->f32one;
+
+   args[0] = ctx->ac.f32_0;
+   args[1] = ctx->ac.f32_0;
+   args[2] = ctx->ac.f32_0;
+   args[3] = ctx->ac.f32_1;
ctx->v4f32empty = LLVMConstVector(args, 4);
  
  	ctx->uniform_md_kind =

@@ -1416,7 +1408,7 @@ static LLVMValueRef emit_f2f16(struct nir_to_llvm_context 
*ctx,
result = LLVMBuildFPExt(ctx->builder, result, ctx->f32, "");
  
  	if (ctx->options->chip_class >= VI)

-   result = LLVMBuildSelect(ctx->builder, cond, ctx->f32zero, result, 
"");
+   result = LLVMBuildSelect(ctx->builder, cond, ctx->ac.f32_0, result, 
"");
else {
/* for SI/CIK */
/* 0x3880 is smallest half float value (2^-14) in 32-bit 
float,
@@ -1429,9 +1421,9 @@ static LLVMValueRef emit_f2f16(struct nir_to_llvm_context 
*ctx,
 LLVMBuildBitCast(ctx->builder, LLVMConstInt(ctx->i32, 
0x3880, false), ctx->f32, ""),
 temp, "");
cond2 = LLVMBuildFCmp(ctx->builder, LLVMRealUNE,
- temp, ctx->f32zero, "");
+ temp, ctx->ac.f32_0, "");
cond = LLVMBuildAnd(ctx->builder, cond, cond2, "");
-   result = LLVMBuildSelect(ctx->builder, cond, ctx->f32zero, result, 
"");
+   result = LLVMBuildSelect(ctx->builder, cond, ctx->ac.f32_0, result, 
"");
}
return result;
  }
@@ -2838,7 +2830,7 @@ load_tcs_input(struct nir_to_llvm_context *ctx,
for (unsigned i = 0; i < instr->num_components + comp; i++) {
value[i] = ac_lds_load(>ac, dw_addr);
dw_addr = LLVMBuildAdd(ctx->builder, dw_addr,
-  ctx->i32one, "");
+  ctx->ac.i32_1, "");
}
result = build_varying_gather_values(>ac, value, 
instr->num_components, comp);
result = LLVMBuildBitCast(ctx->builder, result, get_def_type(ctx->nir, 
>dest.ssa), "");
@@ -2877,7 +2869,7 @@ load_tcs_output(struct nir_to_llvm_context *ctx,
for (unsigned i = comp; i < instr->num_components + comp; i++) {
value[i] = ac_lds_load(>ac, dw_addr);
dw_addr = LLVMBuildAdd(ctx->builder, dw_addr,
-  ctx->i32one, "");
+  ctx->ac.i32_1, "");
}
result = build_varying_gather_values(>ac, value, 
instr->num_components, comp);
result = LLVMBuildBitCast(ctx->builder, result, get_def_type(ctx->nir, 
>dest.ssa), "");
@@ -2945,7 +2937,7 @@ store_tcs_output(struct nir_to_llvm_context *ctx,
4 * (base + chan), 1, 0, 
true, false);
  
  		dw_addr = LLVMBuildAdd(ctx->builder, dw_addr,

-  ctx->i32one, "");
+  ctx->ac.i32_1, "");
}
  
  	if (writemask == 0xF) {

@@ -3023,12 +3015,12 @@ load_gs_input(struct nir_to_llvm_context *ctx,
args[0] = ctx->esgs_ring;
args[1] = vtx_offset;
args[2] = LLVMConstInt(ctx->i32, (param * 4 + i + 
const_index) * 256, false);
-   args[3] = ctx->i32zero;
-   args[4] = ctx->i32one; /* OFFEN */
-   args[5] = ctx->i32zero; /* IDXEN */
-   args[6] = ctx->i32one; /* GLC */
-   args[7] = ctx->i32zero; /* SLC */
-   args[8] = ctx->i32zero; /* TFE */
+   

[Mesa-dev] [PATCH] ac/llvm: use the ac i32 0/1 and f32 0/1 llvm types.

2017-10-25 Thread Dave Airlie
From: Dave Airlie 

This just avoids having two copies of these.

Signed-off-by: Dave Airlie 
---
 src/amd/common/ac_nir_to_llvm.c | 112 +++-
 1 file changed, 52 insertions(+), 60 deletions(-)

diff --git a/src/amd/common/ac_nir_to_llvm.c b/src/amd/common/ac_nir_to_llvm.c
index cbd646e..00a3ec2 100644
--- a/src/amd/common/ac_nir_to_llvm.c
+++ b/src/amd/common/ac_nir_to_llvm.c
@@ -152,10 +152,6 @@ struct nir_to_llvm_context {
 
LLVMValueRef i1true;
LLVMValueRef i1false;
-   LLVMValueRef i32zero;
-   LLVMValueRef i32one;
-   LLVMValueRef f32zero;
-   LLVMValueRef f32one;
LLVMValueRef v4f32empty;
 
unsigned uniform_md_kind;
@@ -1025,15 +1021,11 @@ static void setup_types(struct nir_to_llvm_context *ctx)
 
ctx->i1false = LLVMConstInt(ctx->i1, 0, false);
ctx->i1true = LLVMConstInt(ctx->i1, 1, false);
-   ctx->i32zero = LLVMConstInt(ctx->i32, 0, false);
-   ctx->i32one = LLVMConstInt(ctx->i32, 1, false);
-   ctx->f32zero = LLVMConstReal(ctx->f32, 0.0);
-   ctx->f32one = LLVMConstReal(ctx->f32, 1.0);
-
-   args[0] = ctx->f32zero;
-   args[1] = ctx->f32zero;
-   args[2] = ctx->f32zero;
-   args[3] = ctx->f32one;
+
+   args[0] = ctx->ac.f32_0;
+   args[1] = ctx->ac.f32_0;
+   args[2] = ctx->ac.f32_0;
+   args[3] = ctx->ac.f32_1;
ctx->v4f32empty = LLVMConstVector(args, 4);
 
ctx->uniform_md_kind =
@@ -1416,7 +1408,7 @@ static LLVMValueRef emit_f2f16(struct nir_to_llvm_context 
*ctx,
result = LLVMBuildFPExt(ctx->builder, result, ctx->f32, "");
 
if (ctx->options->chip_class >= VI)
-   result = LLVMBuildSelect(ctx->builder, cond, ctx->f32zero, 
result, "");
+   result = LLVMBuildSelect(ctx->builder, cond, ctx->ac.f32_0, 
result, "");
else {
/* for SI/CIK */
/* 0x3880 is smallest half float value (2^-14) in 32-bit 
float,
@@ -1429,9 +1421,9 @@ static LLVMValueRef emit_f2f16(struct nir_to_llvm_context 
*ctx,
 LLVMBuildBitCast(ctx->builder, 
LLVMConstInt(ctx->i32, 0x3880, false), ctx->f32, ""),
 temp, "");
cond2 = LLVMBuildFCmp(ctx->builder, LLVMRealUNE,
- temp, ctx->f32zero, "");
+ temp, ctx->ac.f32_0, "");
cond = LLVMBuildAnd(ctx->builder, cond, cond2, "");
-   result = LLVMBuildSelect(ctx->builder, cond, ctx->f32zero, 
result, "");
+   result = LLVMBuildSelect(ctx->builder, cond, ctx->ac.f32_0, 
result, "");
}
return result;
 }
@@ -2838,7 +2830,7 @@ load_tcs_input(struct nir_to_llvm_context *ctx,
for (unsigned i = 0; i < instr->num_components + comp; i++) {
value[i] = ac_lds_load(>ac, dw_addr);
dw_addr = LLVMBuildAdd(ctx->builder, dw_addr,
-  ctx->i32one, "");
+  ctx->ac.i32_1, "");
}
result = build_varying_gather_values(>ac, value, 
instr->num_components, comp);
result = LLVMBuildBitCast(ctx->builder, result, get_def_type(ctx->nir, 
>dest.ssa), "");
@@ -2877,7 +2869,7 @@ load_tcs_output(struct nir_to_llvm_context *ctx,
for (unsigned i = comp; i < instr->num_components + comp; i++) {
value[i] = ac_lds_load(>ac, dw_addr);
dw_addr = LLVMBuildAdd(ctx->builder, dw_addr,
-  ctx->i32one, "");
+  ctx->ac.i32_1, "");
}
result = build_varying_gather_values(>ac, value, 
instr->num_components, comp);
result = LLVMBuildBitCast(ctx->builder, result, get_def_type(ctx->nir, 
>dest.ssa), "");
@@ -2945,7 +2937,7 @@ store_tcs_output(struct nir_to_llvm_context *ctx,
4 * (base + chan), 1, 0, 
true, false);
 
dw_addr = LLVMBuildAdd(ctx->builder, dw_addr,
-  ctx->i32one, "");
+  ctx->ac.i32_1, "");
}
 
if (writemask == 0xF) {
@@ -3023,12 +3015,12 @@ load_gs_input(struct nir_to_llvm_context *ctx,
args[0] = ctx->esgs_ring;
args[1] = vtx_offset;
args[2] = LLVMConstInt(ctx->i32, (param * 4 + i + 
const_index) * 256, false);
-   args[3] = ctx->i32zero;
-   args[4] = ctx->i32one; /* OFFEN */
-   args[5] = ctx->i32zero; /* IDXEN */
-   args[6] = ctx->i32one; /* GLC */
-   args[7] = ctx->i32zero; /* SLC */
-   args[8] = ctx->i32zero; /* TFE */
+   args[3] = ctx->ac.i32_0;
+   args[4] = ctx->ac.i32_1; 

Re: [Mesa-dev] [PATCH 03/25] threads: update for late C11 changes

2017-10-25 Thread Jose Fonseca

On 26/10/17 13:55, Jose Fonseca wrote:

On 23/10/17 20:15, Emil Velikov wrote:

Hi Nicolai,

On 22 October 2017 at 20:07, Nicolai Hähnle  wrote:

From: Nicolai Hähnle 

C11 threads were changed to use struct timespec instead of xtime, and
thrd_sleep got a second argument.

As xtime was replaced with timespec there's a couple of odd bits in 
the code.




See http://www.open-std.org/jtc1/sc22/wg14/www/docs/n1554.htm and
http://en.cppreference.com/w/c/thread/{thrd_sleep,cnd_timedwait,mtx_timedlock} 



Note that cnd_timedwait is spec'd to be relative to TIME_UTC / 
CLOCK_REALTIME.


Cc: Jose Fonseca 
---
  include/c11/threads.h   | 11 ---
  include/c11/threads_posix.h | 39 
+++
  include/c11/threads_win32.h | 37 
+++--

  src/egl/drivers/dri2/egl_dri2.c | 24 +---
  4 files changed, 47 insertions(+), 64 deletions(-)

diff --git a/include/c11/threads.h b/include/c11/threads.h
index 573348d8091..3c3f23a8ab8 100644
--- a/include/c11/threads.h
+++ b/include/c11/threads.h
@@ -23,42 +23,31 @@
   * FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN 
NO EVENT
   * SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE 
BE LIABLE
   * FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR 
OTHERWISE,
   * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE 
USE OR OTHER

   * DEALINGS IN THE SOFTWARE.
   */
  #ifndef EMULATED_THREADS_H_INCLUDED_
  #define EMULATED_THREADS_H_INCLUDED_

  #include 
-#ifdef _MSC_VER
-#include   // for xtime
-#endif

  #ifndef TIME_UTC
  #define TIME_UTC 1
  #endif

  #include "c99_compat.h" /* for `inline` */

  /* types */
  typedef void (*tss_dtor_t)(void*);
  typedef int (*thrd_start_t)(void*);

-#ifndef _MSC_VER
-struct xtime {
-    time_t sec;
-    long nsec;
-};
-typedef struct xtime xtime;
-#endif
-


We don't have a fall-back declaration of the struct, yet we use it
below and provide a timespec_get() implementation.
I'd imagine you haven't tested this on Windows (hence Jose in CC)?

Quick search suggests that MSVC 2015 was the first one that introduces
the struct and timespec_get.

If we're safe as-is, please add a comment with some details - I'd
imagine Jose had better knowledge in the area.



  /* 7.25.7 Time functions */
  // 7.25.6.1
+#if 0

I'd just drop the hunk mentioning that timespec_get() is part of time.h


Thank
Emil


If there's doubt, I suggest testing Visual Studio with AppVeyor by 
pushing the changes as a feature branch to FDO's git -- I believe that 
should trigger an AppVeyor build.  (Push to a github repos hooked into 
Appveyor, depending on what people are more confortable with.)


Alternatively reach out to Brian or Roland.

I'm currently on PTO, so I'm afraid I don't have the time nor a devel 
setup to try this out..


Jose
I forgot to say: assuming we don't need xtime anymore, at a glance, the 
patch looks ok to me.



Jose
___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH] ac/nir: move lds declaration/load/store into shared code.

2017-10-25 Thread Dave Airlie
From: Dave Airlie 

This was duplicated between both drivers, share here.

Signed-off-by: Dave Airlie 
---
 src/amd/common/ac_llvm_build.c| 24 ++
 src/amd/common/ac_llvm_build.h| 12 +
 src/amd/common/ac_nir_to_llvm.c   | 56 ++-
 src/gallium/drivers/radeonsi/si_shader.c  | 20 ++--
 src/gallium/drivers/radeonsi/si_shader_internal.h |  1 -
 5 files changed, 56 insertions(+), 57 deletions(-)

diff --git a/src/amd/common/ac_llvm_build.c b/src/amd/common/ac_llvm_build.c
index 80b027e..946f97f 100644
--- a/src/amd/common/ac_llvm_build.c
+++ b/src/amd/common/ac_llvm_build.c
@@ -1748,3 +1748,27 @@ void ac_init_exec_full_mask(struct ac_llvm_context *ctx)
   "llvm.amdgcn.init.exec", ctx->voidt,
   _mask, 1, AC_FUNC_ATTR_CONVERGENT);
 }
+
+void ac_declare_lds_as_pointer(struct ac_llvm_context *ctx)
+{
+   unsigned lds_size = ctx->chip_class >= CIK ? 65536 : 32768;
+   ctx->lds = LLVMBuildIntToPtr(ctx->builder, ctx->i32_0,
+LLVMPointerType(LLVMArrayType(ctx->i32, 
lds_size / 4), AC_LOCAL_ADDR_SPACE),
+"lds");
+}
+
+LLVMValueRef
+ac_lds_load(struct ac_llvm_context *ctx,
+   LLVMValueRef dw_addr)
+{
+   return ac_build_load(ctx, ctx->lds, dw_addr);
+}
+
+void
+ac_lds_store(struct ac_llvm_context *ctx,
+LLVMValueRef dw_addr, LLVMValueRef value)
+{
+   value = ac_to_integer(ctx, value);
+   ac_build_indexed_store(ctx, ctx->lds,
+  dw_addr, value);
+}
diff --git a/src/amd/common/ac_llvm_build.h b/src/amd/common/ac_llvm_build.h
index 996f558..7d57b8b 100644
--- a/src/amd/common/ac_llvm_build.h
+++ b/src/amd/common/ac_llvm_build.h
@@ -34,6 +34,10 @@
 extern "C" {
 #endif
 
+enum {
+   AC_LOCAL_ADDR_SPACE = 3,
+};
+
 struct ac_llvm_context {
LLVMContextRef context;
LLVMModuleRef module;
@@ -65,6 +69,8 @@ struct ac_llvm_context {
LLVMValueRef empty_md;
 
enum chip_class chip_class;
+
+   LLVMValueRef lds;
 };
 
 void
@@ -283,6 +289,12 @@ void ac_optimize_vs_outputs(struct ac_llvm_context *ac,
uint32_t num_outputs,
uint8_t *num_param_exports);
 void ac_init_exec_full_mask(struct ac_llvm_context *ctx);
+
+void ac_declare_lds_as_pointer(struct ac_llvm_context *ac);
+LLVMValueRef ac_lds_load(struct ac_llvm_context *ctx,
+LLVMValueRef dw_addr);
+void ac_lds_store(struct ac_llvm_context *ctx,
+ LLVMValueRef dw_addr, LLVMValueRef value);
 #ifdef __cplusplus
 }
 #endif
diff --git a/src/amd/common/ac_nir_to_llvm.c b/src/amd/common/ac_nir_to_llvm.c
index 06937d6..cbd646e 100644
--- a/src/amd/common/ac_nir_to_llvm.c
+++ b/src/amd/common/ac_nir_to_llvm.c
@@ -162,7 +162,6 @@ struct nir_to_llvm_context {
LLVMValueRef empty_md;
gl_shader_stage stage;
 
-   LLVMValueRef lds;
LLVMValueRef inputs[RADEON_LLVM_MAX_INPUTS * 4];
 
uint64_t input_mask;
@@ -548,14 +547,6 @@ static void set_userdata_location_indirect(struct 
ac_userdata_info *ud_info, uin
ud_info->indirect_offset = indirect_offset;
 }
 
-static void declare_tess_lds(struct nir_to_llvm_context *ctx)
-{
-   unsigned lds_size = ctx->options->chip_class >= CIK ? 65536 : 32768;
-   ctx->lds = LLVMBuildIntToPtr(ctx->builder, ctx->i32zero,
-LLVMPointerType(LLVMArrayType(ctx->i32, 
lds_size / 4), LOCAL_ADDR_SPACE),
-   "tess_lds");
-}
-
 struct user_sgpr_info {
bool need_ring_offsets;
uint8_t sgpr_count;
@@ -971,7 +962,7 @@ static void create_function(struct nir_to_llvm_context *ctx,
set_userdata_location_shader(ctx, 
AC_UD_VS_LS_TCS_IN_LAYOUT, _sgpr_idx, 1);
}
if (ctx->options->key.vs.as_ls)
-   declare_tess_lds(ctx);
+   ac_declare_lds_as_pointer(>ac);
break;
case MESA_SHADER_TESS_CTRL:
radv_define_vs_user_sgprs_phase2(ctx, stage, 
has_previous_stage, previous_stage, _sgpr_idx);
@@ -980,7 +971,7 @@ static void create_function(struct nir_to_llvm_context *ctx,
set_userdata_location_shader(ctx, AC_UD_TCS_OFFCHIP_LAYOUT, 
_sgpr_idx, 4);
if (ctx->view_index)
set_userdata_location_shader(ctx, AC_UD_VIEW_INDEX, 
_sgpr_idx, 1);
-   declare_tess_lds(ctx);
+   ac_declare_lds_as_pointer(>ac);
break;
case MESA_SHADER_TESS_EVAL:
set_userdata_location_shader(ctx, AC_UD_TES_OFFCHIP_LAYOUT, 
_sgpr_idx, 1);
@@ -998,7 +989,7 @@ static void create_function(struct nir_to_llvm_context *ctx,
if (ctx->view_index)
set_userdata_location_shader(ctx, 

Re: [Mesa-dev] [PATCH 03/25] threads: update for late C11 changes

2017-10-25 Thread Jose Fonseca

On 23/10/17 20:15, Emil Velikov wrote:

Hi Nicolai,

On 22 October 2017 at 20:07, Nicolai Hähnle  wrote:

From: Nicolai Hähnle 

C11 threads were changed to use struct timespec instead of xtime, and
thrd_sleep got a second argument.


As xtime was replaced with timespec there's a couple of odd bits in the code.



See http://www.open-std.org/jtc1/sc22/wg14/www/docs/n1554.htm and
http://en.cppreference.com/w/c/thread/{thrd_sleep,cnd_timedwait,mtx_timedlock}

Note that cnd_timedwait is spec'd to be relative to TIME_UTC / CLOCK_REALTIME.

Cc: Jose Fonseca 
---
  include/c11/threads.h   | 11 ---
  include/c11/threads_posix.h | 39 +++
  include/c11/threads_win32.h | 37 +++--
  src/egl/drivers/dri2/egl_dri2.c | 24 +---
  4 files changed, 47 insertions(+), 64 deletions(-)

diff --git a/include/c11/threads.h b/include/c11/threads.h
index 573348d8091..3c3f23a8ab8 100644
--- a/include/c11/threads.h
+++ b/include/c11/threads.h
@@ -23,42 +23,31 @@
   * FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT
   * SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE
   * FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE,
   * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
   * DEALINGS IN THE SOFTWARE.
   */
  #ifndef EMULATED_THREADS_H_INCLUDED_
  #define EMULATED_THREADS_H_INCLUDED_

  #include 
-#ifdef _MSC_VER
-#include   // for xtime
-#endif

  #ifndef TIME_UTC
  #define TIME_UTC 1
  #endif

  #include "c99_compat.h" /* for `inline` */

  /* types */
  typedef void (*tss_dtor_t)(void*);
  typedef int (*thrd_start_t)(void*);

-#ifndef _MSC_VER
-struct xtime {
-time_t sec;
-long nsec;
-};
-typedef struct xtime xtime;
-#endif
-


We don't have a fall-back declaration of the struct, yet we use it
below and provide a timespec_get() implementation.
I'd imagine you haven't tested this on Windows (hence Jose in CC)?

Quick search suggests that MSVC 2015 was the first one that introduces
the struct and timespec_get.

If we're safe as-is, please add a comment with some details - I'd
imagine Jose had better knowledge in the area.



  /* 7.25.7 Time functions */
  // 7.25.6.1
+#if 0

I'd just drop the hunk mentioning that timespec_get() is part of time.h


Thank
Emil


If there's doubt, I suggest testing Visual Studio with AppVeyor by 
pushing the changes as a feature branch to FDO's git -- I believe that 
should trigger an AppVeyor build.  (Push to a github repos hooked into 
Appveyor, depending on what people are more confortable with.)


Alternatively reach out to Brian or Roland.

I'm currently on PTO, so I'm afraid I don't have the time nor a devel 
setup to try this out..


Jose


___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH 2/2] broadcom/genxml: Fix decoding of groups with small fields.

2017-10-25 Thread Kenneth Graunke
Groups containing fields smaller than a byte probably not being decoded
correctly.  For example:


  


gen_field_iterator_next would properly walk over each element of the
array, incrementing group_iter.  However, the code to print the actual
values only considered iter->field->start/end, which are 0 and 3 in the
above example.  So it would always fetch bits 3:0 of the current byte,
printing the same value over and over.

Cc: Eric Anholt 
---
 src/broadcom/cle/v3d_decoder.c | 6 --
 1 file changed, 4 insertions(+), 2 deletions(-)

Hey Eric,

This patch is totally untested...I assume I need to build vc5, and
I can't do that because I don't have a simulator.  Not even compile
tested.  I figured you had the same bug though, so I'd try and fix it.

--Ken

diff --git a/src/broadcom/cle/v3d_decoder.c b/src/broadcom/cle/v3d_decoder.c
index 4ac40af05e8..9c457b76068 100644
--- a/src/broadcom/cle/v3d_decoder.c
+++ b/src/broadcom/cle/v3d_decoder.c
@@ -781,8 +781,10 @@ v3d_field_iterator_next(struct v3d_field_iterator *iter)
 
 const char *enum_name = NULL;
 
-int s = iter->field->start;
-int e = iter->field->end;
+int group_member_offset =
+iter_group_offset_bits(iter, iter->group_iter);
+int s = group_member_offset + iter->field->start;
+int e = group_member_offset + iter->field->end;
 
 switch (iter->field->type.kind) {
 case V3D_TYPE_UNKNOWN:
-- 
2.14.2

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH 1/2] intel/genxml: Fix decoding of groups with fields smaller than a DWord.

2017-10-25 Thread Kenneth Graunke
Groups containing fields smaller than a DWord were not being decoded
correctly.  For example:


  


gen_field_iterator_next would properly walk over each element of the
array, incrementing group_iter, and calling iter_group_offset_bits()
to advance to the proper DWord.  However, the code to print the actual
values only considered iter->field->start/end, which are 0 and 3 in the
above example.  So it would always fetch bits 3:0 of the current DWord
when printing values, instead of advancing to each element of the array,
printing bits 0-3, 4-7, 8-11, and so on.

To fix this, we add new iter->start/end tracking, which properly
advances for each instance of a group's field.

Caught by Matt Turner while working on 3DSTATE_VF_COMPONENT_PACKING,
with a patch to convert it to use an array of bitfields (the example
above).

This also fixes the decoding of 3DSTATE_SBE's "Attribute Active
Component Format" fields.
---
 src/intel/common/gen_decoder.c | 24 ++--
 src/intel/common/gen_decoder.h |  2 ++
 2 files changed, 16 insertions(+), 10 deletions(-)

diff --git a/src/intel/common/gen_decoder.c b/src/intel/common/gen_decoder.c
index 85880143f00..1bf69ac4f94 100644
--- a/src/intel/common/gen_decoder.c
+++ b/src/intel/common/gen_decoder.c
@@ -843,8 +843,12 @@ iter_advance_field(struct gen_field_iterator *iter)
strncpy(iter->name, iter->field->name, sizeof(iter->name));
else
   memset(iter->name, 0, sizeof(iter->name));
-   iter->dword = iter_group_offset_bits(iter, iter->group_iter) / 32 +
-  iter->field->start / 32;
+
+   int group_member_offset = iter_group_offset_bits(iter, iter->group_iter);
+
+   iter->start = group_member_offset + iter->field->start;
+   iter->end = group_member_offset + iter->field->end;
+   iter->dword = iter->start / 32;
iter->struct_desc = NULL;
 
return true;
@@ -861,7 +865,7 @@ gen_field_iterator_next(struct gen_field_iterator *iter)
if (!iter_advance_field(iter))
   return false;
 
-   if ((iter->field->end - iter->field->start) > 32)
+   if ((iter->end - iter->start) > 32)
   v.qw = ((uint64_t) iter->p[iter->dword+1] << 32) | iter->p[iter->dword];
else
   v.qw = iter->p[iter->dword];
@@ -871,13 +875,13 @@ gen_field_iterator_next(struct gen_field_iterator *iter)
switch (iter->field->type.kind) {
case GEN_TYPE_UNKNOWN:
case GEN_TYPE_INT: {
-  uint64_t value = field(v.qw, iter->field->start, iter->field->end);
+  uint64_t value = field(v.qw, iter->start, iter->end);
   snprintf(iter->value, sizeof(iter->value), "%"PRId64, value);
   enum_name = gen_get_enum_name(>field->inline_enum, value);
   break;
}
case GEN_TYPE_UINT: {
-  uint64_t value = field(v.qw, iter->field->start, iter->field->end);
+  uint64_t value = field(v.qw, iter->start, iter->end);
   snprintf(iter->value, sizeof(iter->value), "%"PRIu64, value);
   enum_name = gen_get_enum_name(>field->inline_enum, value);
   break;
@@ -886,7 +890,7 @@ gen_field_iterator_next(struct gen_field_iterator *iter)
   const char *true_string =
  iter->print_colors ? "\e[0;35mtrue\e[0m" : "true";
   snprintf(iter->value, sizeof(iter->value), "%s",
-   field(v.qw, iter->field->start, iter->field->end) ?
+   field(v.qw, iter->start, iter->end) ?
true_string : "false");
   break;
}
@@ -896,7 +900,7 @@ gen_field_iterator_next(struct gen_field_iterator *iter)
case GEN_TYPE_ADDRESS:
case GEN_TYPE_OFFSET:
   snprintf(iter->value, sizeof(iter->value), "0x%08"PRIx64,
-   field_address(v.qw, iter->field->start, iter->field->end));
+   field_address(v.qw, iter->start, iter->end));
   break;
case GEN_TYPE_STRUCT:
   snprintf(iter->value, sizeof(iter->value), "",
@@ -907,8 +911,8 @@ gen_field_iterator_next(struct gen_field_iterator *iter)
   break;
case GEN_TYPE_UFIXED:
   snprintf(iter->value, sizeof(iter->value), "%f",
-   (float) field(v.qw, iter->field->start,
- iter->field->end) / (1 << iter->field->type.f));
+   (float) field(v.qw, iter->start,
+ iter->end) / (1 << iter->field->type.f));
   break;
case GEN_TYPE_SFIXED:
   /* FIXME: Sign extend extracted field. */
@@ -917,7 +921,7 @@ gen_field_iterator_next(struct gen_field_iterator *iter)
case GEN_TYPE_MBO:
break;
case GEN_TYPE_ENUM: {
-  uint64_t value = field(v.qw, iter->field->start, iter->field->end);
+  uint64_t value = field(v.qw, iter->start, iter->end);
   snprintf(iter->value, sizeof(iter->value),
"%"PRId64, value);
   enum_name = gen_get_enum_name(iter->field->type.gen_enum, value);
diff --git a/src/intel/common/gen_decoder.h b/src/intel/common/gen_decoder.h
index cfc9f2e3f15..12d4551a127 100644
--- a/src/intel/common/gen_decoder.h
+++ b/src/intel/common/gen_decoder.h
@@ -58,6 +58,8 @@ struct 

Re: [Mesa-dev] [PATCH] i965: Fix ARB_indirect_parameters logic.

2017-10-25 Thread Kenneth Graunke
On Wednesday, October 25, 2017 9:54:46 AM PDT Plamena Manolova wrote:
> This patch modifies the ARB_indirect_parameters logic in
> brw_draw_prims, so that our implementation isn't affected if
> another application attempts to use predicates. Previously we
> were using a predicate with a DELTAS_EQUAL comparison operation
> and relying on the MI_PREDICATE_DATA register being 0.

It's unfortunately a bit nastier of an explanation.  How about:

  Our code to initialize MI_PREDICATE_DATA to 0 was incorrect, so we
  were accidentally using whatever value was written there.  Because
  the kernel does not initialize the MI_PREDICATE_DATA register on
  hardware context creation, we might inherit the value from whatever
  context was last running on the GPU (likely another process).

  The Haswell command parser also does not currently allow us to write
  the MI_PREDICATE_DATA register.  Rather than fixing this and requiring
  an updated kernel, we switch to a different approach which uses two
  predicates (one using SRC_EQUAL and the other DELTAS_EQUAL) that makes
  no assumption about the states of any of the predicate registers.

> This
> assumtion is incorrect when another program is using predicates
> which store data in MI_PREDICATE_DATA. This patch introduces a
> different approach which uses 2 predicates (one using SRC_EQUAL
> and the other DELTAS_EQUAL) that makes no assumption about the
> states of any of the predicate registers.
> 
> Fixes: piglit.spec.arb_indirect_parameters.tf-count-arrays
> Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=103085
> 
> Signed-off-by: Plamena Manolova 
> CC: Kenneth Graunke 
> ---
>  src/mesa/drivers/dri/i965/brw_draw.c | 75 
> +---
>  1 file changed, 44 insertions(+), 31 deletions(-)
> 
> diff --git a/src/mesa/drivers/dri/i965/brw_draw.c 
> b/src/mesa/drivers/dri/i965/brw_draw.c
> index 80d4891f6f..81465c79fb 100644
> --- a/src/mesa/drivers/dri/i965/brw_draw.c
> +++ b/src/mesa/drivers/dri/i965/brw_draw.c
> @@ -866,7 +866,6 @@ brw_draw_prims(struct gl_context *ctx,
> struct brw_context *brw = brw_context(ctx);
> const struct gl_vertex_array **arrays = ctx->Array._DrawArrays;
> int predicate_state = brw->predicate.state;
> -   int combine_op = MI_PREDICATE_COMBINEOP_SET;
> struct brw_transform_feedback_object *xfb_obj =
>(struct brw_transform_feedback_object *) gl_xfb_obj;
>  
> @@ -910,49 +909,63 @@ brw_draw_prims(struct gl_context *ctx,
>  * to it.
>  */
>  
> -if (brw->draw.draw_params_count_bo &&
> -predicate_state == BRW_PREDICATE_STATE_USE_BIT) {
> -  /* We need to empty the MI_PREDICATE_DATA register since it might
> -   * already be set.
> -   */
> -
> -  BEGIN_BATCH(4);
> -  OUT_BATCH(MI_PREDICATE_DATA);
> -  OUT_BATCH(0u);
> -  OUT_BATCH(MI_PREDICATE_DATA + 4);
> -  OUT_BATCH(0u);
> -  ADVANCE_BATCH();
> -
> -  /* We need to combine the results of both predicates.*/
> -  combine_op = MI_PREDICATE_COMBINEOP_AND;
> -   }
> -
> for (i = 0; i < nr_prims; i++) {
>/* Implementation of ARB_indirect_parameters via predicates */
>if (brw->draw.draw_params_count_bo) {
> - struct brw_bo *draw_id_bo = NULL;
> - uint32_t draw_id_offset;
> -
> - intel_upload_data(brw, [i].draw_id, 4, 4, _id_bo,
> -   _id_offset);
> -
>   brw_emit_pipe_control_flush(brw, PIPE_CONTROL_FLUSH_ENABLE);
>  
> + /*

Extra line here, should be /* Upload the ... (also below).

> +  * Upload the current draw count from the draw parameters buffer to
> +  * MI_PREDICATE_SRC0
> +  */
>   brw_load_register_mem(brw, MI_PREDICATE_SRC0,
> brw->draw.draw_params_count_bo,
> brw->draw.draw_params_count_offset);
> - brw_load_register_mem(brw, MI_PREDICATE_SRC1, draw_id_bo,
> -   draw_id_offset);
> -
> + /*
> +  * Upload the id of the current primitive to MI_PREDICATE_SRC1.
> +  */
> + brw_load_register_imm64(brw, MI_PREDICATE_SRC1, prims[i].draw_id);
> +
> + /*
> +  * This calculates MI_PREDICATE_SRC0 - MI_PREDICATE_SRC1 and stores 
> it
> +  * in MI_PREDICATE_DATA without modifying the Predicate State Bit 
> since
> +  * we don't need the comparision result (it's
> +  * MI_PREDICATE_SRC0 == MI_PREDICATE_SRC0).

MI_PREDICATE_SRC0 == MI_PREDICATE_SRC1, right?

> +  */
>   BEGIN_BATCH(1);
>   OUT_BATCH(GEN7_MI_PREDICATE |
> -   MI_PREDICATE_LOADOP_LOADINV | combine_op |
> -   MI_PREDICATE_COMPAREOP_DELTAS_EQUAL);
> +   MI_PREDICATE_LOADOP_KEEP | MI_PREDICATE_COMBINEOP_SET |
> +   MI_PREDICATE_COMPAREOP_SRCS_EQUAL);
>   ADVANCE_BATCH();
>  
> - 

[Mesa-dev] [Bug 103388] Linking libcltgsi.la (llvm/codegen/libclllvm_la-common.lo) fails with "error: no match for 'operator-'" with GCC-7, Mesa from Git and current LLVM revisions

2017-10-25 Thread bugzilla-daemon
https://bugs.freedesktop.org/show_bug.cgi?id=103388

Jan Vesely  changed:

   What|Removed |Added

 Resolution|--- |FIXED
 Status|NEW |RESOLVED

--- Comment #5 from Jan Vesely  ---
Fixed by a6d38f476beaaf0a9677cfc168172121b5779570

clover: Fix compilation after clang r315871

-- 
You are receiving this mail because:
You are the QA Contact for the bug.
You are the assignee for the bug.___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


Re: [Mesa-dev] [PATCH 2/3] mesa: enable ARB_texture_buffer_* extensions in the Compatibility profile

2017-10-25 Thread Dylan Baker
I've pushed it to our CI, I'll let you know tomorrow what it looks like

Dylan

Quoting Marek Olšák (2017-10-25 19:52:43)
> Hi Dylan,
> 
> Can you please add this and re-test?
> 
> diff --git a/src/mesa/drivers/dri/i965/intel_extensions.c b/src/mesa/drivers/
> dri/i965/intel_extensions.c
> index 21cf632..4d17393 100644
> --- a/src/mesa/drivers/dri/i965/intel_extensions.c
> +++ b/src/mesa/drivers/dri/i965/intel_extensions.c
> @@ -178,7 +178,7 @@ intelInitExtensions(struct gl_context *ctx)
>ctx->Extensions.ARB_pipeline_statistics_query = true;
>ctx->Extensions.ARB_sample_shading = true;
>ctx->Extensions.ARB_shading_language_420pack = true;
> -  if (ctx->API == API_OPENGL_CORE) {
> +  if (ctx->API != API_OPENGL_COMPAT) {
>   ctx->Extensions.ARB_texture_buffer_object = true;
>   ctx->Extensions.ARB_texture_buffer_object_rgb32 = true;
>   ctx->Extensions.ARB_texture_buffer_range = true;
> 
> 
> Marek
> 
> On Wed, Oct 25, 2017 at 11:42 PM, Dylan Baker  
> wrote:
> 
> There are a significant number of i965 regressions from
> d96c68146a781c79a23f5181d7050174f1070d90, largely related to texturing (I
> can
> send you a complete list of regressions if you care, but due to the large
> number
> of them I suspect it's something fairly simple).
> 
> For example:
> ES31-CTS.functional.texture.format.buffer.r32ui_npot
> 
> glGetIntegerv() failed: glGetError() returned GL_INVALID_ENUM at
> gluContextInfo.cpp:229
> 
> dEQP-GLES31.functional.state_query.integer.texture_buffer_
> binding_getinteger:
> 
> glGetIntegerv(GL_TEXTURE_BUFFER_BINDING, 0x7ffee0c43834);
> // data = { -555819298 }
> glGetError();
> // GL_INVALID_ENUM returned
>  // ERROR: glGetIntegerv: glGetError() returned GL_INVALID_ENUM
>
> Dylan
> 
> Quoting Emil Velikov (2017-10-24 09:30:03)
> > Hi Marek,
> >
> > On 21 October 2017 at 13:54, Marek Olšák  wrote:
> > > From: Marek Olšák 
> > >
> > > We already have piglit tests testing alpha, luminance, and intensity
> > > formats. They were skipped by piglit until now.
> > >
> > > Additionally, I'm enabling one ARB_texture_buffer_range piglit test to
> run
> > > with the compat profile.
> >
> > Can you please mention that ARB_texture_buffer_* on i965 is unchanged
> > - aka still enabled only for core profiles.
> > Out of curiosity - can you tried the series with anything more than
> piglit?
> >
> > The Intel guys can run the lot through CTS, dEQP... admittedly only on
> > Intel hardware.
> > Still it should help catch if a piece is missing somewhere.
> >
> >
> > -Emil
> > ___
> > mesa-dev mailing list
> > mesa-dev@lists.freedesktop.org
> > https://lists.freedesktop.org/mailman/listinfo/mesa-dev
> 
> 


signature.asc
Description: signature
___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


Re: [Mesa-dev] [PATCH 2/3] mesa: enable ARB_texture_buffer_* extensions in the Compatibility profile

2017-10-25 Thread Marek Olšák
Hi Dylan,

Can you please add this and re-test?

diff --git a/src/mesa/drivers/dri/i965/intel_extensions.c
b/src/mesa/drivers/dri/i965/intel_extensions.c
index 21cf632..4d17393 100644
--- a/src/mesa/drivers/dri/i965/intel_extensions.c
+++ b/src/mesa/drivers/dri/i965/intel_extensions.c
@@ -178,7 +178,7 @@ intelInitExtensions(struct gl_context *ctx)
   ctx->Extensions.ARB_pipeline_statistics_query = true;
   ctx->Extensions.ARB_sample_shading = true;
   ctx->Extensions.ARB_shading_language_420pack = true;
-  if (ctx->API == API_OPENGL_CORE) {
+  if (ctx->API != API_OPENGL_COMPAT) {
  ctx->Extensions.ARB_texture_buffer_object = true;
  ctx->Extensions.ARB_texture_buffer_object_rgb32 = true;
  ctx->Extensions.ARB_texture_buffer_range = true;


Marek

On Wed, Oct 25, 2017 at 11:42 PM, Dylan Baker 
wrote:

> There are a significant number of i965 regressions from
> d96c68146a781c79a23f5181d7050174f1070d90, largely related to texturing (I
> can
> send you a complete list of regressions if you care, but due to the large
> number
> of them I suspect it's something fairly simple).
>
> For example:
> ES31-CTS.functional.texture.format.buffer.r32ui_npot
>
> glGetIntegerv() failed: glGetError() returned GL_INVALID_ENUM at
> gluContextInfo.cpp:229
>
> dEQP-GLES31.functional.state_query.integer.texture_buffer_
> binding_getinteger:
>
> glGetIntegerv(GL_TEXTURE_BUFFER_BINDING, 0x7ffee0c43834);
> // data = { -555819298 }
> glGetError();
> // GL_INVALID_ENUM returned
>  // ERROR: glGetIntegerv: glGetError() returned GL_INVALID_ENUM
>
> Dylan
>
> Quoting Emil Velikov (2017-10-24 09:30:03)
> > Hi Marek,
> >
> > On 21 October 2017 at 13:54, Marek Olšák  wrote:
> > > From: Marek Olšák 
> > >
> > > We already have piglit tests testing alpha, luminance, and intensity
> > > formats. They were skipped by piglit until now.
> > >
> > > Additionally, I'm enabling one ARB_texture_buffer_range piglit test to
> run
> > > with the compat profile.
> >
> > Can you please mention that ARB_texture_buffer_* on i965 is unchanged
> > - aka still enabled only for core profiles.
> > Out of curiosity - can you tried the series with anything more than
> piglit?
> >
> > The Intel guys can run the lot through CTS, dEQP... admittedly only on
> > Intel hardware.
> > Still it should help catch if a piece is missing somewhere.
> >
> >
> > -Emil
> > ___
> > mesa-dev mailing list
> > mesa-dev@lists.freedesktop.org
> > https://lists.freedesktop.org/mailman/listinfo/mesa-dev
>
___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH] radv: add cache items to in memory cache when reading from disk

2017-10-25 Thread Timothy Arceri
Otherwise we will leak them, load duplicates from disk rather
than memory and never write items loaded from disk to the apps
pipeline cache.

Fixes: fd24be134ffd 'radv: make use of on-disk cache'
---
 src/amd/vulkan/radv_pipeline_cache.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/amd/vulkan/radv_pipeline_cache.c 
b/src/amd/vulkan/radv_pipeline_cache.c
index 9ba9a3b61b..89b27fd35f 100644
--- a/src/amd/vulkan/radv_pipeline_cache.c
+++ b/src/amd/vulkan/radv_pipeline_cache.c
@@ -194,20 +194,22 @@ radv_create_shader_variants_from_pipeline_cache(struct 
radv_device *device,
 
uint8_t disk_sha1[20];
disk_cache_compute_key(device->physical_device->disk_cache,
   sha1, 20, disk_sha1);
entry = (struct cache_entry *)
disk_cache_get(device->physical_device->disk_cache,
   disk_sha1, NULL);
if (!entry) {
pthread_mutex_unlock(>mutex);
return false;
+   } else {
+   radv_pipeline_cache_add_entry(cache, entry);
}
}
 
char *p = entry->code;
for(int i = 0; i < MESA_SHADER_STAGES; ++i) {
if (!entry->variants[i] && entry->code_sizes[i]) {
struct radv_shader_variant *variant;
struct cache_entry_variant_info info;
 
variant = calloc(1, sizeof(struct radv_shader_variant));
-- 
2.13.6

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


Re: [Mesa-dev] [PATCH v3 07/10] egl: add dri2_surface_destroy_back_image() helper (v3)

2017-10-25 Thread Gurchetan Singh
No plans to merge dri2_surface_destroy_back_image
and dri2_surface_destroy_front_image ;-)?  Otherwise, looks consistent with
the previous review comments.  Patches 1-6 are:

Reviewed-by: Gurchetan Singh 

On Tue, Oct 24, 2017 at 2:45 PM, Gwan-gyeong Mun  wrote:

> To share common destroy dri_image_back code.
>
> In preparation to adding of new platform which uses this helper.
>
> v2:
>  - Move dri_image_back to outside of android ifdef block for removing of
>ifdef magic on dri2_egl_surface_destroy_image_back().
>  - Fixes from Eric's review:
>a) Split out series of refactor for helpers to a separate series.
>b) Add the new helper function and use them to replace the old code in
> the
>   same patch.
>
> v3: Fixes from Emil and Gurchetan's review
>   - Follow the naming convention which prevents too verbose name of
> functions.
> a) use a dri2_surface_$action_$object naming convention
> b) change a first argument type "struct dri2_egl_surface" to
> "_EGLSurface".
>
> Signed-off-by: Mun Gwan-gyeong 
> Reviewed-by: Emil Velikov 
> ---
>  src/egl/drivers/dri2/egl_dri2.c | 12 
>  src/egl/drivers/dri2/egl_dri2.h |  6 +-
>  src/egl/drivers/dri2/platform_android.c | 11 ++-
>  3 files changed, 19 insertions(+), 10 deletions(-)
>
> diff --git a/src/egl/drivers/dri2/egl_dri2.c b/src/egl/drivers/dri2/egl_
> dri2.c
> index d381e52e86..dc2aecef88 100644
> --- a/src/egl/drivers/dri2/egl_dri2.c
> +++ b/src/egl/drivers/dri2/egl_dri2.c
> @@ -1142,6 +1142,18 @@ dri2_surface_update_age(_EGLSurface *surf)
>dri2_surf->back->age = 1;
>  }
>
> +void
> +dri2_surface_destroy_back_image(_EGLSurface *surf)
> +{
> +   struct dri2_egl_display *dri2_dpy = dri2_egl_display(surf->
> Resource.Display);
> +   struct dri2_egl_surface *dri2_surf = dri2_egl_surface(surf);
> +
> +   if (dri2_surf->dri_image_back) {
> +  dri2_dpy->image->destroyImage(dri2_surf->dri_image_back);
> +  dri2_surf->dri_image_back = NULL;
> +   }
> +}
> +
>  /**
>   * Called via eglTerminate(), drv->API.Terminate().
>   *
> diff --git a/src/egl/drivers/dri2/egl_dri2.h b/src/egl/drivers/dri2/egl_
> dri2.h
> index 58f8082509..f13bdb6d12 100644
> --- a/src/egl/drivers/dri2/egl_dri2.h
> +++ b/src/egl/drivers/dri2/egl_dri2.h
> @@ -303,10 +303,11 @@ struct dri2_egl_surface
>int age;
> } color_buffers[COLOR_BUFFERS_SIZE], *back, *current;
>
> +   __DRIimage *dri_image_back;
> +
>  #ifdef HAVE_ANDROID_PLATFORM
> struct ANativeWindow *window;
> struct ANativeWindowBuffer *buffer;
> -   __DRIimage *dri_image_back;
> __DRIimage *dri_image_front;
>  #endif
>
> @@ -460,6 +461,9 @@ dri2_surface_set_back_buffer(_EGLSurface *surf, void
> *buffer);
>  void
>  dri2_surface_update_age(_EGLSurface *surf);
>
> +void
> +dri2_surface_destroy_back_image(_EGLSurface *surf);
> +
>  EGLBoolean
>  dri2_init_surface(_EGLSurface *surf, _EGLDisplay *dpy, EGLint type,
>  _EGLConfig *conf, const EGLint *attrib_list, EGLBoolean
> enable_out_fence);
> diff --git a/src/egl/drivers/dri2/platform_android.c
> b/src/egl/drivers/dri2/platform_android.c
> index 45af871555..e0896ed1a0 100644
> --- a/src/egl/drivers/dri2/platform_android.c
> +++ b/src/egl/drivers/dri2/platform_android.c
> @@ -228,10 +228,7 @@ droid_window_enqueue_buffer(_EGLDisplay *disp,
> struct dri2_egl_surface *dri2_sur
>
> mtx_lock(>Mutex);
>
> -   if (dri2_surf->dri_image_back) {
> -  dri2_dpy->image->destroyImage(dri2_surf->dri_image_back);
> -  dri2_surf->dri_image_back = NULL;
> -   }
> +   dri2_surface_destroy_back_image(_surf->base);
>
> return EGL_TRUE;
>  }
> @@ -355,11 +352,7 @@ droid_destroy_surface(_EGLDriver *drv, _EGLDisplay
> *disp, _EGLSurface *surf)
>dri2_surf->window->common.decRef(_surf->window->common);
> }
>
> -   if (dri2_surf->dri_image_back) {
> -  _eglLog(_EGL_DEBUG, "%s : %d : destroy dri_image_back", __func__,
> __LINE__);
> -  dri2_dpy->image->destroyImage(dri2_surf->dri_image_back);
> -  dri2_surf->dri_image_back = NULL;
> -   }
> +   dri2_surface_destroy_back_image(surf);
>
> if (dri2_surf->dri_image_front) {
>_eglLog(_EGL_DEBUG, "%s : %d : destroy dri_image_front", __func__,
> __LINE__);
> --
> 2.14.2
>
>
___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH] etnaviv: make use of TEXTURE_TYPE_1D

2017-10-25 Thread Christian Gmeiner
Signed-off-by: Christian Gmeiner 
---
 src/gallium/drivers/etnaviv/etnaviv_texture.c | 7 ++-
 1 file changed, 2 insertions(+), 5 deletions(-)

diff --git a/src/gallium/drivers/etnaviv/etnaviv_texture.c 
b/src/gallium/drivers/etnaviv/etnaviv_texture.c
index b8ebab6082..f71169d227 100644
--- a/src/gallium/drivers/etnaviv/etnaviv_texture.c
+++ b/src/gallium/drivers/etnaviv/etnaviv_texture.c
@@ -212,11 +212,8 @@ etna_create_sampler_view(struct pipe_context *pctx, struct 
pipe_resource *prsc,
 
switch (sv->base.target) {
case PIPE_TEXTURE_1D:
-  /* For 1D textures, we will have a height of 1, so we can use 2D
-   * but set T wrap to repeat */
-  sv->TE_SAMPLER_CONFIG0_MASK = ~VIVS_TE_SAMPLER_CONFIG0_VWRAP__MASK;
-  sv->TE_SAMPLER_CONFIG0 |= 
VIVS_TE_SAMPLER_CONFIG0_VWRAP(TEXTURE_WRAPMODE_REPEAT);
-  /* fallthrough */
+  sv->TE_SAMPLER_CONFIG0 |= VIVS_TE_SAMPLER_CONFIG0_TYPE(TEXTURE_TYPE_1D);
+  break;
case PIPE_TEXTURE_2D:
case PIPE_TEXTURE_RECT:
   sv->TE_SAMPLER_CONFIG0 |= VIVS_TE_SAMPLER_CONFIG0_TYPE(TEXTURE_TYPE_2D);
-- 
2.13.6

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


Re: [Mesa-dev] meson for remaining class drivers

2017-10-25 Thread Dylan Baker
ping

Quoting Dylan Baker (2017-10-16 17:55:49)
> Here is build support for the three remaining classic drivers, radeon (r100),
> r200, and the nouveau-veaux driver. None of these are too crazy, but I don't
> have hardware to test any of these.
> 
> ___
> mesa-dev mailing list
> mesa-dev@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/mesa-dev


signature.asc
Description: signature
___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


Re: [Mesa-dev] [PATCH v2] clover: Fix compilation after clang r315871

2017-10-25 Thread Francisco Jerez
Jan Vesely  writes:

> On Tue, 2017-10-24 at 15:32 +0200, Vedran Miletić wrote:
>> On 10/23/2017 05:24 AM, Jan Vesely wrote:
>> > From: Jan Vesely 
>> > 
>> > v2: use a more generic compat function
>> > 
>> > Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=103388
>> > Signed-off-by: Jan Vesely 
>> > ---
>> >  src/gallium/state_trackers/clover/llvm/codegen/common.cpp |  5 ++---
>> >  src/gallium/state_trackers/clover/llvm/compat.hpp | 12 
>> > ++--
>> >  2 files changed, 12 insertions(+), 5 deletions(-)
>> > 
>> > diff --git a/src/gallium/state_trackers/clover/llvm/codegen/common.cpp 
>> > b/src/gallium/state_trackers/clover/llvm/codegen/common.cpp
>> > index 075183400a..dd9d02ab11 100644
>> > --- a/src/gallium/state_trackers/clover/llvm/codegen/common.cpp
>> > +++ b/src/gallium/state_trackers/clover/llvm/codegen/common.cpp
>> > @@ -70,7 +70,6 @@ namespace {
>> > make_kernel_args(const Module , const std::string _name,
>> >  const clang::CompilerInstance ) {
>> >std::vector args;
>> > -  const auto address_spaces = c.getTarget().getAddressSpaceMap();
>> >const Function  = *mod.getFunction(kernel_name);
>> >::llvm::DataLayout dl();
>> >const auto size_type =
>> > @@ -128,8 +127,8 @@ namespace {
>> > const unsigned address_space =
>> >cast< 
>> > ::llvm::PointerType>(actual_type)->getAddressSpace();
>> >  
>> > -   if (address_space == 
>> > address_spaces[clang::LangAS::opencl_local
>> > -   - 
>> > compat::lang_as_offset]) {
>> > +   if (address_space == compat::target_lang_address_space(
>> > +  c.getTarget(), 
>> > clang::LangAS::opencl_local)) {
>> >args.emplace_back(module::argument::local, arg_api_size,
>> >  target_size, target_align,
>> >  module::argument::zero_ext);
>> > diff --git a/src/gallium/state_trackers/clover/llvm/compat.hpp 
>> > b/src/gallium/state_trackers/clover/llvm/compat.hpp
>> > index f8b56516d5..3e34f0dd94 100644
>> > --- a/src/gallium/state_trackers/clover/llvm/compat.hpp
>> > +++ b/src/gallium/state_trackers/clover/llvm/compat.hpp
>> > @@ -69,11 +69,19 @@ namespace clover {
>> >   typedef ::llvm::TargetLibraryInfo target_library_info;
>> >  #endif
>> >  
>> > + template
>> > + unsigned target_lang_address_space(const T& target, const AS 
>> > lang_as) {
>> > +const auto  = target.getAddressSpaceMap();
>> > +#if HAVE_LLVM >= 0x0500
>> > +return map[static_cast(lang_as)];
>> > +#else
>> > +return map[lang_as - clang::LangAS::Offset];
>> > +#endif
>> > + }
>> > +
>> >  #if HAVE_LLVM >= 0x0500
>> > - const auto lang_as_offset = 0;
>> >   const clang::InputKind ik_opencl = clang::InputKind::OpenCL;
>> >  #else
>> > - const auto lang_as_offset = clang::LangAS::Offset;
>> >   const clang::InputKind ik_opencl = clang::IK_OpenCL;
>> >  #endif
>> >  
>> > 
>> 
>> Thanks for improving the patch. Future-proof thinking: what if the value
>> of clang::LangAS::Default changes from 0 to some other constant?
>
> Hi Vedran,
>
> you're right that it'd be more future proof, but I liked the one line
> simplicity of the current version. Future clang changes will require
> adaptations, but I don't expect clang to go back to non-0 lang AS
> indices. Feel free to add "I told you so" if they prove me wrong :)
>

I think it will be trivial to extend Jan's current approach to non-zero
LangAS offset...  Just add a new #elif directive to
compat::target_address_space() in the unlikely case that it's necessary
in the future.

>> 
>> Other than that, this patch is:
>> 
>> Reviewed-by: Vedran Miletić 
>
> I was not sure if this applied even without the change so I pushed it
> only with francisco's rb.
>
> thanks,
> Jan
>
>> 
>> Regards,
>> Vedran
>> 
> ___
> mesa-dev mailing list
> mesa-dev@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/mesa-dev


signature.asc
Description: PGP signature
___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


Re: [Mesa-dev] Build mesa-dev on Windows with AVX instruction set problem

2017-10-25 Thread Fabrício Ceolin
Seeing the source code in disassembly and the stacktrace, it appears to me
that the problem is in the std:: basic_string(), not related to mesa-dev.

Although, it's strange to me the source of this include is at MSVC dir
(according to VS2017) and opengl32.dll was compiled using LLVM. Maybe the
problem is LLVM compilation, not the mesa-dev compilation.

Disassembly with source:

  1922: basic_string(const basic_string& _Right)
  1923: :
_Mybase(_Alty_traits::select_on_container_copy_construction(_Right._Getal()))
7FF85FBF874B 48 89 41 18  mov qword ptr [rcx+18h],rax
  1925: _Construct_lv_contents(_Right);
7FF85FBF874F 48 83 7A 18 10   cmp qword ptr [rdx+18h],10h
7FF85FBF8754 48 8B 6A 10  mov rbp,qword ptr [rdx+10h]
7FF85FBF8758 72 03jb
 std::basic_string::basic_string+2Dh
(07FF85FBF875Dh)
7FF85FBF875A 48 8B 32 mov rsi,qword ptr [rdx]
7FF85FBF875D 48 83 FD 10  cmp rbp,10h
7FF85FBF8761 73 27jae
std::basic_string::basic_string+5Ah
(07FF85FBF878Ah)
7FF85FBF8763 C5 F8 10 06  vmovups xmm0,xmmword ptr [rsi]
7FF85FBF8767 C5 F8 11 01  vmovups xmmword ptr [rcx],xmm0
7FF85FBF876B 48 89 69 10  mov qword ptr [rcx+10h],rbp


Include path of source in disassembly: c:\Program Files (x86)\Microsoft
Visual Studio\2017\Community\VC\Tools\MSVC\14.11.25503\include\xstring


Stacktrace:

>
opengl32.dll!std::basic_string::basic_string(const
std::basic_string &
_Right) Line 1925 C++ Symbols loaded.
  [External Code] Annotated Frame
  opengl32.dll!_initterm(void(*)() * first, void(*)() * last) Line 16 C++
Symbols loaded.
  [External Code] Annotated Frame
  python36.dll!5e137fd8() Unknown No symbols loaded.
  python36.dll!5e1376ca() Unknown No symbols loaded.
  python36.dll!5e1372a1() Unknown No symbols loaded.
  python36.dll!5e137217() Unknown No symbols loaded.

[image: MiningMath Associates]

*Fabrício
Ceolin*
+55 (31) 98675-1359
MiningMath Associates 
www.miningmath.com 



2017-10-25 20:55 GMT-02:00 Roland Scheidegger :

> Am 26.10.2017 um 00:26 schrieb Ilia Mirkin:
> > On Wed, Oct 25, 2017 at 6:15 PM, Fabrício Ceolin
> > >
> > wrote:
> >
> > Hi,
> >
> > Thanks. I recompiled everything (on Windows) using this real machine:
> >
> > #under msys2
> > $ cat /proc/cpuinfo
> > processor   : 0
> > vendor_id   : GenuineIntel
> > cpu family  : 6
> > model   : 23
> > model name  : Genuine Intel(R) CPU   U2300  @ 1.20GHz
> > stepping: 10
> > cpu MHz : 1197.000
> > cache size  : 1024 KB
> > physical id : 0
> > siblings: 2
> > core id : 0
> > cpu cores   : 2
> > apicid  : 0
> > initial apicid  : 0
> > fpu : yes
> > fpu_exception   : yes
> > cpuid level : 13
> > wp  : yes
> > flags   : fpu vme de pse tsc msr pae mce cx8 apic sep mtrr
> > pge mca cmov pat pse36 clflush dts acpi mmx fxsr sse sse2 ss ht tm
> > pbe pni dtes64 monitor ds_cpl vmx est tm2 ssse3 cx16 xtpr pdcm xsave
> > osxsave lahf_lm dtherm
> > clflush size: 64
> > cache_alignment : 64
> > address sizes   : 36 bits physical, 48 bits virtual
> > power management:
> >
> > but I got the same error:
> >
> > 7FF85FBF874F 48 83 7A 18 10   cmp qword ptr
> > [rdx+18h],10h
> > 7FF85FBF8754 48 8B 6A 10  mov rbp,qword ptr
> > [rdx+10h]
> > 7FF85FBF8758 72 03jb
> > std::basic_string > >::basic_string > >+2Dh (07FF85FBF875Dh)
> > 7FF85FBF875A 48 8B 32 mov rsi,qword ptr
> [rdx]
> > 7FF85FBF875D 48 83 FD 10  cmp rbp,10h
> > 7FF85FBF8761 73 27jae
> >  std::basic_string > >::basic_string > >+5Ah (07FF85FBF878Ah)
> > *7FF85FBF8763 C5 F8 10 06  vmovups xmm0,xmmword ptr
> > [rsi]  *
> > 7FF85FBF8767 C5 F8 11 01  vmovups xmmword ptr
> > [rcx],xmm0
> > 7FF85FBF876B 48 89 69 10  mov qword ptr
> > 

[Mesa-dev] [PATCH v3 47/48] nir: Validate base types on array dereferences

2017-10-25 Thread Jason Ekstrand
We were already validating that the parent type goes along with the
child type but we weren't actually validating that the parent type is
reasonable.  This fixes that.

Acked-by: Lionel Landwerlin 
---
 src/compiler/nir/nir_validate.c | 18 --
 1 file changed, 16 insertions(+), 2 deletions(-)

diff --git a/src/compiler/nir/nir_validate.c b/src/compiler/nir/nir_validate.c
index 2322c8f..9bf8c70 100644
--- a/src/compiler/nir/nir_validate.c
+++ b/src/compiler/nir/nir_validate.c
@@ -397,7 +397,8 @@ validate_alu_instr(nir_alu_instr *instr, validate_state 
*state)
 }
 
 static void
-validate_deref_chain(nir_deref *deref, validate_state *state)
+validate_deref_chain(nir_deref *deref, nir_variable_mode mode,
+ validate_state *state)
 {
validate_assert(state, deref->child == NULL || ralloc_parent(deref->child) 
== deref);
 
@@ -405,6 +406,19 @@ validate_deref_chain(nir_deref *deref, validate_state 
*state)
while (deref != NULL) {
   switch (deref->deref_type) {
   case nir_deref_type_array:
+ if (mode == nir_var_shared) {
+/* Shared variables have a bit more relaxed rules because we need
+ * to be able to handle array derefs on vectors.  Fortunately,
+ * nir_lower_io handles these just fine.
+ */
+validate_assert(state, glsl_type_is_array(parent->type) ||
+   glsl_type_is_matrix(parent->type) ||
+   glsl_type_is_vector(parent->type));
+ } else {
+/* Most of NIR cannot handle array derefs on vectors */
+validate_assert(state, glsl_type_is_array(parent->type) ||
+   glsl_type_is_matrix(parent->type));
+ }
  validate_assert(state, deref->type == 
glsl_get_array_element(parent->type));
  if (nir_deref_as_array(deref)->deref_array_type ==
  nir_deref_array_type_indirect)
@@ -451,7 +465,7 @@ validate_deref_var(void *parent_mem_ctx, nir_deref_var 
*deref, validate_state *s
 
validate_var_use(deref->var, state);
 
-   validate_deref_chain(>deref, state);
+   validate_deref_chain(>deref, deref->var->data.mode, state);
 }
 
 static void
-- 
2.5.0.400.gff86faf

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH v3 32/48] intel/cs: Push subgroup ID instead of base thread ID

2017-10-25 Thread Jason Ekstrand
We're going to want subgroup ID for SPIR-V subgroups eventually anyway.
We really only want to push one and calculate the other from it.  It
makes a bit more sense to push the subgroup ID because it's simpler to
calculate and because it's a real API thing.  The only advantage to
pushing the base thread ID is to avoid a single SHL in the shader.
---
 src/compiler/nir/nir_intrinsics.h|  4 +---
 src/intel/compiler/brw_compiler.h|  2 +-
 src/intel/compiler/brw_fs.cpp| 30 
 src/intel/compiler/brw_fs.h  |  2 +-
 src/intel/compiler/brw_fs_nir.cpp|  8 +++
 src/intel/compiler/brw_nir.h |  3 ++-
 src/intel/compiler/brw_nir_lower_cs_intrinsics.c | 15 
 src/intel/vulkan/anv_cmd_buffer.c|  6 ++---
 src/mesa/drivers/dri/i965/gen6_constant_state.c  |  6 ++---
 9 files changed, 40 insertions(+), 36 deletions(-)

diff --git a/src/compiler/nir/nir_intrinsics.h 
b/src/compiler/nir/nir_intrinsics.h
index 47022dd..bb8cfac 100644
--- a/src/compiler/nir/nir_intrinsics.h
+++ b/src/compiler/nir/nir_intrinsics.h
@@ -355,6 +355,7 @@ SYSTEM_VALUE(subgroup_ge_mask, 1, 0, xx, xx, xx)
 SYSTEM_VALUE(subgroup_gt_mask, 1, 0, xx, xx, xx)
 SYSTEM_VALUE(subgroup_le_mask, 1, 0, xx, xx, xx)
 SYSTEM_VALUE(subgroup_lt_mask, 1, 0, xx, xx, xx)
+SYSTEM_VALUE(subgroup_id, 1, 0, xx, xx, xx)
 
 /* Blend constant color values.  Float values are clamped. */
 SYSTEM_VALUE(blend_const_color_r_float, 1, 0, xx, xx, xx)
@@ -364,9 +365,6 @@ SYSTEM_VALUE(blend_const_color_a_float, 1, 0, xx, xx, xx)
 SYSTEM_VALUE(blend_const_color_rgba_unorm, 1, 0, xx, xx, xx)
 SYSTEM_VALUE(blend_const_color__unorm, 1, 0, xx, xx, xx)
 
-/* Intel specific system values */
-SYSTEM_VALUE(intel_thread_local_id, 1, 0, xx, xx, xx)
-
 /**
  * Barycentric coordinate intrinsics.
  *
diff --git a/src/intel/compiler/brw_compiler.h 
b/src/intel/compiler/brw_compiler.h
index 508d4ba..23c2172 100644
--- a/src/intel/compiler/brw_compiler.h
+++ b/src/intel/compiler/brw_compiler.h
@@ -543,7 +543,7 @@ enum brw_param_builtin {
BRW_PARAM_BUILTIN_TESS_LEVEL_INNER_X,
BRW_PARAM_BUILTIN_TESS_LEVEL_INNER_Y,
 
-   BRW_PARAM_BUILTIN_THREAD_LOCAL_ID,
+   BRW_PARAM_BUILTIN_SUBGROUP_ID,
 };
 
 #define BRW_PARAM_BUILTIN_CLIP_PLANE(idx, comp) \
diff --git a/src/intel/compiler/brw_fs.cpp b/src/intel/compiler/brw_fs.cpp
index c054537..6e2428c 100644
--- a/src/intel/compiler/brw_fs.cpp
+++ b/src/intel/compiler/brw_fs.cpp
@@ -996,7 +996,7 @@ fs_visitor::import_uniforms(fs_visitor *v)
this->push_constant_loc = v->push_constant_loc;
this->pull_constant_loc = v->pull_constant_loc;
this->uniforms = v->uniforms;
-   this->thread_local_id = v->thread_local_id;
+   this->subgroup_id = v->subgroup_id;
 }
 
 void
@@ -1931,14 +1931,14 @@ set_push_pull_constant_loc(unsigned uniform, int 
*chunk_start,
 }
 
 static int
-get_thread_local_id_param_index(const brw_stage_prog_data *prog_data)
+get_subgroup_id_param_index(const brw_stage_prog_data *prog_data)
 {
if (prog_data->nr_params == 0)
   return -1;
 
/* The local thread id is always the last parameter in the list */
uint32_t last_param = prog_data->param[prog_data->nr_params - 1];
-   if (last_param == BRW_PARAM_BUILTIN_THREAD_LOCAL_ID)
+   if (last_param == BRW_PARAM_BUILTIN_SUBGROUP_ID)
   return prog_data->nr_params - 1;
 
return -1;
@@ -2019,7 +2019,7 @@ fs_visitor::assign_constant_locations()
   }
}
 
-   int thread_local_id_index = 
get_thread_local_id_param_index(stage_prog_data);
+   int subgroup_id_index = get_subgroup_id_param_index(stage_prog_data);
 
/* Only allow 16 registers (128 uniform components) as push constants.
 *
@@ -2030,7 +2030,7 @@ fs_visitor::assign_constant_locations()
 * brw_curbe.c.
 */
unsigned int max_push_components = 16 * 8;
-   if (thread_local_id_index >= 0)
+   if (subgroup_id_index >= 0)
   max_push_components--; /* Save a slot for the thread ID */
 
/* We push small arrays, but no bigger than 16 floats.  This is big enough
@@ -2075,8 +2075,8 @@ fs_visitor::assign_constant_locations()
   if (!is_live[u])
  continue;
 
-  /* Skip thread_local_id_index to put it in the last push register. */
-  if (thread_local_id_index == (int)u)
+  /* Skip subgroup_id_index to put it in the last push register. */
+  if (subgroup_id_index == (int)u)
  continue;
 
   set_push_pull_constant_loc(u, _start, _chunk_bitsize,
@@ -2090,8 +2090,8 @@ fs_visitor::assign_constant_locations()
}
 
/* Add the CS local thread ID uniform at the end of the push constants */
-   if (thread_local_id_index >= 0)
-  push_constant_loc[thread_local_id_index] = num_push_constants++;
+   if (subgroup_id_index >= 0)
+  push_constant_loc[subgroup_id_index] = num_push_constants++;
 
/* As the uniforms are going to be reordered, stash the old array and
 * create two 

[Mesa-dev] [PATCH v3 43/48] nir/lower_subgroups: Lower ballot intrinsics to the specified bit size

2017-10-25 Thread Jason Ekstrand
Ballot intrinsics return a bitfield of subgroups.  In GLSL and some
SPIR-V extensions, they return a uint64_t.  In SPV_KHR_shader_ballot,
they return a uvec4.  Also, some back-ends would rather pass around
32-bit values because it's easier than messing with 64-bit all the time.
To solve this mess, we make nir_lower_subgroups take a new parameter
called ballot_bit_size and it lowers whichever thing it gets in from the
source language (uint64_t or uvec4) to a scalar with the specified
number of bits.  This replaces a chunk of the old lowering code.

Reviewed-by: Lionel Landwerlin 
---
 src/compiler/nir/nir.h |   3 +-
 src/compiler/nir/nir_lower_subgroups.c | 101 +++--
 src/compiler/nir/nir_opt_intrinsics.c  |  18 --
 src/intel/compiler/brw_compiler.c  |   1 -
 src/intel/compiler/brw_nir.c   |   1 +
 5 files changed, 98 insertions(+), 26 deletions(-)

diff --git a/src/compiler/nir/nir.h b/src/compiler/nir/nir.h
index 1a25d7b..563b57f 100644
--- a/src/compiler/nir/nir.h
+++ b/src/compiler/nir/nir.h
@@ -1854,8 +1854,6 @@ typedef struct nir_shader_compiler_options {
 */
bool use_interpolated_input_intrinsics;
 
-   unsigned max_subgroup_size;
-
unsigned max_unroll_iterations;
 } nir_shader_compiler_options;
 
@@ -2469,6 +2467,7 @@ bool nir_lower_samplers_as_deref(nir_shader *shader,
  const struct gl_shader_program 
*shader_program);
 
 typedef struct nir_lower_subgroups_options {
+   uint8_t ballot_bit_size;
bool lower_to_scalar:1;
bool lower_vote_trivial:1;
bool lower_subgroup_masks:1;
diff --git a/src/compiler/nir/nir_lower_subgroups.c 
b/src/compiler/nir/nir_lower_subgroups.c
index 02738c4..1969740 100644
--- a/src/compiler/nir/nir_lower_subgroups.c
+++ b/src/compiler/nir/nir_lower_subgroups.c
@@ -28,6 +28,43 @@
  * \file nir_opt_intrinsics.c
  */
 
+/* Converts a uint32_t or uint64_t value to uint64_t or uvec4 */
+static nir_ssa_def *
+uint_to_ballot_type(nir_builder *b, nir_ssa_def *value,
+unsigned num_components, unsigned bit_size,
+uint32_t extend_val)
+{
+   assert(value->num_components == 1);
+   assert(value->bit_size == 32 || value->bit_size == 64);
+
+   nir_ssa_def *extend = nir_imm_int(b, extend_val);
+   if (num_components > 1) {
+  /* SPIR-V uses a uvec4 for ballot values */
+  assert(num_components == 4);
+  assert(bit_size == 32);
+
+  if (value->bit_size == 32) {
+ return nir_vec4(b, value, extend, extend, extend);
+  } else {
+ assert(value->bit_size == 64);
+ return nir_vec4(b, nir_unpack_64_2x32_split_x(b, value),
+nir_unpack_64_2x32_split_y(b, value),
+extend, extend);
+  }
+   } else {
+  /* GLSL uses a uint64_t for ballot values */
+  assert(num_components == 1);
+  assert(bit_size == 64);
+
+  if (value->bit_size == 32) {
+ return nir_pack_64_2x32_split(b, value, extend);
+  } else {
+ assert(value->bit_size == 64);
+ return value;
+  }
+   }
+}
+
 static nir_ssa_def *
 lower_read_invocation_to_scalar(nir_builder *b, nir_intrinsic_instr *intrin)
 {
@@ -86,24 +123,78 @@ lower_subgroups_intrin(nir_builder *b, nir_intrinsic_instr 
*intrin,
   if (!options->lower_subgroup_masks)
  return NULL;
 
+  uint64_t mask;
+  switch (intrin->intrinsic) {
+  case nir_intrinsic_load_subgroup_eq_mask:
+ mask = 1ull;
+ break;
+  case nir_intrinsic_load_subgroup_ge_mask:
+  case nir_intrinsic_load_subgroup_lt_mask:
+ mask = ~0ull;
+ break;
+  case nir_intrinsic_load_subgroup_gt_mask:
+  case nir_intrinsic_load_subgroup_le_mask:
+ mask = ~1ull;
+ break;
+  default:
+ unreachable("you seriously can't tell this is unreachable?");
+  }
+
   nir_ssa_def *count = nir_load_subgroup_invocation(b);
+  nir_ssa_def *shifted;
+  if (options->ballot_bit_size == 32 && intrin->dest.ssa.bit_size == 32) {
+ assert(intrin->dest.ssa.num_components == 4);
+ shifted = nir_ishl(b, nir_imm_int(b, mask), count);
+  } else {
+ /* We're either working with 64-bit types natively or we're in OpenGL
+  * where we want a uint64_t as our final value.  In either case we
+  * know that we have 64-bit types.  In the first case, we need to use
+  * 64 bits because of the native subgroup size.  In the second, we
+  * want a 64-bit result and a 64-bit shift is likely more efficient
+  * than messing around with 32-bit shifts and packing.
+  */
+ assert(options->ballot_bit_size == 64 ||
+intrin->dest.ssa.bit_size == 64);
+ shifted = nir_ishl(b, nir_imm_int64(b, mask), count);
+  }
+
+  nir_ssa_def *ballot =
+ uint_to_ballot_type(b, shifted,
+ 

[Mesa-dev] [PATCH v3 44/48] nir, intel/compiler: Use a fixed subgroup size

2017-10-25 Thread Jason Ekstrand
The GL_ARB_shader_ballot spec says that gl_SubGroupSizeARB is declared
as a uniform.  This means that it cannot change across an invocation
such as a draw call or a compute dispatch.  For compute shaders, we're
ok because we only ever use one dispatch size.  For fragment, however,
the hardware dynamically chooses between SIMD8 and SIMD16 which violates
the spec.  Instead, let's just pick a subgroup size based on the shader
stage.  The fixed size we choose for compute shaders is a bit higher
than strictly needed but there's no real harm in that.  The advantage is
that, if they do anything interesting with the value, NIR will see it as
an immediate and can optimize better.

Acked-by: Lionel Landwerlin 
---
 src/compiler/nir/nir.h | 1 +
 src/compiler/nir/nir_lower_subgroups.c | 5 +
 src/intel/compiler/brw_fs_nir.cpp  | 4 
 src/intel/compiler/brw_nir.c   | 2 ++
 4 files changed, 8 insertions(+), 4 deletions(-)

diff --git a/src/compiler/nir/nir.h b/src/compiler/nir/nir.h
index 563b57f..df18bfa 100644
--- a/src/compiler/nir/nir.h
+++ b/src/compiler/nir/nir.h
@@ -2467,6 +2467,7 @@ bool nir_lower_samplers_as_deref(nir_shader *shader,
  const struct gl_shader_program 
*shader_program);
 
 typedef struct nir_lower_subgroups_options {
+   uint8_t subgroup_size;
uint8_t ballot_bit_size;
bool lower_to_scalar:1;
bool lower_vote_trivial:1;
diff --git a/src/compiler/nir/nir_lower_subgroups.c 
b/src/compiler/nir/nir_lower_subgroups.c
index 1969740..f9424c9 100644
--- a/src/compiler/nir/nir_lower_subgroups.c
+++ b/src/compiler/nir/nir_lower_subgroups.c
@@ -109,6 +109,11 @@ lower_subgroups_intrin(nir_builder *b, nir_intrinsic_instr 
*intrin,
  return nir_imm_int(b, NIR_TRUE);
   break;
 
+   case nir_intrinsic_load_subgroup_size:
+  if (options->subgroup_size)
+ return nir_imm_int(b, options->subgroup_size);
+  break;
+
case nir_intrinsic_read_invocation:
case nir_intrinsic_read_first_invocation:
   if (options->lower_to_scalar)
diff --git a/src/intel/compiler/brw_fs_nir.cpp 
b/src/intel/compiler/brw_fs_nir.cpp
index b552387..47730e1 100644
--- a/src/intel/compiler/brw_fs_nir.cpp
+++ b/src/intel/compiler/brw_fs_nir.cpp
@@ -4183,10 +4183,6 @@ fs_visitor::nir_emit_intrinsic(const fs_builder , 
nir_intrinsic_instr *instr
   break;
}
 
-   case nir_intrinsic_load_subgroup_size:
-  bld.MOV(retype(dest, BRW_REGISTER_TYPE_D), brw_imm_d(dispatch_width));
-  break;
-
case nir_intrinsic_load_subgroup_invocation:
   bld.MOV(retype(dest, BRW_REGISTER_TYPE_D),
   nir_system_values[SYSTEM_VALUE_SUBGROUP_INVOCATION]);
diff --git a/src/intel/compiler/brw_nir.c b/src/intel/compiler/brw_nir.c
index 0d59d36..5ed36fe 100644
--- a/src/intel/compiler/brw_nir.c
+++ b/src/intel/compiler/brw_nir.c
@@ -637,6 +637,8 @@ brw_preprocess_nir(const struct brw_compiler *compiler, 
nir_shader *nir)
OPT(nir_lower_system_values);
 
const nir_lower_subgroups_options subgroups_options = {
+  .subgroup_size = nir->info.stage == MESA_SHADER_COMPUTE ? 32 :
+   nir->info.stage == MESA_SHADER_FRAGMENT ? 16 : 8,
   .ballot_bit_size = 32,
   .lower_to_scalar = true,
   .lower_subgroup_masks = true,
-- 
2.5.0.400.gff86faf

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH v3 42/48] nir/lower_system_values: Lower SUBGROUP_*_MASK based on type

2017-10-25 Thread Jason Ekstrand
The SUBGROUP_*_MASK system values are uint64_t when coming in from GLSL
but uvec4 when coming in from SPIR-V.  Lowering based on type allows us
to nicely handle both.

Reviewed-by: Lionel Landwerlin 
---
 src/compiler/nir/nir_lower_system_values.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/src/compiler/nir/nir_lower_system_values.c 
b/src/compiler/nir/nir_lower_system_values.c
index 48c497e..3594f4a 100644
--- a/src/compiler/nir/nir_lower_system_values.c
+++ b/src/compiler/nir/nir_lower_system_values.c
@@ -125,8 +125,9 @@ convert_block(nir_block *block, nir_builder *b)
  nir_intrinsic_op op =
 nir_intrinsic_from_system_value(var->data.location);
  nir_intrinsic_instr *load = nir_intrinsic_instr_create(b->shader, op);
- nir_ssa_dest_init(>instr, >dest, 1, 64, NULL);
- load->num_components = 1;
+ nir_ssa_dest_init_for_type(>instr, >dest,
+var->type, NULL);
+ load->num_components = load->dest.ssa.num_components;
  nir_builder_instr_insert(b, >instr);
  sysval = >dest.ssa;
  break;
-- 
2.5.0.400.gff86faf

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH v3 25/48] intel/cs: Drop max_dispatch_width checks from compile_cs

2017-10-25 Thread Jason Ekstrand
The only things that adjust fs_visitor::max_dispatch_width are render
target writes which don't happen in compute shaders so they're
pointless.
---
 src/intel/compiler/brw_fs.cpp | 6 ++
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/src/intel/compiler/brw_fs.cpp b/src/intel/compiler/brw_fs.cpp
index a23366b..4c362ba 100644
--- a/src/intel/compiler/brw_fs.cpp
+++ b/src/intel/compiler/brw_fs.cpp
@@ -6818,8 +6818,7 @@ brw_compile_cs(const struct brw_compiler *compiler, void 
*log_data,
  NULL, /* Never used in core profile */
  shader, 16, shader_time_index);
if (likely(!(INTEL_DEBUG & DEBUG_NO16)) &&
-   !fail_msg && v8.max_dispatch_width >= 16 &&
-   min_dispatch_width <= 16) {
+   !fail_msg && min_dispatch_width <= 16) {
   /* Try a SIMD16 compile */
   if (min_dispatch_width <= 8)
  v16.import_uniforms();
@@ -6843,8 +6842,7 @@ brw_compile_cs(const struct brw_compiler *compiler, void 
*log_data,
fs_visitor v32(compiler, log_data, mem_ctx, key, _data->base,
  NULL, /* Never used in core profile */
  shader, 32, shader_time_index);
-   if (!fail_msg && v8.max_dispatch_width >= 32 &&
-   (min_dispatch_width > 16 || (INTEL_DEBUG & DEBUG_DO32))) {
+   if (!fail_msg && (min_dispatch_width > 16 || (INTEL_DEBUG & DEBUG_DO32))) {
   /* Try a SIMD32 compile */
   if (min_dispatch_width <= 8)
  v32.import_uniforms();
-- 
2.5.0.400.gff86faf

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH v3 40/48] nir: Add a ssa_dest_init_for_type helper

2017-10-25 Thread Jason Ekstrand
This would be useful a number of places

Reviewed-by: Lionel Landwerlin 
---
 src/compiler/nir/nir.h | 9 +
 1 file changed, 9 insertions(+)

diff --git a/src/compiler/nir/nir.h b/src/compiler/nir/nir.h
index 8c3a20c..1a25d7b 100644
--- a/src/compiler/nir/nir.h
+++ b/src/compiler/nir/nir.h
@@ -2228,6 +2228,15 @@ void nir_ssa_dest_init(nir_instr *instr, nir_dest *dest,
 void nir_ssa_def_init(nir_instr *instr, nir_ssa_def *def,
   unsigned num_components, unsigned bit_size,
   const char *name);
+static inline void
+nir_ssa_dest_init_for_type(nir_instr *instr, nir_dest *dest,
+   const struct glsl_type *type,
+   const char *name)
+{
+   assert(glsl_type_is_vector_or_scalar(type));
+   nir_ssa_dest_init(instr, dest, glsl_get_components(type),
+ glsl_get_bit_size(type), name);
+}
 void nir_ssa_def_rewrite_uses(nir_ssa_def *def, nir_src new_src);
 void nir_ssa_def_rewrite_uses_after(nir_ssa_def *def, nir_src new_src,
 nir_instr *after_me);
-- 
2.5.0.400.gff86faf

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH v3 48/48] compiler/nir_types: Handle vectors in glsl_get_array_element

2017-10-25 Thread Jason Ekstrand
Most of NIR doesn't allow doing array indexing on a vector (though it
does on a matrix).  However, nir_lower_io handles it just fine and this
behavior is needed for shared variables in Vulkan.  This commit makes
glsl_get_array_element do something sensible for vector types and makes
nir_validate happy with them.

Reviewed-by: Lionel Landwerlin 
---
 src/compiler/nir_types.cpp | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/compiler/nir_types.cpp b/src/compiler/nir_types.cpp
index b1b17eb..c66cfff 100644
--- a/src/compiler/nir_types.cpp
+++ b/src/compiler/nir_types.cpp
@@ -39,6 +39,8 @@ glsl_get_array_element(const glsl_type* type)
 {
if (type->is_matrix())
   return type->column_type();
+   else if (type->is_vector())
+  return type->get_scalar_type();
return type->fields.array;
 }
 
-- 
2.5.0.400.gff86faf

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH v3 41/48] nir: Make ballot intrinsics variable-size

2017-10-25 Thread Jason Ekstrand
This way they can return either a uvec4 or a uint64_t.  At the moment,
this is a no-op since we still always return a uint64_t.

Reviewed-by: Lionel Landwerlin 
---
 src/compiler/glsl/glsl_to_nir.cpp  |  1 +
 src/compiler/nir/nir_intrinsics.h  | 12 ++--
 src/compiler/nir/nir_lower_system_values.c |  1 +
 3 files changed, 8 insertions(+), 6 deletions(-)

diff --git a/src/compiler/glsl/glsl_to_nir.cpp 
b/src/compiler/glsl/glsl_to_nir.cpp
index c659a25..2786ede 100644
--- a/src/compiler/glsl/glsl_to_nir.cpp
+++ b/src/compiler/glsl/glsl_to_nir.cpp
@@ -1165,6 +1165,7 @@ nir_visitor::visit(ir_call *ir)
   case nir_intrinsic_ballot: {
  nir_ssa_dest_init(>instr, >dest,
ir->return_deref->type->vector_elements, 64, NULL);
+ instr->num_components = ir->return_deref->type->vector_elements;
 
  ir_rvalue *value = (ir_rvalue *) ir->actual_parameters.get_head();
  instr->src[0] = nir_src_for_ssa(evaluate_rvalue(value));
diff --git a/src/compiler/nir/nir_intrinsics.h 
b/src/compiler/nir/nir_intrinsics.h
index bb8cfac..20bef33 100644
--- a/src/compiler/nir/nir_intrinsics.h
+++ b/src/compiler/nir/nir_intrinsics.h
@@ -102,7 +102,7 @@ INTRINSIC(shader_clock, 0, ARR(0), true, 2, 0, 0, xx, xx, 
xx, NIR_INTRINSIC_CAN_
  *
  * GLSL functions from ARB_shader_ballot.
  */
-INTRINSIC(ballot, 1, ARR(1), true, 1, 0, 0, xx, xx, xx, 
NIR_INTRINSIC_CAN_ELIMINATE)
+INTRINSIC(ballot, 1, ARR(1), true, 0, 0, 0, xx, xx, xx, 
NIR_INTRINSIC_CAN_ELIMINATE)
 INTRINSIC(read_invocation, 2, ARR(0, 1), true, 0, 0, 0, xx, xx, xx, 
NIR_INTRINSIC_CAN_ELIMINATE)
 INTRINSIC(read_first_invocation, 1, ARR(0), true, 0, 0, 0, xx, xx, xx, 
NIR_INTRINSIC_CAN_ELIMINATE)
 
@@ -350,11 +350,11 @@ SYSTEM_VALUE(layer_id, 1, 0, xx, xx, xx)
 SYSTEM_VALUE(view_index, 1, 0, xx, xx, xx)
 SYSTEM_VALUE(subgroup_size, 1, 0, xx, xx, xx)
 SYSTEM_VALUE(subgroup_invocation, 1, 0, xx, xx, xx)
-SYSTEM_VALUE(subgroup_eq_mask, 1, 0, xx, xx, xx)
-SYSTEM_VALUE(subgroup_ge_mask, 1, 0, xx, xx, xx)
-SYSTEM_VALUE(subgroup_gt_mask, 1, 0, xx, xx, xx)
-SYSTEM_VALUE(subgroup_le_mask, 1, 0, xx, xx, xx)
-SYSTEM_VALUE(subgroup_lt_mask, 1, 0, xx, xx, xx)
+SYSTEM_VALUE(subgroup_eq_mask, 0, 0, xx, xx, xx)
+SYSTEM_VALUE(subgroup_ge_mask, 0, 0, xx, xx, xx)
+SYSTEM_VALUE(subgroup_gt_mask, 0, 0, xx, xx, xx)
+SYSTEM_VALUE(subgroup_le_mask, 0, 0, xx, xx, xx)
+SYSTEM_VALUE(subgroup_lt_mask, 0, 0, xx, xx, xx)
 SYSTEM_VALUE(subgroup_id, 1, 0, xx, xx, xx)
 
 /* Blend constant color values.  Float values are clamped. */
diff --git a/src/compiler/nir/nir_lower_system_values.c 
b/src/compiler/nir/nir_lower_system_values.c
index 39b1a26..48c497e 100644
--- a/src/compiler/nir/nir_lower_system_values.c
+++ b/src/compiler/nir/nir_lower_system_values.c
@@ -126,6 +126,7 @@ convert_block(nir_block *block, nir_builder *b)
 nir_intrinsic_from_system_value(var->data.location);
  nir_intrinsic_instr *load = nir_intrinsic_instr_create(b->shader, op);
  nir_ssa_dest_init(>instr, >dest, 1, 64, NULL);
+ load->num_components = 1;
  nir_builder_instr_insert(b, >instr);
  sysval = >dest.ssa;
  break;
-- 
2.5.0.400.gff86faf

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH v3 46/48] spirv: Rework barriers

2017-10-25 Thread Jason Ekstrand
Our previous handling of barriers always used the big hammer and didn't
correctly emit memory barriers when specified along with a control
barrier.  This commit completely reworks the way we emit barriers to
make things both more precise and more correct.

Reviewed-by: Lionel Landwerlin 
---
 src/compiler/spirv/spirv_to_nir.c | 132 --
 1 file changed, 114 insertions(+), 18 deletions(-)

diff --git a/src/compiler/spirv/spirv_to_nir.c 
b/src/compiler/spirv/spirv_to_nir.c
index fe0a4ef..6051854 100644
--- a/src/compiler/spirv/spirv_to_nir.c
+++ b/src/compiler/spirv/spirv_to_nir.c
@@ -2571,36 +2571,132 @@ vtn_handle_composite(struct vtn_builder *b, SpvOp 
opcode,
 }
 
 static void
+vtn_emit_barrier(struct vtn_builder *b, nir_intrinsic_op op)
+{
+   nir_intrinsic_instr *intrin = nir_intrinsic_instr_create(b->shader, op);
+   nir_builder_instr_insert(>nb, >instr);
+}
+
+static void
+vtn_emit_memory_barrier(struct vtn_builder *b, SpvScope scope,
+SpvMemorySemanticsMask semantics)
+{
+   static const SpvMemorySemanticsMask all_memory_semantics =
+  SpvMemorySemanticsUniformMemoryMask |
+  SpvMemorySemanticsWorkgroupMemoryMask |
+  SpvMemorySemanticsAtomicCounterMemoryMask |
+  SpvMemorySemanticsImageMemoryMask;
+
+   /* If we're not actually doing a memory barrier, bail */
+   if (!(semantics & all_memory_semantics))
+  return;
+
+   /* GL and Vulkan don't have these */
+   assert(scope != SpvScopeCrossDevice);
+
+   if (scope == SpvScopeSubgroup)
+  return; /* Nothing to do here */
+
+   if (scope == SpvScopeWorkgroup) {
+  vtn_emit_barrier(b, nir_intrinsic_group_memory_barrier);
+  return;
+   }
+
+   /* There's only two scopes thing left */
+   assert(scope == SpvScopeInvocation || scope == SpvScopeDevice);
+
+   if ((semantics & all_memory_semantics) == all_memory_semantics) {
+  vtn_emit_barrier(b, nir_intrinsic_memory_barrier);
+  return;
+   }
+
+   /* Issue a bunch of more specific barriers */
+   uint32_t bits = semantics;
+   while (bits) {
+  SpvMemorySemanticsMask semantic = 1 << u_bit_scan();
+  switch (semantic) {
+  case SpvMemorySemanticsUniformMemoryMask:
+ vtn_emit_barrier(b, nir_intrinsic_memory_barrier_buffer);
+ break;
+  case SpvMemorySemanticsWorkgroupMemoryMask:
+ vtn_emit_barrier(b, nir_intrinsic_memory_barrier_shared);
+ break;
+  case SpvMemorySemanticsAtomicCounterMemoryMask:
+ vtn_emit_barrier(b, nir_intrinsic_memory_barrier_atomic_counter);
+ break;
+  case SpvMemorySemanticsImageMemoryMask:
+ vtn_emit_barrier(b, nir_intrinsic_memory_barrier_image);
+ break;
+  default:
+ break;;
+  }
+   }
+}
+
+static void
 vtn_handle_barrier(struct vtn_builder *b, SpvOp opcode,
const uint32_t *w, unsigned count)
 {
-   nir_intrinsic_op intrinsic_op;
switch (opcode) {
case SpvOpEmitVertex:
case SpvOpEmitStreamVertex:
-  intrinsic_op = nir_intrinsic_emit_vertex;
-  break;
case SpvOpEndPrimitive:
-   case SpvOpEndStreamPrimitive:
-  intrinsic_op = nir_intrinsic_end_primitive;
-  break;
-   case SpvOpMemoryBarrier:
-  intrinsic_op = nir_intrinsic_memory_barrier;
-  break;
-   case SpvOpControlBarrier:
-  intrinsic_op = nir_intrinsic_barrier;
+   case SpvOpEndStreamPrimitive: {
+  nir_intrinsic_op intrinsic_op;
+  switch (opcode) {
+  case SpvOpEmitVertex:
+  case SpvOpEmitStreamVertex:
+ intrinsic_op = nir_intrinsic_emit_vertex;
+ break;
+  case SpvOpEndPrimitive:
+  case SpvOpEndStreamPrimitive:
+ intrinsic_op = nir_intrinsic_end_primitive;
+ break;
+  default:
+ unreachable("Invalid opcode");
+  }
+
+  nir_intrinsic_instr *intrin =
+ nir_intrinsic_instr_create(b->shader, intrinsic_op);
+
+  switch (opcode) {
+  case SpvOpEmitStreamVertex:
+  case SpvOpEndStreamPrimitive:
+ nir_intrinsic_set_stream_id(intrin, w[1]);
+ break;
+  default:
+ break;
+  }
+
+  nir_builder_instr_insert(>nb, >instr);
   break;
-   default:
-  unreachable("unknown barrier instruction");
}
 
-   nir_intrinsic_instr *intrin =
-  nir_intrinsic_instr_create(b->shader, intrinsic_op);
+   case SpvOpMemoryBarrier: {
+  SpvScope scope = vtn_constant_value(b, w[1])->values[0].u32[0];
+  SpvMemorySemanticsMask semantics =
+ vtn_constant_value(b, w[2])->values[0].u32[0];
+  vtn_emit_memory_barrier(b, scope, semantics);
+  return;
+   }
+
+   case SpvOpControlBarrier: {
+  SpvScope execution_scope =
+ vtn_constant_value(b, w[1])->values[0].u32[0];
+  if (execution_scope == SpvScopeWorkgroup)
+ vtn_emit_barrier(b, nir_intrinsic_barrier);
 
-   if (opcode == SpvOpEmitStreamVertex || opcode == SpvOpEndStreamPrimitive)
-  

[Mesa-dev] [PATCH v3 36/48] intel/eu: Explicitly set EXECUTE_1 where needed

2017-10-25 Thread Jason Ekstrand
---
 src/intel/compiler/brw_eu_emit.c | 9 +
 1 file changed, 9 insertions(+)

diff --git a/src/intel/compiler/brw_eu_emit.c b/src/intel/compiler/brw_eu_emit.c
index 902914f..952d489 100644
--- a/src/intel/compiler/brw_eu_emit.c
+++ b/src/intel/compiler/brw_eu_emit.c
@@ -1983,6 +1983,7 @@ void brw_oword_block_write_scratch(struct brw_codegen *p,
   brw_MOV(p, mrf, retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
 
   /* set message header global offset field (reg 0, element 2) */
+  brw_set_default_exec_size(p, BRW_EXECUTE_1);
   brw_MOV(p,
  retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE,
  mrf.nr,
@@ -2102,6 +2103,7 @@ brw_oword_block_read_scratch(struct brw_codegen *p,
   brw_MOV(p, mrf, retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
 
   /* set message header global offset field (reg 0, element 2) */
+  brw_set_default_exec_size(p, BRW_EXECUTE_1);
   brw_MOV(p, get_element_ud(mrf, 2), brw_imm_ud(offset));
 
   brw_pop_insn_state(p);
@@ -2200,6 +2202,7 @@ void brw_oword_block_read(struct brw_codegen *p,
brw_MOV(p, mrf, retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
 
/* set message header global offset field (reg 0, element 2) */
+   brw_set_default_exec_size(p, BRW_EXECUTE_1);
brw_MOV(p,
   retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE,
   mrf.nr,
@@ -2448,6 +2451,7 @@ void brw_urb_WRITE(struct brw_codegen *p,
   brw_push_insn_state(p);
   brw_set_default_access_mode(p, BRW_ALIGN_1);
   brw_set_default_mask_control(p, BRW_MASK_DISABLE);
+  brw_set_default_exec_size(p, BRW_EXECUTE_1);
   brw_OR(p, retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE, msg_reg_nr, 5),
   BRW_REGISTER_TYPE_UD),
retype(brw_vec1_grf(0, 5), BRW_REGISTER_TYPE_UD),
@@ -2507,6 +2511,7 @@ brw_send_indirect_message(struct brw_codegen *p,
   brw_push_insn_state(p);
   brw_set_default_access_mode(p, BRW_ALIGN_1);
   brw_set_default_mask_control(p, BRW_MASK_DISABLE);
+  brw_set_default_exec_size(p, BRW_EXECUTE_1);
   brw_set_default_predicate_control(p, BRW_PREDICATE_NONE);
 
   /* Load the indirect descriptor to an address register using OR so the
@@ -2551,6 +2556,7 @@ brw_send_indirect_surface_message(struct brw_codegen *p,
   brw_push_insn_state(p);
   brw_set_default_access_mode(p, BRW_ALIGN_1);
   brw_set_default_mask_control(p, BRW_MASK_DISABLE);
+  brw_set_default_exec_size(p, BRW_EXECUTE_1);
   brw_set_default_predicate_control(p, BRW_PREDICATE_NONE);
 
   /* Mask out invalid bits from the surface index to avoid hangs e.g. when
@@ -3278,6 +3284,7 @@ brw_find_live_channel(struct brw_codegen *p, struct 
brw_reg dst,
  struct brw_reg exec_mask =
 retype(brw_mask_reg(0), BRW_REGISTER_TYPE_UD);
 
+ brw_set_default_exec_size(p, BRW_EXECUTE_1);
  if (mask.file != BRW_IMMEDIATE_VALUE || mask.ud != 0x) {
 /* Unfortunately, ce0 does not take into account the thread
  * dispatch mask, which may be a problem in cases where it's not
@@ -3299,6 +3306,7 @@ brw_find_live_channel(struct brw_codegen *p, struct 
brw_reg dst,
   } else {
  const struct brw_reg flag = brw_flag_reg(1, 0);
 
+ brw_set_default_exec_size(p, BRW_EXECUTE_1);
  brw_MOV(p, retype(flag, BRW_REGISTER_TYPE_UD), brw_imm_ud(0));
 
  /* Run enough instructions returning zero with execution masking and
@@ -3324,6 +3332,7 @@ brw_find_live_channel(struct brw_codegen *p, struct 
brw_reg dst,
   * instructions.
   */
  const enum brw_reg_type type = brw_int_type(exec_size / 8, false);
+ brw_set_default_exec_size(p, BRW_EXECUTE_1);
  brw_FBL(p, vec1(dst), byte_offset(retype(flag, type), qtr_control));
   }
} else {
-- 
2.5.0.400.gff86faf

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH v3 37/48] intel/fs: Explicitly set EXECUTE_1 where needed

2017-10-25 Thread Jason Ekstrand
---
 src/intel/compiler/brw_fs.cpp   | 2 +-
 src/intel/compiler/brw_fs_generator.cpp | 7 +++
 src/intel/compiler/brw_fs_nir.cpp   | 8 
 src/intel/compiler/brw_fs_visitor.cpp   | 7 +++
 4 files changed, 15 insertions(+), 9 deletions(-)

diff --git a/src/intel/compiler/brw_fs.cpp b/src/intel/compiler/brw_fs.cpp
index 6e2428c..41dda53 100644
--- a/src/intel/compiler/brw_fs.cpp
+++ b/src/intel/compiler/brw_fs.cpp
@@ -4288,7 +4288,7 @@ emit_surface_header(const fs_builder , const fs_reg 
_mask)
fs_builder ubld = bld.exec_all().group(8, 0);
const fs_reg dst = ubld.vgrf(BRW_REGISTER_TYPE_UD);
ubld.MOV(dst, brw_imm_d(0));
-   ubld.MOV(component(dst, 7), sample_mask);
+   ubld.group(1, 0).MOV(component(dst, 7), sample_mask);
return dst;
 }
 
diff --git a/src/intel/compiler/brw_fs_generator.cpp 
b/src/intel/compiler/brw_fs_generator.cpp
index cc9f8ad..8322be1 100644
--- a/src/intel/compiler/brw_fs_generator.cpp
+++ b/src/intel/compiler/brw_fs_generator.cpp
@@ -323,6 +323,7 @@ fs_generator::generate_fb_write(fs_inst *inst, struct 
brw_reg payload)
if (inst->header_size != 0) {
   brw_push_insn_state(p);
   brw_set_default_mask_control(p, BRW_MASK_DISABLE);
+  brw_set_default_exec_size(p, BRW_EXECUTE_1);
   brw_set_default_predicate_control(p, BRW_PREDICATE_NONE);
   brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
   brw_set_default_flag_reg(p, 0, 0);
@@ -395,11 +396,14 @@ fs_generator::generate_fb_write(fs_inst *inst, struct 
brw_reg payload)
 
   /* Check runtime bit to detect if we have to send AA data or not */
   brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
+  brw_push_insn_state(p);
+  brw_inst_set_exec_size(p->devinfo, brw_last_inst, BRW_EXECUTE_1);
   brw_AND(p,
   v1_null_ud,
   retype(brw_vec1_grf(1, 6), BRW_REGISTER_TYPE_UD),
   brw_imm_ud(1<<26));
   brw_inst_set_cond_modifier(p->devinfo, brw_last_inst, 
BRW_CONDITIONAL_NZ);
+  brw_pop_insn_state(p);
 
   int jmp = brw_JMPI(p, brw_imm_ud(0), BRW_PREDICATE_NORMAL) - p->store;
   {
@@ -946,6 +950,7 @@ fs_generator::generate_tex(fs_inst *inst, struct brw_reg 
dst, struct brw_reg src
  /* Explicitly set up the message header by copying g0 to the MRF. */
  brw_MOV(p, header_reg, brw_vec8_grf(0, 0));
 
+ brw_set_default_exec_size(p, BRW_EXECUTE_1);
  if (inst->offset) {
 /* Set the offset bits in DWord 2. */
 brw_MOV(p, get_element_ud(header_reg, 2),
@@ -999,6 +1004,7 @@ fs_generator::generate_tex(fs_inst *inst, struct brw_reg 
dst, struct brw_reg src
   brw_push_insn_state(p);
   brw_set_default_mask_control(p, BRW_MASK_DISABLE);
   brw_set_default_access_mode(p, BRW_ALIGN_1);
+  brw_set_default_exec_size(p, BRW_EXECUTE_1);
 
   if (brw_regs_equal(_reg, _reg)) {
  brw_MUL(p, addr, sampler_reg, brw_imm_uw(0x101));
@@ -1446,6 +1452,7 @@ fs_generator::generate_mov_dispatch_to_flags(fs_inst 
*inst)
 
brw_push_insn_state(p);
brw_set_default_mask_control(p, BRW_MASK_DISABLE);
+   brw_set_default_exec_size(p, BRW_EXECUTE_1);
brw_MOV(p, flags, dispatch_mask);
brw_pop_insn_state(p);
 }
diff --git a/src/intel/compiler/brw_fs_nir.cpp 
b/src/intel/compiler/brw_fs_nir.cpp
index d860c0f..b552387 100644
--- a/src/intel/compiler/brw_fs_nir.cpp
+++ b/src/intel/compiler/brw_fs_nir.cpp
@@ -4200,7 +4200,7 @@ fs_visitor::nir_emit_intrinsic(const fs_builder , 
nir_intrinsic_instr *instr
   unreachable("not reached");
 
case nir_intrinsic_vote_any: {
-  const fs_builder ubld = bld.exec_all();
+  const fs_builder ubld = bld.exec_all().group(1, 0);
 
   /* The any/all predicates do not consider channel enables. To prevent
* dead channels from affecting the result, we initialize the flag with
@@ -4232,7 +4232,7 @@ fs_visitor::nir_emit_intrinsic(const fs_builder , 
nir_intrinsic_instr *instr
   break;
}
case nir_intrinsic_vote_all: {
-  const fs_builder ubld = bld.exec_all();
+  const fs_builder ubld = bld.exec_all().group(1, 0);
 
   /* The any/all predicates do not consider channel enables. To prevent
* dead channels from affecting the result, we initialize the flag with
@@ -4266,7 +4266,7 @@ fs_visitor::nir_emit_intrinsic(const fs_builder , 
nir_intrinsic_instr *instr
case nir_intrinsic_vote_eq: {
   fs_reg value = get_nir_src(instr->src[0]);
   fs_reg uniformized = bld.emit_uniformize(value);
-  const fs_builder ubld = bld.exec_all();
+  const fs_builder ubld = bld.exec_all().group(1, 0);
 
   /* The any/all predicates do not consider channel enables. To prevent
* dead channels from affecting the result, we initialize the flag with
@@ -4311,7 +4311,7 @@ fs_visitor::nir_emit_intrinsic(const fs_builder , 
nir_intrinsic_instr *instr
   if (dispatch_width == 32)
  flag.type = BRW_REGISTER_TYPE_UD;
 
-  

[Mesa-dev] [PATCH v3 45/48] spirv: Add a vtn_constant_value helper

2017-10-25 Thread Jason Ekstrand
Reviewed-by: Lionel Landwerlin 
---
 src/compiler/spirv/vtn_private.h | 6 ++
 1 file changed, 6 insertions(+)

diff --git a/src/compiler/spirv/vtn_private.h b/src/compiler/spirv/vtn_private.h
index 8458462..e7a7c36 100644
--- a/src/compiler/spirv/vtn_private.h
+++ b/src/compiler/spirv/vtn_private.h
@@ -557,6 +557,12 @@ vtn_value(struct vtn_builder *b, uint32_t value_id,
return val;
 }
 
+static inline nir_constant *
+vtn_constant_value(struct vtn_builder *b, uint32_t value_id)
+{
+   return vtn_value(b, value_id, vtn_value_type_constant)->constant;
+}
+
 void _vtn_warn(const char *file, int line, const char *msg, ...);
 #define vtn_warn(...) _vtn_warn(__FILE__, __LINE__, __VA_ARGS__)
 
-- 
2.5.0.400.gff86faf

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH v3 31/48] intel/cs: Re-run final NIR optimizations for each SIMD size

2017-10-25 Thread Jason Ekstrand
With the advent of SPIR-V subgroup operations, compute shaders will have
to be slightly different depending on the SIMD size at which they
execute.  In order to allow us to do dispatch-width specific things in
NIR, we re-run the final NIR stages for each sIMD width.

As a side-effect of this change, we start using ralloc on fs_visitor so
we need to add DECLARE_RALLOC_OPERATORS to fs_visitor.
---
 src/intel/compiler/brw_fs.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/intel/compiler/brw_fs.h b/src/intel/compiler/brw_fs.h
index d3ab385..9ff06b6 100644
--- a/src/intel/compiler/brw_fs.h
+++ b/src/intel/compiler/brw_fs.h
@@ -60,7 +60,7 @@ offset(const fs_reg , const brw::fs_builder , 
unsigned delta)
 class fs_visitor : public backend_shader
 {
 public:
-   DECLARE_RALLOC_CXX_OPERATORS(fs_reg)
+   DECLARE_RALLOC_CXX_OPERATORS(fs_visitor)
 
fs_visitor(const struct brw_compiler *compiler, void *log_data,
   void *mem_ctx,
-- 
2.5.0.400.gff86faf

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH v3 39/48] nir: Add a new subgroups lowering pass

2017-10-25 Thread Jason Ekstrand
This commit pulls nir_lower_read_invocations_to_scalar along with most
of the guts of nir_opt_intrinsics (which mostly does subgroup lowering)
into a new nir_lower_subgroups pass.  There are various other bits of
subgroup lowering that we're going to want to do so it makes a bit more
sense to keep it all together in one pass.  We also move it in i965 to
happen after nir_lower_system_values to ensure that because we want to
handle the subgroup mask system value intrinsics here.
---
 src/compiler/Makefile.sources  |   2 +-
 src/compiler/nir/nir.h |  12 +-
 .../nir/nir_lower_read_invocation_to_scalar.c  | 112 --
 src/compiler/nir/nir_lower_subgroups.c | 161 +
 src/compiler/nir/nir_opt_intrinsics.c  |  51 +--
 src/intel/compiler/brw_compiler.c  |   3 -
 src/intel/compiler/brw_nir.c   |   8 +-
 7 files changed, 184 insertions(+), 165 deletions(-)
 delete mode 100644 src/compiler/nir/nir_lower_read_invocation_to_scalar.c
 create mode 100644 src/compiler/nir/nir_lower_subgroups.c

diff --git a/src/compiler/Makefile.sources b/src/compiler/Makefile.sources
index 27cc33a..12d932f 100644
--- a/src/compiler/Makefile.sources
+++ b/src/compiler/Makefile.sources
@@ -233,11 +233,11 @@ NIR_FILES = \
nir/nir_lower_passthrough_edgeflags.c \
nir/nir_lower_patch_vertices.c \
nir/nir_lower_phis_to_scalar.c \
-   nir/nir_lower_read_invocation_to_scalar.c \
nir/nir_lower_regs_to_ssa.c \
nir/nir_lower_returns.c \
nir/nir_lower_samplers.c \
nir/nir_lower_samplers_as_deref.c \
+   nir/nir_lower_subgroups.c \
nir/nir_lower_system_values.c \
nir/nir_lower_tex.c \
nir/nir_lower_to_source_mods.c \
diff --git a/src/compiler/nir/nir.h b/src/compiler/nir/nir.h
index dd833cf..8c3a20c 100644
--- a/src/compiler/nir/nir.h
+++ b/src/compiler/nir/nir.h
@@ -1835,9 +1835,6 @@ typedef struct nir_shader_compiler_options {
bool lower_extract_byte;
bool lower_extract_word;
 
-   bool lower_vote_trivial;
-   bool lower_subgroup_masks;
-
/**
 * Does the driver support real 32-bit integers?  (Otherwise, integers
 * are simulated by floats.)
@@ -2462,6 +2459,15 @@ bool nir_lower_samplers(nir_shader *shader,
 bool nir_lower_samplers_as_deref(nir_shader *shader,
  const struct gl_shader_program 
*shader_program);
 
+typedef struct nir_lower_subgroups_options {
+   bool lower_to_scalar:1;
+   bool lower_vote_trivial:1;
+   bool lower_subgroup_masks:1;
+} nir_lower_subgroups_options;
+
+bool nir_lower_subgroups(nir_shader *shader,
+ const nir_lower_subgroups_options *options);
+
 bool nir_lower_system_values(nir_shader *shader);
 
 typedef struct nir_lower_tex_options {
diff --git a/src/compiler/nir/nir_lower_read_invocation_to_scalar.c 
b/src/compiler/nir/nir_lower_read_invocation_to_scalar.c
deleted file mode 100644
index 69e7c0a..000
--- a/src/compiler/nir/nir_lower_read_invocation_to_scalar.c
+++ /dev/null
@@ -1,112 +0,0 @@
-/*
- * Copyright © 2017 Intel Corporation
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
- */
-
-#include "nir.h"
-#include "nir_builder.h"
-
-/** @file nir_lower_read_invocation_to_scalar.c
- *
- * Replaces nir_intrinsic_read_invocation/nir_intrinsic_read_first_invocation
- * operations with num_components != 1 with individual per-channel operations.
- */
-
-static void
-lower_read_invocation_to_scalar(nir_builder *b, nir_intrinsic_instr *intrin)
-{
-   b->cursor = nir_before_instr(>instr);
-
-   nir_ssa_def *value = nir_ssa_for_src(b, intrin->src[0], 
intrin->num_components);
-   nir_ssa_def *reads[4];
-
-   for (unsigned i = 0; i < intrin->num_components; i++) {
-  nir_intrinsic_instr *chan_intrin =
- nir_intrinsic_instr_create(b->shader, 

[Mesa-dev] [PATCH v3 29/48] intel/cs: Rework the way thread local ID is handled

2017-10-25 Thread Jason Ekstrand
Previously, brw_nir_lower_intrinsics added the param and then emitted a
load_uniform intrinsic to load it directly.  This commit switches things
over to use a specific NIR intrinsic for the thread id.  The one thing I
don't like about this approach is that we have to copy thread_local_id
over to the new visitor in import_uniforms.
---
 src/compiler/nir/nir_intrinsics.h|  3 ++
 src/intel/compiler/brw_fs.cpp|  4 +-
 src/intel/compiler/brw_fs.h  |  1 +
 src/intel/compiler/brw_fs_nir.cpp| 14 +++
 src/intel/compiler/brw_nir.h |  3 +-
 src/intel/compiler/brw_nir_lower_cs_intrinsics.c | 53 +---
 6 files changed, 32 insertions(+), 46 deletions(-)

diff --git a/src/compiler/nir/nir_intrinsics.h 
b/src/compiler/nir/nir_intrinsics.h
index cefd18b..47022dd 100644
--- a/src/compiler/nir/nir_intrinsics.h
+++ b/src/compiler/nir/nir_intrinsics.h
@@ -364,6 +364,9 @@ SYSTEM_VALUE(blend_const_color_a_float, 1, 0, xx, xx, xx)
 SYSTEM_VALUE(blend_const_color_rgba_unorm, 1, 0, xx, xx, xx)
 SYSTEM_VALUE(blend_const_color__unorm, 1, 0, xx, xx, xx)
 
+/* Intel specific system values */
+SYSTEM_VALUE(intel_thread_local_id, 1, 0, xx, xx, xx)
+
 /**
  * Barycentric coordinate intrinsics.
  *
diff --git a/src/intel/compiler/brw_fs.cpp b/src/intel/compiler/brw_fs.cpp
index 2acd838..c0d4c05 100644
--- a/src/intel/compiler/brw_fs.cpp
+++ b/src/intel/compiler/brw_fs.cpp
@@ -996,6 +996,7 @@ fs_visitor::import_uniforms(fs_visitor *v)
this->push_constant_loc = v->push_constant_loc;
this->pull_constant_loc = v->pull_constant_loc;
this->uniforms = v->uniforms;
+   this->thread_local_id = v->thread_local_id;
 }
 
 void
@@ -6781,8 +6782,7 @@ brw_compile_cs(const struct brw_compiler *compiler, void 
*log_data,
 {
nir_shader *shader = nir_shader_clone(mem_ctx, src_shader);
shader = brw_nir_apply_sampler_key(shader, compiler, >tex, true);
-
-   brw_nir_lower_cs_intrinsics(shader, prog_data);
+   brw_nir_lower_cs_intrinsics(shader);
shader = brw_postprocess_nir(shader, compiler, true);
 
prog_data->local_size[0] = shader->info.cs.local_size[0];
diff --git a/src/intel/compiler/brw_fs.h b/src/intel/compiler/brw_fs.h
index da32593..f51a4d8 100644
--- a/src/intel/compiler/brw_fs.h
+++ b/src/intel/compiler/brw_fs.h
@@ -315,6 +315,7 @@ public:
 */
int *push_constant_loc;
 
+   fs_reg thread_local_id;
fs_reg frag_depth;
fs_reg frag_stencil;
fs_reg sample_mask;
diff --git a/src/intel/compiler/brw_fs_nir.cpp 
b/src/intel/compiler/brw_fs_nir.cpp
index 05efee3..fdc6fc6 100644
--- a/src/intel/compiler/brw_fs_nir.cpp
+++ b/src/intel/compiler/brw_fs_nir.cpp
@@ -88,6 +88,16 @@ fs_visitor::nir_setup_uniforms()
}
 
uniforms = nir->num_uniforms / 4;
+
+   if (stage == MESA_SHADER_COMPUTE) {
+  /* Add a uniform for the thread local id.  It must be the last uniform
+   * on the list.
+   */
+  assert(uniforms == prog_data->nr_params);
+  uint32_t *param = brw_stage_prog_data_add_params(prog_data, 1);
+  *param = BRW_PARAM_BUILTIN_THREAD_LOCAL_ID;
+  thread_local_id = fs_reg(UNIFORM, uniforms++, BRW_REGISTER_TYPE_UD);
+   }
 }
 
 static bool
@@ -3409,6 +3419,10 @@ fs_visitor::nir_emit_cs_intrinsic(const fs_builder ,
   cs_prog_data->uses_barrier = true;
   break;
 
+   case nir_intrinsic_load_intel_thread_local_id:
+  bld.MOV(retype(dest, BRW_REGISTER_TYPE_UD), thread_local_id);
+  break;
+
case nir_intrinsic_load_local_invocation_id:
case nir_intrinsic_load_work_group_id: {
   gl_system_value sv = nir_system_value_from_intrinsic(instr->intrinsic);
diff --git a/src/intel/compiler/brw_nir.h b/src/intel/compiler/brw_nir.h
index 1493b74..3e40712 100644
--- a/src/intel/compiler/brw_nir.h
+++ b/src/intel/compiler/brw_nir.h
@@ -95,8 +95,7 @@ void brw_nir_analyze_boolean_resolves(nir_shader *nir);
 nir_shader *brw_preprocess_nir(const struct brw_compiler *compiler,
nir_shader *nir);
 
-bool brw_nir_lower_cs_intrinsics(nir_shader *nir,
- struct brw_cs_prog_data *prog_data);
+bool brw_nir_lower_cs_intrinsics(nir_shader *nir);
 void brw_nir_lower_vs_inputs(nir_shader *nir,
  bool use_legacy_snorm_formula,
  const uint8_t *vs_attrib_wa_flags);
diff --git a/src/intel/compiler/brw_nir_lower_cs_intrinsics.c 
b/src/intel/compiler/brw_nir_lower_cs_intrinsics.c
index d277276..07d2dcc 100644
--- a/src/intel/compiler/brw_nir_lower_cs_intrinsics.c
+++ b/src/intel/compiler/brw_nir_lower_cs_intrinsics.c
@@ -26,47 +26,12 @@
 
 struct lower_intrinsics_state {
nir_shader *nir;
-   struct brw_cs_prog_data *prog_data;
nir_function_impl *impl;
bool progress;
nir_builder builder;
-   int thread_local_id_index;
+   unsigned local_workgroup_size;
 };
 
-static nir_ssa_def *
-read_thread_local_id(struct lower_intrinsics_state 

[Mesa-dev] [PATCH v3 33/48] intel/compiler/fs: Set up subgroup invocation as a system value

2017-10-25 Thread Jason Ekstrand
Subgroup invocation is computed using a vector immediate and some
dispatch-aware arithmetic.  Unfortunately, due to the vector arithmetic,
and the fact that it's frequently read 16-wide, it's not something that
can easily be CSEd by the back-end compiler.  There are a few different
possible approaches to this problem:

 1) Emit the code to calculate the subgroup invocation on-the-fly and
trust NIR to do the CSE.  This is what we were doing.

 2) Add a back-end instruction for the subgroup ID.  This has the
advantage of helping the back-end compiler with CSE but has the
downside of very poor scheduling for the calculation because it has
to be emitted in the back-end.

 3) Emit the calculation at the top of the program and re-use the
result.  This gets rid of the CSE problem but comes at the cost of
an extra live register.

This commit switches us from 1) to 3).  We choose to store the subgroup
invocation values as a W type to reduce the impact of the extra live
register.  Trusting NIR and using 1) was fine but we're soon going to
want to use the subgroup invocation value for other things in the
back-end compiler and this makes it much easier to do without having to
worry about CSE problems.
---
 src/intel/compiler/brw_fs_nir.cpp | 34 +-
 1 file changed, 21 insertions(+), 13 deletions(-)

diff --git a/src/intel/compiler/brw_fs_nir.cpp 
b/src/intel/compiler/brw_fs_nir.cpp
index cc5731f..d860c0f 100644
--- a/src/intel/compiler/brw_fs_nir.cpp
+++ b/src/intel/compiler/brw_fs_nir.cpp
@@ -231,6 +231,24 @@ fs_visitor::nir_emit_system_values()
   nir_system_values[i] = fs_reg();
}
 
+   /* Always emit SUBGROUP_INVOCATION.  Dead code will clean it up if we
+* never end up using it.
+*/
+   {
+  const fs_builder abld = bld.annotate("gl_SubgroupInvocation", NULL);
+  fs_reg  = nir_system_values[SYSTEM_VALUE_SUBGROUP_INVOCATION];
+  reg = abld.vgrf(BRW_REGISTER_TYPE_W);
+
+  const fs_builder allbld8 = abld.group(8, 0).exec_all();
+  allbld8.MOV(reg, brw_imm_v(0x76543210));
+  if (dispatch_width > 8)
+ allbld8.ADD(byte_offset(reg, 16), reg, brw_imm_uw(8u));
+  if (dispatch_width > 16) {
+ const fs_builder allbld16 = abld.group(16, 0).exec_all();
+ allbld16.ADD(byte_offset(reg, 32), reg, brw_imm_uw(16u));
+  }
+   }
+
nir_foreach_function(function, nir) {
   assert(strcmp(function->name, "main") == 0);
   assert(function->impl);
@@ -4169,20 +4187,10 @@ fs_visitor::nir_emit_intrinsic(const fs_builder , 
nir_intrinsic_instr *instr
   bld.MOV(retype(dest, BRW_REGISTER_TYPE_D), brw_imm_d(dispatch_width));
   break;
 
-   case nir_intrinsic_load_subgroup_invocation: {
-  fs_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_UW);
-  dest = retype(dest, BRW_REGISTER_TYPE_UD);
-  const fs_builder allbld8 = bld.group(8, 0).exec_all();
-  allbld8.MOV(tmp, brw_imm_v(0x76543210));
-  if (dispatch_width > 8)
- allbld8.ADD(byte_offset(tmp, 16), tmp, brw_imm_uw(8u));
-  if (dispatch_width > 16) {
- const fs_builder allbld16 = bld.group(16, 0).exec_all();
- allbld16.ADD(byte_offset(tmp, 32), tmp, brw_imm_uw(16u));
-  }
-  bld.MOV(dest, tmp);
+   case nir_intrinsic_load_subgroup_invocation:
+  bld.MOV(retype(dest, BRW_REGISTER_TYPE_D),
+  nir_system_values[SYSTEM_VALUE_SUBGROUP_INVOCATION]);
   break;
-   }
 
case nir_intrinsic_load_subgroup_eq_mask:
case nir_intrinsic_load_subgroup_ge_mask:
-- 
2.5.0.400.gff86faf

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH v3 30/48] intel/cs: Re-run final NIR optimizations for each SIMD size

2017-10-25 Thread Jason Ekstrand
With the advent of SPIR-V subgroup operations, compute shaders will have
to be slightly different depending on the SIMD size at which they
execute.  In order to allow us to do dispatch-width specific things in
NIR, we re-run the final NIR stages for each sIMD width.

One side-effect of this change is that we start rallocing fs_visitors
which means we need DECLARE_RALLOC_CXX_OPERATORS.
---
 src/intel/compiler/brw_fs.cpp | 103 ++
 src/intel/compiler/brw_fs.h   |   2 +
 2 files changed, 66 insertions(+), 39 deletions(-)

diff --git a/src/intel/compiler/brw_fs.cpp b/src/intel/compiler/brw_fs.cpp
index c0d4c05..c054537 100644
--- a/src/intel/compiler/brw_fs.cpp
+++ b/src/intel/compiler/brw_fs.cpp
@@ -6770,6 +6770,20 @@ cs_set_simd_size(struct brw_cs_prog_data *cs_prog_data, 
unsigned size)
cs_prog_data->threads = (group_size + size - 1) / size;
 }
 
+static nir_shader *
+compile_cs_to_nir(const struct brw_compiler *compiler,
+  void *mem_ctx,
+  const struct brw_cs_prog_key *key,
+  struct brw_cs_prog_data *prog_data,
+  const nir_shader *src_shader,
+  unsigned dispatch_width)
+{
+   nir_shader *shader = nir_shader_clone(mem_ctx, src_shader);
+   shader = brw_nir_apply_sampler_key(shader, compiler, >tex, true);
+   brw_nir_lower_cs_intrinsics(shader);
+   return brw_postprocess_nir(shader, compiler, true);
+}
+
 const unsigned *
 brw_compile_cs(const struct brw_compiler *compiler, void *log_data,
void *mem_ctx,
@@ -6780,17 +6794,12 @@ brw_compile_cs(const struct brw_compiler *compiler, 
void *log_data,
unsigned *final_assembly_size,
char **error_str)
 {
-   nir_shader *shader = nir_shader_clone(mem_ctx, src_shader);
-   shader = brw_nir_apply_sampler_key(shader, compiler, >tex, true);
-   brw_nir_lower_cs_intrinsics(shader);
-   shader = brw_postprocess_nir(shader, compiler, true);
-
-   prog_data->local_size[0] = shader->info.cs.local_size[0];
-   prog_data->local_size[1] = shader->info.cs.local_size[1];
-   prog_data->local_size[2] = shader->info.cs.local_size[2];
+   prog_data->local_size[0] = src_shader->info.cs.local_size[0];
+   prog_data->local_size[1] = src_shader->info.cs.local_size[1];
+   prog_data->local_size[2] = src_shader->info.cs.local_size[2];
unsigned local_workgroup_size =
-  shader->info.cs.local_size[0] * shader->info.cs.local_size[1] *
-  shader->info.cs.local_size[2];
+  src_shader->info.cs.local_size[0] * src_shader->info.cs.local_size[1] *
+  src_shader->info.cs.local_size[2];
 
unsigned min_dispatch_width =
   DIV_ROUND_UP(local_workgroup_size, compiler->devinfo->max_cs_threads);
@@ -6798,71 +6807,87 @@ brw_compile_cs(const struct brw_compiler *compiler, 
void *log_data,
min_dispatch_width = util_next_power_of_two(min_dispatch_width);
assert(min_dispatch_width <= 32);
 
+
+   fs_visitor *v8 = NULL, *v16 = NULL, *v32 = NULL;
cfg_t *cfg = NULL;
const char *fail_msg = NULL;
+   unsigned promoted_constants;
 
/* Now the main event: Visit the shader IR and generate our CS IR for it.
 */
-   fs_visitor v8(compiler, log_data, mem_ctx, key, _data->base,
- NULL, /* Never used in core profile */
- shader, 8, shader_time_index);
if (min_dispatch_width <= 8) {
-  if (!v8.run_cs(min_dispatch_width)) {
- fail_msg = v8.fail_msg;
+  nir_shader *nir8 = compile_cs_to_nir(compiler, mem_ctx, key,
+   prog_data, src_shader, 8);
+  v8 = new(mem_ctx) fs_visitor(compiler, log_data, mem_ctx, key,
+   _data->base,
+   NULL, /* Never used in core profile */
+   nir8, 8, shader_time_index);
+  if (!v8->run_cs(min_dispatch_width)) {
+ fail_msg = v8->fail_msg;
   } else {
- cfg = v8.cfg;
+ cfg = v8->cfg;
  cs_set_simd_size(prog_data, 8);
  cs_fill_push_const_info(compiler->devinfo, prog_data);
+ promoted_constants = v8->promoted_constants;
   }
}
 
-   fs_visitor v16(compiler, log_data, mem_ctx, key, _data->base,
- NULL, /* Never used in core profile */
- shader, 16, shader_time_index);
if (likely(!(INTEL_DEBUG & DEBUG_NO16)) &&
!fail_msg && min_dispatch_width <= 16) {
   /* Try a SIMD16 compile */
-  if (min_dispatch_width <= 8)
- v16.import_uniforms();
-  if (!v16.run_cs(min_dispatch_width)) {
+  nir_shader *nir16 = compile_cs_to_nir(compiler, mem_ctx, key,
+prog_data, src_shader, 16);
+  v16 = new(mem_ctx) fs_visitor(compiler, log_data, mem_ctx, key,
+_data->base,
+NULL, /* Never used in core profile */
+nir16, 16, 

[Mesa-dev] [PATCH v3 34/48] intel/fs: Rework zero-length URB write handling

2017-10-25 Thread Jason Ekstrand
Originally we tried to handle this case based on slots_valid.  However,
there are a number of ways that this can go wrong.  For one, we throw
away any trailing slots which either aren't written or are set to
VARYING_SLOT_PAD.  Second, even if PSIZ is a valid slot, we may not
actually write anything there.  Between the lot of these, it was
possible to end up in a case where we tried to do a regular URB write
but ended up with a length of 1 which is invalid.  This commit moves it
to the end and makes it based on a new boolean flag urb_written.

Cc: mesa-sta...@lists.freedesktop.org
---
 src/intel/compiler/brw_fs_visitor.cpp | 60 ++-
 1 file changed, 31 insertions(+), 29 deletions(-)

diff --git a/src/intel/compiler/brw_fs_visitor.cpp 
b/src/intel/compiler/brw_fs_visitor.cpp
index 9fd4c20..9a19dc2 100644
--- a/src/intel/compiler/brw_fs_visitor.cpp
+++ b/src/intel/compiler/brw_fs_visitor.cpp
@@ -566,34 +566,6 @@ fs_visitor::emit_urb_writes(const fs_reg _vertex_count)
else
   urb_handle = fs_reg(retype(brw_vec8_grf(1, 0), BRW_REGISTER_TYPE_UD));
 
-   /* If we don't have any valid slots to write, just do a minimal urb write
-* send to terminate the shader.  This includes 1 slot of undefined data,
-* because it's invalid to write 0 data:
-*
-* From the Broadwell PRM, Volume 7: 3D Media GPGPU, Shared Functions -
-* Unified Return Buffer (URB) > URB_SIMD8_Write and URB_SIMD8_Read >
-* Write Data Payload:
-*
-*"The write data payload can be between 1 and 8 message phases long."
-*/
-   if (vue_map->slots_valid == 0) {
-  /* For GS, just turn EmitVertex() into a no-op.  We don't want it to
-   * end the thread, and emit_gs_thread_end() already emits a SEND with
-   * EOT at the end of the program for us.
-   */
-  if (stage == MESA_SHADER_GEOMETRY)
- return;
-
-  fs_reg payload = fs_reg(VGRF, alloc.allocate(2), BRW_REGISTER_TYPE_UD);
-  bld.exec_all().MOV(payload, urb_handle);
-
-  fs_inst *inst = bld.emit(SHADER_OPCODE_URB_WRITE_SIMD8, reg_undef, 
payload);
-  inst->eot = true;
-  inst->mlen = 2;
-  inst->offset = 1;
-  return;
-   }
-
opcode opcode = SHADER_OPCODE_URB_WRITE_SIMD8;
int header_size = 1;
fs_reg per_slot_offsets;
@@ -645,6 +617,7 @@ fs_visitor::emit_urb_writes(const fs_reg _vertex_count)
   last_slot--;
}
 
+   bool urb_written = false;
for (slot = 0; slot < vue_map->num_slots; slot++) {
   int varying = vue_map->slot_to_varying[slot];
   switch (varying) {
@@ -730,7 +703,7 @@ fs_visitor::emit_urb_writes(const fs_reg _vertex_count)
* the last slot or if we need to flush (see BAD_FILE varying case
* above), emit a URB write send now to flush out the data.
*/
-  if (length == 8 || slot == last_slot)
+  if (length == 8 || (length > 0 && slot == last_slot))
  flush = true;
   if (flush) {
  fs_reg *payload_sources =
@@ -755,8 +728,37 @@ fs_visitor::emit_urb_writes(const fs_reg _vertex_count)
  urb_offset = starting_urb_offset + slot + 1;
  length = 0;
  flush = false;
+ urb_written = true;
   }
}
+
+   /* If we don't have any valid slots to write, just do a minimal urb write
+* send to terminate the shader.  This includes 1 slot of undefined data,
+* because it's invalid to write 0 data:
+*
+* From the Broadwell PRM, Volume 7: 3D Media GPGPU, Shared Functions -
+* Unified Return Buffer (URB) > URB_SIMD8_Write and URB_SIMD8_Read >
+* Write Data Payload:
+*
+*"The write data payload can be between 1 and 8 message phases long."
+*/
+   if (!urb_written) {
+  /* For GS, just turn EmitVertex() into a no-op.  We don't want it to
+   * end the thread, and emit_gs_thread_end() already emits a SEND with
+   * EOT at the end of the program for us.
+   */
+  if (stage == MESA_SHADER_GEOMETRY)
+ return;
+
+  fs_reg payload = fs_reg(VGRF, alloc.allocate(2), BRW_REGISTER_TYPE_UD);
+  bld.exec_all().MOV(payload, urb_handle);
+
+  fs_inst *inst = bld.emit(SHADER_OPCODE_URB_WRITE_SIMD8, reg_undef, 
payload);
+  inst->eot = true;
+  inst->mlen = 2;
+  inst->offset = 1;
+  return;
+   }
 }
 
 void
-- 
2.5.0.400.gff86faf

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH v3 21/48] intel/fs: Uniformize the index in readInvocation

2017-10-25 Thread Jason Ekstrand
The index is any value provided by the shader and this can be called in
non-uniform control flow so we can't just take component 0.  Found by
inspection.
---
 src/intel/compiler/brw_fs_nir.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/intel/compiler/brw_fs_nir.cpp 
b/src/intel/compiler/brw_fs_nir.cpp
index a441f57..a3a863e 100644
--- a/src/intel/compiler/brw_fs_nir.cpp
+++ b/src/intel/compiler/brw_fs_nir.cpp
@@ -4304,7 +4304,7 @@ fs_visitor::nir_emit_intrinsic(const fs_builder , 
nir_intrinsic_instr *instr
   fs_reg tmp = bld.vgrf(value.type);
 
   bld.exec_all().emit(SHADER_OPCODE_BROADCAST, tmp, value,
-  component(invocation, 0));
+  bld.emit_uniformize(invocation));
 
   bld.MOV(retype(dest, BRW_REGISTER_TYPE_D),
   fs_reg(component(tmp, 0)));
-- 
2.5.0.400.gff86faf

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH v3 28/48] intel/fs: Mark 64-bit values as being contiguous

2017-10-25 Thread Jason Ekstrand
This isn't often a problem , when we're in a compute shader, we must
push the thread local ID so we decrement the amount of available push
space by 1 and it's no longer even and 64-bit data can, in theory, span
it.  By marking those uniforms contiguous, we ensure that they never get
split in half between push and pull constants.

Cc: mesa-sta...@lists.freedesktop.org
---
 src/intel/compiler/brw_fs.cpp | 5 -
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/src/intel/compiler/brw_fs.cpp b/src/intel/compiler/brw_fs.cpp
index 4ebf539..2acd838 100644
--- a/src/intel/compiler/brw_fs.cpp
+++ b/src/intel/compiler/brw_fs.cpp
@@ -1968,7 +1968,7 @@ fs_visitor::assign_constant_locations()
 
/* For each uniform slot, a value of true indicates that the given slot and
 * the next slot must remain contiguous.  This is used to keep us from
-* splitting arrays apart.
+* splitting arrays and 64-bit values apart.
 */
bool contiguous[uniforms];
memset(contiguous, 0, sizeof(contiguous));
@@ -2005,6 +2005,9 @@ fs_visitor::assign_constant_locations()
 if (constant_nr >= 0 && constant_nr < (int) uniforms) {
int regs_read = inst->components_read(i) *
   type_sz(inst->src[i].type) / 4;
+   assert(regs_read <= 2);
+   if (regs_read == 2)
+  contiguous[constant_nr] = true;
for (int j = 0; j < regs_read; j++) {
   is_live[constant_nr + j] = true;
   bitsize_access[constant_nr + j] =
-- 
2.5.0.400.gff86faf

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH v3 35/48] intel/eu: Make automatic exec sizes a configurable option

2017-10-25 Thread Jason Ekstrand
We have had a feature in codegen for some time that tries to
automatically infer the execution size of an instruction from the width
of its destination.  For things such as fixed function GS, clipper, and
SF programs, this is very useful because they tend to have lots of
hand-rolled register setup and trying to specify the exec size all the
time would be prohibitive.  For things that come from a higher-level IR,
however, it's easier to just set the right size all the time and the
automatic exec sizes can, in fact, cause problems.  This commit makes it
optional while enabling it by default.
---
 src/intel/compiler/brw_eu.c  |  1 +
 src/intel/compiler/brw_eu.h  | 10 ++
 src/intel/compiler/brw_eu_emit.c | 32 ++--
 3 files changed, 29 insertions(+), 14 deletions(-)

diff --git a/src/intel/compiler/brw_eu.c b/src/intel/compiler/brw_eu.c
index b0bdc38..bc297a2 100644
--- a/src/intel/compiler/brw_eu.c
+++ b/src/intel/compiler/brw_eu.c
@@ -296,6 +296,7 @@ brw_init_codegen(const struct gen_device_info *devinfo,
memset(p, 0, sizeof(*p));
 
p->devinfo = devinfo;
+   p->automatic_exec_sizes = true;
/*
 * Set the initial instruction store array size to 1024, if found that
 * isn't enough, then it will double the store size at brw_next_insn()
diff --git a/src/intel/compiler/brw_eu.h b/src/intel/compiler/brw_eu.h
index 8e597b2..8abebeb 100644
--- a/src/intel/compiler/brw_eu.h
+++ b/src/intel/compiler/brw_eu.h
@@ -65,6 +65,16 @@ struct brw_codegen {
bool compressed_stack[BRW_EU_MAX_INSN_STACK];
brw_inst *current;
 
+   /** Whether or not the user wants automatic exec sizes
+*
+* If true, codegen will try to automatically infer the exec size of an
+* instruction from the width of the destination register.  If false, it
+* will take whatever is set by brw_set_default_exec_size verbatim.
+*
+* This is set to true by default in brw_init_codegen.
+*/
+   bool automatic_exec_sizes;
+
bool single_program_flow;
const struct gen_device_info *devinfo;
 
diff --git a/src/intel/compiler/brw_eu_emit.c b/src/intel/compiler/brw_eu_emit.c
index fae74cf..902914f 100644
--- a/src/intel/compiler/brw_eu_emit.c
+++ b/src/intel/compiler/brw_eu_emit.c
@@ -141,22 +141,26 @@ brw_set_dest(struct brw_codegen *p, brw_inst *inst, 
struct brw_reg dest)
 
/* Generators should set a default exec_size of either 8 (SIMD4x2 or SIMD8)
 * or 16 (SIMD16), as that's normally correct.  However, when dealing with
-* small registers, we automatically reduce it to match the register size.
-*
-* In platforms that support fp64 we can emit instructions with a width of
-* 4 that need two SIMD8 registers and an exec_size of 8 or 16. In these
-* cases we need to make sure that these instructions have their exec sizes
-* set properly when they are emitted and we can't rely on this code to fix
-* it.
+* small registers, it can be useful for us toautomatically reduce it to
+* match the register size.
 */
-   bool fix_exec_size;
-   if (devinfo->gen >= 6)
-  fix_exec_size = dest.width < BRW_EXECUTE_4;
-   else
-  fix_exec_size = dest.width < BRW_EXECUTE_8;
+   if (p->automatic_exec_sizes) {
+  /*
+   * In platforms that support fp64 we can emit instructions with a width
+   * of 4 that need two SIMD8 registers and an exec_size of 8 or 16. In
+   * these cases we need to make sure that these instructions have their
+   * exec sizes set properly when they are emitted and we can't rely on
+   * this code to fix it.
+   */
+  bool fix_exec_size;
+  if (devinfo->gen >= 6)
+ fix_exec_size = dest.width < BRW_EXECUTE_4;
+  else
+ fix_exec_size = dest.width < BRW_EXECUTE_8;
 
-   if (fix_exec_size)
-  brw_inst_set_exec_size(devinfo, inst, dest.width);
+  if (fix_exec_size)
+ brw_inst_set_exec_size(devinfo, inst, dest.width);
+   }
 }
 
 void
-- 
2.5.0.400.gff86faf

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH v3 18/48] i965/fs/nir: Minor refactor of store_output

2017-10-25 Thread Jason Ekstrand
Stop retyping the output of shuffle_64bit_data_for_32bit_write.  It's
always BRW_REGISTER_TYPE_D which is perfectly fine for writing out.
Also, when we change get_nir_src to return something with a 64-bit type
for 64-bit values, the retyping will not be at all what we want.  Also,
retyping the output based on src.type before we whack it back to 32 bits
is a problem because the output is always 32 bits.
---
 src/intel/compiler/brw_fs_nir.cpp | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/intel/compiler/brw_fs_nir.cpp 
b/src/intel/compiler/brw_fs_nir.cpp
index 5bcdb1a..e008e2e 100644
--- a/src/intel/compiler/brw_fs_nir.cpp
+++ b/src/intel/compiler/brw_fs_nir.cpp
@@ -4058,18 +4058,18 @@ fs_visitor::nir_emit_intrinsic(const fs_builder , 
nir_intrinsic_instr *instr
 
   nir_const_value *const_offset = nir_src_as_const_value(instr->src[1]);
   assert(const_offset && "Indirect output stores not allowed");
-  fs_reg new_dest = retype(offset(outputs[instr->const_index[0]], bld,
-  4 * const_offset->u32[0]), src.type);
 
   unsigned num_components = instr->num_components;
   unsigned first_component = nir_intrinsic_component(instr);
   if (nir_src_bit_size(instr->src[0]) == 64) {
  fs_reg tmp = shuffle_64bit_data_for_32bit_write(bld,
 retype(src, BRW_REGISTER_TYPE_DF), num_components);
- src = retype(tmp, src.type);
+ src = tmp;
  num_components *= 2;
   }
 
+  fs_reg new_dest = retype(offset(outputs[instr->const_index[0]], bld,
+  4 * const_offset->u32[0]), src.type);
   for (unsigned j = 0; j < num_components; j++) {
  bld.MOV(offset(new_dest, bld, j + first_component),
  offset(src, bld, j));
-- 
2.5.0.400.gff86faf

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH v3 38/48] intel/fs: Don't use automatic exec size inference

2017-10-25 Thread Jason Ekstrand
The automatic exec size inference can accidentally mess things up if
we're not careful.  For instance, if we have

add(4)g38.2<4>Dg38.1<8,2,4>Dg38.2<8,2,4>D

then the destination register will end up having a width of 2 with a
horizontal stride of 4 and a vertical stride of 8.  The EU emit code
sees the width of 2 and decides that we really wanted an exec size of 2
which doesn't do what we wanted.
---
 src/intel/compiler/brw_fs_generator.cpp | 12 +---
 1 file changed, 9 insertions(+), 3 deletions(-)

diff --git a/src/intel/compiler/brw_fs_generator.cpp 
b/src/intel/compiler/brw_fs_generator.cpp
index 8322be1..46f9a33 100644
--- a/src/intel/compiler/brw_fs_generator.cpp
+++ b/src/intel/compiler/brw_fs_generator.cpp
@@ -190,6 +190,12 @@ fs_generator::fs_generator(const struct brw_compiler 
*compiler, void *log_data,
 {
p = rzalloc(mem_ctx, struct brw_codegen);
brw_init_codegen(devinfo, p, mem_ctx);
+
+   /* In the FS code generator, we are very careful to ensure that we always
+* set the right execution size so we don't need the EU code to "help" us
+* by trying to infer it.  Sometimes, it infers the wrong thing.
+*/
+   p->automatic_exec_sizes = false;
 }
 
 fs_generator::~fs_generator()
@@ -395,17 +401,17 @@ fs_generator::generate_fb_write(fs_inst *inst, struct 
brw_reg payload)
   struct brw_reg v1_null_ud = vec1(retype(brw_null_reg(), 
BRW_REGISTER_TYPE_UD));
 
   /* Check runtime bit to detect if we have to send AA data or not */
-  brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
   brw_push_insn_state(p);
-  brw_inst_set_exec_size(p->devinfo, brw_last_inst, BRW_EXECUTE_1);
+  brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
+  brw_set_default_exec_size(p, BRW_EXECUTE_1);
   brw_AND(p,
   v1_null_ud,
   retype(brw_vec1_grf(1, 6), BRW_REGISTER_TYPE_UD),
   brw_imm_ud(1<<26));
   brw_inst_set_cond_modifier(p->devinfo, brw_last_inst, 
BRW_CONDITIONAL_NZ);
-  brw_pop_insn_state(p);
 
   int jmp = brw_JMPI(p, brw_imm_ud(0), BRW_PREDICATE_NORMAL) - p->store;
+  brw_pop_insn_state(p);
   {
  /* Don't send AA data */
  fire_fb_write(inst, offset(payload, 1), implied_header, inst->mlen-1);
-- 
2.5.0.400.gff86faf

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH v3 26/48] intel/cs: Stop setting dispatch_grf_start_reg

2017-10-25 Thread Jason Ekstrand
Nothing ever reads it for compute shaders because it's always 1.
---
 src/intel/compiler/brw_compiler.h | 1 -
 src/intel/compiler/brw_fs.cpp | 2 --
 2 files changed, 3 deletions(-)

diff --git a/src/intel/compiler/brw_compiler.h 
b/src/intel/compiler/brw_compiler.h
index 014202d..508d4ba 100644
--- a/src/intel/compiler/brw_compiler.h
+++ b/src/intel/compiler/brw_compiler.h
@@ -734,7 +734,6 @@ struct brw_push_const_block {
 struct brw_cs_prog_data {
struct brw_stage_prog_data base;
 
-   GLuint dispatch_grf_start_reg_16;
unsigned local_size[3];
unsigned simd_size;
unsigned threads;
diff --git a/src/intel/compiler/brw_fs.cpp b/src/intel/compiler/brw_fs.cpp
index 4c362ba..35d1ca4 100644
--- a/src/intel/compiler/brw_fs.cpp
+++ b/src/intel/compiler/brw_fs.cpp
@@ -6810,7 +6810,6 @@ brw_compile_cs(const struct brw_compiler *compiler, void 
*log_data,
  cfg = v8.cfg;
  cs_set_simd_size(prog_data, 8);
  cs_fill_push_const_info(compiler->devinfo, prog_data);
- prog_data->base.dispatch_grf_start_reg = v8.payload.num_regs;
   }
}
 
@@ -6835,7 +6834,6 @@ brw_compile_cs(const struct brw_compiler *compiler, void 
*log_data,
  cfg = v16.cfg;
  cs_set_simd_size(prog_data, 16);
  cs_fill_push_const_info(compiler->devinfo, prog_data);
- prog_data->dispatch_grf_start_reg_16 = v16.payload.num_regs;
   }
}
 
-- 
2.5.0.400.gff86faf

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH v3 27/48] intel/cs: Ignore runtime_check_aads_emit for CS

2017-10-25 Thread Jason Ekstrand
It's only set on gen4-5 which clearly don't support compute shaders.
---
 src/intel/compiler/brw_fs.cpp | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/src/intel/compiler/brw_fs.cpp b/src/intel/compiler/brw_fs.cpp
index 35d1ca4..4ebf539 100644
--- a/src/intel/compiler/brw_fs.cpp
+++ b/src/intel/compiler/brw_fs.cpp
@@ -6872,8 +6872,7 @@ brw_compile_cs(const struct brw_compiler *compiler, void 
*log_data,
}
 
fs_generator g(compiler, log_data, mem_ctx, (void*) key, _data->base,
-  v8.promoted_constants, v8.runtime_check_aads_emit,
-  MESA_SHADER_COMPUTE);
+  v8.promoted_constants, false, MESA_SHADER_COMPUTE);
if (INTEL_DEBUG & DEBUG_CS) {
   char *name = ralloc_asprintf(mem_ctx, "%s compute shader %s",
shader->info.label ? shader->info.label :
-- 
2.5.0.400.gff86faf

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH v3 19/48] i965/fs/nir: Don't stomp 64-bit values to D in get_nir_src

2017-10-25 Thread Jason Ekstrand
---
 src/intel/compiler/brw_fs_nir.cpp | 33 +
 1 file changed, 21 insertions(+), 12 deletions(-)

diff --git a/src/intel/compiler/brw_fs_nir.cpp 
b/src/intel/compiler/brw_fs_nir.cpp
index e008e2e..a441f57 100644
--- a/src/intel/compiler/brw_fs_nir.cpp
+++ b/src/intel/compiler/brw_fs_nir.cpp
@@ -1441,11 +1441,19 @@ fs_visitor::get_nir_src(const nir_src )
src.reg.base_offset * src.reg.reg->num_components);
}
 
-   /* to avoid floating-point denorm flushing problems, set the type by
-* default to D - instructions that need floating point semantics will set
-* this to F if they need to
-*/
-   return retype(reg, BRW_REGISTER_TYPE_D);
+   if (nir_src_bit_size(src) == 64 && devinfo->gen == 7) {
+  /* The only 64-bit type available on gen7 is DF, so use that. */
+  reg.type = BRW_REGISTER_TYPE_DF;
+   } else {
+  /* To avoid floating-point denorm flushing problems, set the type by
+   * default to an integer type - instructions that need floating point
+   * semantics will set this to F if they need to
+   */
+  reg.type = brw_reg_type_from_bit_size(nir_src_bit_size(src),
+BRW_REGISTER_TYPE_D);
+   }
+
+   return reg;
 }
 
 /**
@@ -1455,6 +1463,10 @@ fs_reg
 fs_visitor::get_nir_src_imm(const nir_src )
 {
nir_const_value *val = nir_src_as_const_value(src);
+   /* This function shouldn't be called on anything which can even
+* possibly be 64 bits as it can't do what it claims.
+*/
+   assert(nir_src_bit_size(src) == 32);
return val ? fs_reg(brw_imm_d(val->i32[0])) : get_nir_src(src);
 }
 
@@ -2648,8 +2660,7 @@ fs_visitor::nir_emit_tcs_intrinsic(const fs_builder ,
 */
unsigned channel = iter * 2 + i;
fs_reg dest = shuffle_64bit_data_for_32bit_write(bld,
-  retype(offset(value, bld, 2 * channel), 
BRW_REGISTER_TYPE_DF),
-  1);
+  offset(value, bld, channel), 1);
 
srcs[header_regs + (i + first_component) * 2] = dest;
srcs[header_regs + (i + first_component) * 2 + 1] =
@@ -3505,8 +3516,7 @@ fs_visitor::nir_emit_cs_intrinsic(const fs_builder ,
   if (nir_src_bit_size(instr->src[0]) == 64) {
  type_size = 8;
  val_reg = shuffle_64bit_data_for_32bit_write(bld,
-retype(val_reg, BRW_REGISTER_TYPE_DF),
-instr->num_components);
+val_reg, instr->num_components);
   }
 
   unsigned type_slots = type_size / 4;
@@ -4005,8 +4015,7 @@ fs_visitor::nir_emit_intrinsic(const fs_builder , 
nir_intrinsic_instr *instr
   if (nir_src_bit_size(instr->src[0]) == 64) {
  type_size = 8;
  val_reg = shuffle_64bit_data_for_32bit_write(bld,
-retype(val_reg, BRW_REGISTER_TYPE_DF),
-instr->num_components);
+val_reg, instr->num_components);
   }
 
   unsigned type_slots = type_size / 4;
@@ -4063,7 +4072,7 @@ fs_visitor::nir_emit_intrinsic(const fs_builder , 
nir_intrinsic_instr *instr
   unsigned first_component = nir_intrinsic_component(instr);
   if (nir_src_bit_size(instr->src[0]) == 64) {
  fs_reg tmp = shuffle_64bit_data_for_32bit_write(bld,
-retype(src, BRW_REGISTER_TYPE_DF), num_components);
+src, num_components);
  src = tmp;
  num_components *= 2;
   }
-- 
2.5.0.400.gff86faf

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH v3 13/48] intel/fs: Use the original destination region for int MUL lowering

2017-10-25 Thread Jason Ekstrand
Some hardware (CHV, BXT) have special restrictions on register regions
when doing integer multiplication.  We want to respect those when we
lower to DxW multiplication.

Cc: mesa-sta...@lists.freedesktop.org
---
 src/intel/compiler/brw_fs.cpp | 16 +---
 1 file changed, 9 insertions(+), 7 deletions(-)

diff --git a/src/intel/compiler/brw_fs.cpp b/src/intel/compiler/brw_fs.cpp
index 56455e9..1c4351b 100644
--- a/src/intel/compiler/brw_fs.cpp
+++ b/src/intel/compiler/brw_fs.cpp
@@ -3482,18 +3482,20 @@ fs_visitor::lower_integer_multiplication()
 
 bool needs_mov = false;
 fs_reg orig_dst = inst->dst;
+fs_reg low = inst->dst;
 if (orig_dst.is_null() || orig_dst.file == MRF ||
 regions_overlap(inst->dst, inst->size_written,
 inst->src[0], inst->size_read(0)) ||
 regions_overlap(inst->dst, inst->size_written,
 inst->src[1], inst->size_read(1))) {
needs_mov = true;
-   inst->dst = fs_reg(VGRF, alloc.allocate(dispatch_width / 8),
-  inst->dst.type);
+   low.nr = alloc.allocate(regs_written(inst));
+   low.offset = low.offset % REG_SIZE;
 }
-fs_reg low = inst->dst;
-fs_reg high(VGRF, alloc.allocate(dispatch_width / 8),
-inst->dst.type);
+
+fs_reg high = inst->dst;
+high.nr = alloc.allocate(regs_written(inst));
+high.offset = high.offset % REG_SIZE;
 
 if (devinfo->gen >= 7) {
if (inst->src[1].file == IMM) {
@@ -3514,13 +3516,13 @@ fs_visitor::lower_integer_multiplication()
 inst->src[1]);
 }
 
-ibld.ADD(subscript(inst->dst, BRW_REGISTER_TYPE_UW, 1),
+ibld.ADD(subscript(low, BRW_REGISTER_TYPE_UW, 1),
  subscript(low, BRW_REGISTER_TYPE_UW, 1),
  subscript(high, BRW_REGISTER_TYPE_UW, 0));
 
 if (needs_mov || inst->conditional_mod) {
set_condmod(inst->conditional_mod,
-   ibld.MOV(orig_dst, inst->dst));
+   ibld.MOV(orig_dst, low));
 }
  }
 
-- 
2.5.0.400.gff86faf

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH v3 10/48] intel/eu: Fix broadcast instruction for 64-bit values on little-core

2017-10-25 Thread Jason Ekstrand
We're not using broadcast for any 32-bit types right now since we mostly
use it for emit_uniformize on 32-bit buffer indices.  However, SPIR-V
subgroups are going to need it for 64-bit so let's make it work.
---
 src/intel/compiler/brw_eu_emit.c | 26 --
 1 file changed, 24 insertions(+), 2 deletions(-)

diff --git a/src/intel/compiler/brw_eu_emit.c b/src/intel/compiler/brw_eu_emit.c
index a18cfa4..fae74cf 100644
--- a/src/intel/compiler/brw_eu_emit.c
+++ b/src/intel/compiler/brw_eu_emit.c
@@ -3430,8 +3430,30 @@ brw_broadcast(struct brw_codegen *p,
  brw_pop_insn_state(p);
 
  /* Use indirect addressing to fetch the specified component. */
- brw_MOV(p, dst,
- retype(brw_vec1_indirect(addr.subnr, offset), src.type));
+ if (type_sz(src.type) > 4 &&
+ (devinfo->is_cherryview || gen_device_info_is_9lp(devinfo))) {
+/* From the Cherryview PRM Vol 7. "Register Region Restrictions":
+ *
+ *"When source or destination datatype is 64b or operation is
+ *integer DWord multiply, indirect addressing must not be
+ *used."
+ *
+ * To work around both of this issue, we do two integer MOVs
+ * insead of one 64-bit MOV.  Because no double value should ever
+ * cross a register boundary, it's safe to use the immediate
+ * offset in the indirect here to handle adding 4 bytes to the
+ * offset and avoid the extra ADD to the register file.
+ */
+brw_MOV(p, subscript(dst, BRW_REGISTER_TYPE_D, 0),
+   retype(brw_vec1_indirect(addr.subnr, offset),
+  BRW_REGISTER_TYPE_D));
+brw_MOV(p, subscript(dst, BRW_REGISTER_TYPE_D, 1),
+   retype(brw_vec1_indirect(addr.subnr, offset + 4),
+  BRW_REGISTER_TYPE_D));
+ } else {
+brw_MOV(p, dst,
+retype(brw_vec1_indirect(addr.subnr, offset), src.type));
+ }
   } else {
  /* In SIMD4x2 mode the index can be either zero or one, replicate it
   * to all bits of a flag register,
-- 
2.5.0.400.gff86faf

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH v3 16/48] i965/fs/nir: Simplify 64-bit store_output

2017-10-25 Thread Jason Ekstrand
The swizzles weren't doing any good because swiz is just XYZW.  Also, we
were emitting an extra set of MOVs because shuffle_64bit_data_for_32bit
already does a MOV for us.  Finally, the temporary was only ever used
inside the inner loop so there's no need for it to actually be an array.
---
 src/intel/compiler/brw_fs_nir.cpp | 25 ++---
 1 file changed, 6 insertions(+), 19 deletions(-)

diff --git a/src/intel/compiler/brw_fs_nir.cpp 
b/src/intel/compiler/brw_fs_nir.cpp
index f433e3b..d0625c8 100644
--- a/src/intel/compiler/brw_fs_nir.cpp
+++ b/src/intel/compiler/brw_fs_nir.cpp
@@ -2568,7 +2568,6 @@ fs_visitor::nir_emit_tcs_intrinsic(const fs_builder ,
  instr->src[0].ssa->bit_size : instr->src[0].reg.reg->bit_size) == 64;
   fs_reg indirect_offset = get_indirect_offset(instr);
   unsigned imm_offset = instr->const_index[0];
-  unsigned swiz = BRW_SWIZZLE_XYZW;
   unsigned mask = instr->const_index[1];
   unsigned header_regs = 0;
   fs_reg srcs[7];
@@ -2598,13 +2597,6 @@ fs_visitor::nir_emit_tcs_intrinsic(const fs_builder ,
  }
   }
 
-  /* 64-bit data needs to me shuffled before we can write it to the URB.
-   * We will use this temporary to shuffle the components in each
-   * iteration.
-   */
-  fs_reg tmp =
- fs_reg(VGRF, alloc.allocate(2 * iter_components), value.type);
-
   mask = mask << first_component;
 
   for (unsigned iter = 0; iter < num_iterations; iter++) {
@@ -2648,26 +2640,21 @@ fs_visitor::nir_emit_tcs_intrinsic(const fs_builder 
,
continue;
 
 if (!is_64bit) {
-   srcs[header_regs + i + first_component] =
-  offset(value, bld, BRW_GET_SWZ(swiz, i));
+   srcs[header_regs + i + first_component] = offset(value, bld, i);
 } else {
/* We need to shuffle the 64-bit data to match the layout
 * expected by our 32-bit URB write messages. We use a temporary
 * for that.
 */
-   unsigned channel = BRW_GET_SWZ(swiz, iter * 2 + i);
+   fs_reg dest = fs_reg(VGRF, alloc.allocate(2), value.type);
+   unsigned channel = iter * 2 + i;
shuffle_64bit_data_for_32bit_write(bld,
-  retype(offset(tmp, bld, 2 * i), BRW_REGISTER_TYPE_F),
+  retype(dest, BRW_REGISTER_TYPE_F),
   retype(offset(value, bld, 2 * channel), 
BRW_REGISTER_TYPE_DF),
   1);
 
-   /* Now copy the data to the destination */
-   fs_reg dest = fs_reg(VGRF, alloc.allocate(2), value.type);
-   unsigned idx = 2 * i;
-   bld.MOV(dest, offset(tmp, bld, idx));
-   bld.MOV(offset(dest, bld, 1), offset(tmp, bld, idx + 1));
-   srcs[header_regs + idx + first_component * 2] = dest;
-   srcs[header_regs + idx + 1 + first_component * 2] =
+   srcs[header_regs + (i + first_component) * 2] = dest;
+   srcs[header_regs + (i + first_component) * 2 + 1] =
   offset(dest, bld, 1);
 }
  }
-- 
2.5.0.400.gff86faf

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH v3 23/48] intel/fs: Assign constant locations if they haven't been assigned

2017-10-25 Thread Jason Ekstrand
Before, we bailing in assign_constant_locations based on the minimum
dispatch size.  The more direct thing to do is simply to check for
whether or not we have constant locations and bail if we do.  For
nir_setup_uniforms, it's completely safe to do it multiple times because
we just copy a value from the NIR shader.
---
 src/intel/compiler/brw_fs.cpp | 4 +++-
 src/intel/compiler/brw_fs_nir.cpp | 5 -
 2 files changed, 7 insertions(+), 2 deletions(-)

diff --git a/src/intel/compiler/brw_fs.cpp b/src/intel/compiler/brw_fs.cpp
index 52079d3..75139fd 100644
--- a/src/intel/compiler/brw_fs.cpp
+++ b/src/intel/compiler/brw_fs.cpp
@@ -1956,8 +1956,10 @@ void
 fs_visitor::assign_constant_locations()
 {
/* Only the first compile gets to decide on locations. */
-   if (dispatch_width != min_dispatch_width)
+   if (push_constant_loc) {
+  assert(pull_constant_loc);
   return;
+   }
 
bool is_live[uniforms];
memset(is_live, 0, sizeof(is_live));
diff --git a/src/intel/compiler/brw_fs_nir.cpp 
b/src/intel/compiler/brw_fs_nir.cpp
index 7556576..05efee3 100644
--- a/src/intel/compiler/brw_fs_nir.cpp
+++ b/src/intel/compiler/brw_fs_nir.cpp
@@ -81,8 +81,11 @@ fs_visitor::nir_setup_outputs()
 void
 fs_visitor::nir_setup_uniforms()
 {
-   if (dispatch_width != min_dispatch_width)
+   /* Only the first compile gets to set up uniforms. */
+   if (push_constant_loc) {
+  assert(pull_constant_loc);
   return;
+   }
 
uniforms = nir->num_uniforms / 4;
 }
-- 
2.5.0.400.gff86faf

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH v3 24/48] intel/fs: Remove min_dispatch_width from fs_visitor

2017-10-25 Thread Jason Ekstrand
It's 8 for everything except compute shaders.  For compute shaders,
there's no need to duplicate the computation and it's just a possible
source of error.
---
 src/intel/compiler/brw_fs.cpp | 42 +++
 src/intel/compiler/brw_fs.h   |  5 ++---
 src/intel/compiler/brw_fs_visitor.cpp | 11 -
 3 files changed, 25 insertions(+), 33 deletions(-)

diff --git a/src/intel/compiler/brw_fs.cpp b/src/intel/compiler/brw_fs.cpp
index 75139fd..a23366b 100644
--- a/src/intel/compiler/brw_fs.cpp
+++ b/src/intel/compiler/brw_fs.cpp
@@ -5886,7 +5886,7 @@ fs_visitor::fixup_3src_null_dest()
 }
 
 void
-fs_visitor::allocate_registers(bool allow_spilling)
+fs_visitor::allocate_registers(unsigned min_dispatch_width, bool 
allow_spilling)
 {
bool allocated_without_spills;
 
@@ -6021,7 +6021,7 @@ fs_visitor::run_vs()
assign_vs_urb_setup();
 
fixup_3src_null_dest();
-   allocate_registers(true);
+   allocate_registers(8, true);
 
return !failed;
 }
@@ -6101,7 +6101,7 @@ fs_visitor::run_tcs_single_patch()
assign_tcs_single_patch_urb_setup();
 
fixup_3src_null_dest();
-   allocate_registers(true);
+   allocate_registers(8, true);
 
return !failed;
 }
@@ -6135,7 +6135,7 @@ fs_visitor::run_tes()
assign_tes_urb_setup();
 
fixup_3src_null_dest();
-   allocate_registers(true);
+   allocate_registers(8, true);
 
return !failed;
 }
@@ -6184,7 +6184,7 @@ fs_visitor::run_gs()
assign_gs_urb_setup();
 
fixup_3src_null_dest();
-   allocate_registers(true);
+   allocate_registers(8, true);
 
return !failed;
 }
@@ -6255,7 +6255,7 @@ fs_visitor::run_fs(bool allow_spilling, bool do_rep_send)
   assign_urb_setup();
 
   fixup_3src_null_dest();
-  allocate_registers(allow_spilling);
+  allocate_registers(8, allow_spilling);
 
   if (failed)
  return false;
@@ -6265,9 +6265,10 @@ fs_visitor::run_fs(bool allow_spilling, bool do_rep_send)
 }
 
 bool
-fs_visitor::run_cs()
+fs_visitor::run_cs(unsigned min_dispatch_width)
 {
assert(stage == MESA_SHADER_COMPUTE);
+   assert(dispatch_width >= min_dispatch_width);
 
setup_cs_payload();
 
@@ -6298,7 +6299,7 @@ fs_visitor::run_cs()
assign_curb_setup();
 
fixup_3src_null_dest();
-   allocate_registers(true);
+   allocate_registers(min_dispatch_width, true);
 
if (failed)
   return false;
@@ -6788,8 +6789,11 @@ brw_compile_cs(const struct brw_compiler *compiler, void 
*log_data,
   shader->info.cs.local_size[0] * shader->info.cs.local_size[1] *
   shader->info.cs.local_size[2];
 
-   unsigned max_cs_threads = compiler->devinfo->max_cs_threads;
-   unsigned simd_required = DIV_ROUND_UP(local_workgroup_size, max_cs_threads);
+   unsigned min_dispatch_width =
+  DIV_ROUND_UP(local_workgroup_size, compiler->devinfo->max_cs_threads);
+   min_dispatch_width = MAX2(8, min_dispatch_width);
+   min_dispatch_width = util_next_power_of_two(min_dispatch_width);
+   assert(min_dispatch_width <= 32);
 
cfg_t *cfg = NULL;
const char *fail_msg = NULL;
@@ -6799,8 +6803,8 @@ brw_compile_cs(const struct brw_compiler *compiler, void 
*log_data,
fs_visitor v8(compiler, log_data, mem_ctx, key, _data->base,
  NULL, /* Never used in core profile */
  shader, 8, shader_time_index);
-   if (simd_required <= 8) {
-  if (!v8.run_cs()) {
+   if (min_dispatch_width <= 8) {
+  if (!v8.run_cs(min_dispatch_width)) {
  fail_msg = v8.fail_msg;
   } else {
  cfg = v8.cfg;
@@ -6815,11 +6819,11 @@ brw_compile_cs(const struct brw_compiler *compiler, 
void *log_data,
  shader, 16, shader_time_index);
if (likely(!(INTEL_DEBUG & DEBUG_NO16)) &&
!fail_msg && v8.max_dispatch_width >= 16 &&
-   simd_required <= 16) {
+   min_dispatch_width <= 16) {
   /* Try a SIMD16 compile */
-  if (simd_required <= 8)
+  if (min_dispatch_width <= 8)
  v16.import_uniforms();
-  if (!v16.run_cs()) {
+  if (!v16.run_cs(min_dispatch_width)) {
  compiler->shader_perf_log(log_data,
"SIMD16 shader failed to compile: %s",
v16.fail_msg);
@@ -6840,14 +6844,14 @@ brw_compile_cs(const struct brw_compiler *compiler, 
void *log_data,
  NULL, /* Never used in core profile */
  shader, 32, shader_time_index);
if (!fail_msg && v8.max_dispatch_width >= 32 &&
-   (simd_required > 16 || (INTEL_DEBUG & DEBUG_DO32))) {
+   (min_dispatch_width > 16 || (INTEL_DEBUG & DEBUG_DO32))) {
   /* Try a SIMD32 compile */
-  if (simd_required <= 8)
+  if (min_dispatch_width <= 8)
  v32.import_uniforms();
-  else if (simd_required <= 16)
+  else if (min_dispatch_width <= 16)
  v32.import_uniforms();
 
-  if (!v32.run_cs()) {
+  if (!v32.run_cs(min_dispatch_width)) {
  compiler->shader_perf_log(log_data,

[Mesa-dev] [PATCH v3 20/48] intel/fs: Protect opt_algebraic from OOB BROADCAST indices

2017-10-25 Thread Jason Ekstrand
---
 src/intel/compiler/brw_fs.cpp | 10 --
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/src/intel/compiler/brw_fs.cpp b/src/intel/compiler/brw_fs.cpp
index 1c4351b..52079d3 100644
--- a/src/intel/compiler/brw_fs.cpp
+++ b/src/intel/compiler/brw_fs.cpp
@@ -2416,8 +2416,14 @@ fs_visitor::opt_algebraic()
 progress = true;
  } else if (inst->src[1].file == IMM) {
 inst->opcode = BRW_OPCODE_MOV;
-inst->src[0] = component(inst->src[0],
- inst->src[1].ud);
+/* It's possible that the selected component will be too large and
+ * overflow the register.  If this happens and we some how manage
+ * to constant fold it in and get here, it would cause an assert
+ * in component() below.  Instead, just let it wrap around if it
+ * goes over exec_size.
+ */
+const unsigned comp = inst->src[1].ud & (inst->exec_size - 1);
+inst->src[0] = component(inst->src[0], comp);
 inst->sources = 1;
 inst->force_writemask_all = true;
 progress = true;
-- 
2.5.0.400.gff86faf

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH v3 09/48] intel/eu/reg: Add a subscript() helper

2017-10-25 Thread Jason Ekstrand
This is similar to the identically named fs_reg helper.

Cc: mesa-sta...@lists.freedesktop.org
---
 src/intel/compiler/brw_reg.h | 16 
 1 file changed, 16 insertions(+)

diff --git a/src/intel/compiler/brw_reg.h b/src/intel/compiler/brw_reg.h
index d68d64f..9d63717 100644
--- a/src/intel/compiler/brw_reg.h
+++ b/src/intel/compiler/brw_reg.h
@@ -896,6 +896,22 @@ spread(struct brw_reg reg, unsigned s)
}
 }
 
+/**
+ * Reinterpret each channel of register \p reg as a vector of values of the
+ * given smaller type and take the i-th subcomponent from each.
+ */
+static inline struct brw_reg
+subscript(struct brw_reg reg, enum brw_reg_type type, unsigned i)
+{
+   if (reg.file == IMM)
+  return reg;
+
+   unsigned scale = type_sz(reg.type) / type_sz(type);
+   assert(scale >= 1 && i < scale);
+
+   return suboffset(retype(spread(reg, scale), type), i);
+}
+
 static inline struct brw_reg
 vec16(struct brw_reg reg)
 {
-- 
2.5.0.400.gff86faf

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH v3 17/48] i965/fs: Return a fs_reg from shuffle_64bit_data_for_32bit_write

2017-10-25 Thread Jason Ekstrand
All callers of this function allocate a fs_reg expressly to pass into
it.  It's much easier if we just let the helper allocate the register.
While we're here, we switch it to doing the MOVs with an integer type so
that we don't accidentally canonicalize floats on half of a double.
---
 src/intel/compiler/brw_fs.h   |  7 +++
 src/intel/compiler/brw_fs_nir.cpp | 34 +-
 2 files changed, 12 insertions(+), 29 deletions(-)

diff --git a/src/intel/compiler/brw_fs.h b/src/intel/compiler/brw_fs.h
index 2040575..b070d38 100644
--- a/src/intel/compiler/brw_fs.h
+++ b/src/intel/compiler/brw_fs.h
@@ -493,10 +493,9 @@ void shuffle_32bit_load_result_to_64bit_data(const 
brw::fs_builder ,
  const fs_reg ,
  uint32_t components);
 
-void shuffle_64bit_data_for_32bit_write(const brw::fs_builder ,
-const fs_reg ,
-const fs_reg ,
-uint32_t components);
+fs_reg shuffle_64bit_data_for_32bit_write(const brw::fs_builder ,
+  const fs_reg ,
+  uint32_t components);
 fs_reg setup_imm_df(const brw::fs_builder ,
 double v);
 
diff --git a/src/intel/compiler/brw_fs_nir.cpp 
b/src/intel/compiler/brw_fs_nir.cpp
index d0625c8..5bcdb1a 100644
--- a/src/intel/compiler/brw_fs_nir.cpp
+++ b/src/intel/compiler/brw_fs_nir.cpp
@@ -2646,10 +2646,8 @@ fs_visitor::nir_emit_tcs_intrinsic(const fs_builder ,
 * expected by our 32-bit URB write messages. We use a temporary
 * for that.
 */
-   fs_reg dest = fs_reg(VGRF, alloc.allocate(2), value.type);
unsigned channel = iter * 2 + i;
-   shuffle_64bit_data_for_32bit_write(bld,
-  retype(dest, BRW_REGISTER_TYPE_F),
+   fs_reg dest = shuffle_64bit_data_for_32bit_write(bld,
   retype(offset(value, bld, 2 * channel), 
BRW_REGISTER_TYPE_DF),
   1);
 
@@ -3506,14 +3504,9 @@ fs_visitor::nir_emit_cs_intrinsic(const fs_builder ,
   unsigned type_size = 4;
   if (nir_src_bit_size(instr->src[0]) == 64) {
  type_size = 8;
- fs_reg tmp =
-   fs_reg(VGRF, alloc.allocate(alloc.sizes[val_reg.nr]), val_reg.type);
- shuffle_64bit_data_for_32bit_write(
-bld,
-retype(tmp, BRW_REGISTER_TYPE_F),
+ val_reg = shuffle_64bit_data_for_32bit_write(bld,
 retype(val_reg, BRW_REGISTER_TYPE_DF),
 instr->num_components);
- val_reg = tmp;
   }
 
   unsigned type_slots = type_size / 4;
@@ -4011,13 +4004,9 @@ fs_visitor::nir_emit_intrinsic(const fs_builder , 
nir_intrinsic_instr *instr
   unsigned type_size = 4;
   if (nir_src_bit_size(instr->src[0]) == 64) {
  type_size = 8;
- fs_reg tmp =
-   fs_reg(VGRF, alloc.allocate(alloc.sizes[val_reg.nr]), val_reg.type);
- shuffle_64bit_data_for_32bit_write(bld,
-retype(tmp, BRW_REGISTER_TYPE_F),
+ val_reg = shuffle_64bit_data_for_32bit_write(bld,
 retype(val_reg, BRW_REGISTER_TYPE_DF),
 instr->num_components);
- val_reg = tmp;
   }
 
   unsigned type_slots = type_size / 4;
@@ -4075,11 +4064,8 @@ fs_visitor::nir_emit_intrinsic(const fs_builder , 
nir_intrinsic_instr *instr
   unsigned num_components = instr->num_components;
   unsigned first_component = nir_intrinsic_component(instr);
   if (nir_src_bit_size(instr->src[0]) == 64) {
- fs_reg tmp =
-fs_reg(VGRF, alloc.allocate(2 * num_components),
-   BRW_REGISTER_TYPE_F);
- shuffle_64bit_data_for_32bit_write(
-bld, tmp, retype(src, BRW_REGISTER_TYPE_DF), num_components);
+ fs_reg tmp = shuffle_64bit_data_for_32bit_write(bld,
+retype(src, BRW_REGISTER_TYPE_DF), num_components);
  src = retype(tmp, src.type);
  num_components *= 2;
   }
@@ -4767,24 +4753,22 @@ shuffle_32bit_load_result_to_64bit_data(const 
fs_builder ,
  * 64-bit data they are about to write. Because of this the function checks
  * that the src and dst regions involved in the operation do not overlap.
  */
-void
+fs_reg
 shuffle_64bit_data_for_32bit_write(const fs_builder ,
-   const fs_reg ,
const fs_reg ,
uint32_t components)
 {
assert(type_sz(src.type) == 8);
-   assert(type_sz(dst.type) == 4);
 
-   assert(!regions_overlap(
- dst, 2 * components * dst.component_size(bld.dispatch_width()),
- src, components * src.component_size(bld.dispatch_width(;
+   fs_reg dst = bld.vgrf(BRW_REGISTER_TYPE_D, 2 * components);
 
for (unsigned i = 0; i < 

[Mesa-dev] [PATCH v3 14/48] i965/fs: Extend the live ranges of VGRFs which leave loops

2017-10-25 Thread Jason Ekstrand
No Shader-db changes.

Cc: mesa-sta...@lists.freedesktop.org
---
 src/intel/compiler/brw_fs_live_variables.cpp | 55 
 1 file changed, 55 insertions(+)

diff --git a/src/intel/compiler/brw_fs_live_variables.cpp 
b/src/intel/compiler/brw_fs_live_variables.cpp
index c449672..380060d 100644
--- a/src/intel/compiler/brw_fs_live_variables.cpp
+++ b/src/intel/compiler/brw_fs_live_variables.cpp
@@ -223,6 +223,61 @@ fs_live_variables::compute_start_end()
  }
   }
}
+
+   /* Due to the explicit way the SIMD data is handled on GEN, we need to be a
+* bit more careful with live ranges and loops.  Consider the following
+* example:
+*
+*vec4 color2;
+*while (1) {
+*   vec4 color = texture();
+*   if (...) {
+*  color2 = color * 2;
+*  break;
+*   }
+*}
+*gl_FragColor = color2;
+*
+* In this case, the definition of color2 dominates the use because the
+* loop only has the one exit.  This means that the live range interval for
+* color2 goes from the statement in the if to it's use below the loop.
+* Now suppose that the texture operation has a header register that gets
+* assigned one of the registers used for color2.  If the loop condition is
+* non-uniform and some of the threads will take the and others will
+* continue.  In this case, the next pass through the loop, the WE_all
+* setup of the header register will stomp the disabled channels of color2
+* and corrupt the value.
+*
+* This same problem can occur if you have a mix of 64, 32, and 16-bit
+* registers because the channels do not line up or if you have a SIMD16
+* program and the first half of one value overlaps the second half of the
+* other.
+*
+* To solve this problem, we take any VGRFs whose live ranges cross the
+* while instruction of a loop and extend their live ranges to the top of
+* the loop.  This more accurately models the hardware because the value in
+* the VGRF needs to be carried through subsequent loop iterations in order
+* to remain valid when we finally do break.
+*/
+   foreach_block (block, cfg) {
+  if (block->end()->opcode != BRW_OPCODE_WHILE)
+ continue;
+
+  /* This is a WHILE instrution. Find the DO block. */
+  bblock_t *do_block = NULL;
+  foreach_list_typed(bblock_link, child_link, link, >children) {
+ if (child_link->block->start_ip < block->end_ip) {
+assert(do_block == NULL);
+do_block = child_link->block;
+ }
+  }
+  assert(do_block);
+
+  for (int i = 0; i < num_vars; i++) {
+ if (start[i] < block->end_ip && end[i] > block->end_ip)
+start[i] = MIN2(start[i], do_block->start_ip);
+  }
+   }
 }
 
 fs_live_variables::fs_live_variables(fs_visitor *v, const cfg_t *cfg)
-- 
2.5.0.400.gff86faf

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH v3 11/48] intel/fs: Fix MOV_INDIRECT for 64-bit values on little-core

2017-10-25 Thread Jason Ekstrand
The same workaround we need for 64-bit values on little core also takes
care of the Ivy Bridge problem and does so a bit more efficiently so we
can drop that code while we're here.

Cc: mesa-sta...@lists.freedesktop.org
---
 src/intel/compiler/brw_fs_generator.cpp | 75 +
 1 file changed, 39 insertions(+), 36 deletions(-)

diff --git a/src/intel/compiler/brw_fs_generator.cpp 
b/src/intel/compiler/brw_fs_generator.cpp
index 2aa79c3..cc9f8ad 100644
--- a/src/intel/compiler/brw_fs_generator.cpp
+++ b/src/intel/compiler/brw_fs_generator.cpp
@@ -486,45 +486,48 @@ fs_generator::generate_mov_indirect(fs_inst *inst,
* code, using it saves us 0 instructions and would require quite a bit
* of case-by-case work.  It's just not worth it.
*/
-  if (devinfo->gen >= 8 || devinfo->is_haswell || type_sz(reg.type) < 8) {
- brw_ADD(p, addr, indirect_byte_offset, brw_imm_uw(imm_byte_offset));
-  } else {
- /* IVB reads two address register components per channel for
-  * indirectly addressed 64-bit sources, so we need to initialize
-  * adjacent address components to consecutive dwords of the source
-  * region by emitting two separate ADD instructions.  Found
-  * empirically.
+  brw_ADD(p, addr, indirect_byte_offset, brw_imm_uw(imm_byte_offset));
+
+  if (type_sz(reg.type) > 4 &&
+  ((devinfo->gen == 7 && !devinfo->is_haswell) ||
+   devinfo->is_cherryview || gen_device_info_is_9lp(devinfo))) {
+ /* IVB has an issue (which we found empirically) where it reads two
+  * address register components per channel for indirectly addressed
+  * 64-bit sources.
+  *
+  * From the Cherryview PRM Vol 7. "Register Region Restrictions":
+  *
+  *"When source or destination datatype is 64b or operation is
+  *integer DWord multiply, indirect addressing must not be used."
+  *
+  * To work around both of these, we do two integer MOVs insead of one
+  * 64-bit MOV.  Because no double value should ever cross a register
+  * boundary, it's safe to use the immediate offset in the indirect
+  * here to handle adding 4 bytes to the offset and avoid the extra
+  * ADD to the register file.
   */
- assert(inst->exec_size <= 4);
- brw_push_insn_state(p);
- brw_set_default_exec_size(p, cvt(inst->exec_size) - 1);
-
- brw_ADD(p, spread(addr, 2), indirect_byte_offset,
- brw_imm_uw(imm_byte_offset));
- brw_inst_set_no_dd_clear(devinfo, brw_last_inst, true);
-
- brw_ADD(p, spread(suboffset(addr, 1), 2), indirect_byte_offset,
- brw_imm_uw(imm_byte_offset + 4));
- brw_inst_set_no_dd_check(devinfo, brw_last_inst, true);
-
- brw_pop_insn_state(p);
-  }
-
-  struct brw_reg ind_src = brw_VxH_indirect(0, 0);
+ brw_MOV(p, subscript(dst, BRW_REGISTER_TYPE_D, 0),
+retype(brw_VxH_indirect(0, 0), BRW_REGISTER_TYPE_D));
+ brw_MOV(p, subscript(dst, BRW_REGISTER_TYPE_D, 1),
+retype(brw_VxH_indirect(0, 4), BRW_REGISTER_TYPE_D));
+  } else {
+ struct brw_reg ind_src = brw_VxH_indirect(0, 0);
 
-  brw_inst *mov = brw_MOV(p, dst, retype(ind_src, reg.type));
+ brw_inst *mov = brw_MOV(p, dst, retype(ind_src, reg.type));
 
-  if (devinfo->gen == 6 && dst.file == BRW_MESSAGE_REGISTER_FILE &&
-  !inst->get_next()->is_tail_sentinel() &&
-  ((fs_inst *)inst->get_next())->mlen > 0) {
- /* From the Sandybridge PRM:
-  *
-  *"[Errata: DevSNB(SNB)] If MRF register is updated by any
-  *instruction that “indexed/indirect” source AND is followed by a
-  *send, the instruction requires a “Switch”. This is to avoid
-  *race condition where send may dispatch before MRF is updated."
-  */
- brw_inst_set_thread_control(devinfo, mov, BRW_THREAD_SWITCH);
+ if (devinfo->gen == 6 && dst.file == BRW_MESSAGE_REGISTER_FILE &&
+ !inst->get_next()->is_tail_sentinel() &&
+ ((fs_inst *)inst->get_next())->mlen > 0) {
+/* From the Sandybridge PRM:
+ *
+ *"[Errata: DevSNB(SNB)] If MRF register is updated by any
+ *instruction that “indexed/indirect” source AND is followed
+ *by a send, the instruction requires a “Switch”. This is to
+ *avoid race condition where send may dispatch before MRF is
+ *updated."
+ */
+brw_inst_set_thread_control(devinfo, mov, BRW_THREAD_SWITCH);
+ }
   }
}
 }
-- 
2.5.0.400.gff86faf

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH v3 22/48] intel/fs: Retype dest to match value in read[First]Invocation

2017-10-25 Thread Jason Ekstrand
This is what we really wanted all along.  Always retyping to D works
because that's what get_nir_src() always gives us, at least for 32-bit
types.  The SPIR-V variants of these operations accept arbitrary types
and we need this if we're going to handle 64 or 16-bit values.
---
 src/intel/compiler/brw_fs_nir.cpp | 6 ++
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/src/intel/compiler/brw_fs_nir.cpp 
b/src/intel/compiler/brw_fs_nir.cpp
index a3a863e..7556576 100644
--- a/src/intel/compiler/brw_fs_nir.cpp
+++ b/src/intel/compiler/brw_fs_nir.cpp
@@ -4306,15 +4306,13 @@ fs_visitor::nir_emit_intrinsic(const fs_builder , 
nir_intrinsic_instr *instr
   bld.exec_all().emit(SHADER_OPCODE_BROADCAST, tmp, value,
   bld.emit_uniformize(invocation));
 
-  bld.MOV(retype(dest, BRW_REGISTER_TYPE_D),
-  fs_reg(component(tmp, 0)));
+  bld.MOV(retype(dest, value.type), fs_reg(component(tmp, 0)));
   break;
}
 
case nir_intrinsic_read_first_invocation: {
   const fs_reg value = get_nir_src(instr->src[0]);
-  bld.MOV(retype(dest, BRW_REGISTER_TYPE_D),
-  bld.emit_uniformize(value));
+  bld.MOV(retype(dest, value.type), bld.emit_uniformize(value));
   break;
}
 
-- 
2.5.0.400.gff86faf

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH v3 12/48] intel/fs: Fix integer multiplication lowering for src/dst hazards

2017-10-25 Thread Jason Ekstrand
Cc: mesa-sta...@lists.freedesktop.org
---
 src/intel/compiler/brw_fs.cpp | 10 --
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/src/intel/compiler/brw_fs.cpp b/src/intel/compiler/brw_fs.cpp
index ef36af9..56455e9 100644
--- a/src/intel/compiler/brw_fs.cpp
+++ b/src/intel/compiler/brw_fs.cpp
@@ -3480,8 +3480,14 @@ fs_visitor::lower_integer_multiplication()
  * schedule multi-component multiplications much better.
  */
 
+bool needs_mov = false;
 fs_reg orig_dst = inst->dst;
-if (orig_dst.is_null() || orig_dst.file == MRF) {
+if (orig_dst.is_null() || orig_dst.file == MRF ||
+regions_overlap(inst->dst, inst->size_written,
+inst->src[0], inst->size_read(0)) ||
+regions_overlap(inst->dst, inst->size_written,
+inst->src[1], inst->size_read(1))) {
+   needs_mov = true;
inst->dst = fs_reg(VGRF, alloc.allocate(dispatch_width / 8),
   inst->dst.type);
 }
@@ -3512,7 +3518,7 @@ fs_visitor::lower_integer_multiplication()
  subscript(low, BRW_REGISTER_TYPE_UW, 1),
  subscript(high, BRW_REGISTER_TYPE_UW, 0));
 
-if (inst->conditional_mod || orig_dst.file == MRF) {
+if (needs_mov || inst->conditional_mod) {
set_condmod(inst->conditional_mod,
ibld.MOV(orig_dst, inst->dst));
 }
-- 
2.5.0.400.gff86faf

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH v3 15/48] intel/fs: Restrict live intervals to the subset possibly reachable from any definition.

2017-10-25 Thread Jason Ekstrand
From: Francisco Jerez 

Currently the liveness analysis pass would extend a live interval up
to the top of the program when no unconditional and complete
definition of the variable is found that dominates all of its uses.

This can lead to a serious performance problem in shaders containing
many partial writes, like scalar arithmetic, FP64 and soon FP16
operations.  The number of oversize live intervals in such workloads
can cause the compilation time of the shader to explode because of the
worse than quadratic behavior of the register allocator and scheduler
when running out of registers, and it can also cause the running time
of the shader to explode due to the amount of spilling it leads to,
which is orders of magnitude slower than GRF memory.

This patch fixes it by computing the intersection of our current live
intervals with the subset of the program that can possibly be reached
from any definition of the variable.  Extending the storage allocation
of the variable beyond that is pretty useless because its value is
guaranteed to be undefined at a point that cannot be reached from any
definition.

No significant change in the running time of shader-db (with 5%
statistical significance).

shader-db results on IVB:

  total cycles in shared programs: 61108780 -> 60932856 (-0.29%)
  cycles in affected programs: 16335482 -> 16159558 (-1.08%)
  helped: 5121
  HURT: 4347

  total spills in shared programs: 1309 -> 1288 (-1.60%)
  spills in affected programs: 249 -> 228 (-8.43%)
  helped: 3
  HURT: 0

  total fills in shared programs: 1652 -> 1597 (-3.33%)
  fills in affected programs: 262 -> 207 (-20.99%)
  helped: 4
  HURT: 0

  LOST:   2
  GAINED: 209

shader-db results on BDW:

  total cycles in shared programs: 67617262 -> 67361220 (-0.38%)
  cycles in affected programs: 23397142 -> 23141100 (-1.09%)
  helped: 8045
  HURT: 6488

  total spills in shared programs: 1456 -> 1252 (-14.01%)
  spills in affected programs: 465 -> 261 (-43.87%)
  helped: 3
  HURT: 0

  total fills in shared programs: 1720 -> 1465 (-14.83%)
  fills in affected programs: 471 -> 216 (-54.14%)
  helped: 4
  HURT: 0

  LOST:   2
  GAINED: 162

shader-db results on SKL:

  total cycles in shared programs: 65436248 -> 65245186 (-0.29%)
  cycles in affected programs: 22560936 -> 22369874 (-0.85%)
  helped: 8457
  HURT: 6247

  total spills in shared programs: 437 -> 437 (0.00%)
  spills in affected programs: 0 -> 0
  helped: 0
  HURT: 0

  total fills in shared programs: 870 -> 854 (-1.84%)
  fills in affected programs: 16 -> 0
  helped: 1
  HURT: 0

  LOST:   0
  GAINED: 107
Reviewed-by: Jason Ekstrand 
---
 src/intel/compiler/brw_fs_live_variables.cpp | 34 
 src/intel/compiler/brw_fs_live_variables.h   | 12 ++
 2 files changed, 42 insertions(+), 4 deletions(-)

diff --git a/src/intel/compiler/brw_fs_live_variables.cpp 
b/src/intel/compiler/brw_fs_live_variables.cpp
index 380060d..6330cff 100644
--- a/src/intel/compiler/brw_fs_live_variables.cpp
+++ b/src/intel/compiler/brw_fs_live_variables.cpp
@@ -83,9 +83,11 @@ fs_live_variables::setup_one_write(struct block_data *bd, 
fs_inst *inst,
/* The def[] bitset marks when an initialization in a block completely
 * screens off previous updates of that variable (VGRF channel).
 */
-   if (inst->dst.file == VGRF && !inst->is_partial_write()) {
-  if (!BITSET_TEST(bd->use, var))
+   if (inst->dst.file == VGRF) {
+  if (!inst->is_partial_write() && !BITSET_TEST(bd->use, var))
  BITSET_SET(bd->def, var);
+
+  BITSET_SET(bd->defout, var);
}
 }
 
@@ -199,6 +201,28 @@ fs_live_variables::compute_live_variables()
  }
   }
}
+
+   /* Propagate defin and defout down the CFG to calculate the union of live
+* variables potentially defined along any possible control flow path.
+*/
+   do {
+  cont = false;
+
+  foreach_block (block, cfg) {
+ const struct block_data *bd = _data[block->num];
+
+foreach_list_typed(bblock_link, child_link, link, >children) {
+struct block_data *child_bd = _data[child_link->block->num];
+
+   for (int i = 0; i < bitset_words; i++) {
+   const BITSET_WORD new_def = bd->defout[i] & ~child_bd->defin[i];
+   child_bd->defin[i] |= new_def;
+   child_bd->defout[i] |= new_def;
+   cont |= new_def;
+   }
+}
+  }
+   } while (cont);
 }
 
 /**
@@ -212,12 +236,12 @@ fs_live_variables::compute_start_end()
   struct block_data *bd = _data[block->num];
 
   for (int i = 0; i < num_vars; i++) {
- if (BITSET_TEST(bd->livein, i)) {
+ if (BITSET_TEST(bd->livein, i) && BITSET_TEST(bd->defin, i)) {
 start[i] = MIN2(start[i], block->start_ip);
 end[i] = MAX2(end[i], block->start_ip);
  }
 
- if (BITSET_TEST(bd->liveout, i)) {
+ if (BITSET_TEST(bd->liveout, i) && 

[Mesa-dev] [PATCH v3 07/48] intel/compiler: Add some restrictions to MOV_INDIRECT and BROADCAST

2017-10-25 Thread Jason Ekstrand
These restrictions effectively already existed due to the way we use
indirect sources but weren't being directly enforced.
---
 src/intel/compiler/brw_eu_emit.c| 16 
 src/intel/compiler/brw_fs_generator.cpp |  2 ++
 src/intel/compiler/brw_shader.cpp   |  2 ++
 3 files changed, 20 insertions(+)

diff --git a/src/intel/compiler/brw_eu_emit.c b/src/intel/compiler/brw_eu_emit.c
index 4f0be55..e10b143 100644
--- a/src/intel/compiler/brw_eu_emit.c
+++ b/src/intel/compiler/brw_eu_emit.c
@@ -3373,6 +3373,8 @@ brw_broadcast(struct brw_codegen *p,
 
assert(src.file == BRW_GENERAL_REGISTER_FILE &&
   src.address_mode == BRW_ADDRESS_DIRECT);
+   assert(!src.abs && !src.negate);
+   assert(src.type == dst.type);
 
if ((src.vstride == 0 && (src.hstride == 0 || !align1)) ||
idx.file == BRW_IMMEDIATE_VALUE) {
@@ -3385,6 +3387,20 @@ brw_broadcast(struct brw_codegen *p,
   (align1 ? stride(suboffset(src, i), 0, 1, 0) :
stride(suboffset(src, 4 * i), 0, 4, 1)));
} else {
+  /* From the Haswell PRM section "Register Region Restrictions":
+   *
+   *"The lower bits of the AddressImmediate must not overflow to
+   *change the register address.  The lower 5 bits of Address
+   *Immediate when added to lower 5 bits of address register gives
+   *the sub-register offset. The upper bits of Address Immediate
+   *when added to upper bits of address register gives the register
+   *address. Any overflow from sub-register offset is dropped."
+   *
+   * Fortunately, for broadcast, we never have a sub-register offset so
+   * this isn't an issue.
+   */
+  assert(src.subnr == 0);
+
   if (align1) {
  const struct brw_reg addr =
 retype(brw_address_reg(0), BRW_REGISTER_TYPE_UD);
diff --git a/src/intel/compiler/brw_fs_generator.cpp 
b/src/intel/compiler/brw_fs_generator.cpp
index 0558c82..2aa79c3 100644
--- a/src/intel/compiler/brw_fs_generator.cpp
+++ b/src/intel/compiler/brw_fs_generator.cpp
@@ -435,6 +435,8 @@ fs_generator::generate_mov_indirect(fs_inst *inst,
 {
assert(indirect_byte_offset.type == BRW_REGISTER_TYPE_UD);
assert(indirect_byte_offset.file == BRW_GENERAL_REGISTER_FILE);
+   assert(!reg.abs && !reg.negate);
+   assert(reg.type == dst.type);
 
unsigned imm_byte_offset = reg.nr * REG_SIZE + reg.subnr;
 
diff --git a/src/intel/compiler/brw_shader.cpp 
b/src/intel/compiler/brw_shader.cpp
index 7d62479f..8f2dc3e 100644
--- a/src/intel/compiler/brw_shader.cpp
+++ b/src/intel/compiler/brw_shader.cpp
@@ -855,6 +855,8 @@ backend_instruction::can_do_source_mods() const
case BRW_OPCODE_FBH:
case BRW_OPCODE_FBL:
case BRW_OPCODE_SUBB:
+   case SHADER_OPCODE_BROADCAST:
+   case SHADER_OPCODE_MOV_INDIRECT:
   return false;
default:
   return true;
-- 
2.5.0.400.gff86faf

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH v3 05/48] intel/fs: Use an explicit D type for vote any/all/eq intrinsics

2017-10-25 Thread Jason Ekstrand
The any/all intrinsics return a boolean value so D or UD is the correct
type.  Unfortunately, get_nir_dest has the annoying behavior of
returnning a float type by default.  This causes format conversion which
gives us -1.0f or 0.0f in the register.  If the consumer of the result
does an integer comparison to zero, it will give you the right boolean
value but if we do something more clever based on the 0/~0 assumption
for booleans, this will give the wrong value.

Cc: mesa-sta...@lists.freedesktop.org
---
 src/intel/compiler/brw_fs_nir.cpp | 6 ++
 1 file changed, 6 insertions(+)

diff --git a/src/intel/compiler/brw_fs_nir.cpp 
b/src/intel/compiler/brw_fs_nir.cpp
index dcd9942..3143bc6 100644
--- a/src/intel/compiler/brw_fs_nir.cpp
+++ b/src/intel/compiler/brw_fs_nir.cpp
@@ -4207,6 +4207,8 @@ fs_visitor::nir_emit_intrinsic(const fs_builder , 
nir_intrinsic_instr *instr
  ubld.MOV(brw_flag_reg(0, 0), brw_imm_uw(0));
   }
   bld.CMP(bld.null_reg_d(), get_nir_src(instr->src[0]), brw_imm_d(0), 
BRW_CONDITIONAL_NZ);
+
+  dest.type = BRW_REGISTER_TYPE_D;
   bld.MOV(dest, brw_imm_d(-1));
   set_predicate(dispatch_width == 8  ? BRW_PREDICATE_ALIGN1_ANY8H :
 dispatch_width == 16 ? BRW_PREDICATE_ALIGN1_ANY16H :
@@ -4229,6 +4231,8 @@ fs_visitor::nir_emit_intrinsic(const fs_builder , 
nir_intrinsic_instr *instr
  ubld.MOV(brw_flag_reg(0, 0), brw_imm_uw(0x));
   }
   bld.CMP(bld.null_reg_d(), get_nir_src(instr->src[0]), brw_imm_d(0), 
BRW_CONDITIONAL_NZ);
+
+  dest.type = BRW_REGISTER_TYPE_D;
   bld.MOV(dest, brw_imm_d(-1));
   set_predicate(dispatch_width == 8  ? BRW_PREDICATE_ALIGN1_ALL8H :
 dispatch_width == 16 ? BRW_PREDICATE_ALIGN1_ALL16H :
@@ -4253,6 +4257,8 @@ fs_visitor::nir_emit_intrinsic(const fs_builder , 
nir_intrinsic_instr *instr
  ubld.MOV(brw_flag_reg(0, 0), brw_imm_uw(0x));
   }
   bld.CMP(bld.null_reg_d(), value, uniformized, BRW_CONDITIONAL_Z);
+
+  dest.type = BRW_REGISTER_TYPE_D;
   bld.MOV(dest, brw_imm_d(-1));
   set_predicate(dispatch_width == 8  ? BRW_PREDICATE_ALIGN1_ALL8H :
 dispatch_width == 16 ? BRW_PREDICATE_ALIGN1_ALL16H :
-- 
2.5.0.400.gff86faf

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH v3 04/48] intel/fs: Don't stomp f0.1 in SIMD16 ballot

2017-10-25 Thread Jason Ekstrand
In fragment shaders f0.1 is used for discards so doing ballot after a
discard can potentially cause the discard to not happen.  However, we
don't support SIMD32 fragment shaders yet so this isn't a problem.

Cc: mesa-sta...@lists.freedesktop.org
---
 src/intel/compiler/brw_fs_nir.cpp | 11 +--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/src/intel/compiler/brw_fs_nir.cpp 
b/src/intel/compiler/brw_fs_nir.cpp
index f29d4e8..dcd9942 100644
--- a/src/intel/compiler/brw_fs_nir.cpp
+++ b/src/intel/compiler/brw_fs_nir.cpp
@@ -4264,8 +4264,15 @@ fs_visitor::nir_emit_intrinsic(const fs_builder , 
nir_intrinsic_instr *instr
case nir_intrinsic_ballot: {
   const fs_reg value = retype(get_nir_src(instr->src[0]),
   BRW_REGISTER_TYPE_UD);
-  const struct brw_reg flag = retype(brw_flag_reg(0, 0),
- BRW_REGISTER_TYPE_UD);
+  struct brw_reg flag = brw_flag_reg(0, 0);
+  /* FIXME: For SIMD32 programs, this causes us to stomp on f0.1 as well
+   * as f0.0.  This is a problem for fragment programs as we currently use
+   * f0.1 for discards.  Fortunately, we don't support SIMD32 fragment
+   * programs yet so this isn't a problem.  When we do, something will
+   * have to change.
+   */
+  if (dispatch_width == 32)
+ flag.type = BRW_REGISTER_TYPE_UD;
 
   bld.exec_all().MOV(flag, brw_imm_ud(0u));
   bld.CMP(bld.null_reg_ud(), value, brw_imm_ud(0u), BRW_CONDITIONAL_NZ);
-- 
2.5.0.400.gff86faf

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH v3 02/48] intel/fs: Be more explicit about our placement of [un]zip

2017-10-25 Thread Jason Ekstrand
Before, we were careful to place the zip after the last of the split
instructions but did unzip on-demand.  This changes things so that the
unzips go before all of the split instructions and the unzip comes
explicitly after all the split instructions.  As a side-effect of this
change, we now emit the split instruction from highest SIMD group to
lowest instead of low to high.  We could have kept the old behavior, but
it shouldn't matter and this made the code easier.

Cc: mesa-sta...@lists.freedesktop.org
---
 src/intel/compiler/brw_fs.cpp | 7 ---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/src/intel/compiler/brw_fs.cpp b/src/intel/compiler/brw_fs.cpp
index 49ca58d..ef36af9 100644
--- a/src/intel/compiler/brw_fs.cpp
+++ b/src/intel/compiler/brw_fs.cpp
@@ -5186,6 +5186,7 @@ fs_visitor::lower_simd_width()
 
  assert(!inst->writes_accumulator && !inst->mlen);
 
+ exec_node * const after_inst = inst->next;
  for (unsigned i = 0; i < n; i++) {
 /* Emit a copy of the original instruction with the lowered width.
  * If the EOT flag was set throw it away except for the last
@@ -5193,7 +5194,7 @@ fs_visitor::lower_simd_width()
  */
 fs_inst split_inst = *inst;
 split_inst.exec_size = lower_width;
-split_inst.eot = inst->eot && i == n - 1;
+split_inst.eot = inst->eot && i == 0;
 
 /* Select the correct channel enables for the i-th group, then
  * transform the sources and destination and emit the lowered
@@ -5205,11 +5206,11 @@ fs_visitor::lower_simd_width()
split_inst.src[j] = emit_unzip(lbld.at(block, inst), inst, j);
 
 split_inst.dst = emit_zip(lbld.at(block, inst),
-  lbld.at(block, inst->next), inst);
+  lbld.at(block, after_inst), inst);
 split_inst.size_written =
split_inst.dst.component_size(lower_width) * dst_size;
 
-lbld.emit(split_inst);
+lbld.at(block, inst->next).emit(split_inst);
  }
 
  inst->remove(block);
-- 
2.5.0.400.gff86faf

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH v3 01/48] intel/fs: Pass builders instead of blocks into emit_[un]zip

2017-10-25 Thread Jason Ekstrand
This makes it far more explicit where we're inserting the instructions
rather than the magic "before and after" stuff that the emit_[un]zip
helpers did based on block and inst.

Cc: mesa-sta...@lists.freedesktop.org
---
 src/intel/compiler/brw_fs.cpp | 50 ---
 1 file changed, 28 insertions(+), 22 deletions(-)

diff --git a/src/intel/compiler/brw_fs.cpp b/src/intel/compiler/brw_fs.cpp
index 4616529..49ca58d 100644
--- a/src/intel/compiler/brw_fs.cpp
+++ b/src/intel/compiler/brw_fs.cpp
@@ -5025,8 +5025,7 @@ needs_src_copy(const fs_builder , const fs_inst 
*inst, unsigned i)
  * will be emitted before the given \p inst in \p block.
  */
 static fs_reg
-emit_unzip(const fs_builder , bblock_t *block, fs_inst *inst,
-   unsigned i)
+emit_unzip(const fs_builder , fs_inst *inst, unsigned i)
 {
/* Specified channel group from the source region. */
const fs_reg src = horiz_offset(inst->src[i], lbld.group());
@@ -5041,8 +5040,7 @@ emit_unzip(const fs_builder , bblock_t *block, 
fs_inst *inst,
   const fs_reg tmp = lbld.vgrf(inst->src[i].type, 
inst->components_read(i));
 
   for (unsigned k = 0; k < inst->components_read(i); ++k)
- cbld.at(block, inst)
- .MOV(offset(tmp, lbld, k), offset(src, inst->exec_size, k));
+ cbld.MOV(offset(tmp, lbld, k), offset(src, inst->exec_size, k));
 
   return tmp;
 
@@ -5112,36 +5110,43 @@ needs_dst_copy(const fs_builder , const fs_inst 
*inst)
  * be emitted around the given \p inst in \p block.
  */
 static fs_reg
-emit_zip(const fs_builder , bblock_t *block, fs_inst *inst)
+emit_zip(const fs_builder _before, const fs_builder _after,
+ fs_inst *inst)
 {
-   /* Builder of the right width to perform the copy avoiding uninitialized
-* data if the lowered execution size is greater than the original
-* execution size of the instruction.
-*/
-   const fs_builder cbld = lbld.group(MIN2(lbld.dispatch_width(),
-   inst->exec_size), 0);
+   assert(lbld_before.dispatch_width() == lbld_after.dispatch_width());
+   assert(lbld_before.group() == lbld_after.group());
 
/* Specified channel group from the destination region. */
-   const fs_reg dst = horiz_offset(inst->dst, lbld.group());
+   const fs_reg dst = horiz_offset(inst->dst, lbld_after.group());
const unsigned dst_size = inst->size_written /
   inst->dst.component_size(inst->exec_size);
 
-   if (needs_dst_copy(lbld, inst)) {
-  const fs_reg tmp = lbld.vgrf(inst->dst.type, dst_size);
+   if (needs_dst_copy(lbld_after, inst)) {
+  const fs_reg tmp = lbld_after.vgrf(inst->dst.type, dst_size);
 
   if (inst->predicate) {
  /* Handle predication by copying the original contents of
   * the destination into the temporary before emitting the
   * lowered instruction.
   */
- for (unsigned k = 0; k < dst_size; ++k)
-cbld.at(block, inst)
-.MOV(offset(tmp, lbld, k), offset(dst, inst->exec_size, k));
+ for (unsigned k = 0; k < dst_size; ++k) {
+lbld_before.group(MIN2(lbld_before.dispatch_width(),
+   inst->exec_size), 0)
+   .MOV(offset(tmp, lbld_before, k),
+offset(dst, inst->exec_size, k));
+ }
   }
 
-  for (unsigned k = 0; k < dst_size; ++k)
- cbld.at(block, inst->next)
- .MOV(offset(dst, inst->exec_size, k), offset(tmp, lbld, k));
+  for (unsigned k = 0; k < dst_size; ++k) {
+ /* Use a builder of the right width to perform the copy avoiding
+  * uninitialized data if the lowered execution size is greater than
+  * the original execution size of the instruction.
+  */
+ lbld_after.group(MIN2(lbld_after.dispatch_width(),
+   inst->exec_size), 0)
+   .MOV(offset(dst, inst->exec_size, k),
+offset(tmp, lbld_after, k));
+  }
 
   return tmp;
 
@@ -5197,9 +5202,10 @@ fs_visitor::lower_simd_width()
 const fs_builder lbld = ibld.group(lower_width, i);
 
 for (unsigned j = 0; j < inst->sources; j++)
-   split_inst.src[j] = emit_unzip(lbld, block, inst, j);
+   split_inst.src[j] = emit_unzip(lbld.at(block, inst), inst, j);
 
-split_inst.dst = emit_zip(lbld, block, inst);
+split_inst.dst = emit_zip(lbld.at(block, inst),
+  lbld.at(block, inst->next), inst);
 split_inst.size_written =
split_inst.dst.component_size(lower_width) * dst_size;
 
-- 
2.5.0.400.gff86faf

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH v3 03/48] intel/fs: Use ANY/ALL32 predicates in SIMD32

2017-10-25 Thread Jason Ekstrand
We have ANY/ALL32 predicates and, for the most part, they work just
fine.  (See the next commit for more details.)  Also, due to the way
that flag registers are handled in hardware, instruction splitting is
able to split the CMP correctly.  Specifically, that hardware looks at
the execution group and knows to shift it's flag usage up correctly so a
2H instruction will write to f0.1 instead of f0.0.

Reviewed-by: Matt Turner 
Cc: mesa-sta...@lists.freedesktop.org
---
 src/intel/compiler/brw_fs_nir.cpp | 42 ---
 1 file changed, 30 insertions(+), 12 deletions(-)

diff --git a/src/intel/compiler/brw_fs_nir.cpp 
b/src/intel/compiler/brw_fs_nir.cpp
index bb153ca..f29d4e8 100644
--- a/src/intel/compiler/brw_fs_nir.cpp
+++ b/src/intel/compiler/brw_fs_nir.cpp
@@ -4199,12 +4199,18 @@ fs_visitor::nir_emit_intrinsic(const fs_builder , 
nir_intrinsic_instr *instr
* dead channels from affecting the result, we initialize the flag with
* with the identity value for the logical operation.
*/
-  ubld.MOV(brw_flag_reg(0, 0), brw_imm_uw(0));
+  if (dispatch_width == 32) {
+ /* For SIMD32, we use a UD type so we fill both f0.0 and f0.1. */
+ ubld.MOV(retype(brw_flag_reg(0, 0), BRW_REGISTER_TYPE_UD),
+ brw_imm_ud(0));
+  } else {
+ ubld.MOV(brw_flag_reg(0, 0), brw_imm_uw(0));
+  }
   bld.CMP(bld.null_reg_d(), get_nir_src(instr->src[0]), brw_imm_d(0), 
BRW_CONDITIONAL_NZ);
   bld.MOV(dest, brw_imm_d(-1));
-  set_predicate(dispatch_width == 8 ?
-BRW_PREDICATE_ALIGN1_ANY8H :
-BRW_PREDICATE_ALIGN1_ANY16H,
+  set_predicate(dispatch_width == 8  ? BRW_PREDICATE_ALIGN1_ANY8H :
+dispatch_width == 16 ? BRW_PREDICATE_ALIGN1_ANY16H :
+   BRW_PREDICATE_ALIGN1_ANY32H,
 bld.SEL(dest, dest, brw_imm_d(0)));
   break;
}
@@ -4215,12 +4221,18 @@ fs_visitor::nir_emit_intrinsic(const fs_builder , 
nir_intrinsic_instr *instr
* dead channels from affecting the result, we initialize the flag with
* with the identity value for the logical operation.
*/
-  ubld.MOV(brw_flag_reg(0, 0), brw_imm_uw(0x));
+  if (dispatch_width == 32) {
+ /* For SIMD32, we use a UD type so we fill both f0.0 and f0.1. */
+ ubld.MOV(retype(brw_flag_reg(0, 0), BRW_REGISTER_TYPE_UD),
+ brw_imm_ud(0x));
+  } else {
+ ubld.MOV(brw_flag_reg(0, 0), brw_imm_uw(0x));
+  }
   bld.CMP(bld.null_reg_d(), get_nir_src(instr->src[0]), brw_imm_d(0), 
BRW_CONDITIONAL_NZ);
   bld.MOV(dest, brw_imm_d(-1));
-  set_predicate(dispatch_width == 8 ?
-BRW_PREDICATE_ALIGN1_ALL8H :
-BRW_PREDICATE_ALIGN1_ALL16H,
+  set_predicate(dispatch_width == 8  ? BRW_PREDICATE_ALIGN1_ALL8H :
+dispatch_width == 16 ? BRW_PREDICATE_ALIGN1_ALL16H :
+   BRW_PREDICATE_ALIGN1_ALL32H,
 bld.SEL(dest, dest, brw_imm_d(0)));
   break;
}
@@ -4233,12 +4245,18 @@ fs_visitor::nir_emit_intrinsic(const fs_builder , 
nir_intrinsic_instr *instr
* dead channels from affecting the result, we initialize the flag with
* with the identity value for the logical operation.
*/
-  ubld.MOV(brw_flag_reg(0, 0), brw_imm_uw(0x));
+  if (dispatch_width == 32) {
+ /* For SIMD32, we use a UD type so we fill both f0.0 and f0.1. */
+ ubld.MOV(retype(brw_flag_reg(0, 0), BRW_REGISTER_TYPE_UD),
+ brw_imm_ud(0x));
+  } else {
+ ubld.MOV(brw_flag_reg(0, 0), brw_imm_uw(0x));
+  }
   bld.CMP(bld.null_reg_d(), value, uniformized, BRW_CONDITIONAL_Z);
   bld.MOV(dest, brw_imm_d(-1));
-  set_predicate(dispatch_width == 8 ?
-BRW_PREDICATE_ALIGN1_ALL8H :
-BRW_PREDICATE_ALIGN1_ALL16H,
+  set_predicate(dispatch_width == 8  ? BRW_PREDICATE_ALIGN1_ALL8H :
+dispatch_width == 16 ? BRW_PREDICATE_ALIGN1_ALL16H :
+   BRW_PREDICATE_ALIGN1_ALL32H,
 bld.SEL(dest, dest, brw_imm_d(0)));
   break;
}
-- 
2.5.0.400.gff86faf

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH v3 08/48] intel/eu: Just modify the offset in brw_broadcast

2017-10-25 Thread Jason Ekstrand
This means we have to drop const from a variable but it also means that
100% of the code which deals with the offset limit is in one place.
---
 src/intel/compiler/brw_eu_emit.c | 9 +
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/src/intel/compiler/brw_eu_emit.c b/src/intel/compiler/brw_eu_emit.c
index e10b143..a18cfa4 100644
--- a/src/intel/compiler/brw_eu_emit.c
+++ b/src/intel/compiler/brw_eu_emit.c
@@ -3404,7 +3404,7 @@ brw_broadcast(struct brw_codegen *p,
   if (align1) {
  const struct brw_reg addr =
 retype(brw_address_reg(0), BRW_REGISTER_TYPE_UD);
- const unsigned offset = src.nr * REG_SIZE + src.subnr;
+ unsigned offset = src.nr * REG_SIZE + src.subnr;
  /* Limit in bytes of the signed indirect addressing immediate. */
  const unsigned limit = 512;
 
@@ -3422,15 +3422,16 @@ brw_broadcast(struct brw_codegen *p,
   * addressing immediate, account for the difference if the source
   * register is above this limit.
   */
- if (offset >= limit)
+ if (offset >= limit) {
 brw_ADD(p, addr, addr, brw_imm_ud(offset - offset % limit));
+offset = offset % limit;
+ }
 
  brw_pop_insn_state(p);
 
  /* Use indirect addressing to fetch the specified component. */
  brw_MOV(p, dst,
- retype(brw_vec1_indirect(addr.subnr, offset % limit),
-src.type));
+ retype(brw_vec1_indirect(addr.subnr, offset), src.type));
   } else {
  /* In SIMD4x2 mode the index can be either zero or one, replicate it
   * to all bits of a flag register,
-- 
2.5.0.400.gff86faf

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH v3 06/48] intel/fs: Use a pair of 1-wide MOVs instead of SEL for any/all

2017-10-25 Thread Jason Ekstrand
For some reason, the any/all predicates don't work properly with SIMD32.
In particular, it appears that a SEL with a QtrCtrl of 2H doesn't read
the correct subset of the flag register and you end up getting garbage
in the second half.  Work around this by using a pair of 1-wide MOVs and
scattering the result.  This fixes the any/all instructions for SIMD32.

Reviewed-by: Matt Turner 
Cc: mesa-sta...@lists.freedesktop.org
---
 src/intel/compiler/brw_fs_nir.cpp | 42 ++-
 1 file changed, 33 insertions(+), 9 deletions(-)

diff --git a/src/intel/compiler/brw_fs_nir.cpp 
b/src/intel/compiler/brw_fs_nir.cpp
index 3143bc6..f433e3b 100644
--- a/src/intel/compiler/brw_fs_nir.cpp
+++ b/src/intel/compiler/brw_fs_nir.cpp
@@ -4208,12 +4208,20 @@ fs_visitor::nir_emit_intrinsic(const fs_builder , 
nir_intrinsic_instr *instr
   }
   bld.CMP(bld.null_reg_d(), get_nir_src(instr->src[0]), brw_imm_d(0), 
BRW_CONDITIONAL_NZ);
 
-  dest.type = BRW_REGISTER_TYPE_D;
-  bld.MOV(dest, brw_imm_d(-1));
+  /* For some reason, the any/all predicates don't work properly with
+   * SIMD32.  In particular, it appears that a SEL with a QtrCtrl of 2H
+   * doesn't read the correct subset of the flag register and you end up
+   * getting garbage in the second half.  Work around this by using a pair
+   * of 1-wide MOVs and scattering the result.
+   */
+  fs_reg res1 = ubld.vgrf(BRW_REGISTER_TYPE_D);
+  ubld.MOV(res1, brw_imm_d(0));
   set_predicate(dispatch_width == 8  ? BRW_PREDICATE_ALIGN1_ANY8H :
 dispatch_width == 16 ? BRW_PREDICATE_ALIGN1_ANY16H :
BRW_PREDICATE_ALIGN1_ANY32H,
-bld.SEL(dest, dest, brw_imm_d(0)));
+ubld.MOV(res1, brw_imm_d(-1)));
+
+  bld.MOV(retype(dest, BRW_REGISTER_TYPE_D), component(res1, 0));
   break;
}
case nir_intrinsic_vote_all: {
@@ -4232,12 +4240,20 @@ fs_visitor::nir_emit_intrinsic(const fs_builder , 
nir_intrinsic_instr *instr
   }
   bld.CMP(bld.null_reg_d(), get_nir_src(instr->src[0]), brw_imm_d(0), 
BRW_CONDITIONAL_NZ);
 
-  dest.type = BRW_REGISTER_TYPE_D;
-  bld.MOV(dest, brw_imm_d(-1));
+  /* For some reason, the any/all predicates don't work properly with
+   * SIMD32.  In particular, it appears that a SEL with a QtrCtrl of 2H
+   * doesn't read the correct subset of the flag register and you end up
+   * getting garbage in the second half.  Work around this by using a pair
+   * of 1-wide MOVs and scattering the result.
+   */
+  fs_reg res1 = ubld.vgrf(BRW_REGISTER_TYPE_D);
+  ubld.MOV(res1, brw_imm_d(0));
   set_predicate(dispatch_width == 8  ? BRW_PREDICATE_ALIGN1_ALL8H :
 dispatch_width == 16 ? BRW_PREDICATE_ALIGN1_ALL16H :
BRW_PREDICATE_ALIGN1_ALL32H,
-bld.SEL(dest, dest, brw_imm_d(0)));
+ubld.MOV(res1, brw_imm_d(-1)));
+
+  bld.MOV(retype(dest, BRW_REGISTER_TYPE_D), component(res1, 0));
   break;
}
case nir_intrinsic_vote_eq: {
@@ -4258,12 +4274,20 @@ fs_visitor::nir_emit_intrinsic(const fs_builder , 
nir_intrinsic_instr *instr
   }
   bld.CMP(bld.null_reg_d(), value, uniformized, BRW_CONDITIONAL_Z);
 
-  dest.type = BRW_REGISTER_TYPE_D;
-  bld.MOV(dest, brw_imm_d(-1));
+  /* For some reason, the any/all predicates don't work properly with
+   * SIMD32.  In particular, it appears that a SEL with a QtrCtrl of 2H
+   * doesn't read the correct subset of the flag register and you end up
+   * getting garbage in the second half.  Work around this by using a pair
+   * of 1-wide MOVs and scattering the result.
+   */
+  fs_reg res1 = ubld.vgrf(BRW_REGISTER_TYPE_D);
+  ubld.MOV(res1, brw_imm_d(0));
   set_predicate(dispatch_width == 8  ? BRW_PREDICATE_ALIGN1_ALL8H :
 dispatch_width == 16 ? BRW_PREDICATE_ALIGN1_ALL16H :
BRW_PREDICATE_ALIGN1_ALL32H,
-bld.SEL(dest, dest, brw_imm_d(0)));
+ubld.MOV(res1, brw_imm_d(-1)));
+
+  bld.MOV(retype(dest, BRW_REGISTER_TYPE_D), component(res1, 0));
   break;
}
 
-- 
2.5.0.400.gff86faf

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH v3 00/48] nir, intel: Prerequisites for subgroups

2017-10-25 Thread Jason Ekstrand
This series is a third respin of my subgroups prerequisites series that
that I sent out a few weeks ago.  Not a whole lot has changed but there are
some new patches.  Primarily,

 1) Some patches which were reviewed by Matt and Lionel were pushed and are
no longer in the series.  Thanks guys!

 2) I've applied R-B tags from various people for patches which are
reviewed but depend on still unreviewed patches.

 3) A few patches to fix little-core.  In particular, the extra little-core
EU restrictions cause problems for BROADCAST, MOV_INDIRECT, and integer
MUL.

This series can be found on fd.o nere:

https://cgit.freedesktop.org/~jekstrand/mesa/log/?h=review/subgroup-prereqs-v3

Happy reviewing!


Cc: Matt Turner 
Cc: Francisco Jerez 
Cc: Connor Abbott 

Francisco Jerez (1):
  intel/fs: Restrict live intervals to the subset possibly reachable
from any definition.

Jason Ekstrand (47):
  intel/fs: Pass builders instead of blocks into emit_[un]zip
  intel/fs: Be more explicit about our placement of [un]zip
  intel/fs: Use ANY/ALL32 predicates in SIMD32
  intel/fs: Don't stomp f0.1 in SIMD16 ballot
  intel/fs: Use an explicit D type for vote any/all/eq intrinsics
  intel/fs: Use a pair of 1-wide MOVs instead of SEL for any/all
  intel/compiler: Add some restrictions to MOV_INDIRECT and BROADCAST
  intel/eu: Just modify the offset in brw_broadcast
  intel/eu/reg: Add a subscript() helper
  intel/eu: Fix broadcast instruction for 64-bit values on little-core
  intel/fs: Fix MOV_INDIRECT for 64-bit values on little-core
  intel/fs: Fix integer multiplication lowering for src/dst hazards
  intel/fs: Use the original destination region for int MUL lowering
  i965/fs: Extend the live ranges of VGRFs which leave loops
  i965/fs/nir: Simplify 64-bit store_output
  i965/fs: Return a fs_reg from shuffle_64bit_data_for_32bit_write
  i965/fs/nir: Minor refactor of store_output
  i965/fs/nir: Don't stomp 64-bit values to D in get_nir_src
  intel/fs: Protect opt_algebraic from OOB BROADCAST indices
  intel/fs: Uniformize the index in readInvocation
  intel/fs: Retype dest to match value in read[First]Invocation
  intel/fs: Assign constant locations if they haven't been assigned
  intel/fs: Remove min_dispatch_width from fs_visitor
  intel/cs: Drop max_dispatch_width checks from compile_cs
  intel/cs: Stop setting dispatch_grf_start_reg
  intel/cs: Ignore runtime_check_aads_emit for CS
  intel/fs: Mark 64-bit values as being contiguous
  intel/cs: Rework the way thread local ID is handled
  intel/cs: Re-run final NIR optimizations for each SIMD size
  intel/cs: Re-run final NIR optimizations for each SIMD size
  intel/cs: Push subgroup ID instead of base thread ID
  intel/compiler/fs: Set up subgroup invocation as a system value
  intel/fs: Rework zero-length URB write handling
  intel/eu: Make automatic exec sizes a configurable option
  intel/eu: Explicitly set EXECUTE_1 where needed
  intel/fs: Explicitly set EXECUTE_1 where needed
  intel/fs: Don't use automatic exec size inference
  nir: Add a new subgroups lowering pass
  nir: Add a ssa_dest_init_for_type helper
  nir: Make ballot intrinsics variable-size
  nir/lower_system_values: Lower SUBGROUP_*_MASK based on type
  nir/lower_subgroups: Lower ballot intrinsics to the specified bit size
  nir,intel/compiler: Use a fixed subgroup size
  spirv: Add a vtn_constant_value helper
  spirv: Rework barriers
  nir: Validate base types on array dereferences
  compiler/nir_types: Handle vectors in glsl_get_array_element

 src/compiler/Makefile.sources  |   2 +-
 src/compiler/glsl/glsl_to_nir.cpp  |   1 +
 src/compiler/nir/nir.h |  25 +-
 src/compiler/nir/nir_intrinsics.h  |  13 +-
 .../nir/nir_lower_read_invocation_to_scalar.c  | 112 -
 src/compiler/nir/nir_lower_subgroups.c | 257 
 src/compiler/nir/nir_lower_system_values.c |   4 +-
 src/compiler/nir/nir_opt_intrinsics.c  |  69 +-
 src/compiler/nir/nir_validate.c|  18 +-
 src/compiler/nir_types.cpp |   2 +
 src/compiler/spirv/spirv_to_nir.c  | 132 --
 src/compiler/spirv/vtn_private.h   |   6 +
 src/intel/compiler/brw_compiler.c  |   4 -
 src/intel/compiler/brw_compiler.h  |   3 +-
 src/intel/compiler/brw_eu.c|   1 +
 src/intel/compiler/brw_eu.h|  10 +
 src/intel/compiler/brw_eu_emit.c   |  90 +--
 src/intel/compiler/brw_fs.cpp  | 268 -
 src/intel/compiler/brw_fs.h|  15 +-
 src/intel/compiler/brw_fs_generator.cpp|  90 ---
 src/intel/compiler/brw_fs_live_variables.cpp   |  89 ++-
 src/intel/compiler/brw_fs_live_variables.h |  

Re: [Mesa-dev] [PATCH v2] clover: Fix compilation after clang r315871

2017-10-25 Thread Jan Vesely
On Tue, 2017-10-24 at 15:32 +0200, Vedran Miletić wrote:
> On 10/23/2017 05:24 AM, Jan Vesely wrote:
> > From: Jan Vesely 
> > 
> > v2: use a more generic compat function
> > 
> > Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=103388
> > Signed-off-by: Jan Vesely 
> > ---
> >  src/gallium/state_trackers/clover/llvm/codegen/common.cpp |  5 ++---
> >  src/gallium/state_trackers/clover/llvm/compat.hpp | 12 ++--
> >  2 files changed, 12 insertions(+), 5 deletions(-)
> > 
> > diff --git a/src/gallium/state_trackers/clover/llvm/codegen/common.cpp 
> > b/src/gallium/state_trackers/clover/llvm/codegen/common.cpp
> > index 075183400a..dd9d02ab11 100644
> > --- a/src/gallium/state_trackers/clover/llvm/codegen/common.cpp
> > +++ b/src/gallium/state_trackers/clover/llvm/codegen/common.cpp
> > @@ -70,7 +70,6 @@ namespace {
> > make_kernel_args(const Module , const std::string _name,
> >  const clang::CompilerInstance ) {
> >std::vector args;
> > -  const auto address_spaces = c.getTarget().getAddressSpaceMap();
> >const Function  = *mod.getFunction(kernel_name);
> >::llvm::DataLayout dl();
> >const auto size_type =
> > @@ -128,8 +127,8 @@ namespace {
> > const unsigned address_space =
> >cast< 
> > ::llvm::PointerType>(actual_type)->getAddressSpace();
> >  
> > -   if (address_space == 
> > address_spaces[clang::LangAS::opencl_local
> > -   - 
> > compat::lang_as_offset]) {
> > +   if (address_space == compat::target_lang_address_space(
> > +  c.getTarget(), 
> > clang::LangAS::opencl_local)) {
> >args.emplace_back(module::argument::local, arg_api_size,
> >  target_size, target_align,
> >  module::argument::zero_ext);
> > diff --git a/src/gallium/state_trackers/clover/llvm/compat.hpp 
> > b/src/gallium/state_trackers/clover/llvm/compat.hpp
> > index f8b56516d5..3e34f0dd94 100644
> > --- a/src/gallium/state_trackers/clover/llvm/compat.hpp
> > +++ b/src/gallium/state_trackers/clover/llvm/compat.hpp
> > @@ -69,11 +69,19 @@ namespace clover {
> >   typedef ::llvm::TargetLibraryInfo target_library_info;
> >  #endif
> >  
> > + template
> > + unsigned target_lang_address_space(const T& target, const AS 
> > lang_as) {
> > +const auto  = target.getAddressSpaceMap();
> > +#if HAVE_LLVM >= 0x0500
> > +return map[static_cast(lang_as)];
> > +#else
> > +return map[lang_as - clang::LangAS::Offset];
> > +#endif
> > + }
> > +
> >  #if HAVE_LLVM >= 0x0500
> > - const auto lang_as_offset = 0;
> >   const clang::InputKind ik_opencl = clang::InputKind::OpenCL;
> >  #else
> > - const auto lang_as_offset = clang::LangAS::Offset;
> >   const clang::InputKind ik_opencl = clang::IK_OpenCL;
> >  #endif
> >  
> > 
> 
> Thanks for improving the patch. Future-proof thinking: what if the value
> of clang::LangAS::Default changes from 0 to some other constant?

Hi Vedran,

you're right that it'd be more future proof, but I liked the one line
simplicity of the current version. Future clang changes will require
adaptations, but I don't expect clang to go back to non-0 lang AS
indices. Feel free to add "I told you so" if they prove me wrong :)

> 
> Other than that, this patch is:
> 
> Reviewed-by: Vedran Miletić 

I was not sure if this applied even without the change so I pushed it
only with francisco's rb.

thanks,
Jan

> 
> Regards,
> Vedran
> 


signature.asc
Description: This is a digitally signed message part
___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH 5/9] osmesa: Include generated headers without path

2017-10-25 Thread Dylan Baker
This makes things much easier to ensure correctness with meson. Tested
with make dist-check and with meson.

Signed-off-by: Dylan Baker 
---
 src/mesa/drivers/osmesa/Makefile.am | 2 ++
 src/mesa/drivers/osmesa/osmesa.c| 4 ++--
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/src/mesa/drivers/osmesa/Makefile.am 
b/src/mesa/drivers/osmesa/Makefile.am
index 2c8d4668b1b..3c7def5c5c9 100644
--- a/src/mesa/drivers/osmesa/Makefile.am
+++ b/src/mesa/drivers/osmesa/Makefile.am
@@ -30,6 +30,8 @@ AM_CPPFLAGS = \
-I$(top_srcdir)/src/gallium/auxiliary \
-I$(top_builddir)/src/mapi \
-I$(top_srcdir)/src/mapi \
+   -I$(top_builddir)/src/mapi/glapi \
+   -I$(top_srcdir)/src/mapi/glapi \
-I$(top_srcdir)/src/mesa/ \
$(DEFINES)
 AM_CFLAGS = $(PTHREAD_CFLAGS) \
diff --git a/src/mesa/drivers/osmesa/osmesa.c b/src/mesa/drivers/osmesa/osmesa.c
index 734a4e891cb..72bff3da2c6 100644
--- a/src/mesa/drivers/osmesa/osmesa.c
+++ b/src/mesa/drivers/osmesa/osmesa.c
@@ -1291,7 +1291,7 @@ OSMesaPostprocess(OSMesaContext osmesa, const char 
*filter,
 #define GL_GLEXT_PROTOTYPES
 #include "GL/gl.h"
 #include "glapi/glapi.h"
-#include "glapi/glapitable.h"
+#include "glapitable.h"
 
 #if defined(USE_MGL_NAMESPACE)
 #define NAME(func)  mgl##func
@@ -1307,6 +1307,6 @@ OSMesaPostprocess(OSMesaContext osmesa, const char 
*filter,
 
 /* skip normal ones */
 #define _GLAPI_SKIP_NORMAL_ENTRY_POINTS
-#include "glapi/glapitemp.h"
+#include "glapitemp.h"
 
 #endif /* GLX_INDIRECT_RENDERING */
-- 
2.14.2

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH 4/9] meson: move gallium include declarations to src

2017-10-25 Thread Dylan Baker
These are used by non-gallium osmesa, so they need to be defined outside
of the gallium subdirectory.

Signed-off-by: Dylan Baker 
---
 src/gallium/auxiliary/meson.build | 2 --
 src/gallium/meson.build   | 1 -
 src/meson.build   | 2 ++
 3 files changed, 2 insertions(+), 3 deletions(-)

diff --git a/src/gallium/auxiliary/meson.build 
b/src/gallium/auxiliary/meson.build
index 778b4ce4ac2..bb7c0506d84 100644
--- a/src/gallium/auxiliary/meson.build
+++ b/src/gallium/auxiliary/meson.build
@@ -18,8 +18,6 @@
 # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 # SOFTWARE.
 
-inc_gallium_aux = include_directories('.')
-
 files_libgallium = files(
   'cso_cache/cso_cache.c',
   'cso_cache/cso_cache.h',
diff --git a/src/gallium/meson.build b/src/gallium/meson.build
index 570c37e6475..e0941103b93 100644
--- a/src/gallium/meson.build
+++ b/src/gallium/meson.build
@@ -18,7 +18,6 @@
 # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 # SOFTWARE.
 
-inc_gallium = include_directories('include')
 inc_gallium_drivers = include_directories('drivers')
 inc_gallium_winsys = include_directories('winsys')
 
diff --git a/src/meson.build b/src/meson.build
index 9b1b0ae594d..c32b666c8bc 100644
--- a/src/meson.build
+++ b/src/meson.build
@@ -23,6 +23,8 @@ inc_common = include_directories(
 inc_mesa = include_directories('mesa')
 inc_mapi = include_directories('mapi')
 inc_src = include_directories('.')
+inc_gallium = include_directories('gallium/include')
+inc_gallium_aux = include_directories('gallium/auxiliary')
 
 libglsl_util = static_library(
   'glsl_util',
-- 
2.14.2

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH 3/9] meson: fix glprocs.h generator

2017-10-25 Thread Dylan Baker
There was a typo that causes the generated file to be called gl_procs.h
instead.

Signed-off-by: Dylan Baker 
Reviewed-by: Eric Engestrom 
---
 src/mapi/glapi/gen/meson.build | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/mapi/glapi/gen/meson.build b/src/mapi/glapi/gen/meson.build
index 4360346edad..69ef57bc146 100644
--- a/src/mapi/glapi/gen/meson.build
+++ b/src/mapi/glapi/gen/meson.build
@@ -153,10 +153,10 @@ glapi_mapi_tmp_h = custom_target(
   capture : true,
 )
 
-gl_procs_h = custom_target(
-  'gl_procs.h',
+glprocs_h = custom_target(
+  'glprocs.h',
   input : ['gl_procs.py', 'gl_and_es_API.xml'],
-  output : 'gl_procs.h',
+  output : 'glprocs.h',
   command : [prog_python2, '@INPUT0@', '-c', '-f', '@INPUT1@'],
   depend_files : glapi_gen_depends,
   capture : true,
-- 
2.14.2

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH 2/9] meson: rename all instances of xf86vm to xxf86vm

2017-10-25 Thread Dylan Baker
Because consistency

Signed-off-by: Dylan Baker 
---
 meson.build | 6 +++---
 src/glx/meson.build | 2 +-
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/meson.build b/meson.build
index c480dd410d9..5b31194b0df 100644
--- a/meson.build
+++ b/meson.build
@@ -736,7 +736,7 @@ dep_xcb_dri2 = []
 dep_xcb_dri3 = []
 dep_dri2proto = []
 dep_glproto = []
-dep_xf86vm = []
+dep_xxf86vm = []
 dep_xcb_dri3 = []
 dep_xcb_present = []
 dep_xcb_sync = []
@@ -749,7 +749,7 @@ if with_platform_x11
 dep_xdamage = dependency('xdamage', version : '>= 1.1')
 dep_xfixes = dependency('xfixes')
 dep_xcb_glx = dependency('xcb-glx', version : '>= 1.8.1')
-dep_xf86vm = dependency('xxf86vm', required : false)
+dep_xxf86vm = dependency('xxf86vm', required : false)
   endif
   if with_any_vk or (with_glx == 'dri' and with_dri_platform == 'drm')
 dep_xcb = dependency('xcb')
@@ -813,7 +813,7 @@ gl_priv_reqs = [
   'x11', 'xext', 'xdamage >= 1.1', 'xfixes', 'x11-xcb', 'xcb',
   'xcb-glx >= 1.8.1', 'libdrm >= 2.4.75',
 ]
-if dep_xf86vm != [] and dep_xf86vm.found()
+if dep_xxf86vm != [] and dep_xxf86vm.found()
   gl_priv_reqs += 'xxf86vm'
 endif
 if with_dri_platform == 'drm'
diff --git a/src/glx/meson.build b/src/glx/meson.build
index 3fe5fcf0cd6..573316c9424 100644
--- a/src/glx/meson.build
+++ b/src/glx/meson.build
@@ -126,7 +126,7 @@ gl_lib_cargs = [
   '-D_REENTRANT', '-DDEFAULT_DRIVER_DIR="@0@"'.format(dri_driver_dir),
 ]
 
-if dep_xf86vm != [] and dep_xf86vm.found()
+if dep_xxf86vm != [] and dep_xxf86vm.found()
   gl_lib_cargs += '-DHAVE_XF86VIDMODE'
 endif
 
-- 
2.14.2

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH 8/9] meson: build classic osmesa

2017-10-25 Thread Dylan Baker
From: Dylan Baker 

This builds the classic (non-gallium) osmesa with meson. This has been
tested with the osdemo application from mesa-demos.

Signed-off-by: Dylan Baker 
---
 include/meson.build |  2 +-
 meson.build | 26 ++--
 meson_options.txt   | 14 +++
 src/mapi/glapi/meson.build  |  2 +-
 src/mesa/drivers/osmesa/meson.build | 48 +
 src/mesa/meson.build|  3 +++
 src/meson.build |  1 -
 7 files changed, 86 insertions(+), 10 deletions(-)
 create mode 100644 src/mesa/drivers/osmesa/meson.build

diff --git a/include/meson.build b/include/meson.build
index 88e66a1a8f4..b5f533bd185 100644
--- a/include/meson.build
+++ b/include/meson.build
@@ -55,7 +55,7 @@ if with_glx != 'disabled'
   install_headers('GL/glx.h', 'GL/glext.h', 'GL/glx_mangle.h', subdir : 'GL')
 endif
 
-if with_osmesa
+if with_osmesa != 'none'
   install_headers('GL/osmesa.h', subdir : 'GL')
 endif
 
diff --git a/meson.build b/meson.build
index 5b31194b0df..79ce59c6b27 100644
--- a/meson.build
+++ b/meson.build
@@ -46,6 +46,8 @@ with_valgrind = get_option('valgrind')
 with_libunwind = get_option('libunwind')
 with_asm = get_option('asm')
 with_llvm = get_option('llvm')
+with_osmesa = get_option('osmesa')
+with_glx_direct = true
 if get_option('texture-float')
   pre_args += '-DTEXTURE_FLOAT_ENABLED'
   message('WARNING: Floating-point texture enabled. Please consult 
docs/patents.txt and your lawyer before building mesa.')
@@ -67,9 +69,6 @@ with_any_opengl = with_opengl or with_gles1 or with_gles2
 # Only build shared_glapi if at least one OpenGL API is enabled
 with_shared_glapi = get_option('shared-glapi') and with_any_opengl
 
-# TODO: these will need options, but at the moment they just control header
-# installs
-with_osmesa = false
 
 # shared-glapi is required if at least two OpenGL APIs are being built
 if not with_shared_glapi
@@ -230,9 +229,6 @@ if with_glvnd and with_glx != 'dri'
   message('glvnd requires dri based glx')
 endif
 
-# TODO: toggle for this
-with_glx_direct = true
-
 if with_vulkan_icd_dir == ''
   with_vulkan_icd_dir = join_paths(get_option('datadir'), 'vulkan/icd.d')
 endif
@@ -704,7 +700,22 @@ endif
 
 # TODO: glx provider
 
-# TODO: osmesa provider
+if with_osmesa != 'none'
+  if with_osmesa == 'classic' and not with_dri_swrast
+error('OSMesa classic requires dri (classic) swrast.')
+  endif
+  osmesa_lib_name = 'OSMesa'
+  osmesa_bits = get_option('osmesa-bits')
+  if osmesa_bits != '8'
+if with_dri or with_glx != 'disabled'
+  error('OSMesa bits must be 8 if building glx or dir based drivers')
+endif
+osmesa_lib_name = osmesa_lib_name + osmesa_bits
+pre_args += [
+  '-DCHAN_BITS=@0@'.format(osmesa_bits), '-DDEFAULT_SOFTWARE_DEPTH_BITS=31'
+]
+  endif
+endif
 
 # TODO: symbol mangling
 
@@ -731,6 +742,7 @@ dep_xext = []
 dep_xdamage = []
 dep_xfixes = []
 dep_x11_xcb = []
+dep_xcb = []
 dep_xcb_glx = []
 dep_xcb_dri2 = []
 dep_xcb_dri3 = []
diff --git a/meson_options.txt b/meson_options.txt
index b44c93df001..97aca571a48 100644
--- a/meson_options.txt
+++ b/meson_options.txt
@@ -160,3 +160,17 @@ option(
   value : false,
   description : 'Enable floating point textures and renderbuffers. This option 
may be patent encumbered, please read docs/patents.txt and consult with your 
lawyer before turning this on.'
 )
+option(
+  'osmesa',
+  type : 'combo',
+  value : 'none',
+  choices : ['none', 'classic'],
+  description : 'Build OSmesa.'
+)
+option(
+  'osmesa-bits',
+  type : 'combo',
+  value : '8',
+  choices : ['8', '16', '32'],
+  description : 'Number of channel bits for OSMesa.'
+)
diff --git a/src/mapi/glapi/meson.build b/src/mapi/glapi/meson.build
index 74f84d289ba..d2d86afd6c1 100644
--- a/src/mapi/glapi/meson.build
+++ b/src/mapi/glapi/meson.build
@@ -69,7 +69,7 @@ endif
 libglapi_static = static_library(
   'glapi_static',
   static_glapi_files,
-  include_directories : [inc_mesa, inc_include, inc_src],
+  include_directories : [inc_mesa, inc_include, inc_src, inc_mapi],
   c_args : [c_msvc_compat_args, static_glapi_args],
   dependencies : [dep_thread, dep_selinux],
   build_by_default : false,
diff --git a/src/mesa/drivers/osmesa/meson.build 
b/src/mesa/drivers/osmesa/meson.build
new file mode 100644
index 000..14c61a85d60
--- /dev/null
+++ b/src/mesa/drivers/osmesa/meson.build
@@ -0,0 +1,48 @@
+# Copyright ?? 2017 Dylan Baker
+
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the 

[Mesa-dev] [PATCH 1/9] meson: fix pkg-config Gl Require.Private

2017-10-25 Thread Dylan Baker
xf86vm -> xxf86vm

Signed-off-by: Dylan Baker 
---
 meson.build | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/meson.build b/meson.build
index 2a89b6482f9..c480dd410d9 100644
--- a/meson.build
+++ b/meson.build
@@ -814,7 +814,7 @@ gl_priv_reqs = [
   'xcb-glx >= 1.8.1', 'libdrm >= 2.4.75',
 ]
 if dep_xf86vm != [] and dep_xf86vm.found()
-  gl_priv_reqs += 'xf86vm'
+  gl_priv_reqs += 'xxf86vm'
 endif
 if with_dri_platform == 'drm'
   gl_priv_reqs += 'xcb-dri2 >= 1.8'
-- 
2.14.2

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH 0/9] Meson: osmesa support

2017-10-25 Thread Dylan Baker
This series adds support to the meson build system for both classic osmesa and
gallium based osmesa. There are quite a few cleanup patches in this series as
well, one of which is reviewed but sent back out so the whole series can be
reviewed.

Dylan

Dylan Baker (9):
  meson: fix pkg-config Gl Require.Private
  meson: rename all instances of xf86vm to xxf86vm
  meson: fix glprocs.h generator
  meson: move gallium include declarations to src
  osmesa: Include generated headers without path
  glapi: include generated headers without path
  meson: Add generated files to non-shared glapi
  meson: build classic osmesa
  meson: build gallium based osmesa

 include/meson.build|  2 +-
 meson.build| 37 
 meson_options.txt  | 14 +
 src/gallium/auxiliary/meson.build  |  2 -
 src/gallium/meson.build|  8 ++-
 .../state_trackers/osmesa}/meson.build | 54 +++--
 src/gallium/targets/osmesa/meson.build | 68 ++
 src/glx/meson.build|  2 +-
 src/mapi/glapi/gen/meson.build |  6 +-
 src/mapi/glapi/glapi_dispatch.c|  2 +-
 src/mapi/glapi/glapi_getproc.c |  4 +-
 src/mapi/glapi/glapi_nop.c |  2 +-
 src/mapi/glapi/meson.build |  5 +-
 src/mesa/drivers/osmesa/Makefile.am|  2 +
 src/mesa/drivers/osmesa/meson.build| 48 +++
 src/mesa/drivers/osmesa/osmesa.c   |  4 +-
 src/mesa/meson.build   |  3 +
 src/meson.build|  3 +-
 18 files changed, 191 insertions(+), 75 deletions(-)
 copy src/{ => gallium/state_trackers/osmesa}/meson.build (50%)
 create mode 100644 src/gallium/targets/osmesa/meson.build
 create mode 100644 src/mesa/drivers/osmesa/meson.build

-- 
2.14.2

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH 7/9] meson: Add generated files to non-shared glapi

2017-10-25 Thread Dylan Baker
Signed-off-by: Dylan Baker 
---
 src/mapi/glapi/meson.build | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/src/mapi/glapi/meson.build b/src/mapi/glapi/meson.build
index d3e070d0d1a..74f84d289ba 100644
--- a/src/mapi/glapi/meson.build
+++ b/src/mapi/glapi/meson.build
@@ -55,6 +55,9 @@ else
 'glapi_priv.h',
   )
   static_glapi_files += files_mapi_util
+  static_glapi_files += [
+glapitable_h, glapi_mapi_tmp_h, glprocs_h, glapitemp_h,
+  ]
   if with_asm_arch == 'x86'
 static_glapi_files += glapi_x86_s
   elif with_asm_arch == 'x86_64'
-- 
2.14.2

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH 6/9] glapi: include generated headers without path

2017-10-25 Thread Dylan Baker
This has been tested wtih make dist-check and with meson.

Signed-off-by: Dylan Baker 
---
 src/mapi/glapi/glapi_dispatch.c | 2 +-
 src/mapi/glapi/glapi_getproc.c  | 4 ++--
 src/mapi/glapi/glapi_nop.c  | 2 +-
 3 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/mapi/glapi/glapi_dispatch.c b/src/mapi/glapi/glapi_dispatch.c
index df907ff9d6a..269adc7ea00 100644
--- a/src/mapi/glapi/glapi_dispatch.c
+++ b/src/mapi/glapi/glapi_dispatch.c
@@ -38,7 +38,7 @@
  */
 
 #include "glapi/glapi_priv.h"
-#include "glapi/glapitable.h"
+#include "glapitable.h"
 
 
 #if !(defined(USE_X86_ASM) || defined(USE_X86_64_ASM) || 
defined(USE_SPARC_ASM))
diff --git a/src/mapi/glapi/glapi_getproc.c b/src/mapi/glapi/glapi_getproc.c
index a6b2455f1d5..d3fd3f82dcd 100644
--- a/src/mapi/glapi/glapi_getproc.c
+++ b/src/mapi/glapi/glapi_getproc.c
@@ -34,7 +34,7 @@
 #include 
 #include 
 #include "glapi/glapi_priv.h"
-#include "glapi/glapitable.h"
+#include "glapitable.h"
 
 
 #define FIRST_DYNAMIC_OFFSET (sizeof(struct _glapi_table) / sizeof(void *))
@@ -49,7 +49,7 @@
 #if !defined(DISPATCH_FUNCTION_SIZE) 
 # define NEED_FUNCTION_POINTER
 #endif
-#include "glapi/glprocs.h"
+#include "glprocs.h"
 
 
 /**
diff --git a/src/mapi/glapi/glapi_nop.c b/src/mapi/glapi/glapi_nop.c
index 13db310b303..2e130e63ebc 100644
--- a/src/mapi/glapi/glapi_nop.c
+++ b/src/mapi/glapi/glapi_nop.c
@@ -116,7 +116,7 @@ NoOpUnused(void)
 #define DISPATCH_TABLE_NAME __glapi_noop_table
 #define UNUSED_TABLE_NAME __unused_noop_functions
 
-#include "glapi/glapitemp.h"
+#include "glapitemp.h"
 
 
 /** Return pointer to new dispatch table filled with no-op functions */
-- 
2.14.2

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH 9/9] meson: build gallium based osmesa

2017-10-25 Thread Dylan Baker
This has been tested with the osdemo from mesa-demos

Signed-off-by: Dylan Baker 
---
 meson.build   |  3 ++
 meson_options.txt |  2 +-
 src/gallium/meson.build   |  7 ++-
 src/gallium/state_trackers/osmesa/meson.build | 28 +++
 src/gallium/targets/osmesa/meson.build| 68 +++
 5 files changed, 106 insertions(+), 2 deletions(-)
 create mode 100644 src/gallium/state_trackers/osmesa/meson.build
 create mode 100644 src/gallium/targets/osmesa/meson.build

diff --git a/meson.build b/meson.build
index 79ce59c6b27..0bbe330042b 100644
--- a/meson.build
+++ b/meson.build
@@ -704,6 +704,9 @@ if with_osmesa != 'none'
   if with_osmesa == 'classic' and not with_dri_swrast
 error('OSMesa classic requires dri (classic) swrast.')
   endif
+  if with_osmesa == 'gallium' and not with_gallium_softpipe
+error('OSMesa gallium requires gallium softpipe or llvmpipe.')
+  endif
   osmesa_lib_name = 'OSMesa'
   osmesa_bits = get_option('osmesa-bits')
   if osmesa_bits != '8'
diff --git a/meson_options.txt b/meson_options.txt
index 97aca571a48..a0b8044e4bb 100644
--- a/meson_options.txt
+++ b/meson_options.txt
@@ -164,7 +164,7 @@ option(
   'osmesa',
   type : 'combo',
   value : 'none',
-  choices : ['none', 'classic'],
+  choices : ['none', 'classic', 'gallium'],
   description : 'Build OSmesa.'
 )
 option(
diff --git a/src/gallium/meson.build b/src/gallium/meson.build
index e0941103b93..6edfe80321d 100644
--- a/src/gallium/meson.build
+++ b/src/gallium/meson.build
@@ -66,6 +66,9 @@ if with_gallium_imx
   subdir('winsys/imx/drm')
 endif
 subdir('state_trackers/dri')
+if with_osmesa == 'gallium'
+  subdir('state_trackers/osmesa')
+endif
 # TODO: i915
 # TODO: SVGA
 # TODO: r300
@@ -77,9 +80,11 @@ subdir('state_trackers/dri')
 if with_dri and with_gallium
   subdir('targets/dri')
 endif
+if with_osmesa == 'gallium'
+  subdir('targets/osmesa')
+endif
 # TODO: xlib-glx
 # TODO: OMX
-# TODO: osmesa
 # TODO: VA
 # TODO: vdpau
 # TODO: xa
diff --git a/src/gallium/state_trackers/osmesa/meson.build 
b/src/gallium/state_trackers/osmesa/meson.build
new file mode 100644
index 000..dacf10512d6
--- /dev/null
+++ b/src/gallium/state_trackers/osmesa/meson.build
@@ -0,0 +1,28 @@
+# Copyright ?? 2017 Intel Corporation
+
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+libosmesa_st = static_library(
+  'osmesa_st',
+  'osmesa.c',
+  c_args : ['-DGALLIUM_SOFTPIPE', '-DGALLIUM_TRACE'],
+  include_directories : [
+inc_include, inc_src, inc_gallium, inc_gallium_aux, inc_mapi, inc_mesa,
+  ],
+)
diff --git a/src/gallium/targets/osmesa/meson.build 
b/src/gallium/targets/osmesa/meson.build
new file mode 100644
index 000..af81c5adbbe
--- /dev/null
+++ b/src/gallium/targets/osmesa/meson.build
@@ -0,0 +1,68 @@
+# Copyright ?? 2017 Intel Corporation
+
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 

Re: [Mesa-dev] Build mesa-dev on Windows with AVX instruction set problem

2017-10-25 Thread Roland Scheidegger
Am 26.10.2017 um 00:26 schrieb Ilia Mirkin:
> On Wed, Oct 25, 2017 at 6:15 PM, Fabrício Ceolin
> >
> wrote:
> 
> Hi,
> 
> Thanks. I recompiled everything (on Windows) using this real machine:
> 
> #under msys2
> $ cat /proc/cpuinfo
> processor       : 0
> vendor_id       : GenuineIntel
> cpu family      : 6
> model           : 23
> model name      : Genuine Intel(R) CPU           U2300  @ 1.20GHz
> stepping        : 10
> cpu MHz         : 1197.000
> cache size      : 1024 KB
> physical id     : 0
> siblings        : 2
> core id         : 0
> cpu cores       : 2
> apicid          : 0
> initial apicid  : 0
> fpu             : yes
> fpu_exception   : yes
> cpuid level     : 13
> wp              : yes
> flags           : fpu vme de pse tsc msr pae mce cx8 apic sep mtrr
> pge mca cmov pat pse36 clflush dts acpi mmx fxsr sse sse2 ss ht tm
> pbe pni dtes64 monitor ds_cpl vmx est tm2 ssse3 cx16 xtpr pdcm xsave
> osxsave lahf_lm dtherm
> clflush size    : 64
> cache_alignment : 64
> address sizes   : 36 bits physical, 48 bits virtual
> power management:
> 
> but I got the same error:
> 
> 7FF85FBF874F 48 83 7A 18 10       cmp         qword ptr
> [rdx+18h],10h  
> 7FF85FBF8754 48 8B 6A 10          mov         rbp,qword ptr
> [rdx+10h]  
> 7FF85FBF8758 72 03                jb         
> std::basic_string >::basic_string >+2Dh (07FF85FBF875Dh)  
> 7FF85FBF875A 48 8B 32             mov         rsi,qword ptr [rdx]  
> 7FF85FBF875D 48 83 FD 10          cmp         rbp,10h  
> 7FF85FBF8761 73 27                jae       
>  std::basic_string >::basic_string >+5Ah (07FF85FBF878Ah)  
> *7FF85FBF8763 C5 F8 10 06          vmovups     xmm0,xmmword ptr
> [rsi]  *
> 7FF85FBF8767 C5 F8 11 01          vmovups     xmmword ptr
> [rcx],xmm0  
> 7FF85FBF876B 48 89 69 10          mov         qword ptr
> [rcx+10h],rbp
> 
> Are there any parameters that I can use for avoid to use AVX (or
> better) instructions?
> 
> 
> LP_NATIVE_VECTOR_WIDTH=128 should force you to a non-avx path. There's
> also a LP_FORCE_SSE2=1 which will also avoid sse3/4 usage. However all
> this stuff should be getting detected, so it's odd that it's messing up.
> Perhaps run with GALLIUM_DUMP_CPU=1 to see what's being detected?
> 

Also, a backtrace when it crashes would be nice. I am not convinced this
is actually happening inside a code-generated shader...

Roland


___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


Re: [Mesa-dev] [Mesa-stable] [PATCH 3/7] configure.ac: rework llvm libs handling for 3.9+

2017-10-25 Thread Andres Gomez
Emil, this patch series introduces a lot of changes on how we treat
LLVM's dependency.

After applying this patch in the stable queue, I get these errors:

For the "make Gallium Drivers SWR" and "make Gallium Drivers Other"
targets:

"
...
  CXXLDgallium_dri.la
/usr/bin/ld: /usr/lib/llvm-3.9/lib/libLLVMX86CodeGen.a(X86FloatingPoint.cpp.o): 
unrecognized relocation (0x2a) in section 
`.text._ZNK12_GLOBAL__N_13FPS16getAnalysisUsageERN4llvm13AnalysisUsageE'
/usr/bin/ld: final link failed: Bad value
collect2: error: ld returned 1 exit status
...
"

While, for the "make Vulkan" target:

"
...
  CXXLDlibvulkan_radeon.la
/usr/bin/ld: 
/usr/lib/llvm-3.9/lib/libLLVMAMDGPUCodeGen.a(R600OptimizeVectorRegisters.cpp.o):
 unrecognized relocation (0x2a) in section 
`.text._ZNK12_GLOBAL__N_119R600VectorRegMerger16getAnalysisUsageERN4llvm13AnalysisUsageE'
/usr/bin/ld: final link failed: Bad value
collect2: error: ld returned 1 exit status
...
"

Bumping to binutils 2.26 with the subsequent patch of the series
doesn't seem to help. You can take a look at:

https://travis-ci.org/Igalia/release-mesa/builds/292370423


So, although, if I'm understanding correctly, this seems to be a
problem in LLVM/binutils rather than in Mesa, I'm weary of applying
this whole series.

What do you think?

Thanks.


On Thu, 2017-10-05 at 11:19 +0100, Emil Velikov wrote:
> From: Emil Velikov 
> 
> Earlier versions need different quirks, but as of LLVM 3.9 llvm-config
> provides --link-shared/link-static toggles.
> 
> The output of which seems to be reliable - looking at LLVM 3.9, 4.0 and
> 5.0.
> 
> Note that there are earlier code will be used for pre LLVM 3.9 and is
> unchanged.
> 
> This effectively fixes LLVM static linking, while providing a clearer
> and more robust solution for future versions.
> 
> Mildly interesting side notes:
> 
>  - build-mode (introduced with 3.8) was buggy with 3.8
> It shows "static" when build with -DLLVM_LINK_LLVM_DYLIB=ON, yet it was
> consistent with --libs. The latter shows the static libraries.
> 
>  - libnames and libfiles are broken with LVM 3.9
> The library prefix and extension is printed twice liblibLLVM-3.9.so.so
> 
> Cc: mesa-sta...@lists.freedesktop.org
> Cc: Dieter Nützel 
> Cc: Michel Dänzer 
> Signed-off-by: Emil Velikov 
> ---
>  configure.ac | 29 +++--
>  1 file changed, 19 insertions(+), 10 deletions(-)
> 
> diff --git a/configure.ac b/configure.ac
> index 308938a5a88..dbaa6569e4e 100644
> --- a/configure.ac
> +++ b/configure.ac
> @@ -2684,18 +2684,27 @@ if test "x$enable_llvm" = xyes; then
>  dnl this was causing the same libraries to be appear multiple times
>  dnl in LLVM_LIBS.
>  
> -LLVM_LIBS="`$LLVM_CONFIG --libs ${LLVM_COMPONENTS}`"
> -
> -if test "x$enable_llvm_shared_libs" = xyes; then
> -if test $LLVM_VERSION_MAJOR -lt 4 -o "`$LLVM_CONFIG --shared-mode 
> ${LLVM_COMPONENTS}`" = static; then
> -detect_old_buggy_llvm
> +if test $LLVM_VERSION_MAJOR -ge 4 -o $LLVM_VERSION_MAJOR -eq 3 -a 
> $LLVM_VERSION_MINOR -ge 9; then
> +if test "x$enable_llvm_shared_libs" = xyes; then
> +LLVM_LIBS="`$LLVM_CONFIG --link-shared --libs 
> ${LLVM_COMPONENTS}`"
> +else
> +LLVM_LIBS="`$LLVM_CONFIG --link-static --libs ${LLVM_COMPONENTS} 
> --system-libs`"
> +dnl llvm-config lists the system libs on a separate line. While
> +dnl invoking --system-libs without --link-static assumes shared 
> link.
> +dnl Everybody now, say - Thank you LLVM developers
> +LLVM_LIBS="`echo $LLVM_LIBS | $SED 's/\n//g'`"
>  fi
>  else
> -AC_MSG_WARN([Building mesa with statically linked LLVM may cause 
> compilation issues])
> -dnl We need to link to llvm system libs when using static libs
> -dnl However, only llvm 3.5+ provides --system-libs
> -if test $LLVM_VERSION_MAJOR -ge 4 -o $LLVM_VERSION_MAJOR -eq 3 -a 
> $LLVM_VERSION_MINOR -ge 5; then
> -LLVM_LIBS="$LLVM_LIBS `$LLVM_CONFIG --system-libs`"
> +LLVM_LIBS="`$LLVM_CONFIG --libs ${LLVM_COMPONENTS}`"
> +if test "x$enable_llvm_shared_libs" = xyes; then
> +detect_old_buggy_llvm
> +else
> +AC_MSG_WARN([Building mesa with statically linked LLVM may cause 
> compilation issues])
> +dnl We need to link to llvm system libs when using static libs
> +dnl However, only llvm 3.5+ provides --system-libs
> +if test $LLVM_VERSION_MAJOR -ge 4 -o $LLVM_VERSION_MAJOR -eq 3 
> -a $LLVM_VERSION_MINOR -ge 5; then
> +LLVM_LIBS="$LLVM_LIBS `$LLVM_CONFIG --system-libs`"
> +fi
>  fi
>  fi
>  fi
-- 
Br,

Andres

signature.asc
Description: This is a digitally signed message part
___
mesa-dev mailing list

Re: [Mesa-dev] [PATCH 2/3] mesa: enable ARB_texture_buffer_* extensions in the Compatibility profile

2017-10-25 Thread Ilia Mirkin
On Sat, Oct 21, 2017 at 8:54 AM, Marek Olšák  wrote:
> From: Marek Olšák 
>
> We already have piglit tests testing alpha, luminance, and intensity
> formats. They were skipped by piglit until now.
>
> Additionally, I'm enabling one ARB_texture_buffer_range piglit test to run
> with the compat profile.
> ---
>  src/mapi/glapi/gen/apiexec.py| 12 +---
>  src/mesa/drivers/dri/i965/intel_extensions.c |  8 +---
>  src/mesa/main/extensions_table.h |  6 +++---
>  src/mesa/main/get.c  |  2 --
>  src/mesa/main/get_hash_params.py | 20 ++--
>  src/mesa/main/tests/dispatch_sanity.cpp  |  8 
>  src/mesa/main/texparam.c |  4 ++--
>  src/mesa/state_tracker/st_context.c  |  2 +-
>  src/mesa/state_tracker/st_extensions.c   |  8 +++-
>  src/mesa/state_tracker/st_extensions.h   |  3 ++-
>  src/mesa/state_tracker/st_manager.c  |  2 +-
>  11 files changed, 40 insertions(+), 35 deletions(-)
>
> diff --git a/src/mapi/glapi/gen/apiexec.py b/src/mapi/glapi/gen/apiexec.py
> index 61eda4b..7da0818 100644
> --- a/src/mapi/glapi/gen/apiexec.py
> +++ b/src/mapi/glapi/gen/apiexec.py
> @@ -28,21 +28,21 @@ class exec_info():
>  10).  For example, glCreateProgram was added in OpenGL 2.0, so
>  compatibility=20 and core=31.
>
>  If the attribute is None, then it cannot be supported by that
>  API.  For example, glNewList was removed from core profiles, so
>  compatibility=10 and core=None.
>
>  Each of the attributes that is not None must have a valid value.  The
>  valid ranges are:
>
> -compatiblity: [10, 30]
> +compatibility: [10, )
>  core: [31, )
>  es1: [10, 11]
>  es2: [20, )
>
>  These ranges are enforced by the constructor.
>  """
>  def __init__(self, compatibility=None, core=None, es1=None, es2=None):
>  if compatibility is not None:
>  assert isinstance(compatibility, int)
>  assert compatibility >= 10
> @@ -59,23 +59,22 @@ class exec_info():
>  if es2 is not None:
>  assert isinstance(es2, int)
>  assert es2 >= 20
>
>  self.compatibility = compatibility
>  self.core = core
>  self.es1 = es1
>  self.es2 = es2
>
>  functions = {
> -# OpenGL 3.1 / GL_ARB_texture_buffer_object.  Mesa only exposes this
> -# extension with core profile.
> -"TexBuffer": exec_info(core=31, es2=31),
> +# OpenGL 3.1 / GL_ARB_texture_buffer_object.
> +"TexBuffer": exec_info(compatibility=20, core=31, es2=31),
>
>  # OpenGL 3.2 / GL_OES_geometry_shader.
>  "FramebufferTexture": exec_info(core=32, es2=31),
>
>  # OpenGL 4.0 / GL_ARB_shader_subroutines. Mesa only exposes this
>  # extension with core profile.
>  "GetSubroutineUniformLocation": exec_info(core=31),
>  "GetSubroutineIndex": exec_info(core=31),
>  "GetActiveSubroutineUniformiv": exec_info(core=31),
>  "GetActiveSubroutineUniformName": exec_info(core=31),
> @@ -137,23 +136,22 @@ functions = {
>  "ViewportIndexedf": exec_info(core=32, es2=31),
>  "ViewportIndexedfv": exec_info(core=32, es2=31),
>  "ScissorArrayv": exec_info(core=32, es2=31),
>  "ScissorIndexed": exec_info(core=32, es2=31),
>  "ScissorIndexedv": exec_info(core=32, es2=31),
>  "DepthRangeArrayv": exec_info(core=32),
>  "DepthRangeIndexed": exec_info(core=32),
>  # GetFloati_v also GL_ARB_shader_atomic_counters
>  # GetDoublei_v also GL_ARB_shader_atomic_counters
>
> -# OpenGL 4.3 / GL_ARB_texture_buffer_range.  Mesa can expose the 
> extension
> -# with OpenGL 3.1.
> -"TexBufferRange": exec_info(core=31, es2=31),
> +# OpenGL 4.3 / GL_ARB_texture_buffer_range.
> +"TexBufferRange": exec_info(compatibility=20, core=31, es2=31),
>
>  # OpenGL 4.3 / GL_ARB_framebuffer_no_attachments.  Mesa can expose the
>  # extension with OpenGL 3.0.
>  "FramebufferParameteri": exec_info(compatibility=30, core=31, es2=31),
>  "GetFramebufferParameteri": exec_info(compatibility=30, core=31, es2=31),
>
>  # OpenGL 4.5 / GL_ARB_direct_state_access.   Mesa can expose the 
> extension
>  # with core profile.
>  "CreateTransformFeedbacks": exec_info(core=31),
>  "TransformFeedbackBufferBase": exec_info(core=31),
> diff --git a/src/mesa/drivers/dri/i965/intel_extensions.c 
> b/src/mesa/drivers/dri/i965/intel_extensions.c
> index 519d0a5..21cf632 100644
> --- a/src/mesa/drivers/dri/i965/intel_extensions.c
> +++ b/src/mesa/drivers/dri/i965/intel_extensions.c
> @@ -171,23 +171,25 @@ intelInitExtensions(struct gl_context *ctx)
>   !driQueryOptionb(>optionCache, "disable_blend_func_extended");
>ctx->Extensions.ARB_conditional_render_inverted = true;
>ctx->Extensions.ARB_cull_distance = true;
>ctx->Extensions.ARB_draw_buffers_blend = 

Re: [Mesa-dev] [PATCH 2/3] mesa: enable ARB_texture_buffer_* extensions in the Compatibility profile

2017-10-25 Thread Dylan Baker
For list posterity, since my intel email isn't subscribed,

There are a significant number of i965 regressions from 



d96c68146a781c79a23f5181d7050174f1070d90, largely related to texturing (I can   



send you a complete list of regressions if you care, but due to the large 
number  

  
of them I suspect it's something fairly simple).




For example:



ES31-CTS.functional.texture.format.buffer.r32ui_npot




glGetIntegerv() failed: glGetError() returned GL_INVALID_ENUM at 
gluContextInfo.cpp:229  

   

dEQP-GLES31.functional.state_query.integer.texture_buffer_binding_getinteger:   




glGetIntegerv(GL_TEXTURE_BUFFER_BINDING, 0x7ffee0c43834);   



// data = { -555819298 }



glGetError();   



// GL_INVALID_ENUM returned 



 // ERROR: glGetIntegerv: glGetError() returned GL_INVALID_ENUM 




Dylan

Quoting Emil Velikov (2017-10-24 09:30:03)
> Hi Marek,
> 
> On 21 October 2017 at 13:54, Marek Olšák  wrote:
> > From: Marek Olšák 
> >
> > We already have piglit tests testing alpha, luminance, and intensity
> > formats. They were skipped by piglit until now.
> >
> > Additionally, I'm enabling one ARB_texture_buffer_range piglit test to run
> > with the compat profile.
> 
> Can you please mention that ARB_texture_buffer_* on i965 is unchanged
> - aka still enabled only for core profiles.
> Out of curiosity - can you tried the series with anything more than piglit?
> 
> The Intel guys can run the lot through CTS, dEQP... admittedly only on
> Intel hardware.
> Still it should help catch if a piece is missing somewhere.
> 
> 
> -Emil
> ___
> mesa-dev mailing list
> mesa-dev@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/mesa-dev


signature.asc
Description: signature
___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


Re: [Mesa-dev] [Mesa-stable] [PATCH 6/7] configure.ac: add missing LLVM components for OpenCL

2017-10-25 Thread Andres Gomez
Emil, by 17.2, Clover is still asking only for LLVM 3.6 and "coverage"
doesn't seem to be there.

Hence, I think we should drop or backport this patch. In any case,
check my other mail about this patch series ...

On Thu, 2017-10-05 at 11:19 +0100, Emil Velikov wrote:
> From: Emil Velikov 
> 
> Coverage and LTO seems to be hard requirements for Clang, while
> coroutines is needed as of LLVM/Clang 4.0.
> 
> Mark the last one as "optional" so we handle every case.
> 
> Cc: mesa-sta...@lists.freedesktop.org
> Signed-off-by: Emil Velikov 
> ---
>  configure.ac | 3 +++
>  1 file changed, 3 insertions(+)
> 
> diff --git a/configure.ac b/configure.ac
> index 56cbf26c778..df8cfa6b71b 100644
> --- a/configure.ac
> +++ b/configure.ac
> @@ -2300,13 +2300,16 @@ if test "x$enable_opencl" = xyes; then
>  
>  llvm_add_default_components "opencl"
>  llvm_add_component "all-targets" "opencl"
> +llvm_add_component "coverage" "opencl"
>  llvm_add_component "linker" "opencl"
>  llvm_add_component "instrumentation" "opencl"
>  llvm_add_component "ipo" "opencl"
>  llvm_add_component "irreader" "opencl"
> +llvm_add_component "lto" "opencl"
>  llvm_add_component "option" "opencl"
>  llvm_add_component "objcarcopts" "opencl"
>  llvm_add_component "profiledata" "opencl"
> +llvm_add_optional_component "coroutines" "opencl"
>  
>  dnl Check for Clang internal headers
>  if test -z "$CLANG_LIBDIR"; then
-- 
Br,

Andres
___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


Re: [Mesa-dev] Build mesa-dev on Windows with AVX instruction set problem

2017-10-25 Thread Ilia Mirkin
On Wed, Oct 25, 2017 at 6:15 PM, Fabrício Ceolin <
fabricio.ceo...@miningmath.com> wrote:

> Hi,
>
> Thanks. I recompiled everything (on Windows) using this real machine:
>
> #under msys2
> $ cat /proc/cpuinfo
> processor   : 0
> vendor_id   : GenuineIntel
> cpu family  : 6
> model   : 23
> model name  : Genuine Intel(R) CPU   U2300  @ 1.20GHz
> stepping: 10
> cpu MHz : 1197.000
> cache size  : 1024 KB
> physical id : 0
> siblings: 2
> core id : 0
> cpu cores   : 2
> apicid  : 0
> initial apicid  : 0
> fpu : yes
> fpu_exception   : yes
> cpuid level : 13
> wp  : yes
> flags   : fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca
> cmov pat pse36 clflush dts acpi mmx fxsr sse sse2 ss ht tm pbe pni dtes64
> monitor ds_cpl vmx est tm2 ssse3 cx16 xtpr pdcm xsave osxsave lahf_lm dtherm
> clflush size: 64
> cache_alignment : 64
> address sizes   : 36 bits physical, 48 bits virtual
> power management:
>
> but I got the same error:
>
> 7FF85FBF874F 48 83 7A 18 10   cmp qword ptr [rdx+18h],10h
> 7FF85FBF8754 48 8B 6A 10  mov rbp,qword ptr [rdx+10h]
> 7FF85FBF8758 72 03jb
> std::basic_string >::basic_string+2Dh
> (07FF85FBF875Dh)
> 7FF85FBF875A 48 8B 32 mov rsi,qword ptr [rdx]
> 7FF85FBF875D 48 83 FD 10  cmp rbp,10h
> 7FF85FBF8761 73 27jae
>  std::basic_string >::basic_string+5Ah
> (07FF85FBF878Ah)
> *7FF85FBF8763 C5 F8 10 06  vmovups xmm0,xmmword ptr
> [rsi]  *
> 7FF85FBF8767 C5 F8 11 01  vmovups xmmword ptr [rcx],xmm0
> 7FF85FBF876B 48 89 69 10  mov qword ptr [rcx+10h],rbp
>
> Are there any parameters that I can use for avoid to use AVX (or better)
> instructions?
>

LP_NATIVE_VECTOR_WIDTH=128 should force you to a non-avx path. There's also
a LP_FORCE_SSE2=1 which will also avoid sse3/4 usage. However all this
stuff should be getting detected, so it's odd that it's messing up. Perhaps
run with GALLIUM_DUMP_CPU=1 to see what's being detected?

  -ilia
___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


Re: [Mesa-dev] Build mesa-dev on Windows with AVX instruction set problem

2017-10-25 Thread Fabrício Ceolin
Hi,

Thanks. I recompiled everything (on Windows) using this real machine:

#under msys2
$ cat /proc/cpuinfo
processor   : 0
vendor_id   : GenuineIntel
cpu family  : 6
model   : 23
model name  : Genuine Intel(R) CPU   U2300  @ 1.20GHz
stepping: 10
cpu MHz : 1197.000
cache size  : 1024 KB
physical id : 0
siblings: 2
core id : 0
cpu cores   : 2
apicid  : 0
initial apicid  : 0
fpu : yes
fpu_exception   : yes
cpuid level : 13
wp  : yes
flags   : fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca
cmov pat pse36 clflush dts acpi mmx fxsr sse sse2 ss ht tm pbe pni dtes64
monitor ds_cpl vmx est tm2 ssse3 cx16 xtpr pdcm xsave osxsave lahf_lm dtherm
clflush size: 64
cache_alignment : 64
address sizes   : 36 bits physical, 48 bits virtual
power management:

but I got the same error:

7FF85FBF874F 48 83 7A 18 10   cmp qword ptr [rdx+18h],10h
7FF85FBF8754 48 8B 6A 10  mov rbp,qword ptr [rdx+10h]
7FF85FBF8758 72 03jb
std::basic_string::basic_string+2Dh
(07FF85FBF875Dh)
7FF85FBF875A 48 8B 32 mov rsi,qword ptr [rdx]
7FF85FBF875D 48 83 FD 10  cmp rbp,10h
7FF85FBF8761 73 27jae
 std::basic_string::basic_string+5Ah
(07FF85FBF878Ah)
*7FF85FBF8763 C5 F8 10 06  vmovups xmm0,xmmword ptr [rsi]  *
7FF85FBF8767 C5 F8 11 01  vmovups xmmword ptr [rcx],xmm0
7FF85FBF876B 48 89 69 10  mov qword ptr [rcx+10h],rbp

Are there any parameters that I can use for avoid to use AVX (or better)
instructions?

(I am using LLVM 5.0)

Thanks

[image: MiningMath Associates]

*Fabrício
Ceolin*
+55 (31) 98675-1359
MiningMath Associates 
www.miningmath.com 



2017-10-24 15:50 GMT-02:00 Roland Scheidegger :

> Am 24.10.2017 um 18:26 schrieb Fabrício Ceolin:
> > Hi,
> >
> > I am trying to build mesa-dev on windows.
> >
> > I am learning how to do it, through the
> > project https://github.com/pal1000/mesa-dist-win/tree/master/buildscript
> > 
> >
> > I tried to use the binaries from pal1000, but I got an illegal
> > instruction problem in opengl32.dll when I tried to run my application
> > (VTK based).
> >
> > 7FF96252B0CF  cmp qword ptr [rdx+18h],10h
> > 7FF96252B0D4  mov rbp,qword ptr [rdx+10h]
> > 7FF96252B0D8  jb
> > std::basic_string >>::basic_string+2Dh
> > (07FF96252B0DDh)
> > 7FF96252B0DA  mov rsi,qword ptr [rdx]
> > 7FF96252B0DD  cmp rbp,10h
> > 7FF96252B0E1  jae
> >  std::basic_string >>::basic_string+5Ah
> > (07FF96252B10Ah)
> > *7FF96252B0E3  vmovups xmm0,xmmword ptr [rsi]  *
> > 7FF96252B0E7  vmovups xmmword ptr [rcx],xmm0
> > 7FF96252B0EB  mov qword ptr [rcx+10h],rbp
> >
> > I tried to build everything on a target machine, but the problem was not
> > solved.
> >
> > I saw here that movups is in avx instruction set here
> > http://www.felixcloutier.com/x86/MOVUPS.html
> > 
> >
> > My target machine has the following cpuinfo (inside msys64):
> >
> > cat /proc/cpuinfo
> > processor   : 0
> > vendor_id   : GenuineIntel
> > cpu family  : 6
> > model   : 6
> > model name  : QEMU Virtual CPU version 2.0.0
> > stepping: 3
> > cpu MHz : 2993.000
> > cache size  : 4096 KB
> > fpu : yes
> > fpu_exception   : yes
> > cpuid level : 4
> > wp  : yes
> > flags   : fpu de pse tsc msr pae mce cx8 apic sep mtrr pge mca
> > cmov pat pse36 clflush mmx fxsr sse sse2 pni vmx cx16 x2apic popcnt
> > hypervisor lahf_lm epb dtherm fsgsbase tsc_adjust bmi1 hle avx2
> clflushopt
> > clflush size: 64
> > cache_alignment : 64
> > address sizes   : 40 bits physical, 48 bits virtual
> > power management:
> >
> > How can I compile opengl32.dll without use the avx instruction set?
> >
> > My compilation command was (from mesa-dist-win):
> >
> > python c:\Python27\Scripts\scons.py build=release platform=windows
> > machine=x86_64 swr=1 libgl-gdi osmesa graw-gdi
>
> Generally, generic x86_64 target should only use sse2.
> If you're running llvmpipe though, the code will use runtime detection
> of features for generated code. I have no idea if that code there
> causing the crash was runtime-compiled 

Re: [Mesa-dev] [PATCH 2/3] i965/blorp: Use more temporary isl_format variables

2017-10-25 Thread Jason Ekstrand
On Mon, Oct 23, 2017 at 11:07 PM, Pohjolainen, Topi <
topi.pohjolai...@gmail.com> wrote:

> On Mon, Oct 23, 2017 at 05:23:08PM -0700, Jason Ekstrand wrote:
> > ---
> >  src/mesa/drivers/dri/i965/brw_blorp.c | 15 +++
> >  1 file changed, 7 insertions(+), 8 deletions(-)
> >
> > diff --git a/src/mesa/drivers/dri/i965/brw_blorp.c
> b/src/mesa/drivers/dri/i965/brw_blorp.c
> > index f7d128d..05204a9 100644
> > --- a/src/mesa/drivers/dri/i965/brw_blorp.c
> > +++ b/src/mesa/drivers/dri/i965/brw_blorp.c
> > @@ -329,6 +329,8 @@ brw_blorp_blit_miptrees(struct brw_context *brw,
> > intel_miptree_prepare_access(brw, src_mt, src_level, 1, src_layer,
> 1,
> >  src_aux_usage, src_clear_supported);
> >
> > +   enum isl_format dst_isl_format =
> > +  brw_blorp_to_isl_format(brw, dst_format, true);
> > enum isl_aux_usage dst_aux_usage =
> >intel_miptree_render_aux_usage(brw, dst_mt, encode_srgb, false);
> > const bool dst_clear_supported = dst_aux_usage != ISL_AUX_USAGE_NONE;
> > @@ -352,10 +354,9 @@ brw_blorp_blit_miptrees(struct brw_context *brw,
> > struct blorp_batch batch;
> > blorp_batch_init(>blorp, , brw, 0);
> > blorp_blit(, _surf, src_level, src_layer,
> > -  brw_blorp_to_isl_format(brw, src_format, false),
> src_isl_swizzle,
> > +  src_isl_format, src_isl_swizzle,
>
> This is functional change, isn't it? It effectively switches from
> brw_blorp_to_isl_format() to brw_isl_format_for_mesa_format().
>

Good catch!  We also need to change the declaration of src_isl_format to
use brw_blorp_to_isl_format which, while it looks like a functional change,
isn't because the only difference is for render or depth and we explicitly
disable HiZ usage.  Would you rather that be a precursor patch?


> >_surf, dst_level, dst_layer,
> > -  brw_blorp_to_isl_format(brw, dst_format, true),
> > -  ISL_SWIZZLE_IDENTITY,
> > +  dst_isl_format, ISL_SWIZZLE_IDENTITY,
> >src_x0, src_y0, src_x1, src_y1,
> >dst_x0, dst_y0, dst_x1, dst_y1,
> >filter, mirror_x, mirror_y);
> > @@ -1158,6 +1159,7 @@ do_single_blorp_clear(struct brw_context *brw,
> struct gl_framebuffer *fb,
> > mesa_format format = irb->Base.Base.Format;
> > if (!encode_srgb && _mesa_get_format_color_encoding(format) ==
> GL_SRGB)
> >format = _mesa_get_srgb_format_linear(format);
> > +   enum isl_format isl_format = brw->mesa_to_isl_render_format[format];
> >
> > x0 = fb->_Xmin;
> > x1 = fb->_Xmax;
> > @@ -1256,8 +1258,7 @@ do_single_blorp_clear(struct brw_context *brw,
> struct gl_framebuffer *fb,
> >
> >struct blorp_batch batch;
> >blorp_batch_init(>blorp, , brw, 0);
> > -  blorp_fast_clear(, ,
> > -   brw->mesa_to_isl_render_format[format],
> > +  blorp_fast_clear(, , isl_format,
> > level, irb->mt_layer, num_layers,
> > x0, y0, x1, y1);
> >blorp_batch_finish();
> > @@ -1290,9 +1291,7 @@ do_single_blorp_clear(struct brw_context *brw,
> struct gl_framebuffer *fb,
> >
> >struct blorp_batch batch;
> >blorp_batch_init(>blorp, , brw, 0);
> > -  blorp_clear(, ,
> > -  brw->mesa_to_isl_render_format[format],
> > -  ISL_SWIZZLE_IDENTITY,
> > +  blorp_clear(, , isl_format, ISL_SWIZZLE_IDENTITY,
> >level, irb->mt_layer, num_layers,
> >x0, y0, x1, y1,
> >clear_color, color_write_disable);
> > --
> > 2.5.0.400.gff86faf
> >
> > ___
> > mesa-dev mailing list
> > mesa-dev@lists.freedesktop.org
> > https://lists.freedesktop.org/mailman/listinfo/mesa-dev
>
___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


Re: [Mesa-dev] [PATCH v2 06/52] intel/fs: Use an explicit D type for vote any/all/eq intrinsics

2017-10-25 Thread Jason Ekstrand
On Tue, Oct 17, 2017 at 2:03 PM, Matt Turner  wrote:

> On 10/12, Jason Ekstrand wrote:
>
>> They return a boolean so this is the right type.  Unfortunately,
>> get_nir_dest has the annoying behavior of giving us a float type by
>> default.  This is mostly to work around the fact that gen7 has 64-bit
>> float but no Q types.
>>
>
> I'd really like to see a clearer explanation about what this fixes.
> Something like
>
> "These intrinsics will be used to implement X, Y, Z from $extension,
> which support additional types (including 64-bit types). For 64-bit
> types get_nir_dest() returns ..., so we need to force the type to D
> since it always returns a bool."
>

That's not quite accurate.  How about this:

The any/all intrinsics return a boolean value so D or UD is the correct
type.  Unfortunately, get_nir_dest has the annoying behavior of returnning
a float type by default.  This causes format conversion which gives us
-1.0f or 0.0f in the register.  If the consumer of the result does an
integer comparison to zero, it will give you the right boolean value but if
we do something more clever based on the 0/~0 assumption for booleans, this
will give the wrong value.
___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


Re: [Mesa-dev] [PATCH v2 43/52] nir: Add a new subgroups lowering pass

2017-10-25 Thread Jason Ekstrand
On Fri, Oct 13, 2017 at 3:52 AM, Lionel Landwerlin <
lionel.g.landwer...@intel.com> wrote:

> On 13/10/17 06:48, Jason Ekstrand wrote:
>
>> This commit pulls nir_lower_read_invocations_to_scalar along with most
>> of the guts of nir_opt_intrinsics (which mostly does subgroup lowering)
>> into a new nir_lower_subgroups pass.  There are various other bits of
>> subgroup lowering that we're going to want to do so it makes a bit more
>> sense to keep it all together in one pass.  We also move it in i965 to
>> happen after nir_lower_system_values to ensure that because we want to
>> handle the subgroup mask system value intrinsics here.
>> ---
>>   src/compiler/Makefile.sources  |   2 +-
>>   src/compiler/nir/nir.h |  12 +-
>>   .../nir/nir_lower_read_invocation_to_scalar.c  | 112 --
>>   src/compiler/nir/nir_lower_subgroups.c | 161
>> +
>>   src/compiler/nir/nir_opt_intrinsics.c  |  51 +--
>>   src/intel/compiler/brw_compiler.c  |   3 -
>>   src/intel/compiler/brw_nir.c   |   8 +-
>>   7 files changed, 184 insertions(+), 165 deletions(-)
>>   delete mode 100644 src/compiler/nir/nir_lower_rea
>> d_invocation_to_scalar.c
>>   create mode 100644 src/compiler/nir/nir_lower_subgroups.c
>>
>> diff --git a/src/compiler/Makefile.sources b/src/compiler/Makefile.source
>> s
>> index 2724a41..912c003 100644
>> --- a/src/compiler/Makefile.sources
>> +++ b/src/compiler/Makefile.sources
>> @@ -232,11 +232,11 @@ NIR_FILES = \
>> nir/nir_lower_passthrough_edgeflags.c \
>> nir/nir_lower_patch_vertices.c \
>> nir/nir_lower_phis_to_scalar.c \
>> -   nir/nir_lower_read_invocation_to_scalar.c \
>> nir/nir_lower_regs_to_ssa.c \
>> nir/nir_lower_returns.c \
>> nir/nir_lower_samplers.c \
>> nir/nir_lower_samplers_as_deref.c \
>> +   nir/nir_lower_subgroups.c \
>> nir/nir_lower_system_values.c \
>> nir/nir_lower_tex.c \
>> nir/nir_lower_to_source_mods.c \
>> diff --git a/src/compiler/nir/nir.h b/src/compiler/nir/nir.h
>> index 5af1503..1154c42 100644
>> --- a/src/compiler/nir/nir.h
>> +++ b/src/compiler/nir/nir.h
>> @@ -1831,9 +1831,6 @@ typedef struct nir_shader_compiler_options {
>>  bool lower_extract_byte;
>>  bool lower_extract_word;
>>   -   bool lower_vote_trivial;
>> -   bool lower_subgroup_masks;
>> -
>>  /**
>>   * Does the driver support real 32-bit integers?  (Otherwise,
>> integers
>>   * are simulated by floats.)
>> @@ -2460,6 +2457,15 @@ bool nir_lower_samplers(nir_shader *shader,
>>   bool nir_lower_samplers_as_deref(nir_shader *shader,
>>const struct gl_shader_program
>> *shader_program);
>>   +typedef struct nir_lower_subgroups_options {
>> +   bool lower_to_scalar:1;
>> +   bool lower_vote_trivial:1;
>> +   bool lower_subgroup_masks:1;
>> +} nir_lower_subgroups_options;
>> +
>> +bool nir_lower_subgroups(nir_shader *shader,
>> + const nir_lower_subgroups_options *options);
>> +
>>   bool nir_lower_system_values(nir_shader *shader);
>> typedef struct nir_lower_tex_options {
>> diff --git a/src/compiler/nir/nir_lower_read_invocation_to_scalar.c
>> b/src/compiler/nir/nir_lower_read_invocation_to_scalar.c
>> deleted file mode 100644
>> index 69e7c0a..000
>> --- a/src/compiler/nir/nir_lower_read_invocation_to_scalar.c
>> +++ /dev/null
>> @@ -1,112 +0,0 @@
>> -/*
>> - * Copyright © 2017 Intel Corporation
>> - *
>> - * Permission is hereby granted, free of charge, to any person obtaining
>> a
>> - * copy of this software and associated documentation files (the
>> "Software"),
>> - * to deal in the Software without restriction, including without
>> limitation
>> - * the rights to use, copy, modify, merge, publish, distribute,
>> sublicense,
>> - * and/or sell copies of the Software, and to permit persons to whom the
>> - * Software is furnished to do so, subject to the following conditions:
>> - *
>> - * The above copyright notice and this permission notice (including the
>> next
>> - * paragraph) shall be included in all copies or substantial portions of
>> the
>> - * Software.
>> - *
>> - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
>> EXPRESS OR
>> - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
>> MERCHANTABILITY,
>> - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT
>> SHALL
>> - * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
>> OTHER
>> - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
>> ARISING
>> - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
>> DEALINGS
>> - * IN THE SOFTWARE.
>> - */
>> -
>> -#include "nir.h"
>> -#include "nir_builder.h"
>> -
>> -/** @file nir_lower_read_invocation_to_scalar.c
>> - *
>> - * Replaces nir_intrinsic_read_invocation/nir_intrinsic_read_first_invoc

Re: [Mesa-dev] [PATCH 2/2] radv: Implement VK_AMD_shader_info

2017-10-25 Thread Bas Nieuwenhuizen
On Wed, Oct 25, 2017 at 4:03 PM, Samuel Pitoiset
 wrote:
>
>
> On 10/25/2017 02:20 PM, Alex Smith wrote:
>>
>> On 25 October 2017 at 12:46, Samuel Pitoiset > > wrote:
>>
>> I have something similar on my local tree (started on monday).
>>
>> Though, I don't like the way we expose the number of VGPRS/SGPRS
>> because we can't really figure out the number of spilled ones.
>>
>>
>> My assumption was that if we've spilled then we've used all available
>> registers, so if numUsed{V,S}gprs is greater than the number available, then
>> you'd know that the number spilled is the difference between the two. Can we
>> have spilling when num_{v,s}gprs is less than the number available?
>
>
> Assuming the number of waves per CU is 4, I would go with:
>
> num_available_vgprs = num_physical_vgprs (ie. 256) / max_simd_waves (aligned
> down to 4).

for compute there is

num_available_vgprs (as LLVM sees as constraints) = num_physical_vgprs
/ ceil(compute_workgroup_size / 256)

for other stages it always is 256. (Until we implement the wave limit ext)

Reading from the spec I think it is unintuitive that the usedVgpr
stats include spilled registers though. I'd
expect to see just the physically used regs. Is this something that
Feral has tried on the official driver on any platform? I'd say to not
include the spilled regs (you can get it approximately with scratch
memory / 256), unless the official driver does otherwise, in which
case we should go for consistency.

>
> (or we can just set num_available_vgprs to conf->num_vgprs and return
> num_used_vgprs = conf->num_vgprs + conf->num_spilled_sgprs).
>
> That way, if num_used_vgprs is greater than num_available_vgprs we know that
> we are spilling some vgprs.
>
> For the number of available SGPRs, I think we can just hardcode the value to
> 104 for now.
>
> Also with this, we can easily re-compute the maximum number of waves.
>
>>
>> Alex
>>
>>
>>
>> On 10/25/2017 01:18 PM, Alex Smith wrote:
>>
>> This allows an app to query shader statistics and get a
>> disassembly of
>> a shader. RenderDoc git has support for it, so this allows you
>> to view
>> shader disassembly from a capture.
>>
>> When this extension is enabled on a device (or when tracing), we
>> now
>> disable pipeline caching, since we don't get the shader debug
>> info when
>> we retrieve cached shaders.
>>
>> Signed-off-by: Alex Smith > >
>>
>> ---
>>src/amd/vulkan/radv_device.c |   9 ++
>>src/amd/vulkan/radv_extensions.py|   1 +
>>src/amd/vulkan/radv_pipeline.c   |   2 +-
>>src/amd/vulkan/radv_pipeline_cache.c |  11 ++-
>>src/amd/vulkan/radv_private.h|   3 +
>>src/amd/vulkan/radv_shader.c | 163
>> ---
>>6 files changed, 154 insertions(+), 35 deletions(-)
>>
>> diff --git a/src/amd/vulkan/radv_device.c
>> b/src/amd/vulkan/radv_device.c
>> index c4e25222ea..5603551680 100644
>> --- a/src/amd/vulkan/radv_device.c
>> +++ b/src/amd/vulkan/radv_device.c
>> @@ -943,10 +943,15 @@ VkResult radv_CreateDevice(
>>  VkResult result;
>>  struct radv_device *device;
>>+ bool keep_shader_info = false;
>> +
>>  for (uint32_t i = 0; i <
>> pCreateInfo->enabledExtensionCount; i++) {
>>  const char *ext_name =
>> pCreateInfo->ppEnabledExtensionNames[i];
>>  if
>> (!radv_physical_device_extension_supported(physical_device,
>> ext_name))
>>  return
>> vk_error(VK_ERROR_EXTENSION_NOT_PRESENT);
>> +
>> +   if (strcmp(ext_name,
>> VK_AMD_SHADER_INFO_EXTENSION_NAME) == 0)
>> +   keep_shader_info = true;
>>  }
>>  /* Check enabled features */
>> @@ -1040,10 +1045,14 @@ VkResult radv_CreateDevice(
>>  device->physical_device->rad_info.max_se >= 2;
>>  if (getenv("RADV_TRACE_FILE")) {
>> +   keep_shader_info = true;
>> +
>>  if (!radv_init_trace(device))
>>  goto fail;
>>  }
>>+ device->keep_shader_info = keep_shader_info;
>> +
>>  result = radv_device_init_meta(device);
>>  if (result != VK_SUCCESS)
>>  goto fail;
>> diff --git a/src/amd/vulkan/radv_extensions.py
>> b/src/amd/vulkan/radv_extensions.py
>> index dfeb2880fc..eeb679d65a 100644
>>  

Re: [Mesa-dev] [PATCH] meson: do not search for needless deps

2017-10-25 Thread Dylan Baker
I don't really care one way or another. When I tested it the time spent was so
insignificant I didn't care to optimize.

Acked-by: Dylan Baker 

Quoting Erik Faye-Lund (2017-10-25 01:24:49)
> If we don't want to use these deps, there's no good reason to search
> for them in the first place. This should shave a bit of time for the
> initial build.
> ---
> 
> This would be a way of dealing with Gert's suggestion. Goes on top
> of the previous patch.
> 
> Thoughts?
> 
>  meson.build | 20 ++--
>  1 file changed, 14 insertions(+), 6 deletions(-)
> 
> diff --git a/meson.build b/meson.build
> index e842bb1652..201956c4c8 100644
> --- a/meson.build
> +++ b/meson.build
> @@ -666,9 +666,13 @@ if with_glvnd
>  endif
>  
>  # TODO: make this conditional
> -dep_valgrind = dependency('valgrind', required : false)
> -if dep_valgrind.found() and with_valgrind
> -  pre_args += '-DHAVE_VALGRIND'
> +if with_valgrind
> +  dep_valgrind = dependency('valgrind', required : false)
> +  if dep_valgrind.found()
> +pre_args += '-DHAVE_VALGRIND'
> +  endif
> +else
> +  dep_valgrind = []
>  endif
>  
>  # pthread stubs. Lets not and say we didn't
> @@ -681,9 +685,13 @@ dep_selinux = []
>  
>  # TODO: llvm-prefix and llvm-shared-libs
>  
> -dep_unwind = dependency('libunwind', required : false)
> -if dep_unwind.found() and with_libunwind
> -  pre_args += '-DHAVE_LIBUNWIND'
> +if with_libunwind
> +  dep_unwind = dependency('libunwind', required : false)
> +  if dep_unwind.found()
> +pre_args += '-DHAVE_LIBUNWIND'
> +  endif
> +else
> +  dep_unwind = []
>  endif
>  
>  # TODO: flags for opengl, gles, dri
> -- 
> 2.11.0
> 


signature.asc
Description: signature
___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


Re: [Mesa-dev] [PATCH 2/3] mesa: enable ARB_texture_buffer_* extensions in the Compatibility profile

2017-10-25 Thread Dylan Baker
I ran it through our CI and there do appear to be a fair number of regressions.
The majority of the errors are in the ES 3.1 CTS (there are also a few in deqp
ES 3.1) and have this as their error message:

glGetIntegerv() failed: glGetError() returned GL_INVALID_ENUM

I'm bisecting now, I'll update you when I know more.

Dylan

Quoting Emil Velikov (2017-10-24 09:30:03)
> Hi Marek,
> 
> On 21 October 2017 at 13:54, Marek Olšák  wrote:
> > From: Marek Olšák 
> >
> > We already have piglit tests testing alpha, luminance, and intensity
> > formats. They were skipped by piglit until now.
> >
> > Additionally, I'm enabling one ARB_texture_buffer_range piglit test to run
> > with the compat profile.
> 
> Can you please mention that ARB_texture_buffer_* on i965 is unchanged
> - aka still enabled only for core profiles.
> Out of curiosity - can you tried the series with anything more than piglit?
> 
> The Intel guys can run the lot through CTS, dEQP... admittedly only on
> Intel hardware.
> Still it should help catch if a piece is missing somewhere.
> 
> 
> -Emil
> ___
> mesa-dev mailing list
> mesa-dev@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/mesa-dev


signature.asc
Description: signature
___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


Re: [Mesa-dev] [PATCH mesa] vc4: fix release build

2017-10-25 Thread Eric Anholt
Eric Engestrom  writes:

> Mesa's DEBUG and assert's NDEBUG are not tied to each other, so we need
> to explicitly compile this code out.

We should move these DEBUG ifdefs under NDEBUG, then -- the point was to
not have the field that's only used for the asserts when asserts don't
do anything.


signature.asc
Description: PGP signature
___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


  1   2   >