from:"\"Rhys Perry\""

[Mesa-dev] [PATCH] ac/nir: fix txf_ms with an offset

2019-07-19 Thread Rhys Perry

Seems to fix some hair artifacts in Max Payne 3:
https://github.com/daniel-schuermann/mesa/issues/76

Signed-off-by: Rhys Perry 
Fixes: f4e499ec791 ('radv: add initial non-conformant radv vulkan driver')
---
 src/amd/common/ac_nir_to_llvm.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/amd/common/ac_nir_to_llvm.c b/src/amd/common/ac_nir_to_llvm.c
index 96bf89a8bf9..549a26ea243 100644
--- a/src/amd/common/ac_nir_to_llvm.c
+++ b/src/amd/common/ac_nir_to_llvm.c
@@ -3784,7 +3784,7 @@ static void visit_tex(struct ac_nir_context *ctx, 
nir_tex_instr *instr)
goto write_result;
}
 
-   if (args.offset && instr->op != nir_texop_txf) {
+   if (args.offset && instr->op != nir_texop_txf && instr->op != 
nir_texop_txf_ms) {
LLVMValueRef offset[3], pack;
for (unsigned chan = 0; chan < 3; ++chan)
offset[chan] = ctx->ac.i32_0;
@@ -3919,7 +3919,7 @@ static void visit_tex(struct ac_nir_context *ctx, 
nir_tex_instr *instr)
args.coords[sample_chan], fmask_ptr);
}
 
-   if (args.offset && instr->op == nir_texop_txf) {
+   if (args.offset && (instr->op == nir_texop_txf || instr->op == 
nir_texop_txf_ms)) {
int num_offsets = 
instr->src[offset_src].src.ssa->num_components;
num_offsets = MIN2(num_offsets, instr->coord_components);
for (unsigned i = 0; i < num_offsets; ++i) {
-- 
2.21.0

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev

Re: [Mesa-dev] [PATCH] ac/nir: mark some texture intrinsics as convergent

2019-05-31 Thread Rhys Perry

The first and last hunks are needed to pass on the shader_info to the
middle hunk, which needs it so that it can test if the compute shader
has a derivative group.

On Fri, 31 May 2019 at 18:38, Marek Olšák  wrote:
>
> The first and last hunks look like they shouldn't be there. Other than that:
>
> Reviewed-by: Marek Olšák 
>
> Marek
>
> On Fri, May 31, 2019 at 11:53 AM Rhys Perry  wrote:
>>
>> Otherwise LLVM can sink them and their texture coordinate calculations
>> into divergent branches.
>>
>> v2: simplify the conditions on which the intrinsic is marked as convergent
>> v3: only mark as convergent in FS and CS with derivative groups
>>
>> Cc: 
>> Signed-off-by: Rhys Perry 
>> ---
>>  src/amd/common/ac_nir_to_llvm.c | 18 ++
>>  1 file changed, 18 insertions(+)
>>
>> diff --git a/src/amd/common/ac_nir_to_llvm.c 
>> b/src/amd/common/ac_nir_to_llvm.c
>> index 265e3b636c4..9e9fade7227 100644
>> --- a/src/amd/common/ac_nir_to_llvm.c
>> +++ b/src/amd/common/ac_nir_to_llvm.c
>> @@ -38,6 +38,7 @@ struct ac_nir_context {
>> struct ac_shader_abi *abi;
>>
>> gl_shader_stage stage;
>> +   shader_info *info;
>>
>> LLVMValueRef *ssa_defs;
>>
>> @@ -1394,6 +1395,22 @@ static LLVMValueRef build_tex_intrinsic(struct 
>> ac_nir_context *ctx,
>> }
>>
>> args->attributes = AC_FUNC_ATTR_READNONE;
>> +   bool cs_derivs = ctx->stage == MESA_SHADER_COMPUTE &&
>> +ctx->info->cs.derivative_group != 
>> DERIVATIVE_GROUP_NONE;
>> +   if (ctx->stage == MESA_SHADER_FRAGMENT || cs_derivs) {
>> +   /* Prevent texture instructions with implicit derivatives 
>> from being
>> +* sinked into branches. */
>> +   switch (instr->op) {
>> +   case nir_texop_tex:
>> +   case nir_texop_txb:
>> +   case nir_texop_lod:
>> +   args->attributes |= AC_FUNC_ATTR_CONVERGENT;
>> +   break;
>> +   default:
>> +   break;
>> +   }
>> +   }
>> +
>> return ac_build_image_opcode(&ctx->ac, args);
>>  }
>>
>> @@ -4350,6 +4367,7 @@ void ac_nir_translate(struct ac_llvm_context *ac, 
>> struct ac_shader_abi *abi,
>> ctx.abi = abi;
>>
>> ctx.stage = nir->info.stage;
>> +   ctx.info = &nir->info;
>>
>> ctx.main_function = 
>> LLVMGetBasicBlockParent(LLVMGetInsertBlock(ctx.ac.builder));
>>
>> --
>> 2.21.0
>>
___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev

[Mesa-dev] [PATCH] ac/nir: mark some texture intrinsics as convergent

2019-05-31 Thread Rhys Perry

Otherwise LLVM can sink them and their texture coordinate calculations
into divergent branches.

v2: simplify the conditions on which the intrinsic is marked as convergent
v3: only mark as convergent in FS and CS with derivative groups

Cc: 
Signed-off-by: Rhys Perry 
---
 src/amd/common/ac_nir_to_llvm.c | 18 ++
 1 file changed, 18 insertions(+)

diff --git a/src/amd/common/ac_nir_to_llvm.c b/src/amd/common/ac_nir_to_llvm.c
index 265e3b636c4..9e9fade7227 100644
--- a/src/amd/common/ac_nir_to_llvm.c
+++ b/src/amd/common/ac_nir_to_llvm.c
@@ -38,6 +38,7 @@ struct ac_nir_context {
struct ac_shader_abi *abi;
 
gl_shader_stage stage;
+   shader_info *info;
 
LLVMValueRef *ssa_defs;
 
@@ -1394,6 +1395,22 @@ static LLVMValueRef build_tex_intrinsic(struct 
ac_nir_context *ctx,
}
 
args->attributes = AC_FUNC_ATTR_READNONE;
+   bool cs_derivs = ctx->stage == MESA_SHADER_COMPUTE &&
+ctx->info->cs.derivative_group != 
DERIVATIVE_GROUP_NONE;
+   if (ctx->stage == MESA_SHADER_FRAGMENT || cs_derivs) {
+   /* Prevent texture instructions with implicit derivatives from 
being
+* sinked into branches. */
+   switch (instr->op) {
+   case nir_texop_tex:
+   case nir_texop_txb:
+   case nir_texop_lod:
+   args->attributes |= AC_FUNC_ATTR_CONVERGENT;
+   break;
+   default:
+   break;
+   }
+   }
+
return ac_build_image_opcode(&ctx->ac, args);
 }
 
@@ -4350,6 +4367,7 @@ void ac_nir_translate(struct ac_llvm_context *ac, struct 
ac_shader_abi *abi,
ctx.abi = abi;
 
ctx.stage = nir->info.stage;
+   ctx.info = &nir->info;
 
ctx.main_function = 
LLVMGetBasicBlockParent(LLVMGetInsertBlock(ctx.ac.builder));
 
-- 
2.21.0

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev

[Mesa-dev] [PATCH] ac/nir: mark some texture intrinsics as convergent

2019-05-30 Thread Rhys Perry

Otherwise LLVM can sink them and their texture coordinate calculations
into divergent branches.

v2: simplify the conditions on which the intrinsic is marked as convergent

Cc: 
Signed-off-by: Rhys Perry 
Reviewed-By: Bas Nieuwenhuizen 
---
 src/amd/common/ac_nir_to_llvm.c | 12 
 1 file changed, 12 insertions(+)

diff --git a/src/amd/common/ac_nir_to_llvm.c b/src/amd/common/ac_nir_to_llvm.c
index 265e3b636c4..b1a191ac24c 100644
--- a/src/amd/common/ac_nir_to_llvm.c
+++ b/src/amd/common/ac_nir_to_llvm.c
@@ -1394,6 +1394,18 @@ static LLVMValueRef build_tex_intrinsic(struct 
ac_nir_context *ctx,
}
 
args->attributes = AC_FUNC_ATTR_READNONE;
+   /* Prevent texture instructions with implicit derivatives from being
+* sinked into branches. */
+   switch (instr->op) {
+   case nir_texop_tex:
+   case nir_texop_txb:
+   case nir_texop_lod:
+   args->attributes |= AC_FUNC_ATTR_CONVERGENT;
+   break;
+   default:
+   break;
+   }
+
return ac_build_image_opcode(&ctx->ac, args);
 }
 
-- 
2.21.0

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev

Re: [Mesa-dev] [PATCH] ac/nir: mark some texture intrinsics as convergent

2019-05-30 Thread Rhys Perry

Seems txf can(should?) have a lod supplied. txf_ms and tg4 always use
the 0th level.

I'll add txf, txf_ms and tg4 to the list of nir_texop which don't ever
have implicit derivatives.

On Thu, 30 May 2019 at 19:43, Ilia Mirkin  wrote:
>
> txf supplies an lod, but tg4's is implicitly always 0.
>
> On Thu, May 30, 2019 at 2:26 PM Bas Nieuwenhuizen
>  wrote:
> >
> > On Thu, May 30, 2019 at 6:50 PM Rhys Perry  wrote:
> > >
> > > Otherwise LLVM can sink them and their texture coordinate calculations
> > > into divergent branches.
> > >
> > > Cc: 
> > > Signed-off-by: Rhys Perry 
> > > ---
> > >  src/amd/common/ac_nir_to_llvm.c | 29 +
> > >  1 file changed, 29 insertions(+)
> > >
> > > diff --git a/src/amd/common/ac_nir_to_llvm.c 
> > > b/src/amd/common/ac_nir_to_llvm.c
> > > index 265e3b636c4..d2dc617de36 100644
> > > --- a/src/amd/common/ac_nir_to_llvm.c
> > > +++ b/src/amd/common/ac_nir_to_llvm.c
> > > @@ -1316,6 +1316,30 @@ static nir_deref_instr 
> > > *get_tex_texture_deref(const nir_tex_instr *instr)
> > > return texture_deref_instr;
> > >  }
> > >
> > > +static bool has_implicit_derivatives(const nir_tex_instr *instr)
> > > +{
> > > +   switch (instr->op) {
> > > +   case nir_texop_txs:
> > > +   case nir_texop_query_levels:
> > > +   case nir_texop_texture_samples:
> > > +   case nir_texop_samples_identical:
> > > +   return false;
> > > +   default:
> > > +   break;
> > > +   }
> > > +   for (unsigned i = 0; i < instr->num_srcs; i++) {
> > > +   switch (instr->src[i].src_type) {
> > > +   case nir_tex_src_lod:
> > > +   case nir_tex_src_ddx:
> > > +   case nir_tex_src_ddy:
> > > +   return false;
> > > +   default:
> > > +   break;
> > > +   }
> > > +   }
> > > +   return true;
> > > +}
> >
> > txf, tg4 and friends do not provide any of lod/ddx/ddy do they?
> >
> > > +
> > >  static LLVMValueRef build_tex_intrinsic(struct ac_nir_context *ctx,
> > > const nir_tex_instr *instr,
> > > struct ac_image_args *args)
> > > @@ -1394,6 +1418,11 @@ static LLVMValueRef build_tex_intrinsic(struct 
> > > ac_nir_context *ctx,
> > > }
> > >
> > > args->attributes = AC_FUNC_ATTR_READNONE;
> > > +   /* Prevent texture instructions with implicit derivatives from 
> > > being
> > > +* sinked into branches. */
> > > +   if (has_implicit_derivatives(instr))
> > > +   args->attributes |= AC_FUNC_ATTR_CONVERGENT;
> > > +
> > > return ac_build_image_opcode(&ctx->ac, args);
> > >  }
> > >
> > > --
> > > 2.21.0
> > >
> > ___
> > mesa-dev mailing list
> > mesa-dev@lists.freedesktop.org
> > https://lists.freedesktop.org/mailman/listinfo/mesa-dev
___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev

[Mesa-dev] [PATCH] ac/nir: mark some texture intrinsics as convergent

2019-05-30 Thread Rhys Perry

Otherwise LLVM can sink them and their texture coordinate calculations
into divergent branches.

Cc: 
Signed-off-by: Rhys Perry 
---
 src/amd/common/ac_nir_to_llvm.c | 29 +
 1 file changed, 29 insertions(+)

diff --git a/src/amd/common/ac_nir_to_llvm.c b/src/amd/common/ac_nir_to_llvm.c
index 265e3b636c4..d2dc617de36 100644
--- a/src/amd/common/ac_nir_to_llvm.c
+++ b/src/amd/common/ac_nir_to_llvm.c
@@ -1316,6 +1316,30 @@ static nir_deref_instr *get_tex_texture_deref(const 
nir_tex_instr *instr)
return texture_deref_instr;
 }
 
+static bool has_implicit_derivatives(const nir_tex_instr *instr)
+{
+   switch (instr->op) {
+   case nir_texop_txs:
+   case nir_texop_query_levels:
+   case nir_texop_texture_samples:
+   case nir_texop_samples_identical:
+   return false;
+   default:
+   break;
+   }
+   for (unsigned i = 0; i < instr->num_srcs; i++) {
+   switch (instr->src[i].src_type) {
+   case nir_tex_src_lod:
+   case nir_tex_src_ddx:
+   case nir_tex_src_ddy:
+   return false;
+   default:
+   break;
+   }
+   }
+   return true;
+}
+
 static LLVMValueRef build_tex_intrinsic(struct ac_nir_context *ctx,
const nir_tex_instr *instr,
struct ac_image_args *args)
@@ -1394,6 +1418,11 @@ static LLVMValueRef build_tex_intrinsic(struct 
ac_nir_context *ctx,
}
 
args->attributes = AC_FUNC_ATTR_READNONE;
+   /* Prevent texture instructions with implicit derivatives from being
+* sinked into branches. */
+   if (has_implicit_derivatives(instr))
+   args->attributes |= AC_FUNC_ATTR_CONVERGENT;
+
return ac_build_image_opcode(&ctx->ac, args);
 }
 
-- 
2.21.0

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev

[Mesa-dev] [PATCH] radv: fix some compiler warnings

2019-05-30 Thread Rhys Perry

Fixes -Woverflow warnings with GCC 9.1.1

Signed-off-by: Rhys Perry 
---
 src/amd/vulkan/si_cmd_buffer.c | 8 
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/amd/vulkan/si_cmd_buffer.c b/src/amd/vulkan/si_cmd_buffer.c
index aae8d578c10..d87c00b94e9 100644
--- a/src/amd/vulkan/si_cmd_buffer.c
+++ b/src/amd/vulkan/si_cmd_buffer.c
@@ -1360,7 +1360,7 @@ void radv_emit_default_sample_locations(struct 
radeon_cmdbuf *cs, int nr_samples
default:
case 1:
radeon_set_context_reg_seq(cs, 
R_028BD4_PA_SC_CENTROID_PRIORITY_0, 2);
-   radeon_emit(cs, centroid_priority_1x);
+   radeon_emit(cs, centroid_priority_1x & 0x);
radeon_emit(cs, centroid_priority_1x >> 32);
radeon_set_context_reg(cs, 
R_028BF8_PA_SC_AA_SAMPLE_LOCS_PIXEL_X0Y0_0, sample_locs_1x);
radeon_set_context_reg(cs, 
R_028C08_PA_SC_AA_SAMPLE_LOCS_PIXEL_X1Y0_0, sample_locs_1x);
@@ -1369,7 +1369,7 @@ void radv_emit_default_sample_locations(struct 
radeon_cmdbuf *cs, int nr_samples
break;
case 2:
radeon_set_context_reg_seq(cs, 
R_028BD4_PA_SC_CENTROID_PRIORITY_0, 2);
-   radeon_emit(cs, centroid_priority_2x);
+   radeon_emit(cs, centroid_priority_2x & 0x);
radeon_emit(cs, centroid_priority_2x >> 32);
radeon_set_context_reg(cs, 
R_028BF8_PA_SC_AA_SAMPLE_LOCS_PIXEL_X0Y0_0, sample_locs_2x);
radeon_set_context_reg(cs, 
R_028C08_PA_SC_AA_SAMPLE_LOCS_PIXEL_X1Y0_0, sample_locs_2x);
@@ -1378,7 +1378,7 @@ void radv_emit_default_sample_locations(struct 
radeon_cmdbuf *cs, int nr_samples
break;
case 4:
radeon_set_context_reg_seq(cs, 
R_028BD4_PA_SC_CENTROID_PRIORITY_0, 2);
-   radeon_emit(cs, centroid_priority_4x);
+   radeon_emit(cs, centroid_priority_4x & 0x);
radeon_emit(cs, centroid_priority_4x >> 32);
radeon_set_context_reg(cs, 
R_028BF8_PA_SC_AA_SAMPLE_LOCS_PIXEL_X0Y0_0, sample_locs_4x);
radeon_set_context_reg(cs, 
R_028C08_PA_SC_AA_SAMPLE_LOCS_PIXEL_X1Y0_0, sample_locs_4x);
@@ -1387,7 +1387,7 @@ void radv_emit_default_sample_locations(struct 
radeon_cmdbuf *cs, int nr_samples
break;
case 8:
radeon_set_context_reg_seq(cs, 
R_028BD4_PA_SC_CENTROID_PRIORITY_0, 2);
-   radeon_emit(cs, centroid_priority_8x);
+   radeon_emit(cs, centroid_priority_8x & 0x);
radeon_emit(cs, centroid_priority_8x >> 32);
radeon_set_context_reg_seq(cs, 
R_028BF8_PA_SC_AA_SAMPLE_LOCS_PIXEL_X0Y0_0, 14);
radeon_emit_array(cs, sample_locs_8x, 4);
-- 
2.21.0

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev

Re: [Mesa-dev] [PATCH 1/2] radeonsi: use new atomic LLVM helpers

2019-04-29 Thread Rhys Perry

The patch this depends on, "ac,ac/nir: use a better sync scope for
shared atomics", has been pushed:
https://gitlab.freedesktop.org/mesa/mesa/commit/bd4c661ad08e772fdccb562ffbb2f45705c4fec8

On Fri, 26 Apr 2019 at 21:41, Marek Olšák  wrote:
>
> From: Marek Olšák 
>
> This depends on "ac,ac/nir: use a better sync scope for shared atomics"
> ---
>  src/gallium/drivers/radeonsi/si_shader_tgsi_mem.c | 12 
>  1 file changed, 4 insertions(+), 8 deletions(-)
>
> diff --git a/src/gallium/drivers/radeonsi/si_shader_tgsi_mem.c 
> b/src/gallium/drivers/radeonsi/si_shader_tgsi_mem.c
> index eb90bfb10ff..5e540fc5098 100644
> --- a/src/gallium/drivers/radeonsi/si_shader_tgsi_mem.c
> +++ b/src/gallium/drivers/radeonsi/si_shader_tgsi_mem.c
> @@ -776,38 +776,36 @@ static void store_emit(
> emit_data->output[emit_data->chan] =
> ac_build_image_opcode(&ctx->ac, &args);
> }
>  }
>
>  static void atomic_emit_memory(struct si_shader_context *ctx,
> struct lp_build_emit_data *emit_data) {
> LLVMBuilderRef builder = ctx->ac.builder;
> const struct tgsi_full_instruction * inst = emit_data->inst;
> LLVMValueRef ptr, result, arg;
> +   const char *sync_scope = HAVE_LLVM >= 0x0900 ? "workgroup-one-as" : 
> "workgroup";
>
> ptr = get_memory_ptr(ctx, inst, ctx->i32, 1);
>
> arg = lp_build_emit_fetch(&ctx->bld_base, inst, 2, 0);
> arg = ac_to_integer(&ctx->ac, arg);
>
> if (inst->Instruction.Opcode == TGSI_OPCODE_ATOMCAS) {
> LLVMValueRef new_data;
> new_data = lp_build_emit_fetch(&ctx->bld_base,
>inst, 3, 0);
>
> new_data = ac_to_integer(&ctx->ac, new_data);
>
> -   result = LLVMBuildAtomicCmpXchg(builder, ptr, arg, new_data,
> -  
> LLVMAtomicOrderingSequentiallyConsistent,
> -  
> LLVMAtomicOrderingSequentiallyConsistent,
> -  false);
> -
> +   result = ac_build_atomic_cmp_xchg(&ctx->ac, ptr, arg, 
> new_data,
> + sync_scope);
> result = LLVMBuildExtractValue(builder, result, 0, "");
> } else {
> LLVMAtomicRMWBinOp op;
>
> switch(inst->Instruction.Opcode) {
> case TGSI_OPCODE_ATOMUADD:
> op = LLVMAtomicRMWBinOpAdd;
> break;
> case TGSI_OPCODE_ATOMXCHG:
> op = LLVMAtomicRMWBinOpXchg;
> @@ -830,23 +828,21 @@ static void atomic_emit_memory(struct si_shader_context 
> *ctx,
> case TGSI_OPCODE_ATOMIMIN:
> op = LLVMAtomicRMWBinOpMin;
> break;
> case TGSI_OPCODE_ATOMIMAX:
> op = LLVMAtomicRMWBinOpMax;
> break;
> default:
> unreachable("unknown atomic opcode");
> }
>
> -   result = LLVMBuildAtomicRMW(builder, op, ptr, arg,
> -  
> LLVMAtomicOrderingSequentiallyConsistent,
> -  false);
> +   result = ac_build_atomic_rmw(&ctx->ac, op, ptr, arg, 
> sync_scope);
> }
> emit_data->output[emit_data->chan] =
> LLVMBuildBitCast(builder, result, ctx->f32, "");
>  }
>
>  static void atomic_emit(
> const struct lp_build_tgsi_action *action,
> struct lp_build_tgsi_context *bld_base,
> struct lp_build_emit_data *emit_data)
>  {
> --
> 2.17.1
>
> ___
> mesa-dev mailing list
> mesa-dev@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/mesa-dev
___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev

[Mesa-dev] [PATCH v3] radv: fix set_output_usage_mask() with composite and 64-bit types

2019-04-27 Thread Rhys Perry

It previously used var->type instead of deref_instr->type and didn't
handle 64-bit outputs.

This fixes lots of transform feedback CTS tests involving transform
feedback and geometry shaders (mostly
dEQP-VK.transform_feedback.fuzz.random_geometry.*)

v2: fix writemask widening when comp != 0
v3: fix 64-bit variables when comp != 0, again

Signed-off-by: Rhys Perry 
Cc: 19.0 
---
 src/amd/vulkan/radv_shader_info.c | 21 +
 1 file changed, 17 insertions(+), 4 deletions(-)

diff --git a/src/amd/vulkan/radv_shader_info.c 
b/src/amd/vulkan/radv_shader_info.c
index 932a1852266..e771ad79878 100644
--- a/src/amd/vulkan/radv_shader_info.c
+++ b/src/amd/vulkan/radv_shader_info.c
@@ -112,6 +112,15 @@ gather_intrinsic_load_deref_info(const nir_shader *nir,
}
 }
 
+static uint32_t
+widen_writemask(uint32_t wrmask)
+{
+   uint32_t new_wrmask = 0;
+   for(unsigned i = 0; i < 4; i++)
+   new_wrmask |= (wrmask & (1 << i) ? 0x3 : 0x0) << (i * 2);
+   return new_wrmask;
+}
+
 static void
 set_output_usage_mask(const nir_shader *nir, const nir_intrinsic_instr *instr,
  uint8_t *output_usage_mask)
@@ -119,7 +128,7 @@ set_output_usage_mask(const nir_shader *nir, const 
nir_intrinsic_instr *instr,
nir_deref_instr *deref_instr =
nir_instr_as_deref(instr->src[0].ssa->parent_instr);
nir_variable *var = nir_deref_instr_get_variable(deref_instr);
-   unsigned attrib_count = glsl_count_attribute_slots(var->type, false);
+   unsigned attrib_count = glsl_count_attribute_slots(deref_instr->type, 
false);
unsigned idx = var->data.location;
unsigned comp = var->data.location_frac;
unsigned const_offset = 0;
@@ -127,15 +136,19 @@ set_output_usage_mask(const nir_shader *nir, const 
nir_intrinsic_instr *instr,
get_deref_offset(deref_instr, &const_offset);
 
if (var->data.compact) {
+   assert(!glsl_type_is_64bit(deref_instr->type));
const_offset += comp;
output_usage_mask[idx + const_offset / 4] |= 1 << (const_offset 
% 4);
return;
}
 
-   for (unsigned i = 0; i < attrib_count; i++) {
+   uint32_t wrmask = nir_intrinsic_write_mask(instr);
+   if (glsl_type_is_64bit(deref_instr->type))
+   wrmask = widen_writemask(wrmask);
+
+   for (unsigned i = 0; i < attrib_count; i++)
output_usage_mask[idx + i + const_offset] |=
-   instr->const_index[0] << comp;
-   }
+   ((wrmask >> (i * 4)) & 0xf) << comp;
 }
 
 static void
-- 
2.20.1

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev

[Mesa-dev] [PATCH v2] radv: fix set_output_usage_mask() with composite and 64-bit types

2019-04-27 Thread Rhys Perry

It previously used var->type instead of deref_instr->type and didn't
handle 64-bit outputs.

This fixes lots of transform feedback CTS tests involving transform
feedback and geometry shaders (mostly
dEQP-VK.transform_feedback.fuzz.random_geometry.*)

v2: fix writemask widening when comp != 0

Signed-off-by: Rhys Perry 
Cc: 19.0 
---
 src/amd/vulkan/radv_shader_info.c | 23 +++
 1 file changed, 19 insertions(+), 4 deletions(-)

diff --git a/src/amd/vulkan/radv_shader_info.c 
b/src/amd/vulkan/radv_shader_info.c
index 932a1852266..63ee25ab7c9 100644
--- a/src/amd/vulkan/radv_shader_info.c
+++ b/src/amd/vulkan/radv_shader_info.c
@@ -112,6 +112,15 @@ gather_intrinsic_load_deref_info(const nir_shader *nir,
}
 }
 
+static uint32_t
+widen_writemask(uint32_t wrmask)
+{
+   uint32_t new_wrmask = 0;
+   for(unsigned i = 0; i < 4; i++)
+   new_wrmask |= (wrmask & (1 << i) ? 0x3 : 0x0) << (i * 2);
+   return new_wrmask;
+}
+
 static void
 set_output_usage_mask(const nir_shader *nir, const nir_intrinsic_instr *instr,
  uint8_t *output_usage_mask)
@@ -119,7 +128,7 @@ set_output_usage_mask(const nir_shader *nir, const 
nir_intrinsic_instr *instr,
nir_deref_instr *deref_instr =
nir_instr_as_deref(instr->src[0].ssa->parent_instr);
nir_variable *var = nir_deref_instr_get_variable(deref_instr);
-   unsigned attrib_count = glsl_count_attribute_slots(var->type, false);
+   unsigned attrib_count = glsl_count_attribute_slots(deref_instr->type, 
false);
unsigned idx = var->data.location;
unsigned comp = var->data.location_frac;
unsigned const_offset = 0;
@@ -127,15 +136,21 @@ set_output_usage_mask(const nir_shader *nir, const 
nir_intrinsic_instr *instr,
get_deref_offset(deref_instr, &const_offset);
 
if (var->data.compact) {
+   assert(!glsl_type_is_64bit(deref_instr->type));
const_offset += comp;
output_usage_mask[idx + const_offset / 4] |= 1 << (const_offset 
% 4);
return;
}
 
-   for (unsigned i = 0; i < attrib_count; i++) {
+   uint32_t wrmask = nir_intrinsic_write_mask(instr);
+   if (glsl_type_is_64bit(deref_instr->type))
+   wrmask = widen_writemask(wrmask) << (comp * 2);
+   else
+   wrmask = wrmask << comp;
+
+   for (unsigned i = 0; i < attrib_count; i++)
output_usage_mask[idx + i + const_offset] |=
-   instr->const_index[0] << comp;
-   }
+   (wrmask >> (i * 4)) & 0xf;
 }
 
 static void
-- 
2.20.1

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev

[Mesa-dev] [PATCH] radv: fix set_output_usage_mask() with composite and 64-bit types

2019-04-27 Thread Rhys Perry

It previously used var->type instead of deref_instr->type and didn't
handle 64-bit outputs.

This fixes lots of transform feedback CTS tests involving transform
feedback and geometry shaders (mostly
dEQP-VK.transform_feedback.fuzz.random_geometry.*)

Signed-off-by: Rhys Perry 
Cc: 19.0 
---
 src/amd/vulkan/radv_shader_info.c | 21 +
 1 file changed, 17 insertions(+), 4 deletions(-)

diff --git a/src/amd/vulkan/radv_shader_info.c 
b/src/amd/vulkan/radv_shader_info.c
index 932a1852266..a3bfc81808e 100644
--- a/src/amd/vulkan/radv_shader_info.c
+++ b/src/amd/vulkan/radv_shader_info.c
@@ -112,6 +112,15 @@ gather_intrinsic_load_deref_info(const nir_shader *nir,
}
 }
 
+static uint32_t
+widen_writemask(uint32_t wrmask)
+{
+   uint32_t new_wrmask = 0;
+   for(unsigned i = 0; i < 4; i++)
+   new_wrmask |= (wrmask & (1 << i) ? 0x3 : 0x0) << (i * 2);
+   return new_wrmask;
+}
+
 static void
 set_output_usage_mask(const nir_shader *nir, const nir_intrinsic_instr *instr,
  uint8_t *output_usage_mask)
@@ -119,7 +128,7 @@ set_output_usage_mask(const nir_shader *nir, const 
nir_intrinsic_instr *instr,
nir_deref_instr *deref_instr =
nir_instr_as_deref(instr->src[0].ssa->parent_instr);
nir_variable *var = nir_deref_instr_get_variable(deref_instr);
-   unsigned attrib_count = glsl_count_attribute_slots(var->type, false);
+   unsigned attrib_count = glsl_count_attribute_slots(deref_instr->type, 
false);
unsigned idx = var->data.location;
unsigned comp = var->data.location_frac;
unsigned const_offset = 0;
@@ -127,15 +136,19 @@ set_output_usage_mask(const nir_shader *nir, const 
nir_intrinsic_instr *instr,
get_deref_offset(deref_instr, &const_offset);
 
if (var->data.compact) {
+   assert(!glsl_type_is_64bit(deref_instr->type));
const_offset += comp;
output_usage_mask[idx + const_offset / 4] |= 1 << (const_offset 
% 4);
return;
}
 
-   for (unsigned i = 0; i < attrib_count; i++) {
+   uint32_t wrmask = nir_intrinsic_write_mask(instr) << comp;
+   if (glsl_type_is_64bit(deref_instr->type))
+   wrmask = widen_writemask(wrmask);
+
+   for (unsigned i = 0; i < attrib_count; i++)
output_usage_mask[idx + i + const_offset] |=
-   instr->const_index[0] << comp;
-   }
+   (wrmask >> (i * 4)) & 0xf;
 }
 
 static void
-- 
2.20.1

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev

[Mesa-dev] [PATCH] ac, ac/nir: use a better sync scope for shared atomics

2019-04-26 Thread Rhys Perry

https://reviews.llvm.org/rL356946 (present in LLVM 9 and later) changed
the meaning of the "system" sync scope, making it no longer restricted to
the memory operation's address space. So a single address space sync scope
is needed for shared atomic operations (such as "system-one-as" or
"workgroup-one-as") otherwise buffer_wbinvl1 and s_waitcnt instructions
can be created at each shared atomic operation.

This mostly reimplements LLVMBuildAtomicRMW and LLVMBuildAtomicCmpXchg
to allow for more sync scopes and uses the new functions in ac->nir with
the "workgroup-one-as" or "workgroup" sync scopes.

  F1 2017 (4K, Ultra High settings, TAA), avg FPS : 59 -> 59.67 (+1.14%)
 Strange Brigade (4K, ~highest settings), avg FPS : 51.5 -> 51.6 (+0.19%)
RotTR/mountain (4K, VeryHigh settings, FXAA), avg FPS : 57.2 -> 57.2 (+0.0%)
RotTR/tomb (4K, VeryHigh settings, FXAA), avg FPS : 42.5 -> 43.0 (+1.17%)
  RotTR/valley (4K, VeryHigh settings, FXAA), avg FPS : 40.7 -> 41.6 (+2.21%)
 Warhammer II/fallen, avg FPS : 31.63 -> 31.83 (+0.63%)
 Warhammer II/skaven, avg FPS : 37.77 -> 38.07 (+0.79%)

Signed-off-by: Rhys Perry 
---
 src/amd/common/ac_llvm_build.h| 10 +-
 src/amd/common/ac_llvm_helper.cpp | 59 +++
 src/amd/common/ac_nir_to_llvm.c   | 12 +++
 3 files changed, 72 insertions(+), 9 deletions(-)

diff --git a/src/amd/common/ac_llvm_build.h b/src/amd/common/ac_llvm_build.h
index f4cee667153..98f856106d6 100644
--- a/src/amd/common/ac_llvm_build.h
+++ b/src/amd/common/ac_llvm_build.h
@@ -26,7 +26,7 @@
 #define AC_LLVM_BUILD_H
 
 #include 
-#include 
+#include 
 #include "compiler/nir/nir.h"
 #include "amd_family.h"
 
@@ -694,6 +694,14 @@ ac_build_ddxy_interp(struct ac_llvm_context *ctx, 
LLVMValueRef interp_ij);
 LLVMValueRef
 ac_build_load_helper_invocation(struct ac_llvm_context *ctx);
 
+LLVMValueRef ac_build_atomic_rmw(struct ac_llvm_context *ctx, 
LLVMAtomicRMWBinOp op,
+LLVMValueRef ptr, LLVMValueRef val,
+const char *sync_scope);
+
+LLVMValueRef ac_build_atomic_cmp_xchg(struct ac_llvm_context *ctx, 
LLVMValueRef ptr,
+ LLVMValueRef cmp, LLVMValueRef val,
+ const char *sync_scope);
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/src/amd/common/ac_llvm_helper.cpp 
b/src/amd/common/ac_llvm_helper.cpp
index dcfb8008546..e5030c6f472 100644
--- a/src/amd/common/ac_llvm_helper.cpp
+++ b/src/amd/common/ac_llvm_helper.cpp
@@ -31,6 +31,7 @@
 
 #include "ac_binary.h"
 #include "ac_llvm_util.h"
+#include "ac_llvm_build.h"
 
 #include 
 #include 
@@ -167,3 +168,61 @@ void ac_enable_global_isel(LLVMTargetMachineRef tm)
 {
   reinterpret_cast(tm)->setGlobalISel(true);
 }
+
+LLVMValueRef ac_build_atomic_rmw(struct ac_llvm_context *ctx, 
LLVMAtomicRMWBinOp op,
+LLVMValueRef ptr, LLVMValueRef val,
+const char *sync_scope) {
+   llvm::AtomicRMWInst::BinOp binop;
+   switch (op) {
+   case LLVMAtomicRMWBinOpXchg:
+   binop = llvm::AtomicRMWInst::Xchg;
+   break;
+   case LLVMAtomicRMWBinOpAdd:
+   binop = llvm::AtomicRMWInst::Add;
+   break;
+   case LLVMAtomicRMWBinOpSub:
+   binop = llvm::AtomicRMWInst::Sub;
+   break;
+   case LLVMAtomicRMWBinOpAnd:
+   binop = llvm::AtomicRMWInst::And;
+   break;
+   case LLVMAtomicRMWBinOpNand:
+   binop = llvm::AtomicRMWInst::Nand;
+   break;
+   case LLVMAtomicRMWBinOpOr:
+   binop = llvm::AtomicRMWInst::Or;
+   break;
+   case LLVMAtomicRMWBinOpXor:
+   binop = llvm::AtomicRMWInst::Xor;
+   break;
+   case LLVMAtomicRMWBinOpMax:
+   binop = llvm::AtomicRMWInst::Max;
+   break;
+   case LLVMAtomicRMWBinOpMin:
+   binop = llvm::AtomicRMWInst::Min;
+   break;
+   case LLVMAtomicRMWBinOpUMax:
+   binop = llvm::AtomicRMWInst::UMax;
+   break;
+   case LLVMAtomicRMWBinOpUMin:
+   binop = llvm::AtomicRMWInst::UMin;
+   break;
+   default:
+   unreachable(!"invalid LLVMAtomicRMWBinOp");
+  break;
+   }
+   unsigned SSID = 
llvm::unwrap(ctx->context)->getOrInsertSyncScopeID(sync_scope);
+   return llvm::wrap(llvm::unwrap(ctx->builder)->CreateAtomicRMW(
+   binop, llvm::unwrap(ptr), llvm::unwrap(val),
+   llvm::AtomicOrdering::SequentiallyConsistent, SSID));
+}
+
+LLVMValueRef ac_build_atomic_cmp_xchg(struct ac_llvm_context *ctx, 
LLVMValueRef ptr,
+

[Mesa-dev] [PATCH] nir,ac/nir: fix cube_face_coord

2019-04-12 Thread Rhys Perry

Seems it was missing the "/ ma + 0.5" and the order was swapped.

Fixes: a1a2a8dfda7b9cac7e ('nir: add AMD_gcn_shader extended instructions')
Signed-off-by: Rhys Perry 
---
 src/amd/common/ac_nir_to_llvm.c | 11 +--
 src/compiler/nir/nir_opcodes.py | 21 +++--
 2 files changed, 24 insertions(+), 8 deletions(-)

diff --git a/src/amd/common/ac_nir_to_llvm.c b/src/amd/common/ac_nir_to_llvm.c
index 0c8891d26a0..12c4c21a8d9 100644
--- a/src/amd/common/ac_nir_to_llvm.c
+++ b/src/amd/common/ac_nir_to_llvm.c
@@ -1081,10 +1081,17 @@ static void visit_alu(struct ac_nir_context *ctx, const 
nir_alu_instr *instr)
LLVMValueRef in[3];
for (unsigned chan = 0; chan < 3; chan++)
in[chan] = ac_llvm_extract_elem(&ctx->ac, src[0], chan);
-   results[0] = ac_build_intrinsic(&ctx->ac, "llvm.amdgcn.cubetc",
+   results[0] = ac_build_intrinsic(&ctx->ac, "llvm.amdgcn.cubesc",
ctx->ac.f32, in, 3, 
AC_FUNC_ATTR_READNONE);
-   results[1] = ac_build_intrinsic(&ctx->ac, "llvm.amdgcn.cubesc",
+   results[1] = ac_build_intrinsic(&ctx->ac, "llvm.amdgcn.cubetc",
ctx->ac.f32, in, 3, 
AC_FUNC_ATTR_READNONE);
+   LLVMValueRef ma = ac_build_intrinsic(&ctx->ac, 
"llvm.amdgcn.cubema",
+ctx->ac.f32, in, 3, 
AC_FUNC_ATTR_READNONE);
+   results[0] = ac_build_fdiv(&ctx->ac, results[0], ma);
+   results[1] = ac_build_fdiv(&ctx->ac, results[1], ma);
+   LLVMValueRef offset = LLVMConstReal(ctx->ac.f32, 0.5);
+   results[0] = LLVMBuildFAdd(ctx->ac.builder, results[0], offset, 
"");
+   results[1] = LLVMBuildFAdd(ctx->ac.builder, results[1], offset, 
"");
result = ac_build_gather_values(&ctx->ac, results, 2);
break;
}
diff --git a/src/compiler/nir/nir_opcodes.py b/src/compiler/nir/nir_opcodes.py
index 90f7aed0c0d..0f56dd9596c 100644
--- a/src/compiler/nir/nir_opcodes.py
+++ b/src/compiler/nir/nir_opcodes.py
@@ -410,12 +410,21 @@ dst.x = dst.y = 0.0;
 float absX = fabs(src0.x);
 float absY = fabs(src0.y);
 float absZ = fabs(src0.z);
-if (src0.x >= 0 && absX >= absY && absX >= absZ) { dst.x = -src0.y; dst.y = 
-src0.z; }
-if (src0.x < 0 && absX >= absY && absX >= absZ) { dst.x = -src0.y; dst.y = 
src0.z; }
-if (src0.y >= 0 && absY >= absX && absY >= absZ) { dst.x = src0.z; dst.y = 
src0.x; }
-if (src0.y < 0 && absY >= absX && absY >= absZ) { dst.x = -src0.z; dst.y = 
src0.x; }
-if (src0.z >= 0 && absZ >= absX && absZ >= absY) { dst.x = -src0.y; dst.y = 
src0.x; }
-if (src0.z < 0 && absZ >= absX && absZ >= absY) { dst.x = -src0.y; dst.y = 
-src0.x; }
+
+float ma = 0.0;
+if (absX >= absY && absX >= absZ) { ma = 2 * src0.x; }
+if (absY >= absX && absY >= absZ) { ma = 2 * src0.y; }
+if (absZ >= absX && absZ >= absY) { ma = 2 * src0.z; }
+
+if (src0.x >= 0 && absX >= absY && absX >= absZ) { dst.x = -src0.z; dst.y = 
-src0.y; }
+if (src0.x < 0 && absX >= absY && absX >= absZ) { dst.x = src0.z; dst.y = 
-src0.y; }
+if (src0.y >= 0 && absY >= absX && absY >= absZ) { dst.x = src0.x; dst.y = 
src0.z; }
+if (src0.y < 0 && absY >= absX && absY >= absZ) { dst.x = src0.x; dst.y = 
-src0.z; }
+if (src0.z >= 0 && absZ >= absX && absZ >= absY) { dst.x = src0.x; dst.y = 
-src0.y; }
+if (src0.z < 0 && absZ >= absX && absZ >= absY) { dst.x = -src0.x; dst.y = 
-src0.y; }
+
+dst.x = dst.x / ma + 0.5;
+dst.y = dst.y / ma + 0.5;
 """)
 
 unop_horiz("cube_face_index", 1, tfloat32, 3, tfloat32, """
-- 
2.20.1

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev

Re: [Mesa-dev] [PATCH 00/38] radv, ac: 16-bit and 8-bit arithmetic and 8-bit storage

2019-02-18 Thread Rhys Perry

The CTS is buggy because the input_output_float_64_to_16 tests are run
even though they shouldn't be run because they try to use a
unadvertised (and unimplemented) optional feature.
Some of them crash for unrelated reasons though: load_tess_varyings()
from ac_nir_to_llvm.c doesn't handle 64-bit varyings. So not all of
them would work even if VK_FORMAT_R64_SFLOAT was a implemented vertex
format.

On Mon, 18 Feb 2019 at 08:53, Samuel Pitoiset  wrote:
>
>
> On 2/16/19 1:21 AM, Rhys Perry wrote:
> > This series add support for:
> > - VK_KHR_shader_float16_int8
> > - VK_AMD_gpu_shader_half_float
> > - VK_AMD_gpu_shader_int16
> > - VK_KHR_8bit_storage
> > on VI+. Half floats are disabled on LLVM 7 because of a bug causing large
> > memory usage and long (or unbounded) compilation times with some CTS
> > tests.
> >
> > It is written against the following patch series:
> > - https://patchwork.freedesktop.org/series/53454/ (v4)
> > - https://patchwork.freedesktop.org/series/53660/ (v1)
> >
> > With LLVM 9, there are no reproducable Vulkan CTS regressions with Vega
> > and VI except for
> > dEQP-VK.spirv_assembly.instruction.graphics.16bit_storage.input_output_float_64_to_16.*
> > which fails or crashes because of unrelated radv bugs with 64-bit varyings
> > and because the tests use VK_FORMAT_R64_SFLOAT as a vertex format even
> > though radv does not support it.
>
> test bug?
>
> The two NIR related patches (22 and 25) should be sent separately,
> otherwise people working on NIR might miss them.
>
> >
> > With LLVM 9, there are no reproducable piglit regressions except for
> > glsl-array-bounds-12.shader_test because of a LLVM bug when
> > SLP vectorization is enabled.
> >
> > With LLVM 8, there are no reproducable Vulkan CTS regressions with Vega
> > and VI except for those with LLVM 9 and a couple of tests because of a
> > LLVM bug after the SLP vectorizer and with the current lack of fallback
> > for 16-bit interpolation on LLVM versions before LLVM 9.
> >
> > With LLVM 7, there are no reproducable Vulkan CTS regressions with Vega
> > and VI except for those with LLVM 9 and a couple of tests because of a
> > LLVM bug after the SLP vectorizer.
> >
> > The SLP vectorization patch is marked as WIP because it exposes LLVM bugs
> > with piglit's glsl-array-bounds-12.shader_test, some Vulkan CTS tests and
> > some shader-db test for a game I can't remember. It also over-vectorizes
> > 32-bit code which can cause significant worsening in generated code
> > quality.
> >
> > The 16-bit interpolation patch is marked as WIP because it currently
> > requires intrinsics only available in LLVM 9 and does not have a fallback.
> >
> > A branch on Github containing this series can be found at:
> > https://github.com/pendingchaos/mesa/commits/radv_fp16_int16_int8_v2
> >
> > v2: rebase
> > v2: implement 16-bit interpolation
> > v2: move LLVMAddSLPVectorizePass to after LLVMAddEarlyCSEMemSSAPass
> > v2: run vectorization unconditionally on GFX9 and later
> > v2: remove ac_get_one(), ac_get_zero(), ac_get_onef() and ac_get_zerof()
> > v2: remove ac_int_of_size()
> > v2: fix 64-bit visit_load_var()
> > v2: mark VK_KHR_8bit_storage as DONE in features.txt
> > v2: mark SLP vectorization patch as WIP
> > v2: fix C++ style comment
> >
> > Rhys Perry (41):
> >radv: bitcast 16-bit outputs to integers
> >radv: ensure export arguments are always float
> >ac: add various helpers for float16/int16/int8
> >ac/nir: implement 8-bit push constant, ssbo and ubo loads
> >ac/nir: implement 8-bit ssbo stores
> >ac/nir: fix 16-bit ssbo stores
> >ac/nir: implement 8-bit nir_load_const_instr
> >ac/nir: implement 8-bit conversions
> >ac/nir: fix 64-bit nir_op_f2f16_rtz
> >ac/nir: make ac_build_clamp work on all bit sizes
> >ac/nir: make ac_build_fract work on all bit sizes
> >ac/nir: make ac_build_isign work on all bit sizes
> >ac/nir: make ac_build_fsign work on all bit sizes
> >ac/nir: make ac_build_fdiv support 16-bit floats
> >ac/nir: implement half-float nir_op_frcp
> >ac/nir: implement half-float nir_op_frsq
> >ac/nir: implement half-float nir_op_ldexp
> >radv: lower 16-bit flrp
> >ac/nir: support half floats in emit_b2f
> >ac/nir: make emit_b2i work on all bit sizes
> >ac/nir: implement 16-bit shifts
> >compiler/nir: add lowering option for 16-bit ffma
> >ac/nir: implement 16-bit ac_build_ddxy
> >ac/nir: implement 8 and 16 bit ac

Re: [Mesa-dev] [PATCH v2 06/41] ac/nir: fix 16-bit ssbo stores

2019-02-18 Thread Rhys Perry

I don't see a 16-bit version of tbuffer.store in IntrinsicsAMDGPU.td
and simply changing "llvm.amdgcn.tbuffer.store.i32" to
"llvm.amdgcn.tbuffer.store.i16" and removing the zext doesn't seem to
work.

On Mon, 18 Feb 2019 at 08:55, Samuel Pitoiset  wrote:
>
> Does this fix anything know? There is a 16-bit version of tbuffer.store,
> maybe we should use it?
>
> On 2/16/19 1:21 AM, Rhys Perry wrote:
> > Signed-off-by: Rhys Perry 
> > ---
> >   src/amd/common/ac_nir_to_llvm.c | 2 ++
> >   1 file changed, 2 insertions(+)
> >
> > diff --git a/src/amd/common/ac_nir_to_llvm.c 
> > b/src/amd/common/ac_nir_to_llvm.c
> > index 89a78b43c6f..b260142c177 100644
> > --- a/src/amd/common/ac_nir_to_llvm.c
> > +++ b/src/amd/common/ac_nir_to_llvm.c
> > @@ -1586,6 +1586,8 @@ static void visit_store_ssbo(struct ac_nir_context 
> > *ctx,
> >   } else if (num_bytes == 2) {
> >   store_name = "llvm.amdgcn.tbuffer.store.i32";
> >   data_type = ctx->ac.i32;
> > + data = LLVMBuildBitCast(ctx->ac.builder, data, 
> > ctx->ac.i16, "");
> > + data = LLVMBuildZExt(ctx->ac.builder, data, 
> > data_type, "");
> >   LLVMValueRef tbuffer_params[] = {
> >   data,
> >   rsrc,
___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev

[Mesa-dev] [PATCH v2 31/41] ac/nir: implement 16-bit pack/unpack opcodes

2019-02-15 Thread Rhys Perry

Signed-off-by: Rhys Perry 
---
 src/amd/common/ac_nir_to_llvm.c | 24 
 1 file changed, 24 insertions(+)

diff --git a/src/amd/common/ac_nir_to_llvm.c b/src/amd/common/ac_nir_to_llvm.c
index bad1c2a990e..f6ad1aa7e77 100644
--- a/src/amd/common/ac_nir_to_llvm.c
+++ b/src/amd/common/ac_nir_to_llvm.c
@@ -1015,6 +1015,30 @@ static void visit_alu(struct ac_nir_context *ctx, const 
nir_alu_instr *instr)
break;
}
 
+   case nir_op_pack_32_2x16_split: {
+   LLVMValueRef tmp = ac_build_gather_values(&ctx->ac, src, 2);
+   result = LLVMBuildBitCast(ctx->ac.builder, tmp, ctx->ac.i32, 
"");
+   break;
+   }
+
+   case nir_op_unpack_32_2x16_split_x: {
+   LLVMValueRef tmp = LLVMBuildBitCast(ctx->ac.builder, src[0],
+   ctx->ac.v2i16,
+   "");
+   result = LLVMBuildExtractElement(ctx->ac.builder, tmp,
+ctx->ac.i32_0, "");
+   break;
+   }
+
+   case nir_op_unpack_32_2x16_split_y: {
+   LLVMValueRef tmp = LLVMBuildBitCast(ctx->ac.builder, src[0],
+   ctx->ac.v2i16,
+   "");
+   result = LLVMBuildExtractElement(ctx->ac.builder, tmp,
+ctx->ac.i32_1, "");
+   break;
+   }
+
case nir_op_cube_face_coord: {
src[0] = ac_to_float(&ctx->ac, src[0]);
LLVMValueRef results[2];
-- 
2.20.1

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev

[Mesa-dev] [PATCH v2 32/41] ac/nir: add 8-bit types to glsl_base_to_llvm_type

2019-02-15 Thread Rhys Perry

v2: remove 16-bit additions and rebase

Signed-off-by: Rhys Perry 
---
 src/amd/common/ac_nir_to_llvm.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/src/amd/common/ac_nir_to_llvm.c b/src/amd/common/ac_nir_to_llvm.c
index f6ad1aa7e77..defbfdf4297 100644
--- a/src/amd/common/ac_nir_to_llvm.c
+++ b/src/amd/common/ac_nir_to_llvm.c
@@ -3969,6 +3969,9 @@ glsl_base_to_llvm_type(struct ac_llvm_context *ac,
case GLSL_TYPE_BOOL:
case GLSL_TYPE_SUBROUTINE:
return ac->i32;
+   case GLSL_TYPE_INT8:
+   case GLSL_TYPE_UINT8:
+   return ac->i8;
case GLSL_TYPE_INT16:
case GLSL_TYPE_UINT16:
return ac->i16;
-- 
2.20.1

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev

[Mesa-dev] [PATCH v2 25/41] nir: make bitfield_reverse and ifind_msb work with all integers

2019-02-15 Thread Rhys Perry

Signed-off-by: Rhys Perry 
---
 src/compiler/nir/nir_opcodes.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/compiler/nir/nir_opcodes.py b/src/compiler/nir/nir_opcodes.py
index dc4cd9ac63d..0f40bd6c548 100644
--- a/src/compiler/nir/nir_opcodes.py
+++ b/src/compiler/nir/nir_opcodes.py
@@ -350,7 +350,7 @@ unop_convert("unpack_64_2x32_split_y", tuint32, tuint64, 
"src0 >> 32")
 # Bit operations, part of ARB_gpu_shader5.
 
 
-unop("bitfield_reverse", tuint32, """
+unop("bitfield_reverse", tuint, """
 /* we're not winning any awards for speed here, but that's ok */
 dst = 0;
 for (unsigned bit = 0; bit < 32; bit++)
@@ -374,7 +374,7 @@ for (int bit = bit_size - 1; bit >= 0; bit--) {
 }
 """)
 
-unop("ifind_msb", tint32, """
+unop_convert("ifind_msb", tint32, tint, """
 dst = -1;
 for (int bit = 31; bit >= 0; bit--) {
/* If src0 < 0, we're looking for the first 0 bit.
-- 
2.20.1

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev

[Mesa-dev] [PATCH v2 33/41] ac/nir, radv: create an array of varying output types

2019-02-15 Thread Rhys Perry

Signed-off-by: Rhys Perry 
---
 src/amd/common/ac_nir_to_llvm.c   | 68 +++
 src/amd/common/ac_shader_abi.h|  1 +
 src/amd/vulkan/radv_nir_to_llvm.c |  3 ++
 3 files changed, 72 insertions(+)

diff --git a/src/amd/common/ac_nir_to_llvm.c b/src/amd/common/ac_nir_to_llvm.c
index defbfdf4297..5821c18aeb1 100644
--- a/src/amd/common/ac_nir_to_llvm.c
+++ b/src/amd/common/ac_nir_to_llvm.c
@@ -4238,6 +4238,68 @@ static void visit_cf_list(struct ac_nir_context *ctx,
}
 }
 
+static unsigned traverse_var_component_slots(struct ac_llvm_context *ctx, bool 
vs_in,
+struct nir_variable *var, unsigned 
cur_offset,
+const struct glsl_type *cur_type,
+void (*cb)(struct ac_llvm_context 
*, unsigned, enum glsl_base_type, void *),
+void *cbdata)
+{
+   if (glsl_type_is_struct(cur_type)) {
+   for (unsigned i = 0; i < glsl_get_length(cur_type); i++) {
+   const struct glsl_type *ft = 
glsl_get_struct_field(cur_type, i);
+   cur_offset = traverse_var_component_slots(ctx, vs_in, 
var, cur_offset, ft, cb, cbdata);
+   }
+   return (cur_offset + 3) / 4 * 4;
+   }
+
+   enum glsl_base_type base_type = 
glsl_get_base_type(glsl_without_array_or_matrix(cur_type));
+
+   unsigned stride = 
glsl_get_component_slots(glsl_without_array_or_matrix(cur_type));
+   if (!var->data.compact)
+   stride = (stride + 3) / 4 * 4;
+   unsigned arr_len = MAX2(glsl_get_matrix_columns(cur_type), 1);
+   if (glsl_type_is_array(cur_type))
+   arr_len *= glsl_get_aoa_size(cur_type);
+   for (unsigned i = 0; i < arr_len; i++) {
+   for (unsigned j = 0; j < 
glsl_get_component_slots(glsl_without_array_or_matrix(cur_type)); j++) {
+   cb(ctx, cur_offset + var->data.location_frac + j, 
base_type, cbdata);
+   }
+   cur_offset += stride;
+   }
+   return cur_offset;
+}
+
+static void setup_output_type(struct ac_llvm_context *ctx, unsigned index, 
enum glsl_base_type base, void *output_types)
+{
+   LLVMTypeRef type;
+   switch (base) {
+   case GLSL_TYPE_INT8:
+   case GLSL_TYPE_UINT8:
+   type = ctx->i8;
+   break;
+   case GLSL_TYPE_INT16:
+   case GLSL_TYPE_UINT16:
+   type = ctx->i16;
+   break;
+   case GLSL_TYPE_FLOAT16:
+   type = ctx->f16;
+   break;
+   case GLSL_TYPE_INT:
+   case GLSL_TYPE_UINT:
+   case GLSL_TYPE_BOOL:
+   case GLSL_TYPE_INT64:
+   case GLSL_TYPE_UINT64:
+   type = ctx->i32;
+   break;
+   case GLSL_TYPE_FLOAT:
+   case GLSL_TYPE_DOUBLE:
+   default:
+   type = ctx->f32;
+   break;
+   }
+   ((LLVMTypeRef*)output_types)[index] = type;
+}
+
 void
 ac_handle_shader_output_decl(struct ac_llvm_context *ctx,
 struct ac_shader_abi *abi,
@@ -4275,6 +4337,9 @@ ac_handle_shader_output_decl(struct ac_llvm_context *ctx,
   ac_build_alloca_undef(ctx, type, "");
}
}
+
+   traverse_var_component_slots(ctx, false, variable, output_loc * 4,
+variable->type, &setup_output_type, 
abi->output_types);
 }
 
 static void
@@ -4328,6 +4393,9 @@ void ac_nir_translate(struct ac_llvm_context *ac, struct 
ac_shader_abi *abi,
 
ctx.main_function = 
LLVMGetBasicBlockParent(LLVMGetInsertBlock(ctx.ac.builder));
 
+   for (unsigned i = 0; i < AC_LLVM_MAX_OUTPUTS * 4; i++)
+   ctx.abi->output_types[i] = ac->i32;
+
nir_foreach_variable(variable, &nir->outputs)
ac_handle_shader_output_decl(&ctx.ac, ctx.abi, nir, variable,
 ctx.stage);
diff --git a/src/amd/common/ac_shader_abi.h b/src/amd/common/ac_shader_abi.h
index ee18e6c1923..274deeb13a4 100644
--- a/src/amd/common/ac_shader_abi.h
+++ b/src/amd/common/ac_shader_abi.h
@@ -69,6 +69,7 @@ struct ac_shader_abi {
LLVMValueRef view_index;
 
LLVMValueRef outputs[AC_LLVM_MAX_OUTPUTS * 4];
+   LLVMTypeRef output_types[AC_LLVM_MAX_OUTPUTS * 4];
 
/* For VS and PS: pre-loaded shader inputs.
 *
diff --git a/src/amd/vulkan/radv_nir_to_llvm.c 
b/src/amd/vulkan/radv_nir_to_llvm.c
index d3795eec403..8fdaee72036 100644
--- a/src/amd/vulkan/radv_nir_to_llvm.c
+++ b/src/amd/vulkan/radv_nir_to_llvm.c
@@ -3910,6 +3910,9 @@ radv_compile_gs_copy_shader(struct ac_llvm_compiler 
*ac_llvm,
ctx.gs_max_out_vertices = geom_shader->info.gs.vertices_out;
ac_setup_rings(&ctx);
 
+   for (unsigned i = 0; i < AC

[Mesa-dev] [PATCH v2 35/41] radv: store all fragment shader inputs as f32

2019-02-15 Thread Rhys Perry

v2: rebase

Signed-off-by: Rhys Perry 
---
 src/amd/vulkan/radv_nir_to_llvm.c | 14 --
 1 file changed, 4 insertions(+), 10 deletions(-)

diff --git a/src/amd/vulkan/radv_nir_to_llvm.c 
b/src/amd/vulkan/radv_nir_to_llvm.c
index 2002a744545..01b8b097ea1 100644
--- a/src/amd/vulkan/radv_nir_to_llvm.c
+++ b/src/amd/vulkan/radv_nir_to_llvm.c
@@ -2056,7 +2056,6 @@ static void interp_fs_input(struct radv_shader_context 
*ctx,
LLVMValueRef attr_number;
unsigned chan;
LLVMValueRef i, j;
-   bool interp = !LLVMIsUndef(interp_param);
 
attr_number = LLVMConstInt(ctx->ac.i32, attr, false);
 
@@ -2070,7 +2069,7 @@ static void interp_fs_input(struct radv_shader_context 
*ctx,
 * fs.interp cannot be used on integers, because they can be equal
 * to NaN.
 */
-   if (interp) {
+   if (interp_param) {
interp_param = LLVMBuildBitCast(ctx->ac.builder, interp_param,
ctx->ac.v2f32, "");
 
@@ -2083,7 +2082,7 @@ static void interp_fs_input(struct radv_shader_context 
*ctx,
for (chan = 0; chan < 4; chan++) {
LLVMValueRef llvm_chan = LLVMConstInt(ctx->ac.i32, chan, false);
 
-   if (interp) {
+   if (interp_param) {
result[chan] = ac_build_fs_interp(&ctx->ac,
  llvm_chan,
  attr_number,
@@ -2095,7 +2094,6 @@ static void interp_fs_input(struct radv_shader_context 
*ctx,
  attr_number,
  prim_mask);
result[chan] = LLVMBuildBitCast(ctx->ac.builder, 
result[chan], ctx->ac.i32, "");
-   result[chan] = LLVMBuildTruncOrBitCast(ctx->ac.builder, 
result[chan], LLVMTypeOf(interp_param), "");
}
}
 }
@@ -2123,10 +2121,6 @@ handle_fs_input_decl(struct radv_shader_context *ctx,
 
interp = lookup_interp_param(&ctx->abi, 
variable->data.interpolation, interp_type);
}
-   bool is_16bit = glsl_type_is_16bit(glsl_without_array(variable->type));
-   LLVMTypeRef type = is_16bit ? ctx->ac.i16 : ctx->ac.i32;
-   if (interp == NULL)
-   interp = LLVMGetUndef(type);
 
for (unsigned i = 0; i < attrib_count; ++i)
ctx->inputs[ac_llvm_reg_index_soa(idx + i, 0)] = interp;
@@ -2187,7 +2181,7 @@ handle_fs_inputs(struct radv_shader_context *ctx,
if (ctx->shader_info->info.ps.uses_input_attachments ||
ctx->shader_info->info.needs_multiview_view_index) {
ctx->input_mask |= 1ull << VARYING_SLOT_LAYER;
-   ctx->inputs[ac_llvm_reg_index_soa(VARYING_SLOT_LAYER, 0)] = 
LLVMGetUndef(ctx->ac.i32);
+   ctx->inputs[ac_llvm_reg_index_soa(VARYING_SLOT_LAYER, 0)] = 
NULL;
}
 
for (unsigned i = 0; i < RADEON_LLVM_MAX_INPUTS; ++i) {
@@ -2203,7 +2197,7 @@ handle_fs_inputs(struct radv_shader_context *ctx,
interp_fs_input(ctx, index, interp_param, 
ctx->abi.prim_mask,
inputs);
 
-   if (LLVMIsUndef(interp_param))
+   if (!interp_param)
ctx->shader_info->fs.flat_shaded_mask |= 1u << 
index;
if (i >= VARYING_SLOT_VAR0)
ctx->abi.fs_input_attr_indices[i - 
VARYING_SLOT_VAR0] = index;
-- 
2.20.1

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev

Re: [Mesa-dev] [PATCH v2 37/41] radv, ac: implement 16-bit interpolation

2019-02-15 Thread Rhys Perry

This patch can be ignored. I forgot to delete it and it ended up getting sent.
"[PATCH v2 37/41] WIP: radv, ac: implement 16-bit interpolation" is
the correct one.

On Sat, 16 Feb 2019 at 00:23, Rhys Perry  wrote:
>
> v2: add to patch series
>
> Signed-off-by: Rhys Perry 
> ---
>  src/amd/common/ac_llvm_build.c   | 33 +---
>  src/amd/common/ac_llvm_build.h   |  3 ++-
>  src/amd/common/ac_nir_to_llvm.c  | 14 +++---
>  src/amd/vulkan/radv_nir_to_llvm.c| 27 ++-
>  src/amd/vulkan/radv_pipeline.c   | 19 --
>  src/amd/vulkan/radv_shader.h |  1 +
>  src/gallium/drivers/radeonsi/si_shader.c |  2 +-
>  7 files changed, 69 insertions(+), 30 deletions(-)
>
> diff --git a/src/amd/common/ac_llvm_build.c b/src/amd/common/ac_llvm_build.c
> index dff369aae7f..be2c2251a21 100644
> --- a/src/amd/common/ac_llvm_build.c
> +++ b/src/amd/common/ac_llvm_build.c
> @@ -937,27 +937,40 @@ ac_build_fs_interp(struct ac_llvm_context *ctx,
>LLVMValueRef attr_number,
>LLVMValueRef params,
>LLVMValueRef i,
> -  LLVMValueRef j)
> +  LLVMValueRef j,
> +  int word)
>  {
> -   LLVMValueRef args[5];
> +   LLVMValueRef args[6];
> LLVMValueRef p1;
>
> args[0] = i;
> args[1] = llvm_chan;
> args[2] = attr_number;
> -   args[3] = params;
> -
> -   p1 = ac_build_intrinsic(ctx, "llvm.amdgcn.interp.p1",
> -   ctx->f32, args, 4, AC_FUNC_ATTR_READNONE);
> +   if (word >= 0) {
> +   args[3] = LLVMConstInt(ctx->i1, word, false);
> +   args[4] = params;
> +   p1 = ac_build_intrinsic(ctx, "llvm.amdgcn.interp.p1.f16",
> +   ctx->f16, args, 5, 
> AC_FUNC_ATTR_READNONE);
> +   } else {
> +   args[3] = params;
> +   p1 = ac_build_intrinsic(ctx, "llvm.amdgcn.interp.p1",
> +   ctx->f32, args, 4, 
> AC_FUNC_ATTR_READNONE);
> +   }
>
> args[0] = p1;
> args[1] = j;
> args[2] = llvm_chan;
> args[3] = attr_number;
> -   args[4] = params;
> -
> -   return ac_build_intrinsic(ctx, "llvm.amdgcn.interp.p2",
> - ctx->f32, args, 5, AC_FUNC_ATTR_READNONE);
> +   if (word >= 0) {
> +   args[4] = LLVMConstInt(ctx->i1, word, false);
> +   args[5] = params;
> +   return ac_build_intrinsic(ctx, "llvm.amdgcn.interp.p2.f16",
> + ctx->f16, args, 6, 
> AC_FUNC_ATTR_READNONE);
> +   } else {
> +   args[4] = params;
> +   return ac_build_intrinsic(ctx, "llvm.amdgcn.interp.p2",
> + ctx->f32, args, 5, 
> AC_FUNC_ATTR_READNONE);
> +   }
>  }
>
>  LLVMValueRef
> diff --git a/src/amd/common/ac_llvm_build.h b/src/amd/common/ac_llvm_build.h
> index 61c9b5e4b6c..655427567c4 100644
> --- a/src/amd/common/ac_llvm_build.h
> +++ b/src/amd/common/ac_llvm_build.h
> @@ -224,7 +224,8 @@ ac_build_fs_interp(struct ac_llvm_context *ctx,
>LLVMValueRef attr_number,
>LLVMValueRef params,
>LLVMValueRef i,
> -  LLVMValueRef j);
> +  LLVMValueRef j,
> +  int word);
>
>  LLVMValueRef
>  ac_build_fs_interp_mov(struct ac_llvm_context *ctx,
> diff --git a/src/amd/common/ac_nir_to_llvm.c b/src/amd/common/ac_nir_to_llvm.c
> index bf7024c68e4..939b8eb13de 100644
> --- a/src/amd/common/ac_nir_to_llvm.c
> +++ b/src/amd/common/ac_nir_to_llvm.c
> @@ -3120,8 +3120,15 @@ static LLVMValueRef visit_interp(struct ac_nir_context 
> *ctx,
> LLVMValueRef j = LLVMBuildExtractElement(
> ctx->ac.builder, interp_param, 
> ctx->ac.i32_1, "");
>
> +   /* This fp16 handling isn't technically 
> correct
> +* but should be correct for the attributes we
> +* are actually going to use. */
> +   bool fp16 = instr->dest.ssa.bit_size == 16;
> +   int word = fp16 ? 0 : -1;
> v = ac_build_fs_interp(&ctx->ac, llvm_chan, 
> attr_number,
> -

[Mesa-dev] [PATCH v2 34/41] ac/nir: store all outputs as f32

2019-02-15 Thread Rhys Perry

v2: rebase
v2: fix 64-bit visit_load_var()

Signed-off-by: Rhys Perry 
---
 src/amd/common/ac_nir_to_llvm.c   | 14 ++
 src/amd/vulkan/radv_nir_to_llvm.c | 22 +-
 2 files changed, 19 insertions(+), 17 deletions(-)

diff --git a/src/amd/common/ac_nir_to_llvm.c b/src/amd/common/ac_nir_to_llvm.c
index 5821c18aeb1..bf7024c68e4 100644
--- a/src/amd/common/ac_nir_to_llvm.c
+++ b/src/amd/common/ac_nir_to_llvm.c
@@ -2114,7 +2114,10 @@ static LLVMValueRef visit_load_var(struct ac_nir_context 
*ctx,
unreachable("unhandle variable mode");
}
ret = ac_build_varying_gather_values(&ctx->ac, values, ve, comp);
-   return LLVMBuildBitCast(ctx->ac.builder, ret, get_def_type(ctx, 
&instr->dest.ssa), "");
+   if (instr->dest.ssa.bit_size == 16)
+   return ac_build_reinterpret(&ctx->ac, ret, get_def_type(ctx, 
&instr->dest.ssa));
+   else
+   return LLVMBuildBitCast(ctx->ac.builder, ret, get_def_type(ctx, 
&instr->dest.ssa), "");
 }
 
 static void
@@ -2152,6 +2155,11 @@ visit_store_var(struct ac_nir_context *ctx,
 
writemask = writemask << comp;
 
+   LLVMTypeRef type = ctx->ac.f32;
+   if (LLVMGetTypeKind(LLVMTypeOf(src)) == LLVMVectorTypeKind)
+   type = LLVMVectorType(ctx->ac.f32, 
LLVMGetVectorSize(LLVMTypeOf(src)));
+   src = ac_build_reinterpret(&ctx->ac, src, type);
+
switch (deref->mode) {
case nir_var_shader_out:
 
@@ -4329,12 +4337,10 @@ ac_handle_shader_output_decl(struct ac_llvm_context 
*ctx,
}
}
 
-   bool is_16bit = glsl_type_is_16bit(glsl_without_array(variable->type));
-   LLVMTypeRef type = is_16bit ? ctx->f16 : ctx->f32;
for (unsigned i = 0; i < attrib_count; ++i) {
for (unsigned chan = 0; chan < 4; chan++) {
abi->outputs[ac_llvm_reg_index_soa(output_loc + i, 
chan)] =
-  ac_build_alloca_undef(ctx, type, "");
+  ac_build_alloca_undef(ctx, ctx->f32, "");
}
}
 
diff --git a/src/amd/vulkan/radv_nir_to_llvm.c 
b/src/amd/vulkan/radv_nir_to_llvm.c
index 8fdaee72036..2002a744545 100644
--- a/src/amd/vulkan/radv_nir_to_llvm.c
+++ b/src/amd/vulkan/radv_nir_to_llvm.c
@@ -2305,6 +2305,7 @@ si_llvm_init_export_args(struct radv_shader_context *ctx,
 
bool is_16bit = ac_get_type_size(LLVMTypeOf(values[0])) == 2;
if (ctx->stage == MESA_SHADER_FRAGMENT) {
+   bool is_16bit = ac_get_type_size(LLVMTypeOf(values[0])) == 2;
unsigned index = target - V_008DFC_SQ_EXP_MRT;
unsigned col_format = (ctx->options->key.fs.col_format >> (4 * 
index)) & 0xf;
bool is_int8 = (ctx->options->key.fs.is_int8 >> index) & 1;
@@ -2421,16 +2422,8 @@ si_llvm_init_export_args(struct radv_shader_context *ctx,
return;
}
 
-   if (is_16bit) {
-   for (unsigned chan = 0; chan < 4; chan++) {
-   values[chan] = LLVMBuildBitCast(ctx->ac.builder, 
values[chan], ctx->ac.i16, "");
-   args->out[chan] = LLVMBuildZExt(ctx->ac.builder, 
values[chan], ctx->ac.i32, "");
-   }
-   } else
-   memcpy(&args->out[0], values, sizeof(values[0]) * 4);
-
-   for (unsigned i = 0; i < 4; ++i)
-   args->out[i] = ac_to_float(&ctx->ac, args->out[i]);
+   for (unsigned chan = 0; chan < 4; chan++)
+   args->out[chan] = ac_build_reinterpret(&ctx->ac, values[chan], 
ctx->ac.f32);
 }
 
 static void
@@ -3137,9 +3130,12 @@ handle_fs_outputs_post(struct radv_shader_context *ctx)
if (i < FRAG_RESULT_DATA0)
continue;
 
-   for (unsigned j = 0; j < 4; j++)
-   values[j] = ac_to_float(&ctx->ac,
-   radv_load_output(ctx, i, j));
+   for (unsigned j = 0; j < 4; j++) {
+   values[j] = radv_load_output(ctx, i, j);
+   unsigned index = ac_llvm_reg_index_soa(i, 0);
+   LLVMTypeRef new_type = ctx->abi.output_types[index];
+   values[j] = ac_build_reinterpret(&ctx->ac, values[j], 
new_type);
+   }
 
bool ret = si_export_mrt_color(ctx, values,
   i - FRAG_RESULT_DATA0,
-- 
2.20.1

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev

[Mesa-dev] [PATCH v2 37/41] WIP: radv, ac: implement 16-bit interpolation

2019-02-15 Thread Rhys Perry

v2: add to patch series

Signed-off-by: Rhys Perry 
---
 src/amd/common/ac_llvm_build.c   | 33 +---
 src/amd/common/ac_llvm_build.h   |  3 ++-
 src/amd/common/ac_nir_to_llvm.c  | 14 +++---
 src/amd/vulkan/radv_nir_to_llvm.c| 27 ++-
 src/amd/vulkan/radv_pipeline.c   | 19 --
 src/amd/vulkan/radv_shader.h |  1 +
 src/gallium/drivers/radeonsi/si_shader.c |  2 +-
 7 files changed, 69 insertions(+), 30 deletions(-)

diff --git a/src/amd/common/ac_llvm_build.c b/src/amd/common/ac_llvm_build.c
index dff369aae7f..be2c2251a21 100644
--- a/src/amd/common/ac_llvm_build.c
+++ b/src/amd/common/ac_llvm_build.c
@@ -937,27 +937,40 @@ ac_build_fs_interp(struct ac_llvm_context *ctx,
   LLVMValueRef attr_number,
   LLVMValueRef params,
   LLVMValueRef i,
-  LLVMValueRef j)
+  LLVMValueRef j,
+  int word)
 {
-   LLVMValueRef args[5];
+   LLVMValueRef args[6];
LLVMValueRef p1;
 
args[0] = i;
args[1] = llvm_chan;
args[2] = attr_number;
-   args[3] = params;
-
-   p1 = ac_build_intrinsic(ctx, "llvm.amdgcn.interp.p1",
-   ctx->f32, args, 4, AC_FUNC_ATTR_READNONE);
+   if (word >= 0) {
+   args[3] = LLVMConstInt(ctx->i1, word, false);
+   args[4] = params;
+   p1 = ac_build_intrinsic(ctx, "llvm.amdgcn.interp.p1.f16",
+   ctx->f16, args, 5, 
AC_FUNC_ATTR_READNONE);
+   } else {
+   args[3] = params;
+   p1 = ac_build_intrinsic(ctx, "llvm.amdgcn.interp.p1",
+   ctx->f32, args, 4, 
AC_FUNC_ATTR_READNONE);
+   }
 
args[0] = p1;
args[1] = j;
args[2] = llvm_chan;
args[3] = attr_number;
-   args[4] = params;
-
-   return ac_build_intrinsic(ctx, "llvm.amdgcn.interp.p2",
- ctx->f32, args, 5, AC_FUNC_ATTR_READNONE);
+   if (word >= 0) {
+   args[4] = LLVMConstInt(ctx->i1, word, false);
+   args[5] = params;
+   return ac_build_intrinsic(ctx, "llvm.amdgcn.interp.p2.f16",
+ ctx->f16, args, 6, 
AC_FUNC_ATTR_READNONE);
+   } else {
+   args[4] = params;
+   return ac_build_intrinsic(ctx, "llvm.amdgcn.interp.p2",
+ ctx->f32, args, 5, 
AC_FUNC_ATTR_READNONE);
+   }
 }
 
 LLVMValueRef
diff --git a/src/amd/common/ac_llvm_build.h b/src/amd/common/ac_llvm_build.h
index 61c9b5e4b6c..655427567c4 100644
--- a/src/amd/common/ac_llvm_build.h
+++ b/src/amd/common/ac_llvm_build.h
@@ -224,7 +224,8 @@ ac_build_fs_interp(struct ac_llvm_context *ctx,
   LLVMValueRef attr_number,
   LLVMValueRef params,
   LLVMValueRef i,
-  LLVMValueRef j);
+  LLVMValueRef j,
+  int word);
 
 LLVMValueRef
 ac_build_fs_interp_mov(struct ac_llvm_context *ctx,
diff --git a/src/amd/common/ac_nir_to_llvm.c b/src/amd/common/ac_nir_to_llvm.c
index bf7024c68e4..939b8eb13de 100644
--- a/src/amd/common/ac_nir_to_llvm.c
+++ b/src/amd/common/ac_nir_to_llvm.c
@@ -3120,8 +3120,15 @@ static LLVMValueRef visit_interp(struct ac_nir_context 
*ctx,
LLVMValueRef j = LLVMBuildExtractElement(
ctx->ac.builder, interp_param, 
ctx->ac.i32_1, "");
 
+   /* This fp16 handling isn't technically correct
+* but should be correct for the attributes we
+* are actually going to use. */
+   bool fp16 = instr->dest.ssa.bit_size == 16;
+   int word = fp16 ? 0 : -1;
v = ac_build_fs_interp(&ctx->ac, llvm_chan, 
attr_number,
-  ctx->abi->prim_mask, i, 
j);
+  ctx->abi->prim_mask, i, 
j, word);
+   if (fp16)
+   v = ac_build_reinterpret(&ctx->ac, v, 
ctx->ac.f32);
} else {
v = ac_build_fs_interp_mov(&ctx->ac, 
LLVMConstInt(ctx->ac.i32, 2, false),
   llvm_chan, 
attr_number, ctx->abi->prim_mask);
@@ -3134,8 +3141,9 @@ static LLVMValueRef visit_interp(struct ac_nir_context 
*ctx,
result[chan] = LLVMBuildExtractElement(ctx->ac.builder, gather, 
attrib_idx, "");
 
}
-   retu

[Mesa-dev] [PATCH v2 37/41] radv, ac: implement 16-bit interpolation

2019-02-15 Thread Rhys Perry

v2: add to patch series

Signed-off-by: Rhys Perry 
---
 src/amd/common/ac_llvm_build.c   | 33 +---
 src/amd/common/ac_llvm_build.h   |  3 ++-
 src/amd/common/ac_nir_to_llvm.c  | 14 +++---
 src/amd/vulkan/radv_nir_to_llvm.c| 27 ++-
 src/amd/vulkan/radv_pipeline.c   | 19 --
 src/amd/vulkan/radv_shader.h |  1 +
 src/gallium/drivers/radeonsi/si_shader.c |  2 +-
 7 files changed, 69 insertions(+), 30 deletions(-)

diff --git a/src/amd/common/ac_llvm_build.c b/src/amd/common/ac_llvm_build.c
index dff369aae7f..be2c2251a21 100644
--- a/src/amd/common/ac_llvm_build.c
+++ b/src/amd/common/ac_llvm_build.c
@@ -937,27 +937,40 @@ ac_build_fs_interp(struct ac_llvm_context *ctx,
   LLVMValueRef attr_number,
   LLVMValueRef params,
   LLVMValueRef i,
-  LLVMValueRef j)
+  LLVMValueRef j,
+  int word)
 {
-   LLVMValueRef args[5];
+   LLVMValueRef args[6];
LLVMValueRef p1;
 
args[0] = i;
args[1] = llvm_chan;
args[2] = attr_number;
-   args[3] = params;
-
-   p1 = ac_build_intrinsic(ctx, "llvm.amdgcn.interp.p1",
-   ctx->f32, args, 4, AC_FUNC_ATTR_READNONE);
+   if (word >= 0) {
+   args[3] = LLVMConstInt(ctx->i1, word, false);
+   args[4] = params;
+   p1 = ac_build_intrinsic(ctx, "llvm.amdgcn.interp.p1.f16",
+   ctx->f16, args, 5, 
AC_FUNC_ATTR_READNONE);
+   } else {
+   args[3] = params;
+   p1 = ac_build_intrinsic(ctx, "llvm.amdgcn.interp.p1",
+   ctx->f32, args, 4, 
AC_FUNC_ATTR_READNONE);
+   }
 
args[0] = p1;
args[1] = j;
args[2] = llvm_chan;
args[3] = attr_number;
-   args[4] = params;
-
-   return ac_build_intrinsic(ctx, "llvm.amdgcn.interp.p2",
- ctx->f32, args, 5, AC_FUNC_ATTR_READNONE);
+   if (word >= 0) {
+   args[4] = LLVMConstInt(ctx->i1, word, false);
+   args[5] = params;
+   return ac_build_intrinsic(ctx, "llvm.amdgcn.interp.p2.f16",
+ ctx->f16, args, 6, 
AC_FUNC_ATTR_READNONE);
+   } else {
+   args[4] = params;
+   return ac_build_intrinsic(ctx, "llvm.amdgcn.interp.p2",
+ ctx->f32, args, 5, 
AC_FUNC_ATTR_READNONE);
+   }
 }
 
 LLVMValueRef
diff --git a/src/amd/common/ac_llvm_build.h b/src/amd/common/ac_llvm_build.h
index 61c9b5e4b6c..655427567c4 100644
--- a/src/amd/common/ac_llvm_build.h
+++ b/src/amd/common/ac_llvm_build.h
@@ -224,7 +224,8 @@ ac_build_fs_interp(struct ac_llvm_context *ctx,
   LLVMValueRef attr_number,
   LLVMValueRef params,
   LLVMValueRef i,
-  LLVMValueRef j);
+  LLVMValueRef j,
+  int word);
 
 LLVMValueRef
 ac_build_fs_interp_mov(struct ac_llvm_context *ctx,
diff --git a/src/amd/common/ac_nir_to_llvm.c b/src/amd/common/ac_nir_to_llvm.c
index bf7024c68e4..939b8eb13de 100644
--- a/src/amd/common/ac_nir_to_llvm.c
+++ b/src/amd/common/ac_nir_to_llvm.c
@@ -3120,8 +3120,15 @@ static LLVMValueRef visit_interp(struct ac_nir_context 
*ctx,
LLVMValueRef j = LLVMBuildExtractElement(
ctx->ac.builder, interp_param, 
ctx->ac.i32_1, "");
 
+   /* This fp16 handling isn't technically correct
+* but should be correct for the attributes we
+* are actually going to use. */
+   bool fp16 = instr->dest.ssa.bit_size == 16;
+   int word = fp16 ? 0 : -1;
v = ac_build_fs_interp(&ctx->ac, llvm_chan, 
attr_number,
-  ctx->abi->prim_mask, i, 
j);
+  ctx->abi->prim_mask, i, 
j, word);
+   if (fp16)
+   v = ac_build_reinterpret(&ctx->ac, v, 
ctx->ac.f32);
} else {
v = ac_build_fs_interp_mov(&ctx->ac, 
LLVMConstInt(ctx->ac.i32, 2, false),
   llvm_chan, 
attr_number, ctx->abi->prim_mask);
@@ -3134,8 +3141,9 @@ static LLVMValueRef visit_interp(struct ac_nir_context 
*ctx,
result[chan] = LLVMBuildExtractElement(ctx->ac.builder, gather, 
attrib_idx, "");
 
}
-   retu

[Mesa-dev] [PATCH v2 38/41] WIP: ac, radv: run LLVM's SLP vectorizer

2019-02-15 Thread Rhys Perry

v2: rebase
v2: move LLVMAddSLPVectorizePass to after LLVMAddEarlyCSEMemSSAPass
v2: run unconditionally on GFX9 and later
v2: mark as WIP because it can make 32-bit code much worse

Signed-off-by: Rhys Perry 
---
 src/amd/common/ac_llvm_util.c | 8 ++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/src/amd/common/ac_llvm_util.c b/src/amd/common/ac_llvm_util.c
index 69446863b95..8d78b5a850b 100644
--- a/src/amd/common/ac_llvm_util.c
+++ b/src/amd/common/ac_llvm_util.c
@@ -31,6 +31,7 @@
 #include 
 #include 
 #include 
+#include 
 #include "c11/threads.h"
 #include "gallivm/lp_bld_misc.h"
 #include "util/u_math.h"
@@ -175,7 +176,7 @@ static LLVMTargetMachineRef ac_create_target_machine(enum 
radeon_family family,
 }
 
 static LLVMPassManagerRef ac_create_passmgr(LLVMTargetLibraryInfoRef 
target_library_info,
-   bool check_ir)
+   bool check_ir, enum radeon_family 
family)
 {
LLVMPassManagerRef passmgr = LLVMCreatePassManager();
if (!passmgr)
@@ -203,6 +204,9 @@ static LLVMPassManagerRef 
ac_create_passmgr(LLVMTargetLibraryInfoRef target_libr
LLVMAddCFGSimplificationPass(passmgr);
/* This is recommended by the instruction combining pass. */
LLVMAddEarlyCSEMemSSAPass(passmgr);
+   /* vectorization is disabled on pre-GFX9 because it's not very useful 
there */
+   if (family >= CHIP_VEGA10)
+   LLVMAddSLPVectorizePass(passmgr);
LLVMAddInstructionCombiningPass(passmgr);
return passmgr;
 }
@@ -327,7 +331,7 @@ ac_init_llvm_compiler(struct ac_llvm_compiler *compiler,
goto fail;
 
compiler->passmgr = ac_create_passmgr(compiler->target_library_info,
- tm_options & AC_TM_CHECK_IR);
+ tm_options & AC_TM_CHECK_IR, 
family);
if (!compiler->passmgr)
goto fail;
 
-- 
2.20.1

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev

[Mesa-dev] [PATCH v2 39/41] ac/nir: generate better code for nir_op_f2f16_rtz

2019-02-15 Thread Rhys Perry

Signed-off-by: Rhys Perry 
---
 src/amd/common/ac_nir_to_llvm.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/src/amd/common/ac_nir_to_llvm.c b/src/amd/common/ac_nir_to_llvm.c
index 939b8eb13de..8bfc63958ca 100644
--- a/src/amd/common/ac_nir_to_llvm.c
+++ b/src/amd/common/ac_nir_to_llvm.c
@@ -889,7 +889,9 @@ static void visit_alu(struct ac_nir_context *ctx, const 
nir_alu_instr *instr)
src[0] = LLVMBuildFPTrunc(ctx->ac.builder, src[0], 
ctx->ac.f32, "");
LLVMValueRef param[2] = { src[0], ctx->ac.f32_0 };
result = ac_build_cvt_pkrtz_f16(&ctx->ac, param);
-   result = LLVMBuildExtractElement(ctx->ac.builder, result, 
ctx->ac.i32_0, "");
+   // generates better code than an extractelement with slp 
vectorization
+   result = LLVMBuildBitCast(ctx->ac.builder, result, ctx->ac.i32, 
"");
+   result = LLVMBuildTrunc(ctx->ac.builder, result, ctx->ac.i16, 
"");
break;
case nir_op_f2f16_rtne:
case nir_op_f2f16:
-- 
2.20.1

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev

[Mesa-dev] [PATCH v2 41/41] radv, docs: expose float16, int16 and int8 features and extensions

2019-02-15 Thread Rhys Perry

v2: rebase
v2: mark VK_KHR_8bit_storage as DONE in features.txt

Signed-off-by: Rhys Perry 
---
 docs/features.txt |  2 +-
 src/amd/vulkan/radv_device.c  | 17 +
 src/amd/vulkan/radv_extensions.py |  4 
 src/amd/vulkan/radv_shader.c  |  3 +++
 4 files changed, 25 insertions(+), 1 deletion(-)

diff --git a/docs/features.txt b/docs/features.txt
index 6c2b6d59377..ded753b0182 100644
--- a/docs/features.txt
+++ b/docs/features.txt
@@ -439,7 +439,7 @@ Vulkan 1.1 -- all DONE: anv, radv
   VK_KHR_variable_pointers  DONE (anv, radv)
 
 Khronos extensions that are not part of any Vulkan version:
-  VK_KHR_8bit_storage   DONE (anv)
+  VK_KHR_8bit_storage   DONE (anv, radv)
   VK_KHR_android_surfacenot started
   VK_KHR_create_renderpass2 DONE (anv, radv)
   VK_KHR_displayDONE (anv, radv)
diff --git a/src/amd/vulkan/radv_device.c b/src/amd/vulkan/radv_device.c
index 0fef92773e1..4137b778466 100644
--- a/src/amd/vulkan/radv_device.c
+++ b/src/amd/vulkan/radv_device.c
@@ -877,6 +877,23 @@ void radv_GetPhysicalDeviceFeatures2(
features->bufferDeviceAddressMultiDevice = false;
break;
}
+   case 
VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_FLOAT16_INT8_FEATURES_KHR: {
+   VkPhysicalDeviceFloat16Int8FeaturesKHR *features =
+   (VkPhysicalDeviceFloat16Int8FeaturesKHR*)ext;
+   bool enabled = pdevice->rad_info.chip_class >= VI;
+   features->shaderFloat16 = enabled && HAVE_LLVM >= 
0x0800;
+   features->shaderInt8 = enabled;
+   break;
+   }
+   case 
VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_8BIT_STORAGE_FEATURES_KHR: {
+   VkPhysicalDevice8BitStorageFeaturesKHR *features =
+   (VkPhysicalDevice8BitStorageFeaturesKHR*)ext;
+   bool enabled = pdevice->rad_info.chip_class >= VI;
+   features->storageBuffer8BitAccess = enabled;
+   features->uniformAndStorageBuffer8BitAccess = enabled;
+   features->storagePushConstant8 = enabled;
+   break;
+   }
default:
break;
}
diff --git a/src/amd/vulkan/radv_extensions.py 
b/src/amd/vulkan/radv_extensions.py
index f218598f123..e38cfcfdcbe 100644
--- a/src/amd/vulkan/radv_extensions.py
+++ b/src/amd/vulkan/radv_extensions.py
@@ -91,6 +91,8 @@ EXTENSIONS = [
 Extension('VK_KHR_xlib_surface',  6, 
'VK_USE_PLATFORM_XLIB_KHR'),
 Extension('VK_KHR_multiview', 1, True),
 Extension('VK_KHR_display',  23, 
'VK_USE_PLATFORM_DISPLAY_KHR'),
+Extension('VK_KHR_shader_float16_int8',   1, 
'device->rad_info.chip_class >= VI'),
+Extension('VK_KHR_8bit_storage',  1, 
'device->rad_info.chip_class >= VI'),
 Extension('VK_EXT_direct_mode_display',   1, 
'VK_USE_PLATFORM_DISPLAY_KHR'),
 Extension('VK_EXT_acquire_xlib_display',  1, 
'VK_USE_PLATFORM_XLIB_XRANDR_EXT'),
 Extension('VK_EXT_buffer_device_address', 1, True),
@@ -121,6 +123,8 @@ EXTENSIONS = [
 Extension('VK_AMD_shader_core_properties',1, True),
 Extension('VK_AMD_shader_info',   1, True),
 Extension('VK_AMD_shader_trinary_minmax', 1, True),
+Extension('VK_AMD_gpu_shader_half_float', 1, 
'device->rad_info.chip_class >= VI && HAVE_LLVM >= 0x0800'),
+Extension('VK_AMD_gpu_shader_int16',  1, 
'device->rad_info.chip_class >= VI'),
 Extension('VK_GOOGLE_decorate_string',1, True),
 Extension('VK_GOOGLE_hlsl_functionality1',1, True),
 ]
diff --git a/src/amd/vulkan/radv_shader.c b/src/amd/vulkan/radv_shader.c
index adba730ad8b..44dea8e7203 100644
--- a/src/amd/vulkan/radv_shader.c
+++ b/src/amd/vulkan/radv_shader.c
@@ -249,6 +249,9 @@ radv_shader_compile_to_nir(struct radv_device *device,
.transform_feedback = true,
.trinary_minmax = true,
.variable_pointers = true,
+   .float16 = true,
+   .storage_8bit = true,
+   .int8 =

[Mesa-dev] [PATCH v2 40/41] ac/nir: have nir_op_f2f16 round to zero

2019-02-15 Thread Rhys Perry

In the hope that one day LLVM will then be able to generate code with
vectorized v_cvt_pkrtz_f16_f32 instructions.

Signed-off-by: Rhys Perry 
---
 src/amd/common/ac_nir_to_llvm.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/amd/common/ac_nir_to_llvm.c b/src/amd/common/ac_nir_to_llvm.c
index 8bfc63958ca..7a5e95506f2 100644
--- a/src/amd/common/ac_nir_to_llvm.c
+++ b/src/amd/common/ac_nir_to_llvm.c
@@ -884,6 +884,7 @@ static void visit_alu(struct ac_nir_context *ctx, const 
nir_alu_instr *instr)
result = LLVMBuildUIToFP(ctx->ac.builder, src[0], 
ac_to_float_type(&ctx->ac, def_type), "");
break;
case nir_op_f2f16_rtz:
+   case nir_op_f2f16:
src[0] = ac_to_float(&ctx->ac, src[0]);
if (LLVMTypeOf(src[0]) == ctx->ac.f64)
src[0] = LLVMBuildFPTrunc(ctx->ac.builder, src[0], 
ctx->ac.f32, "");
@@ -894,7 +895,6 @@ static void visit_alu(struct ac_nir_context *ctx, const 
nir_alu_instr *instr)
result = LLVMBuildTrunc(ctx->ac.builder, result, ctx->ac.i16, 
"");
break;
case nir_op_f2f16_rtne:
-   case nir_op_f2f16:
case nir_op_f2f32:
case nir_op_f2f64:
src[0] = ac_to_float(&ctx->ac, src[0]);
-- 
2.20.1

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev

[Mesa-dev] [PATCH v2 28/41] ac/nir: implement 8 and 16 bit ac_build_imsb

2019-02-15 Thread Rhys Perry

v2: fix C++ style comment

Signed-off-by: Rhys Perry 
---
 src/amd/common/ac_llvm_build.c | 4 
 1 file changed, 4 insertions(+)

diff --git a/src/amd/common/ac_llvm_build.c b/src/amd/common/ac_llvm_build.c
index ec87a7b9343..c986f800fa4 100644
--- a/src/amd/common/ac_llvm_build.c
+++ b/src/amd/common/ac_llvm_build.c
@@ -1531,6 +1531,10 @@ ac_build_imsb(struct ac_llvm_context *ctx,
  LLVMValueRef arg,
  LLVMTypeRef dst_type)
 {
+   /* TODO: support 64-bit integers */
+   if (LLVMTypeOf(arg) != ctx->i32)
+   arg = LLVMBuildSExt(ctx->builder, arg, ctx->i32, "");
+
LLVMValueRef msb = ac_build_intrinsic(ctx, "llvm.amdgcn.sffbh.i32",
  dst_type, &arg, 1,
  AC_FUNC_ATTR_READNONE);
-- 
2.20.1

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev

[Mesa-dev] [PATCH v2 10/41] ac/nir: make ac_build_clamp work on all bit sizes

2019-02-15 Thread Rhys Perry

v2: don't use ac_get_zerof() and ac_get_onef()

Signed-off-by: Rhys Perry 
---
 src/amd/common/ac_llvm_build.c | 13 +
 1 file changed, 9 insertions(+), 4 deletions(-)

diff --git a/src/amd/common/ac_llvm_build.c b/src/amd/common/ac_llvm_build.c
index b53d9c7ff8c..667f9700764 100644
--- a/src/amd/common/ac_llvm_build.c
+++ b/src/amd/common/ac_llvm_build.c
@@ -1597,16 +1597,20 @@ ac_build_umsb(struct ac_llvm_context *ctx,
 LLVMValueRef ac_build_fmin(struct ac_llvm_context *ctx, LLVMValueRef a,
   LLVMValueRef b)
 {
+   char intr[64];
+   snprintf(intr, sizeof(intr), "llvm.minnum.f%d", ac_get_elem_bits(ctx, 
LLVMTypeOf(a)));
LLVMValueRef args[2] = {a, b};
-   return ac_build_intrinsic(ctx, "llvm.minnum.f32", ctx->f32, args, 2,
+   return ac_build_intrinsic(ctx, intr, LLVMTypeOf(a), args, 2,
  AC_FUNC_ATTR_READNONE);
 }
 
 LLVMValueRef ac_build_fmax(struct ac_llvm_context *ctx, LLVMValueRef a,
   LLVMValueRef b)
 {
+   char intr[64];
+   snprintf(intr, sizeof(intr), "llvm.maxnum.f%d", ac_get_elem_bits(ctx, 
LLVMTypeOf(a)));
LLVMValueRef args[2] = {a, b};
-   return ac_build_intrinsic(ctx, "llvm.maxnum.f32", ctx->f32, args, 2,
+   return ac_build_intrinsic(ctx, intr, LLVMTypeOf(a), args, 2,
  AC_FUNC_ATTR_READNONE);
 }
 
@@ -1633,8 +1637,9 @@ LLVMValueRef ac_build_umin(struct ac_llvm_context *ctx, 
LLVMValueRef a,
 
 LLVMValueRef ac_build_clamp(struct ac_llvm_context *ctx, LLVMValueRef value)
 {
-   return ac_build_fmin(ctx, ac_build_fmax(ctx, value, ctx->f32_0),
-ctx->f32_1);
+   LLVMTypeRef t = LLVMTypeOf(value);
+   return ac_build_fmin(ctx, ac_build_fmax(ctx, value, LLVMConstReal(t, 
0.0)),
+LLVMConstReal(t, 1.0));
 }
 
 void ac_build_export(struct ac_llvm_context *ctx, struct ac_export_args *a)
-- 
2.20.1

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev

[Mesa-dev] [PATCH v2 14/41] ac/nir: make ac_build_fdiv support 16-bit floats

2019-02-15 Thread Rhys Perry

v2: don't use ac_get_onef()

Signed-off-by: Rhys Perry 
---
 src/amd/common/ac_llvm_build.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/amd/common/ac_llvm_build.c b/src/amd/common/ac_llvm_build.c
index 23e454385d7..fb871a47400 100644
--- a/src/amd/common/ac_llvm_build.c
+++ b/src/amd/common/ac_llvm_build.c
@@ -661,7 +661,7 @@ ac_build_fdiv(struct ac_llvm_context *ctx,
 * If we do (num * (1 / den)), LLVM does:
 *return num * v_rcp_f32(den);
 */
-   LLVMValueRef one = LLVMTypeOf(num) == ctx->f64 ? ctx->f64_1 : 
ctx->f32_1;
+   LLVMValueRef one = LLVMConstReal(LLVMTypeOf(num), 1.0);
LLVMValueRef rcp = LLVMBuildFDiv(ctx->builder, one, den, "");
LLVMValueRef ret = LLVMBuildFMul(ctx->builder, num, rcp, "");
 
-- 
2.20.1

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev

[Mesa-dev] [PATCH v2 21/41] ac/nir: implement 16-bit shifts

2019-02-15 Thread Rhys Perry

Signed-off-by: Rhys Perry 
---
 src/amd/common/ac_nir_to_llvm.c | 9 +++--
 1 file changed, 3 insertions(+), 6 deletions(-)

diff --git a/src/amd/common/ac_nir_to_llvm.c b/src/amd/common/ac_nir_to_llvm.c
index 75bb19031bf..bad1c2a990e 100644
--- a/src/amd/common/ac_nir_to_llvm.c
+++ b/src/amd/common/ac_nir_to_llvm.c
@@ -672,20 +672,17 @@ static void visit_alu(struct ac_nir_context *ctx, const 
nir_alu_instr *instr)
break;
case nir_op_ishl:
result = LLVMBuildShl(ctx->ac.builder, src[0],
- LLVMBuildZExt(ctx->ac.builder, src[1],
-   LLVMTypeOf(src[0]), ""),
+ ac_build_ui_cast(&ctx->ac, src[1], 
LLVMTypeOf(src[0])),
  "");
break;
case nir_op_ishr:
result = LLVMBuildAShr(ctx->ac.builder, src[0],
-  LLVMBuildZExt(ctx->ac.builder, src[1],
-LLVMTypeOf(src[0]), ""),
+  ac_build_ui_cast(&ctx->ac, src[1], 
LLVMTypeOf(src[0])),
   "");
break;
case nir_op_ushr:
result = LLVMBuildLShr(ctx->ac.builder, src[0],
-  LLVMBuildZExt(ctx->ac.builder, src[1],
-LLVMTypeOf(src[0]), ""),
+  ac_build_ui_cast(&ctx->ac, src[1], 
LLVMTypeOf(src[0])),
   "");
break;
case nir_op_ilt32:
-- 
2.20.1

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev

[Mesa-dev] [PATCH v2 11/41] ac/nir: make ac_build_fract work on all bit sizes

2019-02-15 Thread Rhys Perry

Signed-off-by: Rhys Perry 
---
 src/amd/common/ac_llvm_build.c | 13 +++--
 1 file changed, 3 insertions(+), 10 deletions(-)

diff --git a/src/amd/common/ac_llvm_build.c b/src/amd/common/ac_llvm_build.c
index 667f9700764..db937eb66fb 100644
--- a/src/amd/common/ac_llvm_build.c
+++ b/src/amd/common/ac_llvm_build.c
@@ -2049,16 +2049,9 @@ void ac_build_waitcnt(struct ac_llvm_context *ctx, 
unsigned simm16)
 LLVMValueRef ac_build_fract(struct ac_llvm_context *ctx, LLVMValueRef src0,
unsigned bitsize)
 {
-   LLVMTypeRef type;
-   char *intr;
-
-   if (bitsize == 32) {
-   intr = "llvm.floor.f32";
-   type = ctx->f32;
-   } else {
-   intr = "llvm.floor.f64";
-   type = ctx->f64;
-   }
+   LLVMTypeRef type = ac_float_of_size(ctx, bitsize);
+   char intr[64];
+   snprintf(intr, sizeof(intr), "llvm.floor.f%d", bitsize);
 
LLVMValueRef params[] = {
src0,
-- 
2.20.1

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev

[Mesa-dev] [PATCH v2 16/41] ac/nir: implement half-float nir_op_frsq

2019-02-15 Thread Rhys Perry

v2: don't use ac_get_onef()

Signed-off-by: Rhys Perry 
---
 src/amd/common/ac_nir_to_llvm.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/src/amd/common/ac_nir_to_llvm.c b/src/amd/common/ac_nir_to_llvm.c
index cba0cec3e8f..8b0e07d2930 100644
--- a/src/amd/common/ac_nir_to_llvm.c
+++ b/src/amd/common/ac_nir_to_llvm.c
@@ -788,8 +788,7 @@ static void visit_alu(struct ac_nir_context *ctx, const 
nir_alu_instr *instr)
case nir_op_frsq:
result = emit_intrin_1f_param(&ctx->ac, "llvm.sqrt",
  ac_to_float_type(&ctx->ac, 
def_type), src[0]);
-   result = ac_build_fdiv(&ctx->ac, instr->dest.dest.ssa.bit_size 
== 32 ? ctx->ac.f32_1 : ctx->ac.f64_1,
-  result);
+   result = ac_build_fdiv(&ctx->ac, 
LLVMConstReal(LLVMTypeOf(result), 1.0), result);
break;
case nir_op_frexp_exp:
src[0] = ac_to_float(&ctx->ac, src[0]);
-- 
2.20.1

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev

[Mesa-dev] [PATCH v2 15/41] ac/nir: implement half-float nir_op_frcp

2019-02-15 Thread Rhys Perry

v2: don't use ac_get_onef()

Signed-off-by: Rhys Perry 
---
 src/amd/common/ac_nir_to_llvm.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/src/amd/common/ac_nir_to_llvm.c b/src/amd/common/ac_nir_to_llvm.c
index 741059b5f1a..cba0cec3e8f 100644
--- a/src/amd/common/ac_nir_to_llvm.c
+++ b/src/amd/common/ac_nir_to_llvm.c
@@ -657,8 +657,7 @@ static void visit_alu(struct ac_nir_context *ctx, const 
nir_alu_instr *instr)
break;
case nir_op_frcp:
src[0] = ac_to_float(&ctx->ac, src[0]);
-   result = ac_build_fdiv(&ctx->ac, instr->dest.dest.ssa.bit_size 
== 32 ? ctx->ac.f32_1 : ctx->ac.f64_1,
-  src[0]);
+   result = ac_build_fdiv(&ctx->ac, 
LLVMConstReal(LLVMTypeOf(src[0]), 1.0), src[0]);
break;
case nir_op_iand:
result = LLVMBuildAnd(ctx->ac.builder, src[0], src[1], "");
-- 
2.20.1

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev

[Mesa-dev] [PATCH v2 18/41] radv: lower 16-bit flrp

2019-02-15 Thread Rhys Perry

Signed-off-by: Rhys Perry 
---
 src/amd/vulkan/radv_shader.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/amd/vulkan/radv_shader.c b/src/amd/vulkan/radv_shader.c
index 1dcb0606246..adba730ad8b 100644
--- a/src/amd/vulkan/radv_shader.c
+++ b/src/amd/vulkan/radv_shader.c
@@ -53,6 +53,7 @@
 static const struct nir_shader_compiler_options nir_options = {
.vertex_id_zero_based = true,
.lower_scmp = true,
+   .lower_flrp16 = true,
.lower_flrp32 = true,
.lower_flrp64 = true,
.lower_device_index_to_zero = true,
-- 
2.20.1

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev

[Mesa-dev] [PATCH v2 17/41] ac/nir: implement half-float nir_op_ldexp

2019-02-15 Thread Rhys Perry

Signed-off-by: Rhys Perry 
---
 src/amd/common/ac_nir_to_llvm.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/src/amd/common/ac_nir_to_llvm.c b/src/amd/common/ac_nir_to_llvm.c
index 8b0e07d2930..0e5946dfdb3 100644
--- a/src/amd/common/ac_nir_to_llvm.c
+++ b/src/amd/common/ac_nir_to_llvm.c
@@ -829,8 +829,10 @@ static void visit_alu(struct ac_nir_context *ctx, const 
nir_alu_instr *instr)
break;
case nir_op_ldexp:
src[0] = ac_to_float(&ctx->ac, src[0]);
-   if (ac_get_elem_bits(&ctx->ac, LLVMTypeOf(src[0])) == 32)
+   if (ac_get_elem_bits(&ctx->ac, def_type) == 32)
result = ac_build_intrinsic(&ctx->ac, 
"llvm.amdgcn.ldexp.f32", ctx->ac.f32, src, 2, AC_FUNC_ATTR_READNONE);
+   else if (ac_get_elem_bits(&ctx->ac, def_type) == 16)
+   result = ac_build_intrinsic(&ctx->ac, 
"llvm.amdgcn.ldexp.f16", ctx->ac.f16, src, 2, AC_FUNC_ATTR_READNONE);
else
result = ac_build_intrinsic(&ctx->ac, 
"llvm.amdgcn.ldexp.f64", ctx->ac.f64, src, 2, AC_FUNC_ATTR_READNONE);
break;
-- 
2.20.1

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev

[Mesa-dev] [PATCH v2 26/41] ac/nir: make ac_find_lsb work on all bit sizes

2019-02-15 Thread Rhys Perry

v2: don't use ac_get_zero() and ac_int_of_size()

Signed-off-by: Rhys Perry 
---
 src/amd/common/ac_llvm_build.c | 33 ++---
 1 file changed, 6 insertions(+), 27 deletions(-)

diff --git a/src/amd/common/ac_llvm_build.c b/src/amd/common/ac_llvm_build.c
index aa92c55c822..61085db9320 100644
--- a/src/amd/common/ac_llvm_build.c
+++ b/src/amd/common/ac_llvm_build.c
@@ -2474,30 +2474,11 @@ LLVMValueRef ac_find_lsb(struct ac_llvm_context *ctx,
 LLVMTypeRef dst_type,
 LLVMValueRef src0)
 {
-   unsigned src0_bitsize = ac_get_elem_bits(ctx, LLVMTypeOf(src0));
-   const char *intrin_name;
-   LLVMTypeRef type;
-   LLVMValueRef zero;
-
-   switch (src0_bitsize) {
-   case 64:
-   intrin_name = "llvm.cttz.i64";
-   type = ctx->i64;
-   zero = ctx->i64_0;
-   break;
-   case 32:
-   intrin_name = "llvm.cttz.i32";
-   type = ctx->i32;
-   zero = ctx->i32_0;
-   break;
-   case 16:
-   intrin_name = "llvm.cttz.i16";
-   type = ctx->i16;
-   zero = ctx->i16_0;
-   break;
-   default:
-   unreachable(!"invalid bitsize");
-   }
+   LLVMTypeRef type = LLVMTypeOf(src0);
+   unsigned src0_bitsize = ac_get_elem_bits(ctx, type);
+   char intrin_name[64];
+   LLVMValueRef zero = LLVMConstInt(type, 0, false);
+   snprintf(intrin_name, sizeof(intrin_name), "llvm.cttz.i%d", 
src0_bitsize);
 
LLVMValueRef params[2] = {
src0,
@@ -2518,9 +2499,7 @@ LLVMValueRef ac_find_lsb(struct ac_llvm_context *ctx,
  params, 2,
  AC_FUNC_ATTR_READNONE);
 
-   if (src0_bitsize == 64) {
-   lsb = LLVMBuildTrunc(ctx->builder, lsb, ctx->i32, "");
-   }
+   lsb = ac_build_ui_cast(ctx, lsb, ctx->i32);
 
/* TODO: We need an intrinsic to skip this conditional. */
/* Check for zero: */
-- 
2.20.1

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev

[Mesa-dev] [PATCH v2 12/41] ac/nir: make ac_build_isign work on all bit sizes

2019-02-15 Thread Rhys Perry

v2: don't use ac_get_zero(), ac_get_one() and ac_int_of_size()

Signed-off-by: Rhys Perry 
---
 src/amd/common/ac_llvm_build.c | 27 ---
 1 file changed, 4 insertions(+), 23 deletions(-)

diff --git a/src/amd/common/ac_llvm_build.c b/src/amd/common/ac_llvm_build.c
index db937eb66fb..3b2257e8bf0 100644
--- a/src/amd/common/ac_llvm_build.c
+++ b/src/amd/common/ac_llvm_build.c
@@ -2064,30 +2064,11 @@ LLVMValueRef ac_build_fract(struct ac_llvm_context 
*ctx, LLVMValueRef src0,
 LLVMValueRef ac_build_isign(struct ac_llvm_context *ctx, LLVMValueRef src0,
unsigned bitsize)
 {
-   LLVMValueRef cmp, val, zero, one;
-   LLVMTypeRef type;
-
-   switch (bitsize) {
-   case 64:
-   type = ctx->i64;
-   zero = ctx->i64_0;
-   one = ctx->i64_1;
-   break;
-   case 32:
-   type = ctx->i32;
-   zero = ctx->i32_0;
-   one = ctx->i32_1;
-   break;
-   case 16:
-   type = ctx->i16;
-   zero = ctx->i16_0;
-   one = ctx->i16_1;
-   break;
-   default:
-   unreachable(!"invalid bitsize");
-   break;
-   }
+   LLVMTypeRef type = LLVMIntTypeInContext(ctx->context, bitsize);
+   LLVMValueRef zero = LLVMConstInt(type, 0, false);
+   LLVMValueRef one = LLVMConstInt(type, 1, false);
 
+   LLVMValueRef cmp, val;
cmp = LLVMBuildICmp(ctx->builder, LLVMIntSGT, src0, zero, "");
val = LLVMBuildSelect(ctx->builder, cmp, one, src0, "");
cmp = LLVMBuildICmp(ctx->builder, LLVMIntSGE, val, zero, "");
-- 
2.20.1

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev

[Mesa-dev] [PATCH v2 19/41] ac/nir: support half floats in emit_b2f

2019-02-15 Thread Rhys Perry

This seems to generate fine code, even though the IR is a bit ugly.

Signed-off-by: Rhys Perry 
---
 src/amd/common/ac_nir_to_llvm.c | 14 ++
 1 file changed, 10 insertions(+), 4 deletions(-)

diff --git a/src/amd/common/ac_nir_to_llvm.c b/src/amd/common/ac_nir_to_llvm.c
index 0e5946dfdb3..e459001c1cf 100644
--- a/src/amd/common/ac_nir_to_llvm.c
+++ b/src/amd/common/ac_nir_to_llvm.c
@@ -316,14 +316,20 @@ static LLVMValueRef emit_b2f(struct ac_llvm_context *ctx,
 unsigned bitsize)
 {
LLVMValueRef result = LLVMBuildAnd(ctx->builder, src0,
-  LLVMBuildBitCast(ctx->builder, 
LLVMConstReal(ctx->f32, 1.0), ctx->i32, ""),
+  LLVMBuildBitCast(ctx->builder, 
ctx->f32_1, ctx->i32, ""),
   "");
result = LLVMBuildBitCast(ctx->builder, result, ctx->f32, "");
 
-   if (bitsize == 32)
+   switch (bitsize) {
+   case 16:
+   return LLVMBuildFPTrunc(ctx->builder, result, ctx->f16, "");
+   case 32:
return result;
-
-   return LLVMBuildFPExt(ctx->builder, result, ctx->f64, "");
+   case 64:
+   return LLVMBuildFPExt(ctx->builder, result, ctx->f64, "");
+   default:
+   unreachable("Unsupported bit size.");
+   }
 }
 
 static LLVMValueRef emit_f2b(struct ac_llvm_context *ctx,
-- 
2.20.1

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev

[Mesa-dev] [PATCH v2 27/41] ac/nir: make ac_build_umsb work on all bit sizes

2019-02-15 Thread Rhys Perry

v2: don't use ac_get_zero() and ac_int_of_size()

Signed-off-by: Rhys Perry 
---
 src/amd/common/ac_llvm_build.c | 38 +++---
 1 file changed, 7 insertions(+), 31 deletions(-)

diff --git a/src/amd/common/ac_llvm_build.c b/src/amd/common/ac_llvm_build.c
index 61085db9320..ec87a7b9343 100644
--- a/src/amd/common/ac_llvm_build.c
+++ b/src/amd/common/ac_llvm_build.c
@@ -1555,36 +1555,12 @@ ac_build_umsb(struct ac_llvm_context *ctx,
  LLVMValueRef arg,
  LLVMTypeRef dst_type)
 {
-   const char *intrin_name;
-   LLVMTypeRef type;
-   LLVMValueRef highest_bit;
-   LLVMValueRef zero;
-   unsigned bitsize;
-
-   bitsize = ac_get_elem_bits(ctx, LLVMTypeOf(arg));
-   switch (bitsize) {
-   case 64:
-   intrin_name = "llvm.ctlz.i64";
-   type = ctx->i64;
-   highest_bit = LLVMConstInt(ctx->i64, 63, false);
-   zero = ctx->i64_0;
-   break;
-   case 32:
-   intrin_name = "llvm.ctlz.i32";
-   type = ctx->i32;
-   highest_bit = LLVMConstInt(ctx->i32, 31, false);
-   zero = ctx->i32_0;
-   break;
-   case 16:
-   intrin_name = "llvm.ctlz.i16";
-   type = ctx->i16;
-   highest_bit = LLVMConstInt(ctx->i16, 15, false);
-   zero = ctx->i16_0;
-   break;
-   default:
-   unreachable(!"invalid bitsize");
-   break;
-   }
+   LLVMTypeRef type = LLVMTypeOf(arg);
+   unsigned bitsize = ac_get_elem_bits(ctx, type);
+   LLVMValueRef highest_bit = LLVMConstInt(type, bitsize - 1, false);
+   LLVMValueRef zero = LLVMConstInt(type, 0, false);
+   char intrin_name[64];
+   snprintf(intrin_name, sizeof(intrin_name), "llvm.ctlz.i%d", bitsize);
 
LLVMValueRef params[2] = {
arg,
@@ -1598,7 +1574,7 @@ ac_build_umsb(struct ac_llvm_context *ctx,
/* The HW returns the last bit index from MSB, but TGSI/NIR wants
 * the index from LSB. Invert it by doing "31 - msb". */
msb = LLVMBuildSub(ctx->builder, highest_bit, msb, "");
-   msb = LLVMBuildTruncOrBitCast(ctx->builder, msb, ctx->i32, "");
+   msb = ac_build_ui_cast(ctx, msb, dst_type);
 
/* check for zero */
return LLVMBuildSelect(ctx->builder,
-- 
2.20.1

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev

[Mesa-dev] [PATCH v2 24/41] ac/nir: implement 8 and 16 bit ac_build_readlane

2019-02-15 Thread Rhys Perry

v2: don't use ac_int_of_size()

Signed-off-by: Rhys Perry 
---
 src/amd/common/ac_llvm_build.c | 12 +++-
 1 file changed, 11 insertions(+), 1 deletion(-)

diff --git a/src/amd/common/ac_llvm_build.c b/src/amd/common/ac_llvm_build.c
index 71eaac4b7bd..aa92c55c822 100644
--- a/src/amd/common/ac_llvm_build.c
+++ b/src/amd/common/ac_llvm_build.c
@@ -2868,9 +2868,15 @@ ac_build_readlane(struct ac_llvm_context *ctx, 
LLVMValueRef src, LLVMValueRef la
 {
LLVMTypeRef src_type = LLVMTypeOf(src);
src = ac_to_integer(ctx, src);
-   unsigned bits = LLVMGetIntTypeWidth(LLVMTypeOf(src));
+   unsigned src_bits = LLVMGetIntTypeWidth(LLVMTypeOf(src));
+   unsigned bits = src_bits;
LLVMValueRef ret;
 
+   if (bits < 32) {
+   src = LLVMBuildZExt(ctx->builder, src, ctx->i32, "");
+   bits = 32;
+   }
+
if (bits == 32) {
ret = _ac_build_readlane(ctx, src, lane);
} else {
@@ -2887,6 +2893,10 @@ ac_build_readlane(struct ac_llvm_context *ctx, 
LLVMValueRef src, LLVMValueRef la
LLVMConstInt(ctx->i32, i, 0), 
"");
}
}
+
+   if (src_bits < 32)
+   ret = LLVMBuildTrunc(ctx->builder, ret, 
LLVMIntTypeInContext(ctx->context, src_bits), "");
+
return LLVMBuildBitCast(ctx->builder, ret, src_type, "");
 }
 
-- 
2.20.1

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev

[Mesa-dev] [PATCH v2 30/41] ac/nir: make ac_build_bitfield_reverse work on all bit sizes

2019-02-15 Thread Rhys Perry

Signed-off-by: Rhys Perry 
---
 src/amd/common/ac_llvm_build.c | 26 ++
 1 file changed, 6 insertions(+), 20 deletions(-)

diff --git a/src/amd/common/ac_llvm_build.c b/src/amd/common/ac_llvm_build.c
index 46738faea9d..dff369aae7f 100644
--- a/src/amd/common/ac_llvm_build.c
+++ b/src/amd/common/ac_llvm_build.c
@@ -2100,28 +2100,14 @@ LLVMValueRef ac_build_bit_count(struct ac_llvm_context 
*ctx, LLVMValueRef src0)
 LLVMValueRef ac_build_bitfield_reverse(struct ac_llvm_context *ctx,
   LLVMValueRef src0)
 {
-   LLVMValueRef result;
-   unsigned bitsize;
-
-   bitsize = ac_get_elem_bits(ctx, LLVMTypeOf(src0));
+   unsigned bitsize = ac_get_elem_bits(ctx, LLVMTypeOf(src0));
 
-   switch (bitsize) {
-   case 32:
-   result = ac_build_intrinsic(ctx, "llvm.bitreverse.i32", 
ctx->i32,
-   (LLVMValueRef []) { src0 }, 1,
-   AC_FUNC_ATTR_READNONE);
-   break;
-   case 16:
-   result = ac_build_intrinsic(ctx, "llvm.bitreverse.i16", 
ctx->i16,
-   (LLVMValueRef []) { src0 }, 1,
-   AC_FUNC_ATTR_READNONE);
-   break;
-   default:
-   unreachable(!"invalid bitsize");
-   break;
-   }
+   char name[64];
+   snprintf(name, sizeof(name), "llvm.bitreverse.i%d", bitsize);
 
-   return result;
+   return ac_build_intrinsic(ctx, name, LLVMTypeOf(src0),
+ (LLVMValueRef []) { src0 }, 1,
+ AC_FUNC_ATTR_READNONE);
 }
 
 #define AC_EXP_TARGET  0
-- 
2.20.1

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev

[Mesa-dev] [PATCH v2 23/41] ac/nir: implement 16-bit ac_build_ddxy

2019-02-15 Thread Rhys Perry

v2: rebase

Signed-off-by: Rhys Perry 
---
 src/amd/common/ac_llvm_build.c | 20 
 1 file changed, 16 insertions(+), 4 deletions(-)

diff --git a/src/amd/common/ac_llvm_build.c b/src/amd/common/ac_llvm_build.c
index fb871a47400..71eaac4b7bd 100644
--- a/src/amd/common/ac_llvm_build.c
+++ b/src/amd/common/ac_llvm_build.c
@@ -1481,6 +1481,11 @@ ac_build_ddxy(struct ac_llvm_context *ctx,
LLVMValueRef tl, trbl;
LLVMValueRef result;
 
+   int size = ac_get_type_size(LLVMTypeOf(val));
+
+   if (size == 2)
+   val = LLVMBuildZExt(ctx->builder, val, ctx->i32, "");
+
for (unsigned i = 0; i < 4; ++i) {
tl_lanes[i] = i & mask;
trbl_lanes[i] = (i & mask) + idx;
@@ -1493,12 +1498,19 @@ ac_build_ddxy(struct ac_llvm_context *ctx,
 trbl_lanes[0], trbl_lanes[1],
 trbl_lanes[2], trbl_lanes[3]);
 
-   tl = LLVMBuildBitCast(ctx->builder, tl, ctx->f32, "");
-   trbl = LLVMBuildBitCast(ctx->builder, trbl, ctx->f32, "");
+   if (size == 2) {
+   tl = LLVMBuildTrunc(ctx->builder, tl, ctx->i16, "");
+   trbl = LLVMBuildTrunc(ctx->builder, trbl, ctx->i16, "");
+   }
+
+   LLVMTypeRef type = ac_float_of_size(ctx, size * 8);
+   tl = LLVMBuildBitCast(ctx->builder, tl, type, "");
+   trbl = LLVMBuildBitCast(ctx->builder, trbl, type, "");
result = LLVMBuildFSub(ctx->builder, trbl, tl, "");
 
-   result = ac_build_intrinsic(ctx, "llvm.amdgcn.wqm.f32", ctx->f32,
-   &result, 1, 0);
+   result = ac_build_intrinsic(ctx,
+   LLVMTypeOf(val) == ctx->f32 ? "llvm.amdgcn.wqm.f32" : 
"llvm.amdgcn.wqm.f16", type,
+   &result, 1, 0);
 
return result;
 }
-- 
2.20.1

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev

[Mesa-dev] [PATCH v2 29/41] ac/nir: make ac_build_bit_count work on all bit sizes

2019-02-15 Thread Rhys Perry

Signed-off-by: Rhys Perry 
---
 src/amd/common/ac_llvm_build.c | 33 +++--
 1 file changed, 7 insertions(+), 26 deletions(-)

diff --git a/src/amd/common/ac_llvm_build.c b/src/amd/common/ac_llvm_build.c
index c986f800fa4..46738faea9d 100644
--- a/src/amd/common/ac_llvm_build.c
+++ b/src/amd/common/ac_llvm_build.c
@@ -2085,35 +2085,16 @@ LLVMValueRef ac_build_fsign(struct ac_llvm_context 
*ctx, LLVMValueRef src0,
 
 LLVMValueRef ac_build_bit_count(struct ac_llvm_context *ctx, LLVMValueRef src0)
 {
-   LLVMValueRef result;
-   unsigned bitsize;
+   unsigned bitsize = ac_get_elem_bits(ctx, LLVMTypeOf(src0));
 
-   bitsize = ac_get_elem_bits(ctx, LLVMTypeOf(src0));
+   char name[64];
+   snprintf(name, sizeof(name), "llvm.ctpop.i%d", bitsize);
 
-   switch (bitsize) {
-   case 64:
-   result = ac_build_intrinsic(ctx, "llvm.ctpop.i64", ctx->i64,
-   (LLVMValueRef []) { src0 }, 1,
-   AC_FUNC_ATTR_READNONE);
-
-   result = LLVMBuildTrunc(ctx->builder, result, ctx->i32, "");
-   break;
-   case 32:
-   result = ac_build_intrinsic(ctx, "llvm.ctpop.i32", ctx->i32,
-   (LLVMValueRef []) { src0 }, 1,
-   AC_FUNC_ATTR_READNONE);
-   break;
-   case 16:
-   result = ac_build_intrinsic(ctx, "llvm.ctpop.i16", ctx->i16,
-   (LLVMValueRef []) { src0 }, 1,
-   AC_FUNC_ATTR_READNONE);
-   break;
-   default:
-   unreachable(!"invalid bitsize");
-   break;
-   }
+   LLVMValueRef result = ac_build_intrinsic(ctx, name, LLVMTypeOf(src0),
+(LLVMValueRef []) { src0 }, 1,
+AC_FUNC_ATTR_READNONE);
 
-   return result;
+   return ac_build_ui_cast(ctx, result, ctx->i32);
 }
 
 LLVMValueRef ac_build_bitfield_reverse(struct ac_llvm_context *ctx,
-- 
2.20.1

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev

[Mesa-dev] [PATCH v2 36/41] radv: handle all fragment output types

2019-02-15 Thread Rhys Perry

Signed-off-by: Rhys Perry 
---
 src/amd/vulkan/radv_nir_to_llvm.c | 55 ---
 1 file changed, 35 insertions(+), 20 deletions(-)

diff --git a/src/amd/vulkan/radv_nir_to_llvm.c 
b/src/amd/vulkan/radv_nir_to_llvm.c
index 01b8b097ea1..c46eabf3656 100644
--- a/src/amd/vulkan/radv_nir_to_llvm.c
+++ b/src/amd/vulkan/radv_nir_to_llvm.c
@@ -2297,9 +2297,7 @@ si_llvm_init_export_args(struct radv_shader_context *ctx,
if (!values)
return;
 
-   bool is_16bit = ac_get_type_size(LLVMTypeOf(values[0])) == 2;
if (ctx->stage == MESA_SHADER_FRAGMENT) {
-   bool is_16bit = ac_get_type_size(LLVMTypeOf(values[0])) == 2;
unsigned index = target - V_008DFC_SQ_EXP_MRT;
unsigned col_format = (ctx->options->key.fs.col_format >> (4 * 
index)) & 0xf;
bool is_int8 = (ctx->options->key.fs.is_int8 >> index) & 1;
@@ -2310,6 +2308,28 @@ si_llvm_init_export_args(struct radv_shader_context *ctx,
LLVMValueRef (*packi)(struct ac_llvm_context *ctx, LLVMValueRef 
args[2],
  unsigned bits, bool hi) = NULL;
 
+   if (LLVMTypeOf(values[0]) == ctx->ac.f16 &&
+   col_format != V_028714_SPI_SHADER_FP16_ABGR) {
+   for (unsigned chan = 0; chan < 4; chan++)
+   values[chan] = LLVMBuildFPExt(ctx->ac.builder,
+ values[chan],
+ ctx->ac.f32, "");
+   }
+
+   if (LLVMTypeOf(values[0]) == ctx->ac.i16 || 
LLVMTypeOf(values[0]) == ctx->ac.i8) {
+   if (col_format == V_028714_SPI_SHADER_SINT16_ABGR) {
+   for (unsigned chan = 0; chan < 4; chan++)
+   values[chan] = 
LLVMBuildSExt(ctx->ac.builder,
+
values[chan],
+
ctx->ac.i32, "");
+   } else {
+   for (unsigned chan = 0; chan < 4; chan++)
+   values[chan] = 
LLVMBuildZExt(ctx->ac.builder,
+
values[chan],
+
ctx->ac.i32, "");
+   }
+   }
+
switch(col_format) {
case V_028714_SPI_SHADER_ZERO:
args->enabled_channels = 0; /* writemask */
@@ -2335,12 +2355,16 @@ si_llvm_init_export_args(struct radv_shader_context 
*ctx,
 
case V_028714_SPI_SHADER_FP16_ABGR:
args->enabled_channels = 0x5;
-   packf = ac_build_cvt_pkrtz_f16;
-   if (is_16bit) {
-   for (unsigned chan = 0; chan < 4; chan++)
-   values[chan] = 
LLVMBuildFPExt(ctx->ac.builder,
- 
values[chan],
- 
ctx->ac.f32, "");
+   if (LLVMTypeOf(values[0]) == ctx->ac.f16) {
+   packi = ac_build_cvt_pk_u16;
+   for (unsigned chan = 0; chan < 4; chan++) {
+   values[chan] = ac_to_integer(&ctx->ac, 
values[chan]);
+   values[chan] = 
LLVMBuildZExt(ctx->ac.builder,
+
values[chan],
+
ctx->ac.i32, "");
+   }
+   } else {
+   packf = ac_build_cvt_pkrtz_f16;
}
break;
 
@@ -2357,23 +2381,11 @@ si_llvm_init_export_args(struct radv_shader_context 
*ctx,
case V_028714_SPI_SHADER_UINT16_ABGR:
args->enabled_channels = 0x5;
packi = ac_build_cvt_pk_u16;
-   if (is_16bit) {
-   for (unsigned chan = 0; chan < 4; chan++)
-   values[chan] = 
LLVMBuildZExt(ctx->ac.builder,
- 
ac_to_integer(&ctx->ac, values[chan]),
- 
ctx->ac.i32, "");
-   }
break;
 
case V_028714_SPI_SHADER_SIN

[Mesa-dev] [PATCH v2 22/41] compiler/nir: add lowering option for 16-bit ffma

2019-02-15 Thread Rhys Perry

The lowering needs to be disabled for sufficient precision to pass
deqp-vk's 16-bit fma test on radv.

Signed-off-by: Rhys Perry 
---
 src/broadcom/compiler/nir_to_vir.c| 1 +
 src/compiler/nir/nir.h| 1 +
 src/compiler/nir/nir_opt_algebraic.py | 4 +++-
 src/gallium/drivers/radeonsi/si_get.c | 1 +
 src/gallium/drivers/vc4/vc4_program.c | 1 +
 5 files changed, 7 insertions(+), 1 deletion(-)

diff --git a/src/broadcom/compiler/nir_to_vir.c 
b/src/broadcom/compiler/nir_to_vir.c
index d983f91e718..6c0a623096a 100644
--- a/src/broadcom/compiler/nir_to_vir.c
+++ b/src/broadcom/compiler/nir_to_vir.c
@@ -2471,6 +2471,7 @@ const nir_shader_compiler_options v3d_nir_options = {
 .lower_fdiv = true,
 .lower_find_lsb = true,
 .lower_ffma = true,
+.lower_ffma16 = true,
 .lower_flrp32 = true,
 .lower_fpow = true,
 .lower_fsat = true,
diff --git a/src/compiler/nir/nir.h b/src/compiler/nir/nir.h
index 740c64d2a94..8df275f4aa3 100644
--- a/src/compiler/nir/nir.h
+++ b/src/compiler/nir/nir.h
@@ -2111,6 +2111,7 @@ typedef struct nir_function {
 
 typedef struct nir_shader_compiler_options {
bool lower_fdiv;
+   bool lower_ffma16;
bool lower_ffma;
bool fuse_ffma;
bool lower_flrp16;
diff --git a/src/compiler/nir/nir_opt_algebraic.py 
b/src/compiler/nir/nir_opt_algebraic.py
index 71c626e1b3f..63dff878d35 100644
--- a/src/compiler/nir/nir_opt_algebraic.py
+++ b/src/compiler/nir/nir_opt_algebraic.py
@@ -136,7 +136,9 @@ optimizations = [
(('~fadd', a, ('fmul', ('b2f', 'c@1'), ('fadd', b, ('fneg', a, 
('bcsel', c, b, a), 'options->lower_flrp32'),
(('~fadd@32', a, ('fmul', c , ('fadd', b, ('fneg', a, ('flrp', 
a, b, c), '!options->lower_flrp32'),
(('~fadd@64', a, ('fmul', c , ('fadd', b, ('fneg', a, ('flrp', 
a, b, c), '!options->lower_flrp64'),
-   (('ffma', a, b, c), ('fadd', ('fmul', a, b), c), 'options->lower_ffma'),
+   (('ffma@16', a, b, c), ('fadd', ('fmul', a, b), c), 
'options->lower_ffma16'),
+   (('ffma@32', a, b, c), ('fadd', ('fmul', a, b), c), 'options->lower_ffma'),
+   (('ffma@64', a, b, c), ('fadd', ('fmul', a, b), c), 'options->lower_ffma'),
(('~fadd', ('fmul', a, b), c), ('ffma', a, b, c), 'options->fuse_ffma'),
 
(('fdot4', ('vec4', a, b,   c,   1.0), d), ('fdph',  ('vec3', a, b, c), d)),
diff --git a/src/gallium/drivers/radeonsi/si_get.c 
b/src/gallium/drivers/radeonsi/si_get.c
index f8ca02d4fcf..5bf107ef6fe 100644
--- a/src/gallium/drivers/radeonsi/si_get.c
+++ b/src/gallium/drivers/radeonsi/si_get.c
@@ -491,6 +491,7 @@ static const struct nir_shader_compiler_options nir_options 
= {
.lower_fdiv = true,
.lower_sub = true,
.lower_ffma = true,
+   .lower_ffma16 = true,
.lower_pack_snorm_2x16 = true,
.lower_pack_snorm_4x8 = true,
.lower_pack_unorm_2x16 = true,
diff --git a/src/gallium/drivers/vc4/vc4_program.c 
b/src/gallium/drivers/vc4/vc4_program.c
index 2d0a52bb5fb..8be258cbba4 100644
--- a/src/gallium/drivers/vc4/vc4_program.c
+++ b/src/gallium/drivers/vc4/vc4_program.c
@@ -2234,6 +2234,7 @@ static const nir_shader_compiler_options nir_options = {
 .lower_extract_word = true,
 .lower_fdiv = true,
 .lower_ffma = true,
+.lower_ffma16 = true,
 .lower_flrp32 = true,
 .lower_fpow = true,
 .lower_fsat = true,
-- 
2.20.1

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev

[Mesa-dev] [PATCH v2 20/41] ac/nir: make emit_b2i work on all bit sizes

2019-02-15 Thread Rhys Perry

v2: don't use ac_int_of_size()

Signed-off-by: Rhys Perry 
---
 src/amd/common/ac_nir_to_llvm.c | 6 +-
 1 file changed, 1 insertion(+), 5 deletions(-)

diff --git a/src/amd/common/ac_nir_to_llvm.c b/src/amd/common/ac_nir_to_llvm.c
index e459001c1cf..75bb19031bf 100644
--- a/src/amd/common/ac_nir_to_llvm.c
+++ b/src/amd/common/ac_nir_to_llvm.c
@@ -347,11 +347,7 @@ static LLVMValueRef emit_b2i(struct ac_llvm_context *ctx,
 unsigned bitsize)
 {
LLVMValueRef result = LLVMBuildAnd(ctx->builder, src0, ctx->i32_1, "");
-
-   if (bitsize == 32)
-   return result;
-
-   return LLVMBuildZExt(ctx->builder, result, ctx->i64, "");
+   return ac_build_ui_cast(ctx, result, LLVMIntTypeInContext(ctx->context, 
bitsize));
 }
 
 static LLVMValueRef emit_i2b(struct ac_llvm_context *ctx,
-- 
2.20.1

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev

[Mesa-dev] [PATCH v2 05/41] ac/nir: implement 8-bit ssbo stores

2019-02-15 Thread Rhys Perry

Signed-off-by: Rhys Perry 
---
 src/amd/common/ac_nir_to_llvm.c | 22 --
 1 file changed, 20 insertions(+), 2 deletions(-)

diff --git a/src/amd/common/ac_nir_to_llvm.c b/src/amd/common/ac_nir_to_llvm.c
index 17d952d1ae8..89a78b43c6f 100644
--- a/src/amd/common/ac_nir_to_llvm.c
+++ b/src/amd/common/ac_nir_to_llvm.c
@@ -1524,7 +1524,7 @@ static void visit_store_ssbo(struct ac_nir_context *ctx,
 
LLVMValueRef rsrc = ctx->abi->load_ssbo(ctx->abi,
get_src(ctx, instr->src[1]), true);
-   LLVMValueRef base_data = ac_to_float(&ctx->ac, src_data);
+   LLVMValueRef base_data = src_data;
base_data = ac_trim_vector(&ctx->ac, base_data, instr->num_components);
LLVMValueRef base_offset = get_src(ctx, instr->src[2]);
 
@@ -1565,7 +1565,25 @@ static void visit_store_ssbo(struct ac_nir_context *ctx,
offset = LLVMBuildAdd(ctx->ac.builder, base_offset,
  LLVMConstInt(ctx->ac.i32, start * 
elem_size_bytes, false), "");
}
-   if (num_bytes == 2) {
+   if (num_bytes == 1) {
+   store_name = "llvm.amdgcn.tbuffer.store.i32";
+   data_type = ctx->ac.i32;
+   data = LLVMBuildZExt(ctx->ac.builder, data, data_type, 
"");
+   LLVMValueRef tbuffer_params[] = {
+   data,
+   rsrc,
+   ctx->ac.i32_0, /* vindex */
+   offset,/* voffset */
+   ctx->ac.i32_0,
+   ctx->ac.i32_0,
+   LLVMConstInt(ctx->ac.i32, 1, false), // dfmt (= 
8bit)
+   LLVMConstInt(ctx->ac.i32, 4, false), // nfmt (= 
uint)
+   glc,
+   ctx->ac.i1false,
+   };
+   ac_build_intrinsic(&ctx->ac, store_name,
+  ctx->ac.voidt, tbuffer_params, 10, 
0);
+   } else if (num_bytes == 2) {
store_name = "llvm.amdgcn.tbuffer.store.i32";
data_type = ctx->ac.i32;
LLVMValueRef tbuffer_params[] = {
-- 
2.20.1

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev

[Mesa-dev] [PATCH v2 03/41] ac: add various helpers for float16/int16/int8

2019-02-15 Thread Rhys Perry

v2: remove ac_get_one(), ac_get_zero(), ac_get_onef() and ac_get_zerof()
v2: remove ac_int_of_size()

Signed-off-by: Rhys Perry 
---
 src/amd/common/ac_llvm_build.c  | 55 ++---
 src/amd/common/ac_llvm_build.h  | 15 +++--
 src/amd/common/ac_nir_to_llvm.c | 30 +-
 3 files changed, 79 insertions(+), 21 deletions(-)

diff --git a/src/amd/common/ac_llvm_build.c b/src/amd/common/ac_llvm_build.c
index 9395bd1bbda..b53d9c7ff8c 100644
--- a/src/amd/common/ac_llvm_build.c
+++ b/src/amd/common/ac_llvm_build.c
@@ -87,12 +87,16 @@ ac_llvm_context_init(struct ac_llvm_context *ctx,
ctx->v4f32 = LLVMVectorType(ctx->f32, 4);
ctx->v8i32 = LLVMVectorType(ctx->i32, 8);
 
+   ctx->i8_0 = LLVMConstInt(ctx->i8, 0, false);
+   ctx->i8_1 = LLVMConstInt(ctx->i8, 1, false);
ctx->i16_0 = LLVMConstInt(ctx->i16, 0, false);
ctx->i16_1 = LLVMConstInt(ctx->i16, 1, false);
ctx->i32_0 = LLVMConstInt(ctx->i32, 0, false);
ctx->i32_1 = LLVMConstInt(ctx->i32, 1, false);
ctx->i64_0 = LLVMConstInt(ctx->i64, 0, false);
ctx->i64_1 = LLVMConstInt(ctx->i64, 1, false);
+   ctx->f16_0 = LLVMConstReal(ctx->f16, 0.0);
+   ctx->f16_1 = LLVMConstReal(ctx->f16, 1.0);
ctx->f32_0 = LLVMConstReal(ctx->f32, 0.0);
ctx->f32_1 = LLVMConstReal(ctx->f32, 1.0);
ctx->f64_0 = LLVMConstReal(ctx->f64, 0.0);
@@ -201,7 +205,9 @@ ac_get_type_size(LLVMTypeRef type)
 
 static LLVMTypeRef to_integer_type_scalar(struct ac_llvm_context *ctx, 
LLVMTypeRef t)
 {
-   if (t == ctx->f16 || t == ctx->i16)
+   if (t == ctx->i8)
+   return ctx->i8;
+   else if (t == ctx->f16 || t == ctx->i16)
return ctx->i16;
else if (t == ctx->f32 || t == ctx->i32)
return ctx->i32;
@@ -281,6 +287,42 @@ ac_to_float(struct ac_llvm_context *ctx, LLVMValueRef v)
return LLVMBuildBitCast(ctx->builder, v, ac_to_float_type(ctx, type), 
"");
 }
 
+LLVMTypeRef ac_float_of_size(struct ac_llvm_context *ctx, unsigned bit_size)
+{
+   switch (bit_size) {
+   case 16:
+   return ctx->f16;
+   case 32:
+   return ctx->f32;
+   case 64:
+   return ctx->f64;
+   default:
+   unreachable("Unhandled bit size");
+   }
+}
+
+LLVMValueRef ac_build_ui_cast(struct ac_llvm_context *ctx, LLVMValueRef v, 
LLVMTypeRef t)
+{
+   unsigned new_bit_size = ac_get_elem_bits(ctx, t);
+   unsigned old_bit_size = ac_get_elem_bits(ctx, LLVMTypeOf(v));
+   if (new_bit_size > old_bit_size)
+   return LLVMBuildZExt(ctx->builder, v, t, "");
+   else if (new_bit_size < old_bit_size)
+   return LLVMBuildTrunc(ctx->builder, v, t, "");
+   else
+   return v;
+}
+
+LLVMValueRef ac_build_reinterpret(struct ac_llvm_context *ctx, LLVMValueRef v, 
LLVMTypeRef t)
+{
+   if (LLVMTypeOf(v) == t)
+   return v;
+
+   v = ac_to_integer(ctx, v);
+   v = ac_build_ui_cast(ctx, v, ac_to_integer_type(ctx, t));
+   return LLVMBuildBitCast(ctx->builder, v, t, "");
+}
+
 
 LLVMValueRef
 ac_build_intrinsic(struct ac_llvm_context *ctx, const char *name,
@@ -1338,15 +1380,18 @@ LLVMValueRef 
ac_build_buffer_load_format_gfx9_safe(struct ac_llvm_context *ctx,
 }
 
 LLVMValueRef
-ac_build_tbuffer_load_short(struct ac_llvm_context *ctx,
+ac_build_tbuffer_load_short_byte(struct ac_llvm_context *ctx,
LLVMValueRef rsrc,
LLVMValueRef vindex,
LLVMValueRef voffset,
LLVMValueRef soffset,
LLVMValueRef immoffset,
-   LLVMValueRef glc)
+   LLVMValueRef glc,
+   unsigned size)
 {
+   assert(size == 1 || size == 2);
const char *name = "llvm.amdgcn.tbuffer.load.i32";
+   int data_format = size == 1 ? V_008F0C_BUF_DATA_FORMAT_8 : 
V_008F0C_BUF_DATA_FORMAT_16;
LLVMTypeRef type = ctx->i32;
LLVMValueRef params[] = {
rsrc,
@@ -1354,13 +1399,13 @@ ac_build_tbuffer_load_short(struct ac_llvm_context *ctx,
voffset,
soffset,
immoffset,
-   LLVMConstInt(ctx->i32, 
V_008F0C_BUF_DATA_FORMAT_16, false),
+   LLVMConstInt(ctx->i32, data_format, false),
LLVMConstInt(ctx->i32, 
V_008F0C_BUF_NUM_FORMAT_UINT, false),
glc,
ctx->i1false,
};
LLVMValueRef res = ac_build

[Mesa-dev] [PATCH v2 09/41] ac/nir: fix 64-bit nir_op_f2f16_rtz

2019-02-15 Thread Rhys Perry

Signed-off-by: Rhys Perry 
---
 src/amd/common/ac_nir_to_llvm.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/amd/common/ac_nir_to_llvm.c b/src/amd/common/ac_nir_to_llvm.c
index 691d444db05..741059b5f1a 100644
--- a/src/amd/common/ac_nir_to_llvm.c
+++ b/src/amd/common/ac_nir_to_llvm.c
@@ -886,6 +886,8 @@ static void visit_alu(struct ac_nir_context *ctx, const 
nir_alu_instr *instr)
break;
case nir_op_f2f16_rtz:
src[0] = ac_to_float(&ctx->ac, src[0]);
+   if (LLVMTypeOf(src[0]) == ctx->ac.f64)
+   src[0] = LLVMBuildFPTrunc(ctx->ac.builder, src[0], 
ctx->ac.f32, "");
LLVMValueRef param[2] = { src[0], ctx->ac.f32_0 };
result = ac_build_cvt_pkrtz_f16(&ctx->ac, param);
result = LLVMBuildExtractElement(ctx->ac.builder, result, 
ctx->ac.i32_0, "");
-- 
2.20.1

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev

[Mesa-dev] [PATCH v2 02/41] radv: ensure export arguments are always float

2019-02-15 Thread Rhys Perry

So that the signature is correct and consistent, the inputs to a export
intrinsic should always be 32-bit floats.

This and the previous commit fixes a large amount crashes from
dEQP-VK.spirv_assembly.instruction.graphics.16bit_storage.input_output_int_*
tests

Fixes: b722b29f10d ('radv: add support for 16bit input/output')
Signed-off-by: Rhys Perry 
---
 src/amd/vulkan/radv_nir_to_llvm.c | 6 +-
 1 file changed, 1 insertion(+), 5 deletions(-)

diff --git a/src/amd/vulkan/radv_nir_to_llvm.c 
b/src/amd/vulkan/radv_nir_to_llvm.c
index a8268c44ecf..d3795eec403 100644
--- a/src/amd/vulkan/radv_nir_to_llvm.c
+++ b/src/amd/vulkan/radv_nir_to_llvm.c
@@ -2429,12 +2429,8 @@ si_llvm_init_export_args(struct radv_shader_context *ctx,
} else
memcpy(&args->out[0], values, sizeof(values[0]) * 4);
 
-   for (unsigned i = 0; i < 4; ++i) {
-   if (!(args->enabled_channels & (1 << i)))
-   continue;
-
+   for (unsigned i = 0; i < 4; ++i)
args->out[i] = ac_to_float(&ctx->ac, args->out[i]);
-   }
 }
 
 static void
-- 
2.20.1

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev

[Mesa-dev] [PATCH v2 07/41] ac/nir: implement 8-bit nir_load_const_instr

2019-02-15 Thread Rhys Perry

Signed-off-by: Rhys Perry 
---
 src/amd/common/ac_nir_to_llvm.c | 4 
 1 file changed, 4 insertions(+)

diff --git a/src/amd/common/ac_nir_to_llvm.c b/src/amd/common/ac_nir_to_llvm.c
index b260142c177..f39232b91a1 100644
--- a/src/amd/common/ac_nir_to_llvm.c
+++ b/src/amd/common/ac_nir_to_llvm.c
@@ -1114,6 +1114,10 @@ static void visit_load_const(struct ac_nir_context *ctx,
 
for (unsigned i = 0; i < instr->def.num_components; ++i) {
switch (instr->def.bit_size) {
+   case 8:
+   values[i] = LLVMConstInt(element_type,
+instr->value.u8[i], false);
+   break;
case 16:
values[i] = LLVMConstInt(element_type,
 instr->value.u16[i], false);
-- 
2.20.1

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev

[Mesa-dev] [PATCH v2 13/41] ac/nir: make ac_build_fsign work on all bit sizes

2019-02-15 Thread Rhys Perry

v2: don't use ac_get_zerof() and ac_get_onef()

Signed-off-by: Rhys Perry 
---
 src/amd/common/ac_llvm_build.c | 16 
 1 file changed, 4 insertions(+), 12 deletions(-)

diff --git a/src/amd/common/ac_llvm_build.c b/src/amd/common/ac_llvm_build.c
index 3b2257e8bf0..23e454385d7 100644
--- a/src/amd/common/ac_llvm_build.c
+++ b/src/amd/common/ac_llvm_build.c
@@ -2079,19 +2079,11 @@ LLVMValueRef ac_build_isign(struct ac_llvm_context 
*ctx, LLVMValueRef src0,
 LLVMValueRef ac_build_fsign(struct ac_llvm_context *ctx, LLVMValueRef src0,
unsigned bitsize)
 {
-   LLVMValueRef cmp, val, zero, one;
-   LLVMTypeRef type;
-
-   if (bitsize == 32) {
-   type = ctx->f32;
-   zero = ctx->f32_0;
-   one = ctx->f32_1;
-   } else {
-   type = ctx->f64;
-   zero = ctx->f64_0;
-   one = ctx->f64_1;
-   }
+   LLVMTypeRef type = ac_float_of_size(ctx, bitsize);
+   LLVMValueRef zero = LLVMConstReal(type, 0.0);
+   LLVMValueRef one = LLVMConstReal(type, 1.0);
 
+   LLVMValueRef cmp, val;
cmp = LLVMBuildFCmp(ctx->builder, LLVMRealOGT, src0, zero, "");
val = LLVMBuildSelect(ctx->builder, cmp, one, src0, "");
cmp = LLVMBuildFCmp(ctx->builder, LLVMRealOGE, val, zero, "");
-- 
2.20.1

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev

[Mesa-dev] [PATCH v2 08/41] ac/nir: implement 8-bit conversions

2019-02-15 Thread Rhys Perry

Signed-off-by: Rhys Perry 
---
 src/amd/common/ac_nir_to_llvm.c | 9 +
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/src/amd/common/ac_nir_to_llvm.c b/src/amd/common/ac_nir_to_llvm.c
index f39232b91a1..691d444db05 100644
--- a/src/amd/common/ac_nir_to_llvm.c
+++ b/src/amd/common/ac_nir_to_llvm.c
@@ -858,12 +858,14 @@ static void visit_alu(struct ac_nir_context *ctx, const 
nir_alu_instr *instr)
src[i] = ac_to_integer(&ctx->ac, src[i]);
result = ac_build_gather_values(&ctx->ac, src, num_components);
break;
+   case nir_op_f2i8:
case nir_op_f2i16:
case nir_op_f2i32:
case nir_op_f2i64:
src[0] = ac_to_float(&ctx->ac, src[0]);
result = LLVMBuildFPToSI(ctx->ac.builder, src[0], def_type, "");
break;
+   case nir_op_f2u8:
case nir_op_f2u16:
case nir_op_f2u32:
case nir_op_f2u64:
@@ -898,15 +900,14 @@ static void visit_alu(struct ac_nir_context *ctx, const 
nir_alu_instr *instr)
else
result = LLVMBuildFPTrunc(ctx->ac.builder, src[0], 
ac_to_float_type(&ctx->ac, def_type), "");
break;
+   case nir_op_u2u8:
case nir_op_u2u16:
case nir_op_u2u32:
case nir_op_u2u64:
src[0] = ac_to_integer(&ctx->ac, src[0]);
-   if (ac_get_elem_bits(&ctx->ac, LLVMTypeOf(src[0])) < 
ac_get_elem_bits(&ctx->ac, def_type))
-   result = LLVMBuildZExt(ctx->ac.builder, src[0], 
def_type, "");
-   else
-   result = LLVMBuildTrunc(ctx->ac.builder, src[0], 
def_type, "");
+   result = ac_build_ui_cast(&ctx->ac, src[0], def_type);
break;
+   case nir_op_i2i8:
case nir_op_i2i16:
case nir_op_i2i32:
case nir_op_i2i64:
-- 
2.20.1

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev

[Mesa-dev] [PATCH v2 04/41] ac/nir: implement 8-bit push constant, ssbo and ubo loads

2019-02-15 Thread Rhys Perry

Signed-off-by: Rhys Perry 
---
 src/amd/common/ac_nir_to_llvm.c | 37 +++--
 1 file changed, 31 insertions(+), 6 deletions(-)

diff --git a/src/amd/common/ac_nir_to_llvm.c b/src/amd/common/ac_nir_to_llvm.c
index bed52490bad..17d952d1ae8 100644
--- a/src/amd/common/ac_nir_to_llvm.c
+++ b/src/amd/common/ac_nir_to_llvm.c
@@ -1399,7 +1399,30 @@ static LLVMValueRef visit_load_push_constant(struct 
ac_nir_context *ctx,
 
ptr = ac_build_gep0(&ctx->ac, ctx->abi->push_constants, addr);
 
-   if (instr->dest.ssa.bit_size == 16) {
+   if (instr->dest.ssa.bit_size == 8) {
+   unsigned load_dwords = instr->dest.ssa.num_components > 1 ? 2 : 
1;
+   LLVMTypeRef vec_type = 
LLVMVectorType(LLVMInt8TypeInContext(ctx->ac.context), 4 * load_dwords);
+   ptr = ac_cast_ptr(&ctx->ac, ptr, vec_type);
+   LLVMValueRef res = LLVMBuildLoad(ctx->ac.builder, ptr, "");
+
+   LLVMValueRef params[3];
+   if (load_dwords > 1) {
+   LLVMValueRef res_vec = 
LLVMBuildBitCast(ctx->ac.builder, res, LLVMVectorType(ctx->ac.i32, 2), "");
+   params[0] = LLVMBuildExtractElement(ctx->ac.builder, 
res_vec, LLVMConstInt(ctx->ac.i32, 1, false), "");
+   params[1] = LLVMBuildExtractElement(ctx->ac.builder, 
res_vec, LLVMConstInt(ctx->ac.i32, 0, false), "");
+   } else {
+   res = LLVMBuildBitCast(ctx->ac.builder, res, 
ctx->ac.i32, "");
+   params[0] = ctx->ac.i32_0;
+   params[1] = res;
+   }
+   params[2] = addr;
+   res = ac_build_intrinsic(&ctx->ac, "llvm.amdgcn.alignbyte", 
ctx->ac.i32, params, 3, 0);
+
+   res = LLVMBuildTrunc(ctx->ac.builder, res, 
LLVMIntTypeInContext(ctx->ac.context, instr->dest.ssa.num_components * 8), "");
+   if (instr->dest.ssa.num_components > 1)
+   res = LLVMBuildBitCast(ctx->ac.builder, res, 
LLVMVectorType(LLVMInt8TypeInContext(ctx->ac.context), 
instr->dest.ssa.num_components), "");
+   return res;
+   } else if (instr->dest.ssa.bit_size == 16) {
unsigned load_dwords = instr->dest.ssa.num_components / 2 + 1;
LLVMTypeRef vec_type = 
LLVMVectorType(LLVMInt16TypeInContext(ctx->ac.context), 2 * load_dwords);
ptr = ac_cast_ptr(&ctx->ac, ptr, vec_type);
@@ -1676,7 +1699,7 @@ static LLVMValueRef visit_load_buffer(struct 
ac_nir_context *ctx,
LLVMValueRef immoffset = LLVMConstInt(ctx->ac.i32, i * 
elem_size_bytes, false);
 
LLVMValueRef ret;
-   if (load_bytes == 2) {
+   if (load_bytes <= 2) {
ret = ac_build_tbuffer_load_short_byte(&ctx->ac,
   rsrc,
   vindex,
@@ -1684,7 +1707,7 @@ static LLVMValueRef visit_load_buffer(struct 
ac_nir_context *ctx,
   ctx->ac.i32_0,
   immoffset,
   glc,
-  2);
+  load_bytes);
} else {
const char *load_name;
LLVMTypeRef data_type;
@@ -1700,6 +1723,7 @@ static LLVMValueRef visit_load_buffer(struct 
ac_nir_context *ctx,
data_type = ctx->ac.v2f32;
break;
case 4:
+   case 3:
load_name = "llvm.amdgcn.buffer.load.f32";
data_type = ctx->ac.f32;
break;
@@ -1746,7 +1770,8 @@ static LLVMValueRef visit_load_ubo_buffer(struct 
ac_nir_context *ctx,
if (instr->dest.ssa.bit_size == 64)
num_components *= 2;
 
-   if (instr->dest.ssa.bit_size == 16) {
+   if (instr->dest.ssa.bit_size == 16 || instr->dest.ssa.bit_size == 8) {
+   unsigned size = instr->dest.ssa.bit_size / 8;
LLVMValueRef results[num_components];
for (unsigned i = 0; i < num_components; ++i) {
results[i] = ac_build_tbuffer_load_short_byte(&ctx->ac,
@@ -1754,9 +1779,9 @@ static LLVMValueRef visit_load_ubo_buffer(struct 
ac_nir_context *ctx,
  
ctx->ac.i32_0,

[Mesa-dev] [PATCH v2 06/41] ac/nir: fix 16-bit ssbo stores

2019-02-15 Thread Rhys Perry

Signed-off-by: Rhys Perry 
---
 src/amd/common/ac_nir_to_llvm.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/amd/common/ac_nir_to_llvm.c b/src/amd/common/ac_nir_to_llvm.c
index 89a78b43c6f..b260142c177 100644
--- a/src/amd/common/ac_nir_to_llvm.c
+++ b/src/amd/common/ac_nir_to_llvm.c
@@ -1586,6 +1586,8 @@ static void visit_store_ssbo(struct ac_nir_context *ctx,
} else if (num_bytes == 2) {
store_name = "llvm.amdgcn.tbuffer.store.i32";
data_type = ctx->ac.i32;
+   data = LLVMBuildBitCast(ctx->ac.builder, data, 
ctx->ac.i16, "");
+   data = LLVMBuildZExt(ctx->ac.builder, data, data_type, 
"");
LLVMValueRef tbuffer_params[] = {
data,
rsrc,
-- 
2.20.1

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev

[Mesa-dev] [PATCH v2 01/41] radv: bitcast 16-bit outputs to integers

2019-02-15 Thread Rhys Perry

16-bit outputs are stored as 16-bit floats in the outputs array, so they
have to be bitcast.

Fixes: b722b29f10d ('radv: add support for 16bit input/output')
Signed-off-by: Rhys Perry 
---
 src/amd/vulkan/radv_nir_to_llvm.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/amd/vulkan/radv_nir_to_llvm.c 
b/src/amd/vulkan/radv_nir_to_llvm.c
index 7f74678d5f1..a8268c44ecf 100644
--- a/src/amd/vulkan/radv_nir_to_llvm.c
+++ b/src/amd/vulkan/radv_nir_to_llvm.c
@@ -2365,7 +2365,7 @@ si_llvm_init_export_args(struct radv_shader_context *ctx,
if (is_16bit) {
for (unsigned chan = 0; chan < 4; chan++)
values[chan] = 
LLVMBuildZExt(ctx->ac.builder,
- 
values[chan],
+ 
ac_to_integer(&ctx->ac, values[chan]),
  
ctx->ac.i32, "");
}
break;
@@ -2376,7 +2376,7 @@ si_llvm_init_export_args(struct radv_shader_context *ctx,
if (is_16bit) {
for (unsigned chan = 0; chan < 4; chan++)
values[chan] = 
LLVMBuildSExt(ctx->ac.builder,
- 
values[chan],
+ 
ac_to_integer(&ctx->ac, values[chan]),
  
ctx->ac.i32, "");
}
break;
-- 
2.20.1

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev

[Mesa-dev] [PATCH 00/38] radv, ac: 16-bit and 8-bit arithmetic and 8-bit storage

2019-02-15 Thread Rhys Perry

This series add support for:
- VK_KHR_shader_float16_int8
- VK_AMD_gpu_shader_half_float
- VK_AMD_gpu_shader_int16
- VK_KHR_8bit_storage
on VI+. Half floats are disabled on LLVM 7 because of a bug causing large
memory usage and long (or unbounded) compilation times with some CTS
tests.

It is written against the following patch series:
- https://patchwork.freedesktop.org/series/53454/ (v4)
- https://patchwork.freedesktop.org/series/53660/ (v1)

With LLVM 9, there are no reproducable Vulkan CTS regressions with Vega
and VI except for
dEQP-VK.spirv_assembly.instruction.graphics.16bit_storage.input_output_float_64_to_16.*
which fails or crashes because of unrelated radv bugs with 64-bit varyings
and because the tests use VK_FORMAT_R64_SFLOAT as a vertex format even
though radv does not support it.

With LLVM 9, there are no reproducable piglit regressions except for
glsl-array-bounds-12.shader_test because of a LLVM bug when
SLP vectorization is enabled.

With LLVM 8, there are no reproducable Vulkan CTS regressions with Vega
and VI except for those with LLVM 9 and a couple of tests because of a
LLVM bug after the SLP vectorizer and with the current lack of fallback
for 16-bit interpolation on LLVM versions before LLVM 9.

With LLVM 7, there are no reproducable Vulkan CTS regressions with Vega
and VI except for those with LLVM 9 and a couple of tests because of a
LLVM bug after the SLP vectorizer.

The SLP vectorization patch is marked as WIP because it exposes LLVM bugs
with piglit's glsl-array-bounds-12.shader_test, some Vulkan CTS tests and
some shader-db test for a game I can't remember. It also over-vectorizes
32-bit code which can cause significant worsening in generated code
quality.

The 16-bit interpolation patch is marked as WIP because it currently
requires intrinsics only available in LLVM 9 and does not have a fallback.

A branch on Github containing this series can be found at:
https://github.com/pendingchaos/mesa/commits/radv_fp16_int16_int8_v2

v2: rebase
v2: implement 16-bit interpolation
v2: move LLVMAddSLPVectorizePass to after LLVMAddEarlyCSEMemSSAPass
v2: run vectorization unconditionally on GFX9 and later
v2: remove ac_get_one(), ac_get_zero(), ac_get_onef() and ac_get_zerof()
v2: remove ac_int_of_size()
v2: fix 64-bit visit_load_var()
v2: mark VK_KHR_8bit_storage as DONE in features.txt
v2: mark SLP vectorization patch as WIP
v2: fix C++ style comment

Rhys Perry (41):
  radv: bitcast 16-bit outputs to integers
  radv: ensure export arguments are always float
  ac: add various helpers for float16/int16/int8
  ac/nir: implement 8-bit push constant, ssbo and ubo loads
  ac/nir: implement 8-bit ssbo stores
  ac/nir: fix 16-bit ssbo stores
  ac/nir: implement 8-bit nir_load_const_instr
  ac/nir: implement 8-bit conversions
  ac/nir: fix 64-bit nir_op_f2f16_rtz
  ac/nir: make ac_build_clamp work on all bit sizes
  ac/nir: make ac_build_fract work on all bit sizes
  ac/nir: make ac_build_isign work on all bit sizes
  ac/nir: make ac_build_fsign work on all bit sizes
  ac/nir: make ac_build_fdiv support 16-bit floats
  ac/nir: implement half-float nir_op_frcp
  ac/nir: implement half-float nir_op_frsq
  ac/nir: implement half-float nir_op_ldexp
  radv: lower 16-bit flrp
  ac/nir: support half floats in emit_b2f
  ac/nir: make emit_b2i work on all bit sizes
  ac/nir: implement 16-bit shifts
  compiler/nir: add lowering option for 16-bit ffma
  ac/nir: implement 16-bit ac_build_ddxy
  ac/nir: implement 8 and 16 bit ac_build_readlane
  nir: make bitfield_reverse and ifind_msb work with all integers
  ac/nir: make ac_find_lsb work on all bit sizes
  ac/nir: make ac_build_umsb work on all bit sizes
  ac/nir: implement 8 and 16 bit ac_build_imsb
  ac/nir: make ac_build_bit_count work on all bit sizes
  ac/nir: make ac_build_bitfield_reverse work on all bit sizes
  ac/nir: implement 16-bit pack/unpack opcodes
  ac/nir: add 8-bit types to glsl_base_to_llvm_type
  ac/nir,radv: create an array of varying output types
  ac/nir: store all outputs as f32
  radv: store all fragment shader inputs as f32
  radv: handle all fragment output types
  WIP: radv,ac: implement 16-bit interpolation
  WIP: ac,radv: run LLVM's SLP vectorizer
  ac/nir: generate better code for nir_op_f2f16_rtz
  ac/nir: have nir_op_f2f16 round to zero
  radv,docs: expose float16, int16 and int8 features and extensions

 docs/features.txt|   2 +-
 src/amd/common/ac_llvm_build.c   | 325 +++
 src/amd/common/ac_llvm_build.h   |  18 +-
 src/amd/common/ac_llvm_util.c|   8 +-
 src/amd/common/ac_nir_to_llvm.c  | 268 +++
 src/amd/common/ac_shader_abi.h   |   1 +
 src/amd/vulkan/radv_device.c |  17 ++
 src/amd/vulkan/radv_extensions.py|   4 +
 src/amd/vulkan/radv_nir_to_llvm.c| 123 +
 src/amd/vulkan/radv_pipeline.c   |  19 +-
 src/amd/vulkan/radv_shader.c |   4 +

Re: [Mesa-dev] [PATCH 00/38] radv, ac: 16-bit and 8-bit arithmetic and 8-bit storage

2019-02-13 Thread Rhys Perry

Quite a bit of the patches aren't specific to a single extension as
many make code size-generic and some of the extensions intersect in
functionality.
It might still be possible to roughly order the patches by
functionality but I'm not sure if it would be very useful (possible
order in attachment). I didn't look at the actual content of the
patches when creating the attachment, this is from memory and looking
at the descriptions.
Would you like me to send out a v2 of this series doing like that?

On Tue, 12 Feb 2019 at 17:08, Samuel Pitoiset  wrote:
>
> How about splitting this series in four different parts? One for every
> extension? Is this doable without too much troubles?
>
> On 2/12/19 6:02 PM, Rhys Perry wrote:
> > It currently requires review (and possibly rebasing). Marek Olšák send
> > some feedback for a few of the patches but other than that, it hasn't
> > gotten much attention.
> >
> > Also patch 35 seems to vectorize 32-bit code which can help or hurt
> > shaders quite a bit and seems to hurt shaders overall. I'm not yet
> > sure how to solve this without removing it or changing the result of
> > LLVM's SLP vectorizer significantly.
> > IIRC enabling SLP vectorizer also uncovered a RA bug with a shader.
> >
> > I think I'll look into the issues with patch 35 again.
> >
> > On Tue, 12 Feb 2019 at 16:30, Samuel Pitoiset  
> > wrote:
> >> What's the status of this?
> >>
> >> On 12/7/18 6:21 PM, Rhys Perry wrote:
> >>> This series add support for:
> >>> - VK_KHR_shader_float16_int8
> >>> - VK_AMD_gpu_shader_half_float
> >>> - VK_AMD_gpu_shader_int16
> >>> - VK_KHR_8bit_storage
> >>> on VI+. Half floats are currently disabled on LLVM 7 because of a bug
> >>> causing large memory usage and long (or unbounded) compilation times with
> >>> some tests.
> >>>
> >>> It depends on the follow patch series:
> >>> - https://patchwork.freedesktop.org/series/53454/
> >>> - https://patchwork.freedesktop.org/series/53602/
> >>> - https://patchwork.freedesktop.org/series/53660/
> >>>
> >>> An older version was tested on my Polaris card, but due to hardware issues
> >>> I currently can't test the latest version of the series.
> >>>
> >>> deqp-vk has no regressions and none of the newly enabled tests fail.
> >>>
> >>> Rhys Perry (38):
> >>> ac: add various helpers for float16/int16/int8
> >>> ac/nir: implement 8-bit push constant, ssbo and ubo loads
> >>> ac/nir: implement 8-bit ssbo stores
> >>> ac/nir: fix 16-bit ssbo stores
> >>> ac/nir: implement 8-bit nir_load_const_instr
> >>> ac/nir: implement 8-bit conversions
> >>> ac/nir: fix 64-bit nir_op_f2f16_rtz
> >>> ac/nir: make ac_build_clamp work on all bit sizes
> >>> ac/nir: make ac_build_fract work on all bit sizes
> >>> ac/nir: make ac_build_isign work on all bit sizes
> >>> ac/nir: make ac_build_fsign work on all bit sizes
> >>> ac/nir: make ac_build_fdiv support 16-bit floats
> >>> ac/nir: implement half-float nir_op_frcp
> >>> ac/nir: implement half-float nir_op_frsq
> >>> ac/nir: implement half-float nir_op_ldexp
> >>> radv: lower 16-bit flrp
> >>> ac/nir: support half floats in emit_b2f
> >>> ac/nir: make emit_b2i work on all bit sizes
> >>> ac/nir: implement 16-bit shifts
> >>> compiler/nir: add lowering option for 16-bit ffma
> >>> ac/nir: implement 16-bit ac_build_ddxy
> >>> ac/nir: implement 8 and 16 bit ac_build_readlane
> >>> nir: make bitfield_reverse and ifind_msb work with all integers
> >>> ac/nir: make ac_find_lsb work on all bit sizes
> >>> ac/nir: make ac_build_umsb work on all bit sizes
> >>> ac/nir: implement 8 and 16 bit ac_build_imsb
> >>> ac/nir: make ac_build_bit_count work on all bit sizes
> >>> ac/nir: make ac_build_bitfield_reverse work on all bit sizes
> >>> ac/nir: implement 16-bit pack/unpack opcodes
> >>> ac/nir: add 8-bit and 16-bit types to glsl_base_to_llvm_type
> >>> ac/nir,radv: create an array of varying output types
> >>> ac/nir: store all outputs as f32
> >>> radv: store all fragment shader inputs as f32
> >>> radv: handle all fragment output types
> >>> ac,ra

Re: [Mesa-dev] [PATCH 00/38] radv, ac: 16-bit and 8-bit arithmetic and 8-bit storage

2019-02-12 Thread Rhys Perry

It currently requires review (and possibly rebasing). Marek Olšák send
some feedback for a few of the patches but other than that, it hasn't
gotten much attention.

Also patch 35 seems to vectorize 32-bit code which can help or hurt
shaders quite a bit and seems to hurt shaders overall. I'm not yet
sure how to solve this without removing it or changing the result of
LLVM's SLP vectorizer significantly.
IIRC enabling SLP vectorizer also uncovered a RA bug with a shader.

I think I'll look into the issues with patch 35 again.

On Tue, 12 Feb 2019 at 16:30, Samuel Pitoiset  wrote:
>
> What's the status of this?
>
> On 12/7/18 6:21 PM, Rhys Perry wrote:
> > This series add support for:
> > - VK_KHR_shader_float16_int8
> > - VK_AMD_gpu_shader_half_float
> > - VK_AMD_gpu_shader_int16
> > - VK_KHR_8bit_storage
> > on VI+. Half floats are currently disabled on LLVM 7 because of a bug
> > causing large memory usage and long (or unbounded) compilation times with
> > some tests.
> >
> > It depends on the follow patch series:
> > - https://patchwork.freedesktop.org/series/53454/
> > - https://patchwork.freedesktop.org/series/53602/
> > - https://patchwork.freedesktop.org/series/53660/
> >
> > An older version was tested on my Polaris card, but due to hardware issues
> > I currently can't test the latest version of the series.
> >
> > deqp-vk has no regressions and none of the newly enabled tests fail.
> >
> > Rhys Perry (38):
> >ac: add various helpers for float16/int16/int8
> >ac/nir: implement 8-bit push constant, ssbo and ubo loads
> >ac/nir: implement 8-bit ssbo stores
> >ac/nir: fix 16-bit ssbo stores
> >ac/nir: implement 8-bit nir_load_const_instr
> >ac/nir: implement 8-bit conversions
> >ac/nir: fix 64-bit nir_op_f2f16_rtz
> >ac/nir: make ac_build_clamp work on all bit sizes
> >ac/nir: make ac_build_fract work on all bit sizes
> >ac/nir: make ac_build_isign work on all bit sizes
> >ac/nir: make ac_build_fsign work on all bit sizes
> >ac/nir: make ac_build_fdiv support 16-bit floats
> >ac/nir: implement half-float nir_op_frcp
> >ac/nir: implement half-float nir_op_frsq
> >ac/nir: implement half-float nir_op_ldexp
> >radv: lower 16-bit flrp
> >ac/nir: support half floats in emit_b2f
> >ac/nir: make emit_b2i work on all bit sizes
> >ac/nir: implement 16-bit shifts
> >compiler/nir: add lowering option for 16-bit ffma
> >ac/nir: implement 16-bit ac_build_ddxy
> >ac/nir: implement 8 and 16 bit ac_build_readlane
> >nir: make bitfield_reverse and ifind_msb work with all integers
> >ac/nir: make ac_find_lsb work on all bit sizes
> >ac/nir: make ac_build_umsb work on all bit sizes
> >ac/nir: implement 8 and 16 bit ac_build_imsb
> >ac/nir: make ac_build_bit_count work on all bit sizes
> >ac/nir: make ac_build_bitfield_reverse work on all bit sizes
> >ac/nir: implement 16-bit pack/unpack opcodes
> >ac/nir: add 8-bit and 16-bit types to glsl_base_to_llvm_type
> >ac/nir,radv: create an array of varying output types
> >ac/nir: store all outputs as f32
> >radv: store all fragment shader inputs as f32
> >radv: handle all fragment output types
> >ac,radv: run LLVM's SLP vectorizer
> >ac/nir: generate better code for nir_op_f2f16_rtz
> >ac/nir: have nir_op_f2f16 round to zero
> >radv: expose float16, int16 and int8 features and extensions
> >
> >   src/amd/common/ac_llvm_build.c| 355 ++
> >   src/amd/common/ac_llvm_build.h|  22 +-
> >   src/amd/common/ac_llvm_util.c |   9 +-
> >   src/amd/common/ac_llvm_util.h |   1 +
> >   src/amd/common/ac_nir_to_llvm.c   | 258 +++
> >   src/amd/common/ac_shader_abi.h|   1 +
> >   src/amd/vulkan/radv_device.c  |  17 ++
> >   src/amd/vulkan/radv_extensions.py |   4 +
> >   src/amd/vulkan/radv_nir_to_llvm.c |  92 ---
> >   src/amd/vulkan/radv_shader.c  |   7 +
> >   src/broadcom/compiler/nir_to_vir.c|   1 +
> >   src/compiler/nir/nir.h|   1 +
> >   src/compiler/nir/nir_opcodes.py   |   4 +-
> >   src/compiler/nir/nir_opt_algebraic.py |   4 +-
> >   src/gallium/drivers/radeonsi/si_get.c |   1 +
> >   src/gallium/drivers/vc4/vc4_program.c |   1 +
> >   16 files changed, 516 insertions(+), 262 deletions(-)
> >
___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev

[Mesa-dev] [PATCH 2/2] radv: add missed situations for scissor bug workaround

2019-01-19 Thread Rhys Perry

Signed-off-by: Rhys Perry 
---
 src/amd/vulkan/radv_cmd_buffer.c | 65 
 src/amd/vulkan/radv_private.h|  2 +
 2 files changed, 43 insertions(+), 24 deletions(-)

diff --git a/src/amd/vulkan/radv_cmd_buffer.c b/src/amd/vulkan/radv_cmd_buffer.c
index f430b4f20dd..6d538d7e88a 100644
--- a/src/amd/vulkan/radv_cmd_buffer.c
+++ b/src/amd/vulkan/radv_cmd_buffer.c
@@ -920,6 +920,8 @@ radv_emit_scissor(struct radv_cmd_buffer *cmd_buffer)
  cmd_buffer->state.dynamic.scissor.scissors,
  cmd_buffer->state.dynamic.viewport.viewports,
  
cmd_buffer->state.emitted_pipeline->graphics.can_use_guardband);
+
+   cmd_buffer->state.workaround_scissor_bug = false;
 }
 
 static void
@@ -1217,6 +1219,8 @@ radv_update_bound_fast_clear_ds(struct radv_cmd_buffer 
*cmd_buffer,
radv_update_zrange_precision(cmd_buffer, &att->ds, image,
 layout, false);
}
+
+   cmd_buffer->state.workaround_scissor_bug = true;
 }
 
 /**
@@ -1442,6 +1446,8 @@ radv_update_bound_fast_clear_color(struct radv_cmd_buffer 
*cmd_buffer,
radeon_set_context_reg_seq(cs, R_028C8C_CB_COLOR0_CLEAR_WORD0 + cb_idx 
* 0x3c, 2);
radeon_emit(cs, color_values[0]);
radeon_emit(cs, color_values[1]);
+
+   cmd_buffer->state.workaround_scissor_bug = true;
 }
 
 /**
@@ -1704,6 +1710,8 @@ void radv_set_db_count_control(struct radv_cmd_buffer 
*cmd_buffer)
}
 
radeon_set_context_reg(cmd_buffer->cs, R_028004_DB_COUNT_CONTROL, 
db_count_control);
+
+   cmd_buffer->state.workaround_scissor_bug = true;
 }
 
 static void
@@ -2185,6 +2193,27 @@ radv_emit_draw_registers(struct radv_cmd_buffer 
*cmd_buffer,
state->last_primitive_reset_index = 
primitive_reset_index;
}
}
+
+   if (draw_info->strmout_buffer) {
+   uint64_t va = radv_buffer_get_va(draw_info->strmout_buffer->bo);
+
+   va += draw_info->strmout_buffer->offset +
+ draw_info->strmout_buffer_offset;
+
+   radeon_set_context_reg(cs, 
R_028B30_VGT_STRMOUT_DRAW_OPAQUE_VERTEX_STRIDE,
+  draw_info->stride);
+
+   radeon_emit(cs, PKT3(PKT3_COPY_DATA, 4, 0));
+   radeon_emit(cs, COPY_DATA_SRC_SEL(COPY_DATA_SRC_MEM) |
+   COPY_DATA_DST_SEL(COPY_DATA_REG) |
+   COPY_DATA_WR_CONFIRM);
+   radeon_emit(cs, va);
+   radeon_emit(cs, va >> 32);
+   radeon_emit(cs, 
R_028B2C_VGT_STRMOUT_DRAW_OPAQUE_BUFFER_FILLED_SIZE >> 2);
+   radeon_emit(cs, 0); /* unused */
+
+   radv_cs_add_buffer(cmd_buffer->device->ws, cs, 
draw_info->strmout_buffer->bo);
+   }
 }
 
 static void radv_stage_flush(struct radv_cmd_buffer *cmd_buffer,
@@ -3470,27 +3499,6 @@ radv_emit_draw_packets(struct radv_cmd_buffer 
*cmd_buffer,
struct radeon_winsys *ws = cmd_buffer->device->ws;
struct radeon_cmdbuf *cs = cmd_buffer->cs;
 
-   if (info->strmout_buffer) {
-   uint64_t va = radv_buffer_get_va(info->strmout_buffer->bo);
-
-   va += info->strmout_buffer->offset +
- info->strmout_buffer_offset;
-
-   radeon_set_context_reg(cs, 
R_028B30_VGT_STRMOUT_DRAW_OPAQUE_VERTEX_STRIDE,
-  info->stride);
-
-   radeon_emit(cs, PKT3(PKT3_COPY_DATA, 4, 0));
-   radeon_emit(cs, COPY_DATA_SRC_SEL(COPY_DATA_SRC_MEM) |
-   COPY_DATA_DST_SEL(COPY_DATA_REG) |
-   COPY_DATA_WR_CONFIRM);
-   radeon_emit(cs, va);
-   radeon_emit(cs, va >> 32);
-   radeon_emit(cs, 
R_028B2C_VGT_STRMOUT_DRAW_OPAQUE_BUFFER_FILLED_SIZE >> 2);
-   radeon_emit(cs, 0); /* unused */
-
-   radv_cs_add_buffer(ws, cs, info->strmout_buffer->bo);
-   }
-
if (info->indirect) {
uint64_t va = radv_buffer_get_va(info->indirect->bo);
uint64_t count_va = 0;
@@ -3609,13 +3617,16 @@ radv_emit_draw_packets(struct radv_cmd_buffer 
*cmd_buffer,
  * any context registers.
  */
 static bool radv_need_late_scissor_emission(struct radv_cmd_buffer *cmd_buffer,
-bool indexed_draw)
+const struct radv_draw_info *info)
 {
struct radv_cmd_state *state = &cmd_buffer->state;
 
if (!cmd_buffer->device->physical_device->has_scissor_bug)
return false;
 
+   if (cmd_buffer->state.workaround_scissor_bug || info->strmout_buffer)
+   return true;
+
uint32_

[Mesa-dev] [PATCH 1/2] radv: pass radv_draw_info to radv_emit_draw_registers()

2019-01-19 Thread Rhys Perry

Signed-off-by: Rhys Perry 
---
 src/amd/vulkan/radv_cmd_buffer.c | 118 +++
 1 file changed, 58 insertions(+), 60 deletions(-)

diff --git a/src/amd/vulkan/radv_cmd_buffer.c b/src/amd/vulkan/radv_cmd_buffer.c
index f41d6c0b3e7..f430b4f20dd 100644
--- a/src/amd/vulkan/radv_cmd_buffer.c
+++ b/src/amd/vulkan/radv_cmd_buffer.c
@@ -2074,10 +2074,60 @@ radv_upload_graphics_shader_descriptors(struct 
radv_cmd_buffer *cmd_buffer, bool
radv_flush_constants(cmd_buffer, VK_SHADER_STAGE_ALL_GRAPHICS);
 }
 
+struct radv_draw_info {
+   /**
+* Number of vertices.
+*/
+   uint32_t count;
+
+   /**
+* Index of the first vertex.
+*/
+   int32_t vertex_offset;
+
+   /**
+* First instance id.
+*/
+   uint32_t first_instance;
+
+   /**
+* Number of instances.
+*/
+   uint32_t instance_count;
+
+   /**
+* First index (indexed draws only).
+*/
+   uint32_t first_index;
+
+   /**
+* Whether it's an indexed draw.
+*/
+   bool indexed;
+
+   /**
+* Indirect draw parameters resource.
+*/
+   struct radv_buffer *indirect;
+   uint64_t indirect_offset;
+   uint32_t stride;
+
+   /**
+* Draw count parameters resource.
+*/
+   struct radv_buffer *count_buffer;
+   uint64_t count_buffer_offset;
+
+   /**
+* Stream output parameters resource.
+*/
+   struct radv_buffer *strmout_buffer;
+   uint64_t strmout_buffer_offset;
+};
+
 static void
-radv_emit_draw_registers(struct radv_cmd_buffer *cmd_buffer, bool indexed_draw,
-bool instanced_draw, bool indirect_draw,
-uint32_t draw_vertex_count)
+radv_emit_draw_registers(struct radv_cmd_buffer *cmd_buffer,
+const struct radv_draw_info *draw_info)
 {
struct radeon_info *info = 
&cmd_buffer->device->physical_device->rad_info;
struct radv_cmd_state *state = &cmd_buffer->state;
@@ -2087,8 +2137,9 @@ radv_emit_draw_registers(struct radv_cmd_buffer 
*cmd_buffer, bool indexed_draw,
 
/* Draw state. */
ia_multi_vgt_param =
-   si_get_ia_multi_vgt_param(cmd_buffer, instanced_draw,
- indirect_draw, draw_vertex_count);
+   si_get_ia_multi_vgt_param(cmd_buffer, draw_info->instance_count 
> 1,
+ draw_info->indirect,
+ draw_info->indirect ? 0 : 
draw_info->count);
 
if (state->last_ia_multi_vgt_param != ia_multi_vgt_param) {
if (info->chip_class >= GFX9) {
@@ -2108,7 +2159,7 @@ radv_emit_draw_registers(struct radv_cmd_buffer 
*cmd_buffer, bool indexed_draw,
 
/* Primitive restart. */
primitive_reset_en =
-   indexed_draw && state->pipeline->graphics.prim_restart_enable;
+   draw_info->indexed && 
state->pipeline->graphics.prim_restart_enable;
 
if (primitive_reset_en != state->last_primitive_reset_en) {
state->last_primitive_reset_en = primitive_reset_en;
@@ -3411,57 +3462,6 @@ radv_cs_emit_indirect_draw_packet(struct radv_cmd_buffer 
*cmd_buffer,
}
 }
 
-struct radv_draw_info {
-   /**
-* Number of vertices.
-*/
-   uint32_t count;
-
-   /**
-* Index of the first vertex.
-*/
-   int32_t vertex_offset;
-
-   /**
-* First instance id.
-*/
-   uint32_t first_instance;
-
-   /**
-* Number of instances.
-*/
-   uint32_t instance_count;
-
-   /**
-* First index (indexed draws only).
-*/
-   uint32_t first_index;
-
-   /**
-* Whether it's an indexed draw.
-*/
-   bool indexed;
-
-   /**
-* Indirect draw parameters resource.
-*/
-   struct radv_buffer *indirect;
-   uint64_t indirect_offset;
-   uint32_t stride;
-
-   /**
-* Draw count parameters resource.
-*/
-   struct radv_buffer *count_buffer;
-   uint64_t count_buffer_offset;
-
-   /**
-* Stream output parameters resource.
-*/
-   struct radv_buffer *strmout_buffer;
-   uint64_t strmout_buffer_offset;
-};
-
 static void
 radv_emit_draw_packets(struct radv_cmd_buffer *cmd_buffer,
   const struct radv_draw_info *info)
@@ -3672,9 +3672,7 @@ radv_emit_all_graphics_states(struct radv_cmd_buffer 
*cmd_buffer,
 
radv_cmd_buffer_flush_dynamic_state(cmd_buffer);
 
-   radv_emit_draw_registers(cmd_buffer, info->indexed,
-info->instance_count > 1, info->indirect,
-info->indirect ? 0 : info->count);
+   radv_emit_draw_registers(cmd_

[Mesa-dev] [PATCH] radv: avoid context rolls when binding graphics pipelines

2019-01-19 Thread Rhys Perry

It's common in some applications to bind a new graphics pipeline without
ending up changing any context registers.

This has a pipline have two command buffers: one for setting context
registers and one for everything else. The context register command buffer
is only emitted if it differs from the previous pipeline's.

v2: ensure late scissor emission is done when radv_emit_rbplus_state() is
called
v2: make use of cmd_buffer->state.workaround_scissor_bug

Signed-off-by: Rhys Perry 
---
This second version depends on the patch "radv: add missed situations for
scissor bug workaround".

 src/amd/vulkan/radv_cmd_buffer.c |  30 -
 src/amd/vulkan/radv_pipeline.c   | 217 ---
 src/amd/vulkan/radv_private.h|   2 +
 3 files changed, 141 insertions(+), 108 deletions(-)

diff --git a/src/amd/vulkan/radv_cmd_buffer.c b/src/amd/vulkan/radv_cmd_buffer.c
index 6d538d7e88a..f406a3a42f3 100644
--- a/src/amd/vulkan/radv_cmd_buffer.c
+++ b/src/amd/vulkan/radv_cmd_buffer.c
@@ -661,6 +661,8 @@ radv_update_multisample_state(struct radv_cmd_buffer 
*cmd_buffer,
radeon_emit(cmd_buffer->cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
radeon_emit(cmd_buffer->cs, EVENT_TYPE(V_028A90_FLUSH_DFSM) | 
EVENT_INDEX(0));
}
+
+   cmd_buffer->state.workaround_scissor_bug = true;
 }
 
 static void
@@ -857,10 +859,13 @@ radv_emit_rbplus_state(struct radv_cmd_buffer *cmd_buffer)
sx_blend_opt_control |= S_02875C_MRT0_COLOR_OPT_DISABLE(1) << 
(i * 4);
sx_blend_opt_control |= S_02875C_MRT0_ALPHA_OPT_DISABLE(1) << 
(i * 4);
}
+   /* TODO: avoid redundantly setting context registers */
radeon_set_context_reg_seq(cmd_buffer->cs, R_028754_SX_PS_DOWNCONVERT, 
3);
radeon_emit(cmd_buffer->cs, sx_ps_downconvert);
radeon_emit(cmd_buffer->cs, sx_blend_opt_epsilon);
radeon_emit(cmd_buffer->cs, sx_blend_opt_control);
+
+   cmd_buffer->state.workaround_scissor_bug = true;
 }
 
 static void
@@ -884,6 +889,15 @@ radv_emit_graphics_pipeline(struct radv_cmd_buffer 
*cmd_buffer)
 
radeon_emit_array(cmd_buffer->cs, pipeline->cs.buf, pipeline->cs.cdw);
 
+   if (!cmd_buffer->state.emitted_pipeline ||
+   cmd_buffer->state.emitted_pipeline->ctx_cs.cdw != 
pipeline->ctx_cs.cdw ||
+   cmd_buffer->state.emitted_pipeline->ctx_cs_hash != 
pipeline->ctx_cs_hash ||
+   memcmp(cmd_buffer->state.emitted_pipeline->ctx_cs.buf,
+  pipeline->ctx_cs.buf, pipeline->ctx_cs.cdw * 4)) {
+   radeon_emit_array(cmd_buffer->cs, pipeline->ctx_cs.buf, 
pipeline->ctx_cs.cdw);
+   cmd_buffer->state.workaround_scissor_bug = true;
+   }
+
for (unsigned i = 0; i < MESA_SHADER_COMPUTE; i++) {
if (!pipeline->shaders[i])
continue;
@@ -2939,6 +2953,8 @@ radv_emit_compute_pipeline(struct radv_cmd_buffer 
*cmd_buffer)
if (!pipeline || pipeline == cmd_buffer->state.emitted_compute_pipeline)
return;
 
+   assert(!pipeline->ctx_cs.cdw);
+
cmd_buffer->state.emitted_compute_pipeline = pipeline;
 
radeon_check_space(cmd_buffer->device->ws, cmd_buffer->cs, 
pipeline->cs.cdw);
@@ -3630,20 +3646,16 @@ static bool radv_need_late_scissor_emission(struct 
radv_cmd_buffer *cmd_buffer,
uint32_t used_states = 
cmd_buffer->state.pipeline->graphics.needed_dynamic_state | 
~RADV_CMD_DIRTY_DYNAMIC_ALL;
 
/* Index, vertex and streamout buffers don't change context regs, and
-* pipeline is handled later.
+* pipeline is already handled.
 */
used_states &= ~(RADV_CMD_DIRTY_INDEX_BUFFER |
 RADV_CMD_DIRTY_VERTEX_BUFFER |
 RADV_CMD_DIRTY_STREAMOUT_BUFFER |
 RADV_CMD_DIRTY_PIPELINE);
 
-   /* Assume all state changes except  these two can imply context rolls. 
*/
if (cmd_buffer->state.dirty & used_states)
return true;
 
-   if (cmd_buffer->state.emitted_pipeline != cmd_buffer->state.pipeline)
-   return true;
-
if (info->indexed && state->pipeline->graphics.prim_restart_enable &&
(state->index_type ? 0xu : 0xu) != 
state->last_primitive_reset_index)
return true;
@@ -3655,7 +3667,7 @@ static void
 radv_emit_all_graphics_states(struct radv_cmd_buffer *cmd_buffer,
  const struct radv_draw_info *info)
 {
-   bool late_scissor_emission = 
radv_need_late_scissor_emission(cmd_buffer, info);
+   bool late_scissor_emission;
 
if ((cmd_buffer->state.dirty & RADV_CMD_DIRTY_FRAMEBUFFER) ||
cmd_buffer->state.emitted_pipeline != cmd_buffer->state.pipel

[Mesa-dev] [PATCH v3 0/5] nvc0: Implement EXT_shader_image_load_formatted

2019-01-16 Thread Rhys Perry

This patch series implements EXT_shader_image_load_formatted on Maxwell+.

It should implement all of the spec except, if the extension is enabled,
passing image variables without a format qualifier to atomic operations
will not raise a compilation error like it should.

This is because knowing the format used in an image operation before
function inlining can be difficult, because formats don't have to (and
currently can't) be specified in the paramter declaration. So this series
leaves this issue to hopefully be resolved in a later patch.

I tested the second version of this series when it was released in June
2018 but I can't easily test this version. Nothing changed too much though
so it should be fine.

v2: change from PIPE_SHADER_CAP_* to PIPE_CAP_*
v2: fix broken feature detection in the state tracker
v2: move code in AlgebraicOpt::handleSULDP() to nv50_ir_ra.cpp
v3: rebase
v3: make use of u_pipe_screen_get_param_defaults
v3: move RA code into it's own function

Rhys Perry (5):
  gallium: add support for formatted image loads
  mesa,glsl: add support for EXT_shader_image_load_formatted
  st/mesa: add support for EXT_shader_image_load_formatted
  nv50/ir: use suld.p on GM107+
  nvc0,nv50/ir: enable support for formatted image loads on GM107+

 src/compiler/glsl/ast_to_hir.cpp  |  5 +++
 src/compiler/glsl/glsl_parser_extras.cpp  |  1 +
 src/compiler/glsl/glsl_parser_extras.h|  7 
 src/gallium/auxiliary/util/u_screen.c |  1 +
 src/gallium/docs/source/screen.rst|  1 +
 src/gallium/drivers/nouveau/codegen/nv50_ir.h |  4 +++
 .../nouveau/codegen/nv50_ir_emit_gm107.cpp| 34 ---
 .../nouveau/codegen/nv50_ir_lowering_nvc0.cpp |  3 +-
 .../drivers/nouveau/codegen/nv50_ir_print.cpp | 17 ++
 .../drivers/nouveau/codegen/nv50_ir_ra.cpp| 31 +
 .../drivers/nouveau/nv30/nv30_screen.c|  1 +
 .../drivers/nouveau/nv50/nv50_screen.c|  1 +
 .../drivers/nouveau/nvc0/nvc0_screen.c|  2 ++
 src/gallium/drivers/swr/swr_screen.cpp|  1 +
 src/gallium/drivers/vc4/vc4_screen.c  |  1 +
 src/gallium/include/pipe/p_defines.h  |  1 +
 src/mesa/main/extensions_table.h  |  1 +
 src/mesa/main/mtypes.h|  1 +
 src/mesa/state_tracker/st_extensions.c|  1 +
 19 files changed, 100 insertions(+), 14 deletions(-)

-- 
2.20.1

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev

[Mesa-dev] [PATCH v3 5/5] nvc0, nv50/ir: enable support for formatted image loads on GM107+

2019-01-16 Thread Rhys Perry

v3: rebase

Signed-off-by: Rhys Perry 
---
 src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp | 3 +--
 src/gallium/drivers/nouveau/nvc0/nvc0_screen.c| 3 ++-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp 
b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp
index 295497be2f..6c134962b4 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp
@@ -2414,12 +2414,11 @@ 
NVC0LoweringPass::processSurfaceCoordsGM107(TexInstruction *su)
   bld.mkCmp(OP_SET, CC_EQ, TYPE_U32, bld.getSSA(1, FILE_PREDICATE),
 TYPE_U32, bld.mkImm(0),
 loadSuInfo32(ind, slot, NVC0_SU_INFO_ADDR, su->tex.bindless));
-   if (su->op != OP_SUSTP && su->tex.format) {
+   if (su->op != OP_SUSTP && su->tex.format && su->tex.format->components > 0) 
{
   const TexInstruction::ImgFormatDesc *format = su->tex.format;
   int blockwidth = format->bits[0] + format->bits[1] +
format->bits[2] + format->bits[3];
 
-  assert(format->components != 0);
   // make sure that the format doesn't mismatch when it's not FMT_NONE
   bld.mkCmp(OP_SET_OR, CC_NE, TYPE_U32, pred->getDef(0),
 TYPE_U32, bld.loadImm(NULL, blockwidth / 8),
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_screen.c 
b/src/gallium/drivers/nouveau/nvc0/nvc0_screen.c
index b7cf2cd2e4..c47502cae1 100644
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_screen.c
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_screen.c
@@ -288,6 +288,8 @@ nvc0_screen_get_param(struct pipe_screen *pscreen, enum 
pipe_cap param)
   return class_3d >= GM200_3D_CLASS;
case PIPE_CAP_CONSERVATIVE_RASTER_PRE_SNAP_TRIANGLES:
   return class_3d >= GP100_3D_CLASS;
+   case PIPE_CAP_IMAGE_LOAD_FORMATTED:
+  return class_3d >= GM107_3D_CLASS;
 
/* unsupported caps */
case PIPE_CAP_DEPTH_CLIP_DISABLE_SEPARATE:
@@ -334,7 +336,6 @@ nvc0_screen_get_param(struct pipe_screen *pscreen, enum 
pipe_cap param)
case PIPE_CAP_MAX_COMBINED_HW_ATOMIC_COUNTERS:
case PIPE_CAP_MAX_COMBINED_HW_ATOMIC_COUNTER_BUFFERS:
case PIPE_CAP_SURFACE_SAMPLE_COUNT:
-   case PIPE_CAP_IMAGE_LOAD_FORMATTED:
   return 0;
 
case PIPE_CAP_VENDOR_ID:
-- 
2.20.1

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev

[Mesa-dev] [PATCH v3 4/5] nv50/ir: use suld.p on GM107+

2019-01-16 Thread Rhys Perry

v3: rebase
v3: move RA code into it's own function

Signed-off-by: Rhys Perry 
---
 src/gallium/drivers/nouveau/codegen/nv50_ir.h |  4 +++
 .../nouveau/codegen/nv50_ir_emit_gm107.cpp| 34 ---
 .../drivers/nouveau/codegen/nv50_ir_print.cpp | 17 ++
 .../drivers/nouveau/codegen/nv50_ir_ra.cpp| 31 +
 4 files changed, 74 insertions(+), 12 deletions(-)

diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir.h 
b/src/gallium/drivers/nouveau/codegen/nv50_ir.h
index 8085bb2f54..2388f3923c 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir.h
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir.h
@@ -224,6 +224,10 @@ enum operation
 #define NV50_IR_SUBOP_SULD_ZERO0
 #define NV50_IR_SUBOP_SULD_TRAP1
 #define NV50_IR_SUBOP_SULD_SDCL3
+// These three are only for GM107+ and are set during register allocation
+#define NV50_IR_SUBOP_SULDP_RGBA   (0 << 2)
+#define NV50_IR_SUBOP_SULDP_RG (1 << 2)
+#define NV50_IR_SUBOP_SULDP_R  (2 << 2)
 #define NV50_IR_SUBOP_SUBFM_3D 1
 #define NV50_IR_SUBOP_SUCLAMP_2D   0x10
 #define NV50_IR_SUBOP_SUCLAMP_SD(r, d) (( 0 + (r)) | ((d == 2) ? 0x10 : 0))
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gm107.cpp 
b/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gm107.cpp
index be00db3131..d7f4380b34 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gm107.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gm107.cpp
@@ -3257,26 +3257,36 @@ void
 CodeEmitterGM107::emitSULDx()
 {
const TexInstruction *insn = this->insn->asTex();
-   int type = 0;
 
emitInsn(0xeb00);
if (insn->op == OP_SULDB)
   emitField(0x34, 1, 1);
emitSUTarget();
 
-   switch (insn->dType) {
-   case TYPE_S8:   type = 1; break;
-   case TYPE_U16:  type = 2; break;
-   case TYPE_S16:  type = 3; break;
-   case TYPE_U32:  type = 4; break;
-   case TYPE_U64:  type = 5; break;
-   case TYPE_B128: type = 6; break;
-   default:
-  assert(insn->dType == TYPE_U8);
-  break;
+   if (insn->op == OP_SULDB) {
+  int type = 0;
+  switch (insn->dType) {
+  case TYPE_S8:   type = 1; break;
+  case TYPE_U16:  type = 2; break;
+  case TYPE_S16:  type = 3; break;
+  case TYPE_U32:  type = 4; break;
+  case TYPE_U64:  type = 5; break;
+  case TYPE_B128: type = 6; break;
+  default:
+ assert(insn->dType == TYPE_U8);
+ break;
+  }
+  emitField(0x14, 3, type);
+   } else {
+  int type = 0;
+  switch (insn->subOp & 0xc) {
+  case NV50_IR_SUBOP_SULDP_R:type = 0x1; break;
+  case NV50_IR_SUBOP_SULDP_RG:   type = 0x3; break;
+  case NV50_IR_SUBOP_SULDP_RGBA: type = 0xf; break;
+  }
+  emitField(0x14, 4, type);
}
emitLDSTc(0x18);
-   emitField(0x14, 3, type);
emitGPR  (0x00, insn->def(0));
emitGPR  (0x08, insn->src(0));
 
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_print.cpp 
b/src/gallium/drivers/nouveau/codegen/nv50_ir_print.cpp
index 5dcbf3c3e0..43011c23af 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_print.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_print.cpp
@@ -246,6 +246,16 @@ static const char *xmadOpCModeStr[] =
"clo", "chi", "csfu", "cbcc"
 };
 
+static const char *suldOpStr[] =
+{
+   "zero", "trap", "sdcl"
+};
+
+static const char *suldSwizzleOpStr[] =
+{
+   "rgba", "rg", "r"
+};
+
 static const char *DataTypeStr[] =
 {
"-",
@@ -672,6 +682,13 @@ void Instruction::print() const
 PRINT("h%d ", (subOp & NV50_IR_SUBOP_XMAD_H1(i)) ? 1 : 0);
  break;
   }
+  case OP_SULDB:
+  case OP_SULDP:
+ if ((subOp & 0x3) < ARRAY_SIZE(suldOpStr))
+PRINT("%s ", suldOpStr[subOp & 0x3]);
+ if (op == OP_SULDP && subOp >> 2 < (int)ARRAY_SIZE(suldSwizzleOpStr))
+PRINT("%s ", suldSwizzleOpStr[subOp >> 2]);
+ break;
   default:
  if (subOp)
 PRINT("(SUBOP:%u) ", subOp);
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_ra.cpp 
b/src/gallium/drivers/nouveau/codegen/nv50_ir_ra.cpp
index 322b79fe62..8e57bda254 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_ra.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_ra.cpp
@@ -264,6 +264,7 @@ private:
 
   void addHazard(Instruction *i, const ValueRef *src);
   void textureMask(TexInstruction *);
+  void suldpMask(TexInstruction *);
   void addConstraint(Instruction *, int s, int n);
   bool detectConflict(Instruction *, int s);
 
@@ -1996,6 +1997,33 @@ 
RegAlloc::InsertConstraintsPass::textureMask(TexInstruction *tex)
   tex->setDef(c, NULL);
 }
 
+void
+RegAlloc::InsertConstraintsPass::suldpMask(TexInstruction *tex)
+{
+   int max = 0;
+

[Mesa-dev] [PATCH v3 2/5] mesa, glsl: add support for EXT_shader_image_load_formatted

2019-01-16 Thread Rhys Perry

v3: rebase

Signed-off-by: Rhys Perry 
Reviewed-by: Marek Olšák  (v2)
---
 src/compiler/glsl/ast_to_hir.cpp | 5 +
 src/compiler/glsl/glsl_parser_extras.cpp | 1 +
 src/compiler/glsl/glsl_parser_extras.h   | 7 +++
 src/mesa/main/extensions_table.h | 1 +
 src/mesa/main/mtypes.h   | 1 +
 5 files changed, 15 insertions(+)

diff --git a/src/compiler/glsl/ast_to_hir.cpp b/src/compiler/glsl/ast_to_hir.cpp
index 67a5a8c050..d9a57d37f6 100644
--- a/src/compiler/glsl/ast_to_hir.cpp
+++ b/src/compiler/glsl/ast_to_hir.cpp
@@ -3476,6 +3476,11 @@ apply_image_qualifier_to_variable(const struct 
ast_type_qualifier *qual,
   }
 
   var->data.image_format = qual->image_format;
+   } else if (state->has_image_load_formatted()) {
+  if (var->data.mode == ir_var_uniform &&
+  state->EXT_shader_image_load_formatted_warn) {
+ _mesa_glsl_warning(loc, state, "GL_EXT_image_load_formatted used");
+  }
} else {
   if (var->data.mode == ir_var_uniform) {
  if (state->es_shader) {
diff --git a/src/compiler/glsl/glsl_parser_extras.cpp 
b/src/compiler/glsl/glsl_parser_extras.cpp
index 2048a7f900..1e035e94d8 100644
--- a/src/compiler/glsl/glsl_parser_extras.cpp
+++ b/src/compiler/glsl/glsl_parser_extras.cpp
@@ -721,6 +721,7 @@ static const _mesa_glsl_extension 
_mesa_glsl_supported_extensions[] = {
EXT(EXT_separate_shader_objects),
EXT(EXT_shader_framebuffer_fetch),
EXT(EXT_shader_framebuffer_fetch_non_coherent),
+   EXT(EXT_shader_image_load_formatted),
EXT(EXT_shader_implicit_conversions),
EXT(EXT_shader_integer_mix),
EXT_AEP(EXT_shader_io_blocks),
diff --git a/src/compiler/glsl/glsl_parser_extras.h 
b/src/compiler/glsl/glsl_parser_extras.h
index b17b5125e0..63a5cca5d2 100644
--- a/src/compiler/glsl/glsl_parser_extras.h
+++ b/src/compiler/glsl/glsl_parser_extras.h
@@ -344,6 +344,11 @@ struct _mesa_glsl_parse_state {
   return ARB_bindless_texture_enable;
}
 
+   bool has_image_load_formatted() const
+   {
+  return EXT_shader_image_load_formatted_enable;
+   }
+
bool has_implicit_conversions() const
{
   return EXT_shader_implicit_conversions_enable || is_version(120, 0);
@@ -816,6 +821,8 @@ struct _mesa_glsl_parse_state {
bool EXT_shader_framebuffer_fetch_warn;
bool EXT_shader_framebuffer_fetch_non_coherent_enable;
bool EXT_shader_framebuffer_fetch_non_coherent_warn;
+   bool EXT_shader_image_load_formatted_enable;
+   bool EXT_shader_image_load_formatted_warn;
bool EXT_shader_implicit_conversions_enable;
bool EXT_shader_implicit_conversions_warn;
bool EXT_shader_integer_mix_enable;
diff --git a/src/mesa/main/extensions_table.h b/src/mesa/main/extensions_table.h
index dad38124d5..c3eb019f81 100644
--- a/src/mesa/main/extensions_table.h
+++ b/src/mesa/main/extensions_table.h
@@ -264,6 +264,7 @@ EXT(EXT_separate_shader_objects , dummy_true
 EXT(EXT_separate_specular_color , dummy_true   
  , GLL,  x ,  x ,  x , 1997)
 EXT(EXT_shader_framebuffer_fetch, EXT_shader_framebuffer_fetch 
  , GLL, GLC,  x , ES2, 2013)
 EXT(EXT_shader_framebuffer_fetch_non_coherent, 
EXT_shader_framebuffer_fetch_non_coherent, GLL, GLC,  x, ES2, 2018)
+EXT(EXT_shader_image_load_formatted , EXT_shader_image_load_formatted  
  , GLL, GLC,  x ,  x , 2014)
 EXT(EXT_shader_implicit_conversions , dummy_true   
  ,  x ,  x ,  x ,  31, 2013)
 EXT(EXT_shader_integer_mix  , EXT_shader_integer_mix   
  , GLL, GLC,  x ,  30, 2013)
 EXT(EXT_shader_io_blocks, dummy_true   
  ,  x ,  x ,  x ,  31, 2014)
diff --git a/src/mesa/main/mtypes.h b/src/mesa/main/mtypes.h
index 241c2b92f7..bd90727e26 100644
--- a/src/mesa/main/mtypes.h
+++ b/src/mesa/main/mtypes.h
@@ -4264,6 +4264,7 @@ struct gl_extensions
GLboolean EXT_render_snorm;
GLboolean EXT_semaphore;
GLboolean EXT_semaphore_fd;
+   GLboolean EXT_shader_image_load_formatted;
GLboolean EXT_shader_integer_mix;
GLboolean EXT_shader_samples_identical;
GLboolean EXT_stencil_two_side;
-- 
2.20.1

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev

[Mesa-dev] [PATCH v3 3/5] st/mesa: add support for EXT_shader_image_load_formatted

2019-01-16 Thread Rhys Perry

v3: rebase

Signed-off-by: Rhys Perry 
Reviewed-by: Marek Olšák  (v2)
---
 src/mesa/state_tracker/st_extensions.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/mesa/state_tracker/st_extensions.c 
b/src/mesa/state_tracker/st_extensions.c
index 4628079260..b713eed969 100644
--- a/src/mesa/state_tracker/st_extensions.c
+++ b/src/mesa/state_tracker/st_extensions.c
@@ -717,6 +717,7 @@ void st_init_extensions(struct pipe_screen *screen,
   { o(ARB_shader_clock), PIPE_CAP_TGSI_CLOCK   
},
   { o(ARB_shader_draw_parameters),   PIPE_CAP_DRAW_PARAMETERS  
},
   { o(ARB_shader_group_vote),PIPE_CAP_TGSI_VOTE
},
+  { o(EXT_shader_image_load_formatted),  PIPE_CAP_IMAGE_LOAD_FORMATTED 
},
   { o(ARB_shader_stencil_export),PIPE_CAP_SHADER_STENCIL_EXPORT
},
   { o(ARB_shader_texture_image_samples), PIPE_CAP_TGSI_TXQS
},
   { o(ARB_shader_texture_lod),   PIPE_CAP_SM3  
},
-- 
2.20.1

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev

[Mesa-dev] [PATCH v3 1/5] gallium: add support for formatted image loads

2019-01-16 Thread Rhys Perry

v3: rebase
v3: make use of u_pipe_screen_get_param_defaults

Signed-off-by: Rhys Perry 
---
 src/gallium/auxiliary/util/u_screen.c  | 1 +
 src/gallium/docs/source/screen.rst | 1 +
 src/gallium/drivers/nouveau/nv30/nv30_screen.c | 1 +
 src/gallium/drivers/nouveau/nv50/nv50_screen.c | 1 +
 src/gallium/drivers/nouveau/nvc0/nvc0_screen.c | 1 +
 src/gallium/drivers/swr/swr_screen.cpp | 1 +
 src/gallium/drivers/vc4/vc4_screen.c   | 1 +
 src/gallium/include/pipe/p_defines.h   | 1 +
 8 files changed, 8 insertions(+)

diff --git a/src/gallium/auxiliary/util/u_screen.c 
b/src/gallium/auxiliary/util/u_screen.c
index c14edde859..470632f5ec 100644
--- a/src/gallium/auxiliary/util/u_screen.c
+++ b/src/gallium/auxiliary/util/u_screen.c
@@ -314,6 +314,7 @@ u_pipe_screen_get_param_defaults(struct pipe_screen 
*pscreen,
case PIPE_CAP_MAX_COMBINED_HW_ATOMIC_COUNTERS:
case PIPE_CAP_MAX_COMBINED_HW_ATOMIC_COUNTER_BUFFERS:
case PIPE_CAP_TGSI_ATOMFADD:
+   case PIPE_CAP_IMAGE_LOAD_FORMATTED:
   return 0;
 
case PIPE_CAP_MAX_GS_INVOCATIONS:
diff --git a/src/gallium/docs/source/screen.rst 
b/src/gallium/docs/source/screen.rst
index 9b75a407db..b2d0c401d5 100644
--- a/src/gallium/docs/source/screen.rst
+++ b/src/gallium/docs/source/screen.rst
@@ -483,6 +483,7 @@ The integer capabilities:
 * ``PIPE_CAP_TGSI_ATOMFADD``: Atomic floating point adds are supported on
   images, buffers, and shared memory.
 * ``PIPE_CAP_RGB_OVERRIDE_DST_ALPHA_BLEND``: True if the driver needs blend 
state to use zero/one instead of destination alpha for RGB/XRGB formats.
+* ``PIPE_CAP_IMAGE_LOAD_FORMATTED``: True if a format for image loads does not 
need to be specified in the shader IR
 
 .. _pipe_capf:
 
diff --git a/src/gallium/drivers/nouveau/nv30/nv30_screen.c 
b/src/gallium/drivers/nouveau/nv30/nv30_screen.c
index 2b69a8f696..d6e0f43f6c 100644
--- a/src/gallium/drivers/nouveau/nv30/nv30_screen.c
+++ b/src/gallium/drivers/nouveau/nv30/nv30_screen.c
@@ -243,6 +243,7 @@ nv30_screen_get_param(struct pipe_screen *pscreen, enum 
pipe_cap param)
case PIPE_CAP_CONSERVATIVE_RASTER_POST_DEPTH_COVERAGE:
case PIPE_CAP_MAX_CONSERVATIVE_RASTER_SUBPIXEL_PRECISION_BIAS:
case PIPE_CAP_PROGRAMMABLE_SAMPLE_LOCATIONS:
+   case PIPE_CAP_IMAGE_LOAD_FORMATTED:
   return 0;
 
case PIPE_CAP_MAX_GS_INVOCATIONS:
diff --git a/src/gallium/drivers/nouveau/nv50/nv50_screen.c 
b/src/gallium/drivers/nouveau/nv50/nv50_screen.c
index d83926f2b1..ff92012894 100644
--- a/src/gallium/drivers/nouveau/nv50/nv50_screen.c
+++ b/src/gallium/drivers/nouveau/nv50/nv50_screen.c
@@ -310,6 +310,7 @@ nv50_screen_get_param(struct pipe_screen *pscreen, enum 
pipe_cap param)
case PIPE_CAP_MAX_COMBINED_HW_ATOMIC_COUNTER_BUFFERS:
case PIPE_CAP_SURFACE_SAMPLE_COUNT:
case PIPE_CAP_TGSI_ATOMFADD:
+   case PIPE_CAP_IMAGE_LOAD_FORMATTED:
   return 0;
 
case PIPE_CAP_VENDOR_ID:
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_screen.c 
b/src/gallium/drivers/nouveau/nvc0/nvc0_screen.c
index f5f3cf..b7cf2cd2e4 100644
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_screen.c
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_screen.c
@@ -334,6 +334,7 @@ nvc0_screen_get_param(struct pipe_screen *pscreen, enum 
pipe_cap param)
case PIPE_CAP_MAX_COMBINED_HW_ATOMIC_COUNTERS:
case PIPE_CAP_MAX_COMBINED_HW_ATOMIC_COUNTER_BUFFERS:
case PIPE_CAP_SURFACE_SAMPLE_COUNT:
+   case PIPE_CAP_IMAGE_LOAD_FORMATTED:
   return 0;
 
case PIPE_CAP_VENDOR_ID:
diff --git a/src/gallium/drivers/swr/swr_screen.cpp 
b/src/gallium/drivers/swr/swr_screen.cpp
index de9008ddf6..38b76366cb 100644
--- a/src/gallium/drivers/swr/swr_screen.cpp
+++ b/src/gallium/drivers/swr/swr_screen.cpp
@@ -364,6 +364,7 @@ swr_get_param(struct pipe_screen *screen, enum pipe_cap 
param)
case PIPE_CAP_MAX_CONSERVATIVE_RASTER_SUBPIXEL_PRECISION_BIAS:
case PIPE_CAP_PROGRAMMABLE_SAMPLE_LOCATIONS:
case PIPE_CAP_MAX_TEXTURE_UPLOAD_MEMORY_BUDGET:
+   case PIPE_CAP_IMAGE_LOAD_FORMATTED:
   return 0;
case PIPE_CAP_MAX_GS_INVOCATIONS:
   return 32;
diff --git a/src/gallium/drivers/vc4/vc4_screen.c 
b/src/gallium/drivers/vc4/vc4_screen.c
index e7f7c82c27..22de60f02c 100644
--- a/src/gallium/drivers/vc4/vc4_screen.c
+++ b/src/gallium/drivers/vc4/vc4_screen.c
@@ -293,6 +293,7 @@ vc4_screen_get_shader_param(struct pipe_screen *pscreen,
 case PIPE_SHADER_CAP_TGSI_SKIP_MERGE_REGISTERS:
 case PIPE_SHADER_CAP_MAX_HW_ATOMIC_COUNTERS:
 case PIPE_SHADER_CAP_MAX_HW_ATOMIC_COUNTER_BUFFERS:
+case PIPE_SHADER_CAP_IMAGE_LOAD_FORMATTED:
 return 0;
 case PIPE_SHADER_CAP_SCALAR_ISA:
 return 1;
diff --git a/src/gallium/include/pipe/p_defines.h 
b/src/gallium/include/pipe/p_defines.h
index ae53c723c7..5c0652d7a9 100644
--- a/src/gallium/include/pipe/p_defines.h
+++ b/src/gallium/include/pipe/p_defines.h
@@ -854,6 +854,7 @@ enum pipe_cap
PIPE_CAP_TGSI_ATOMFADD

Re: [Mesa-dev] [PATCH] radv: avoid context rolls when binding graphics pipelines

2019-01-16 Thread Rhys Perry

Seems I accidentally had it use Fedora 29's mesa build in both the
before and after runs...
Running again I get (again, average of 3 runs):
GeothermalValley: 58.2 fps -> 59.633 fps (+2.5%)
ProphetsTomb: 59 fps -> 60 fps (+1.7%)
SpineOfTheMountain: 64 fps -> 64.06667 fps (+0.1%) (1 extreme from
"before" run excluded)

Sorry for the noise.


On Wed, 16 Jan 2019 at 11:46, Rhys Perry  wrote:
>
> Rise of the Tomb Raider from without to with the change (average of 3 runs):
> SpineOfTheMountain: 73.46667 fps -> 73.56667 fps (+0.14%)
> ProphetsTomb: 58.4 fps -> 58.46667 fps (+0.11%)
> GeothermalValley: 57.2 fps -> 57.46667 fps (+0.47%)
>
> So not much improvement (if any).
>
> On Wed, 16 Jan 2019 at 00:39, Rhys Perry  wrote:
> >
> > I did a before/after comparison during development with multiple runs
> > but only 1 before and after run to produce the numbers I sent. They
> > seemed to match up well enough to the runs during development, so I
> > wasn't too concerned.
> >
> > IIRC, the two runs were with a Vega 64 at 1080p with "High" settings.
> > The kernel/distro was 4.19.13 and Fedora 29. Also
> > "/sys/devices/system/cpu/cpu*/cpufreq/scaling_governor" was set to
> > "performance" and
> > "/sys/class/drm/card*/device/power_dpm_force_performance_level" was
> > set to "high" while running.
> >
> > I'll do multiple runs of Rise of the Tomb Raider tomorrow and see if I
> > get anything too different.
> >
> > On Wed, 16 Jan 2019 at 00:25, Bas Nieuwenhuizen  
> > wrote:
> > >
> > > On Mon, Jan 14, 2019 at 5:12 PM Rhys Perry  
> > > wrote:
> > > >
> > > > I did and found small improvements in Rise of the Tomb Raider. I
> > > > measured framerates ~104.3% that of without the changes for the
> > > > Geothermal Valley scene, ~101.2% for Spine of the Mountain and ~102.3%
> > > > for Prophets Tomb.
> > >
> > > My main question would be what the statistical significance is.  e.g.
> > > did you do one run of each, did you do multiple, and what was your
> > > test setup?
> > >
> > > Just curious because I have tried the exact same thing before and
> > > could not find anything more than noise.
> > >
> > > >
> > > > I found no change with Dota 2 but I've heard it's cpu-bound.
> > > >
> > > > On Mon, 14 Jan 2019 at 16:05, Samuel Pitoiset 
> > > >  wrote:
> > > > >
> > > > > Did you benchmark?
> > > > >
> > > > > On 1/14/19 5:01 PM, Rhys Perry wrote:
> > > > > > It's common in some applications to bind a new graphics pipeline 
> > > > > > without
> > > > > > ending up changing any context registers.
> > > > > >
> > > > > > This has a pipline have two command buffers: one for setting context
> > > > > > registers and one for everything else. The context register command 
> > > > > > buffer
> > > > > > is only emitted if it differs from the previous pipeline's.
> > > > > >
> > > > > > Signed-off-by: Rhys Perry 
> > > > > > ---
> > > > > >   src/amd/vulkan/radv_cmd_buffer.c |  46 +--
> > > > > >   src/amd/vulkan/radv_pipeline.c   | 217 
> > > > > > ---
> > > > > >   src/amd/vulkan/radv_private.h|   2 +
> > > > > >   3 files changed, 150 insertions(+), 115 deletions(-)
> > > > > >
> > > > > > diff --git a/src/amd/vulkan/radv_cmd_buffer.c 
> > > > > > b/src/amd/vulkan/radv_cmd_buffer.c
> > > > > > index f41d6c0b3e7..59903ab64d8 100644
> > > > > > --- a/src/amd/vulkan/radv_cmd_buffer.c
> > > > > > +++ b/src/amd/vulkan/radv_cmd_buffer.c
> > > > > > @@ -634,7 +634,7 @@ radv_emit_descriptor_pointers(struct 
> > > > > > radv_cmd_buffer *cmd_buffer,
> > > > > >   }
> > > > > >   }
> > > > > >
> > > > > > -static void
> > > > > > +static bool
> > > > > >   radv_update_multisample_state(struct radv_cmd_buffer *cmd_buffer,
> > > > > > struct radv_pipeline *pipeline)
> > > > > >   {
> > > > > > @@ -646,7 +646,7 @@ radv_update_multisamp

Re: [Mesa-dev] [PATCH] radv: avoid context rolls when binding graphics pipelines

2019-01-16 Thread Rhys Perry

Rise of the Tomb Raider from without to with the change (average of 3 runs):
SpineOfTheMountain: 73.46667 fps -> 73.56667 fps (+0.14%)
ProphetsTomb: 58.4 fps -> 58.46667 fps (+0.11%)
GeothermalValley: 57.2 fps -> 57.46667 fps (+0.47%)

So not much improvement (if any).

On Wed, 16 Jan 2019 at 00:39, Rhys Perry  wrote:
>
> I did a before/after comparison during development with multiple runs
> but only 1 before and after run to produce the numbers I sent. They
> seemed to match up well enough to the runs during development, so I
> wasn't too concerned.
>
> IIRC, the two runs were with a Vega 64 at 1080p with "High" settings.
> The kernel/distro was 4.19.13 and Fedora 29. Also
> "/sys/devices/system/cpu/cpu*/cpufreq/scaling_governor" was set to
> "performance" and
> "/sys/class/drm/card*/device/power_dpm_force_performance_level" was
> set to "high" while running.
>
> I'll do multiple runs of Rise of the Tomb Raider tomorrow and see if I
> get anything too different.
>
> On Wed, 16 Jan 2019 at 00:25, Bas Nieuwenhuizen  
> wrote:
> >
> > On Mon, Jan 14, 2019 at 5:12 PM Rhys Perry  wrote:
> > >
> > > I did and found small improvements in Rise of the Tomb Raider. I
> > > measured framerates ~104.3% that of without the changes for the
> > > Geothermal Valley scene, ~101.2% for Spine of the Mountain and ~102.3%
> > > for Prophets Tomb.
> >
> > My main question would be what the statistical significance is.  e.g.
> > did you do one run of each, did you do multiple, and what was your
> > test setup?
> >
> > Just curious because I have tried the exact same thing before and
> > could not find anything more than noise.
> >
> > >
> > > I found no change with Dota 2 but I've heard it's cpu-bound.
> > >
> > > On Mon, 14 Jan 2019 at 16:05, Samuel Pitoiset  
> > > wrote:
> > > >
> > > > Did you benchmark?
> > > >
> > > > On 1/14/19 5:01 PM, Rhys Perry wrote:
> > > > > It's common in some applications to bind a new graphics pipeline 
> > > > > without
> > > > > ending up changing any context registers.
> > > > >
> > > > > This has a pipline have two command buffers: one for setting context
> > > > > registers and one for everything else. The context register command 
> > > > > buffer
> > > > > is only emitted if it differs from the previous pipeline's.
> > > > >
> > > > > Signed-off-by: Rhys Perry 
> > > > > ---
> > > > >   src/amd/vulkan/radv_cmd_buffer.c |  46 +--
> > > > >   src/amd/vulkan/radv_pipeline.c   | 217 
> > > > > ---
> > > > >   src/amd/vulkan/radv_private.h|   2 +
> > > > >   3 files changed, 150 insertions(+), 115 deletions(-)
> > > > >
> > > > > diff --git a/src/amd/vulkan/radv_cmd_buffer.c 
> > > > > b/src/amd/vulkan/radv_cmd_buffer.c
> > > > > index f41d6c0b3e7..59903ab64d8 100644
> > > > > --- a/src/amd/vulkan/radv_cmd_buffer.c
> > > > > +++ b/src/amd/vulkan/radv_cmd_buffer.c
> > > > > @@ -634,7 +634,7 @@ radv_emit_descriptor_pointers(struct 
> > > > > radv_cmd_buffer *cmd_buffer,
> > > > >   }
> > > > >   }
> > > > >
> > > > > -static void
> > > > > +static bool
> > > > >   radv_update_multisample_state(struct radv_cmd_buffer *cmd_buffer,
> > > > > struct radv_pipeline *pipeline)
> > > > >   {
> > > > > @@ -646,7 +646,7 @@ radv_update_multisample_state(struct 
> > > > > radv_cmd_buffer *cmd_buffer,
> > > > >   cmd_buffer->sample_positions_needed = true;
> > > > >
> > > > >   if (old_pipeline && num_samples == 
> > > > > old_pipeline->graphics.ms.num_samples)
> > > > > - return;
> > > > > + return false;
> > > > >
> > > > >   radeon_set_context_reg_seq(cmd_buffer->cs, 
> > > > > R_028BDC_PA_SC_LINE_CNTL, 2);
> > > > >   radeon_emit(cmd_buffer->cs, ms->pa_sc_line_cntl);
> > > > > @@ -661,6 +661,8 @@ radv_update_multisample_state(struct 
> > > > > radv_cmd_buffer *cmd_buffer,
> > > > >   radeon_emit(cmd_buffer-&g

Re: [Mesa-dev] [PATCH] radv: avoid context rolls when binding graphics pipelines

2019-01-15 Thread Rhys Perry

I did a before/after comparison during development with multiple runs
but only 1 before and after run to produce the numbers I sent. They
seemed to match up well enough to the runs during development, so I
wasn't too concerned.

IIRC, the two runs were with a Vega 64 at 1080p with "High" settings.
The kernel/distro was 4.19.13 and Fedora 29. Also
"/sys/devices/system/cpu/cpu*/cpufreq/scaling_governor" was set to
"performance" and
"/sys/class/drm/card*/device/power_dpm_force_performance_level" was
set to "high" while running.

I'll do multiple runs of Rise of the Tomb Raider tomorrow and see if I
get anything too different.

On Wed, 16 Jan 2019 at 00:25, Bas Nieuwenhuizen  
wrote:
>
> On Mon, Jan 14, 2019 at 5:12 PM Rhys Perry  wrote:
> >
> > I did and found small improvements in Rise of the Tomb Raider. I
> > measured framerates ~104.3% that of without the changes for the
> > Geothermal Valley scene, ~101.2% for Spine of the Mountain and ~102.3%
> > for Prophets Tomb.
>
> My main question would be what the statistical significance is.  e.g.
> did you do one run of each, did you do multiple, and what was your
> test setup?
>
> Just curious because I have tried the exact same thing before and
> could not find anything more than noise.
>
> >
> > I found no change with Dota 2 but I've heard it's cpu-bound.
> >
> > On Mon, 14 Jan 2019 at 16:05, Samuel Pitoiset  
> > wrote:
> > >
> > > Did you benchmark?
> > >
> > > On 1/14/19 5:01 PM, Rhys Perry wrote:
> > > > It's common in some applications to bind a new graphics pipeline without
> > > > ending up changing any context registers.
> > > >
> > > > This has a pipline have two command buffers: one for setting context
> > > > registers and one for everything else. The context register command 
> > > > buffer
> > > > is only emitted if it differs from the previous pipeline's.
> > > >
> > > > Signed-off-by: Rhys Perry 
> > > > ---
> > > >   src/amd/vulkan/radv_cmd_buffer.c |  46 +--
> > > >   src/amd/vulkan/radv_pipeline.c   | 217 ---
> > > >   src/amd/vulkan/radv_private.h|   2 +
> > > >   3 files changed, 150 insertions(+), 115 deletions(-)
> > > >
> > > > diff --git a/src/amd/vulkan/radv_cmd_buffer.c 
> > > > b/src/amd/vulkan/radv_cmd_buffer.c
> > > > index f41d6c0b3e7..59903ab64d8 100644
> > > > --- a/src/amd/vulkan/radv_cmd_buffer.c
> > > > +++ b/src/amd/vulkan/radv_cmd_buffer.c
> > > > @@ -634,7 +634,7 @@ radv_emit_descriptor_pointers(struct 
> > > > radv_cmd_buffer *cmd_buffer,
> > > >   }
> > > >   }
> > > >
> > > > -static void
> > > > +static bool
> > > >   radv_update_multisample_state(struct radv_cmd_buffer *cmd_buffer,
> > > > struct radv_pipeline *pipeline)
> > > >   {
> > > > @@ -646,7 +646,7 @@ radv_update_multisample_state(struct 
> > > > radv_cmd_buffer *cmd_buffer,
> > > >   cmd_buffer->sample_positions_needed = true;
> > > >
> > > >   if (old_pipeline && num_samples == 
> > > > old_pipeline->graphics.ms.num_samples)
> > > > - return;
> > > > + return false;
> > > >
> > > >   radeon_set_context_reg_seq(cmd_buffer->cs, 
> > > > R_028BDC_PA_SC_LINE_CNTL, 2);
> > > >   radeon_emit(cmd_buffer->cs, ms->pa_sc_line_cntl);
> > > > @@ -661,6 +661,8 @@ radv_update_multisample_state(struct 
> > > > radv_cmd_buffer *cmd_buffer,
> > > >   radeon_emit(cmd_buffer->cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
> > > >   radeon_emit(cmd_buffer->cs, 
> > > > EVENT_TYPE(V_028A90_FLUSH_DFSM) | EVENT_INDEX(0));
> > > >   }
> > > > +
> > > > + return true;
> > > >   }
> > > >
> > > >   static void
> > > > @@ -863,15 +865,15 @@ radv_emit_rbplus_state(struct radv_cmd_buffer 
> > > > *cmd_buffer)
> > > >   radeon_emit(cmd_buffer->cs, sx_blend_opt_control);
> > > >   }
> > > >
> > > > -static void
> > > > +static bool
> > > >   radv_emit_graphics_pipeline(struct radv_cmd_buffer *cmd_buffer)
> > > >   {
> > > >   struct radv_pipeline *pipeline =

Re: [Mesa-dev] [PATCH] radv: prevent dirtying of dynamic state when it does not change

2019-01-15 Thread Rhys Perry

I misread some code and forgot to remove it.

It was always unrelated to this patch.

On Wed, 16 Jan 2019 at 00:22, Bas Nieuwenhuizen  
wrote:
>
> On Tue, Jan 15, 2019 at 10:59 PM Rhys Perry  wrote:
> >
> > DXVK often sets dynamic state without actually changing it.
> >
> > Signed-off-by: Rhys Perry 
> > ---
> >  src/amd/vulkan/radv_cmd_buffer.c | 92 ++--
> >  1 file changed, 76 insertions(+), 16 deletions(-)
> >
> > diff --git a/src/amd/vulkan/radv_cmd_buffer.c 
> > b/src/amd/vulkan/radv_cmd_buffer.c
> > index 59903ab64d8..56b3c934c2e 100644
> > --- a/src/amd/vulkan/radv_cmd_buffer.c
> > +++ b/src/amd/vulkan/radv_cmd_buffer.c
> > @@ -2965,6 +2965,11 @@ void radv_CmdSetViewport(
> > assert(firstViewport < MAX_VIEWPORTS);
> > assert(total_count >= 1 && total_count <= MAX_VIEWPORTS);
> >
> > +   if (!memcmp(state->dynamic.viewport.viewports + firstViewport,
> > +   pViewports, viewportCount * sizeof(*pViewports))) {
> > +   return;
> > +   }
> > +
> > memcpy(state->dynamic.viewport.viewports + firstViewport, 
> > pViewports,
> >viewportCount * sizeof(*pViewports));
> >
> > @@ -2984,6 +2989,11 @@ void radv_CmdSetScissor(
> > assert(firstScissor < MAX_SCISSORS);
> > assert(total_count >= 1 && total_count <= MAX_SCISSORS);
> >
> > +   if (!memcmp(state->dynamic.scissor.scissors + firstScissor, 
> > pScissors,
> > +   scissorCount * sizeof(*pScissors))) {
> > +   return;
> > +   }
> > +
> > memcpy(state->dynamic.scissor.scissors + firstScissor, pScissors,
> >scissorCount * sizeof(*pScissors));
> >
> > @@ -2995,6 +3005,10 @@ void radv_CmdSetLineWidth(
> > float   lineWidth)
> >  {
> > RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
> > +
> > +   if (cmd_buffer->state.dynamic.line_width == lineWidth)
> > +   return;
> > +
> > cmd_buffer->state.dynamic.line_width = lineWidth;
> > cmd_buffer->state.dirty |= RADV_CMD_DIRTY_DYNAMIC_LINE_WIDTH;
> >  }
> > @@ -3006,12 +3020,19 @@ void radv_CmdSetDepthBias(
> > float   depthBiasSlopeFactor)
> >  {
> > RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
> > +   struct radv_cmd_state *state = &cmd_buffer->state;
> >
> > -   cmd_buffer->state.dynamic.depth_bias.bias = depthBiasConstantFactor;
> > -   cmd_buffer->state.dynamic.depth_bias.clamp = depthBiasClamp;
> > -   cmd_buffer->state.dynamic.depth_bias.slope = depthBiasSlopeFactor;
> > +   if (state->dynamic.depth_bias.bias == depthBiasConstantFactor &&
> > +   state->dynamic.depth_bias.clamp == depthBiasClamp &&
> > +   state->dynamic.depth_bias.slope == depthBiasSlopeFactor) {
> > +   return;
> > +   }
> >
> > -   cmd_buffer->state.dirty |= RADV_CMD_DIRTY_DYNAMIC_DEPTH_BIAS;
> > +   state->dynamic.depth_bias.bias = depthBiasConstantFactor;
> > +   state->dynamic.depth_bias.clamp = depthBiasClamp;
> > +   state->dynamic.depth_bias.slope = depthBiasSlopeFactor;
> > +
> > +   state->dirty |= RADV_CMD_DIRTY_DYNAMIC_DEPTH_BIAS;
> >  }
> >
> >  void radv_CmdSetBlendConstants(
> > @@ -3019,11 +3040,14 @@ void radv_CmdSetBlendConstants(
> > const float blendConstants[4])
> >  {
> > RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
> > +   struct radv_cmd_state *state = &cmd_buffer->state;
> >
> > -   memcpy(cmd_buffer->state.dynamic.blend_constants,
> > -  blendConstants, sizeof(float) * 4);
> > +   if (!memcmp(state->dynamic.blend_constants, blendConstants, 
> > sizeof(float) * 4))
> > +   return;
> > +
> > +   memcpy(state->dynamic.blend_constants, blendConstants, 
> > sizeof(float) * 4);
> >
> > -   cmd_buffer->state.dirty |= RADV_CMD_DIRTY_DYNAMIC_BLEND_CONSTANTS;
> > +   state->dirty |= RADV_CMD_DIRTY_DYNAMIC_BLEND_CONSTANTS;
> >  }
> >
> >  void radv_CmdSetDepthBounds(
> > @@ -3032,11 +3056,17 @@ void radv_CmdSetDepthBounds(
> > float   maxDepthB

[Mesa-dev] [PATCH] radv: prevent dirtying of dynamic state when it does not change

2019-01-15 Thread Rhys Perry

DXVK often sets dynamic state without actually changing it.

Signed-off-by: Rhys Perry 
---
 src/amd/vulkan/radv_cmd_buffer.c | 92 ++--
 1 file changed, 76 insertions(+), 16 deletions(-)

diff --git a/src/amd/vulkan/radv_cmd_buffer.c b/src/amd/vulkan/radv_cmd_buffer.c
index 59903ab64d8..56b3c934c2e 100644
--- a/src/amd/vulkan/radv_cmd_buffer.c
+++ b/src/amd/vulkan/radv_cmd_buffer.c
@@ -2965,6 +2965,11 @@ void radv_CmdSetViewport(
assert(firstViewport < MAX_VIEWPORTS);
assert(total_count >= 1 && total_count <= MAX_VIEWPORTS);
 
+   if (!memcmp(state->dynamic.viewport.viewports + firstViewport,
+   pViewports, viewportCount * sizeof(*pViewports))) {
+   return;
+   }
+
memcpy(state->dynamic.viewport.viewports + firstViewport, pViewports,
   viewportCount * sizeof(*pViewports));
 
@@ -2984,6 +2989,11 @@ void radv_CmdSetScissor(
assert(firstScissor < MAX_SCISSORS);
assert(total_count >= 1 && total_count <= MAX_SCISSORS);
 
+   if (!memcmp(state->dynamic.scissor.scissors + firstScissor, pScissors,
+   scissorCount * sizeof(*pScissors))) {
+   return;
+   }
+
memcpy(state->dynamic.scissor.scissors + firstScissor, pScissors,
   scissorCount * sizeof(*pScissors));
 
@@ -2995,6 +3005,10 @@ void radv_CmdSetLineWidth(
float   lineWidth)
 {
RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
+
+   if (cmd_buffer->state.dynamic.line_width == lineWidth)
+   return;
+
cmd_buffer->state.dynamic.line_width = lineWidth;
cmd_buffer->state.dirty |= RADV_CMD_DIRTY_DYNAMIC_LINE_WIDTH;
 }
@@ -3006,12 +3020,19 @@ void radv_CmdSetDepthBias(
float   depthBiasSlopeFactor)
 {
RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
+   struct radv_cmd_state *state = &cmd_buffer->state;
 
-   cmd_buffer->state.dynamic.depth_bias.bias = depthBiasConstantFactor;
-   cmd_buffer->state.dynamic.depth_bias.clamp = depthBiasClamp;
-   cmd_buffer->state.dynamic.depth_bias.slope = depthBiasSlopeFactor;
+   if (state->dynamic.depth_bias.bias == depthBiasConstantFactor &&
+   state->dynamic.depth_bias.clamp == depthBiasClamp &&
+   state->dynamic.depth_bias.slope == depthBiasSlopeFactor) {
+   return;
+   }
 
-   cmd_buffer->state.dirty |= RADV_CMD_DIRTY_DYNAMIC_DEPTH_BIAS;
+   state->dynamic.depth_bias.bias = depthBiasConstantFactor;
+   state->dynamic.depth_bias.clamp = depthBiasClamp;
+   state->dynamic.depth_bias.slope = depthBiasSlopeFactor;
+
+   state->dirty |= RADV_CMD_DIRTY_DYNAMIC_DEPTH_BIAS;
 }
 
 void radv_CmdSetBlendConstants(
@@ -3019,11 +3040,14 @@ void radv_CmdSetBlendConstants(
const float blendConstants[4])
 {
RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
+   struct radv_cmd_state *state = &cmd_buffer->state;
 
-   memcpy(cmd_buffer->state.dynamic.blend_constants,
-  blendConstants, sizeof(float) * 4);
+   if (!memcmp(state->dynamic.blend_constants, blendConstants, 
sizeof(float) * 4))
+   return;
+
+   memcpy(state->dynamic.blend_constants, blendConstants, sizeof(float) * 
4);
 
-   cmd_buffer->state.dirty |= RADV_CMD_DIRTY_DYNAMIC_BLEND_CONSTANTS;
+   state->dirty |= RADV_CMD_DIRTY_DYNAMIC_BLEND_CONSTANTS;
 }
 
 void radv_CmdSetDepthBounds(
@@ -3032,11 +3056,17 @@ void radv_CmdSetDepthBounds(
float   maxDepthBounds)
 {
RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
+   struct radv_cmd_state *state = &cmd_buffer->state;
 
-   cmd_buffer->state.dynamic.depth_bounds.min = minDepthBounds;
-   cmd_buffer->state.dynamic.depth_bounds.max = maxDepthBounds;
+   if (state->dynamic.depth_bounds.min == minDepthBounds &&
+   state->dynamic.depth_bounds.max == maxDepthBounds) {
+   return;
+   }
+
+   state->dynamic.depth_bounds.min = minDepthBounds;
+   state->dynamic.depth_bounds.max = maxDepthBounds;
 
-   cmd_buffer->state.dirty |= RADV_CMD_DIRTY_DYNAMIC_DEPTH_BOUNDS;
+   state->dirty |= RADV_CMD_DIRTY_DYNAMIC_DEPTH_BOUNDS;
 }
 
 void radv_CmdSetStencilCompareMask(
@@ -3045,13 +3075,21 @@ void radv_CmdSetStencilCompareMask(
uint32_tcompareMask)
 {
RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
+   struct radv_cmd_state *state = &cmd_buffer->state;
+   bool front_same = state->dynamic.stencil_compare_mask.front == 
compareMask;
+   bool back_same = state->dyn

Re: [Mesa-dev] [PATCH] radv: avoid context rolls when binding graphics pipelines

2019-01-14 Thread Rhys Perry

Sure

On Mon, 14 Jan 2019 at 16:50, Samuel Pitoiset  wrote:
>
> While you are on it, can you experiment the tracked ctx stuff that
> RadeonSI implements (ie. SI_TRACKED_XXX)?
>
> This approach will likely be more costly from the CPU side, but it will
> reduce the number of register changes a lot more.
>
> Not sure if that will improve anything though, but I think it's worth to
> try?
>
> On 1/14/19 5:12 PM, Rhys Perry wrote:
> > I did and found small improvements in Rise of the Tomb Raider. I
> > measured framerates ~104.3% that of without the changes for the
> > Geothermal Valley scene, ~101.2% for Spine of the Mountain and ~102.3%
> > for Prophets Tomb.
> >
> > I found no change with Dota 2 but I've heard it's cpu-bound.
> >
> > On Mon, 14 Jan 2019 at 16:05, Samuel Pitoiset  
> > wrote:
> >> Did you benchmark?
> >>
> >> On 1/14/19 5:01 PM, Rhys Perry wrote:
> >>> It's common in some applications to bind a new graphics pipeline without
> >>> ending up changing any context registers.
> >>>
> >>> This has a pipline have two command buffers: one for setting context
> >>> registers and one for everything else. The context register command buffer
> >>> is only emitted if it differs from the previous pipeline's.
> >>>
> >>> Signed-off-by: Rhys Perry 
> >>> ---
> >>>src/amd/vulkan/radv_cmd_buffer.c |  46 +--
> >>>src/amd/vulkan/radv_pipeline.c   | 217 ---
> >>>src/amd/vulkan/radv_private.h|   2 +
> >>>3 files changed, 150 insertions(+), 115 deletions(-)
> >>>
> >>> diff --git a/src/amd/vulkan/radv_cmd_buffer.c 
> >>> b/src/amd/vulkan/radv_cmd_buffer.c
> >>> index f41d6c0b3e7..59903ab64d8 100644
> >>> --- a/src/amd/vulkan/radv_cmd_buffer.c
> >>> +++ b/src/amd/vulkan/radv_cmd_buffer.c
> >>> @@ -634,7 +634,7 @@ radv_emit_descriptor_pointers(struct radv_cmd_buffer 
> >>> *cmd_buffer,
> >>>}
> >>>}
> >>>
> >>> -static void
> >>> +static bool
> >>>radv_update_multisample_state(struct radv_cmd_buffer *cmd_buffer,
> >>>  struct radv_pipeline *pipeline)
> >>>{
> >>> @@ -646,7 +646,7 @@ radv_update_multisample_state(struct radv_cmd_buffer 
> >>> *cmd_buffer,
> >>>cmd_buffer->sample_positions_needed = true;
> >>>
> >>>if (old_pipeline && num_samples == 
> >>> old_pipeline->graphics.ms.num_samples)
> >>> - return;
> >>> + return false;
> >>>
> >>>radeon_set_context_reg_seq(cmd_buffer->cs, 
> >>> R_028BDC_PA_SC_LINE_CNTL, 2);
> >>>radeon_emit(cmd_buffer->cs, ms->pa_sc_line_cntl);
> >>> @@ -661,6 +661,8 @@ radv_update_multisample_state(struct radv_cmd_buffer 
> >>> *cmd_buffer,
> >>>radeon_emit(cmd_buffer->cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
> >>>radeon_emit(cmd_buffer->cs, 
> >>> EVENT_TYPE(V_028A90_FLUSH_DFSM) | EVENT_INDEX(0));
> >>>}
> >>> +
> >>> + return true;
> >>>}
> >>>
> >>>static void
> >>> @@ -863,15 +865,15 @@ radv_emit_rbplus_state(struct radv_cmd_buffer 
> >>> *cmd_buffer)
> >>>radeon_emit(cmd_buffer->cs, sx_blend_opt_control);
> >>>}
> >>>
> >>> -static void
> >>> +static bool
> >>>radv_emit_graphics_pipeline(struct radv_cmd_buffer *cmd_buffer)
> >>>{
> >>>struct radv_pipeline *pipeline = cmd_buffer->state.pipeline;
> >>>
> >>>if (!pipeline || cmd_buffer->state.emitted_pipeline == pipeline)
> >>> - return;
> >>> + return false;
> >>>
> >>> - radv_update_multisample_state(cmd_buffer, pipeline);
> >>> + bool context_roll = radv_update_multisample_state(cmd_buffer, 
> >>> pipeline);
> >>>
> >>>cmd_buffer->scratch_size_needed =
> >>>  MAX2(cmd_buffer->scratch_size_needed,
> >>> @@ -884,6 +886,15 @@ radv_emit_graphics_pipeline(struct radv_cmd_buffer 
> >>> *cmd_buffer)
> >>>
> >>>radeon_emit_array(c

Re: [Mesa-dev] [PATCH] radv: avoid context rolls when binding graphics pipelines

2019-01-14 Thread Rhys Perry

This is with Rise of the Tomb Raider's graphics settings set to "High"
by the way.

On Mon, 14 Jan 2019 at 16:12, Rhys Perry  wrote:
>
> I did and found small improvements in Rise of the Tomb Raider. I
> measured framerates ~104.3% that of without the changes for the
> Geothermal Valley scene, ~101.2% for Spine of the Mountain and ~102.3%
> for Prophets Tomb.
>
> I found no change with Dota 2 but I've heard it's cpu-bound.
>
> On Mon, 14 Jan 2019 at 16:05, Samuel Pitoiset  
> wrote:
> >
> > Did you benchmark?
> >
> > On 1/14/19 5:01 PM, Rhys Perry wrote:
> > > It's common in some applications to bind a new graphics pipeline without
> > > ending up changing any context registers.
> > >
> > > This has a pipline have two command buffers: one for setting context
> > > registers and one for everything else. The context register command buffer
> > > is only emitted if it differs from the previous pipeline's.
> > >
> > > Signed-off-by: Rhys Perry 
> > > ---
> > >   src/amd/vulkan/radv_cmd_buffer.c |  46 +--
> > >   src/amd/vulkan/radv_pipeline.c   | 217 ---
> > >   src/amd/vulkan/radv_private.h|   2 +
> > >   3 files changed, 150 insertions(+), 115 deletions(-)
> > >
> > > diff --git a/src/amd/vulkan/radv_cmd_buffer.c 
> > > b/src/amd/vulkan/radv_cmd_buffer.c
> > > index f41d6c0b3e7..59903ab64d8 100644
> > > --- a/src/amd/vulkan/radv_cmd_buffer.c
> > > +++ b/src/amd/vulkan/radv_cmd_buffer.c
> > > @@ -634,7 +634,7 @@ radv_emit_descriptor_pointers(struct radv_cmd_buffer 
> > > *cmd_buffer,
> > >   }
> > >   }
> > >
> > > -static void
> > > +static bool
> > >   radv_update_multisample_state(struct radv_cmd_buffer *cmd_buffer,
> > > struct radv_pipeline *pipeline)
> > >   {
> > > @@ -646,7 +646,7 @@ radv_update_multisample_state(struct radv_cmd_buffer 
> > > *cmd_buffer,
> > >   cmd_buffer->sample_positions_needed = true;
> > >
> > >   if (old_pipeline && num_samples == 
> > > old_pipeline->graphics.ms.num_samples)
> > > - return;
> > > + return false;
> > >
> > >   radeon_set_context_reg_seq(cmd_buffer->cs, 
> > > R_028BDC_PA_SC_LINE_CNTL, 2);
> > >   radeon_emit(cmd_buffer->cs, ms->pa_sc_line_cntl);
> > > @@ -661,6 +661,8 @@ radv_update_multisample_state(struct radv_cmd_buffer 
> > > *cmd_buffer,
> > >   radeon_emit(cmd_buffer->cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
> > >   radeon_emit(cmd_buffer->cs, EVENT_TYPE(V_028A90_FLUSH_DFSM) 
> > > | EVENT_INDEX(0));
> > >   }
> > > +
> > > + return true;
> > >   }
> > >
> > >   static void
> > > @@ -863,15 +865,15 @@ radv_emit_rbplus_state(struct radv_cmd_buffer 
> > > *cmd_buffer)
> > >   radeon_emit(cmd_buffer->cs, sx_blend_opt_control);
> > >   }
> > >
> > > -static void
> > > +static bool
> > >   radv_emit_graphics_pipeline(struct radv_cmd_buffer *cmd_buffer)
> > >   {
> > >   struct radv_pipeline *pipeline = cmd_buffer->state.pipeline;
> > >
> > >   if (!pipeline || cmd_buffer->state.emitted_pipeline == pipeline)
> > > - return;
> > > + return false;
> > >
> > > - radv_update_multisample_state(cmd_buffer, pipeline);
> > > + bool context_roll = radv_update_multisample_state(cmd_buffer, 
> > > pipeline);
> > >
> > >   cmd_buffer->scratch_size_needed =
> > > MAX2(cmd_buffer->scratch_size_needed,
> > > @@ -884,6 +886,15 @@ radv_emit_graphics_pipeline(struct radv_cmd_buffer 
> > > *cmd_buffer)
> > >
> > >   radeon_emit_array(cmd_buffer->cs, pipeline->cs.buf, 
> > > pipeline->cs.cdw);
> > >
> > > + if (!cmd_buffer->state.emitted_pipeline ||
> > > + cmd_buffer->state.emitted_pipeline->ctx_cs.cdw != 
> > > pipeline->ctx_cs.cdw ||
> > > + cmd_buffer->state.emitted_pipeline->ctx_cs_hash != 
> > > pipeline->ctx_cs_hash ||
> > > + memcmp(cmd_buffer->state.emitted_pipeline->ctx_cs.buf,
> > > +pipeline->ctx_cs.buf, pipeline->ctx_cs.cdw * 4)) {
> > >

Re: [Mesa-dev] [PATCH] radv: avoid context rolls when binding graphics pipelines

2019-01-14 Thread Rhys Perry

I did and found small improvements in Rise of the Tomb Raider. I
measured framerates ~104.3% that of without the changes for the
Geothermal Valley scene, ~101.2% for Spine of the Mountain and ~102.3%
for Prophets Tomb.

I found no change with Dota 2 but I've heard it's cpu-bound.

On Mon, 14 Jan 2019 at 16:05, Samuel Pitoiset  wrote:
>
> Did you benchmark?
>
> On 1/14/19 5:01 PM, Rhys Perry wrote:
> > It's common in some applications to bind a new graphics pipeline without
> > ending up changing any context registers.
> >
> > This has a pipline have two command buffers: one for setting context
> > registers and one for everything else. The context register command buffer
> > is only emitted if it differs from the previous pipeline's.
> >
> > Signed-off-by: Rhys Perry 
> > ---
> >   src/amd/vulkan/radv_cmd_buffer.c |  46 +--
> >   src/amd/vulkan/radv_pipeline.c   | 217 ---
> >   src/amd/vulkan/radv_private.h|   2 +
> >   3 files changed, 150 insertions(+), 115 deletions(-)
> >
> > diff --git a/src/amd/vulkan/radv_cmd_buffer.c 
> > b/src/amd/vulkan/radv_cmd_buffer.c
> > index f41d6c0b3e7..59903ab64d8 100644
> > --- a/src/amd/vulkan/radv_cmd_buffer.c
> > +++ b/src/amd/vulkan/radv_cmd_buffer.c
> > @@ -634,7 +634,7 @@ radv_emit_descriptor_pointers(struct radv_cmd_buffer 
> > *cmd_buffer,
> >   }
> >   }
> >
> > -static void
> > +static bool
> >   radv_update_multisample_state(struct radv_cmd_buffer *cmd_buffer,
> > struct radv_pipeline *pipeline)
> >   {
> > @@ -646,7 +646,7 @@ radv_update_multisample_state(struct radv_cmd_buffer 
> > *cmd_buffer,
> >   cmd_buffer->sample_positions_needed = true;
> >
> >   if (old_pipeline && num_samples == 
> > old_pipeline->graphics.ms.num_samples)
> > - return;
> > + return false;
> >
> >   radeon_set_context_reg_seq(cmd_buffer->cs, R_028BDC_PA_SC_LINE_CNTL, 
> > 2);
> >   radeon_emit(cmd_buffer->cs, ms->pa_sc_line_cntl);
> > @@ -661,6 +661,8 @@ radv_update_multisample_state(struct radv_cmd_buffer 
> > *cmd_buffer,
> >   radeon_emit(cmd_buffer->cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
> >   radeon_emit(cmd_buffer->cs, EVENT_TYPE(V_028A90_FLUSH_DFSM) | 
> > EVENT_INDEX(0));
> >   }
> > +
> > + return true;
> >   }
> >
> >   static void
> > @@ -863,15 +865,15 @@ radv_emit_rbplus_state(struct radv_cmd_buffer 
> > *cmd_buffer)
> >   radeon_emit(cmd_buffer->cs, sx_blend_opt_control);
> >   }
> >
> > -static void
> > +static bool
> >   radv_emit_graphics_pipeline(struct radv_cmd_buffer *cmd_buffer)
> >   {
> >   struct radv_pipeline *pipeline = cmd_buffer->state.pipeline;
> >
> >   if (!pipeline || cmd_buffer->state.emitted_pipeline == pipeline)
> > - return;
> > + return false;
> >
> > - radv_update_multisample_state(cmd_buffer, pipeline);
> > + bool context_roll = radv_update_multisample_state(cmd_buffer, 
> > pipeline);
> >
> >   cmd_buffer->scratch_size_needed =
> > MAX2(cmd_buffer->scratch_size_needed,
> > @@ -884,6 +886,15 @@ radv_emit_graphics_pipeline(struct radv_cmd_buffer 
> > *cmd_buffer)
> >
> >   radeon_emit_array(cmd_buffer->cs, pipeline->cs.buf, pipeline->cs.cdw);
> >
> > + if (!cmd_buffer->state.emitted_pipeline ||
> > + cmd_buffer->state.emitted_pipeline->ctx_cs.cdw != 
> > pipeline->ctx_cs.cdw ||
> > + cmd_buffer->state.emitted_pipeline->ctx_cs_hash != 
> > pipeline->ctx_cs_hash ||
> > + memcmp(cmd_buffer->state.emitted_pipeline->ctx_cs.buf,
> > +pipeline->ctx_cs.buf, pipeline->ctx_cs.cdw * 4)) {
> > + radeon_emit_array(cmd_buffer->cs, pipeline->ctx_cs.buf, 
> > pipeline->ctx_cs.cdw);
> > + context_roll = true;
> > + }
> > +
> >   for (unsigned i = 0; i < MESA_SHADER_COMPUTE; i++) {
> >   if (!pipeline->shaders[i])
> >   continue;
> > @@ -902,6 +913,8 @@ radv_emit_graphics_pipeline(struct radv_cmd_buffer 
> > *cmd_buffer)
> >   cmd_buffer->state.emitted_pipeline = pipeline;
> >
> >   cmd_buffer->state.dirty &= ~RADV_CMD_DIRTY_PIPELINE;
> > +
> > + return context_roll;

[Mesa-dev] [PATCH] radv: avoid context rolls when binding graphics pipelines

2019-01-14 Thread Rhys Perry

It's common in some applications to bind a new graphics pipeline without
ending up changing any context registers.

This has a pipline have two command buffers: one for setting context
registers and one for everything else. The context register command buffer
is only emitted if it differs from the previous pipeline's.

Signed-off-by: Rhys Perry 
---
 src/amd/vulkan/radv_cmd_buffer.c |  46 +--
 src/amd/vulkan/radv_pipeline.c   | 217 ---
 src/amd/vulkan/radv_private.h|   2 +
 3 files changed, 150 insertions(+), 115 deletions(-)

diff --git a/src/amd/vulkan/radv_cmd_buffer.c b/src/amd/vulkan/radv_cmd_buffer.c
index f41d6c0b3e7..59903ab64d8 100644
--- a/src/amd/vulkan/radv_cmd_buffer.c
+++ b/src/amd/vulkan/radv_cmd_buffer.c
@@ -634,7 +634,7 @@ radv_emit_descriptor_pointers(struct radv_cmd_buffer 
*cmd_buffer,
}
 }
 
-static void
+static bool
 radv_update_multisample_state(struct radv_cmd_buffer *cmd_buffer,
  struct radv_pipeline *pipeline)
 {
@@ -646,7 +646,7 @@ radv_update_multisample_state(struct radv_cmd_buffer 
*cmd_buffer,
cmd_buffer->sample_positions_needed = true;
 
if (old_pipeline && num_samples == 
old_pipeline->graphics.ms.num_samples)
-   return;
+   return false;
 
radeon_set_context_reg_seq(cmd_buffer->cs, R_028BDC_PA_SC_LINE_CNTL, 2);
radeon_emit(cmd_buffer->cs, ms->pa_sc_line_cntl);
@@ -661,6 +661,8 @@ radv_update_multisample_state(struct radv_cmd_buffer 
*cmd_buffer,
radeon_emit(cmd_buffer->cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
radeon_emit(cmd_buffer->cs, EVENT_TYPE(V_028A90_FLUSH_DFSM) | 
EVENT_INDEX(0));
}
+
+   return true;
 }
 
 static void
@@ -863,15 +865,15 @@ radv_emit_rbplus_state(struct radv_cmd_buffer *cmd_buffer)
radeon_emit(cmd_buffer->cs, sx_blend_opt_control);
 }
 
-static void
+static bool
 radv_emit_graphics_pipeline(struct radv_cmd_buffer *cmd_buffer)
 {
struct radv_pipeline *pipeline = cmd_buffer->state.pipeline;
 
if (!pipeline || cmd_buffer->state.emitted_pipeline == pipeline)
-   return;
+   return false;
 
-   radv_update_multisample_state(cmd_buffer, pipeline);
+   bool context_roll = radv_update_multisample_state(cmd_buffer, pipeline);
 
cmd_buffer->scratch_size_needed =
  MAX2(cmd_buffer->scratch_size_needed,
@@ -884,6 +886,15 @@ radv_emit_graphics_pipeline(struct radv_cmd_buffer 
*cmd_buffer)
 
radeon_emit_array(cmd_buffer->cs, pipeline->cs.buf, pipeline->cs.cdw);
 
+   if (!cmd_buffer->state.emitted_pipeline ||
+   cmd_buffer->state.emitted_pipeline->ctx_cs.cdw != 
pipeline->ctx_cs.cdw ||
+   cmd_buffer->state.emitted_pipeline->ctx_cs_hash != 
pipeline->ctx_cs_hash ||
+   memcmp(cmd_buffer->state.emitted_pipeline->ctx_cs.buf,
+  pipeline->ctx_cs.buf, pipeline->ctx_cs.cdw * 4)) {
+   radeon_emit_array(cmd_buffer->cs, pipeline->ctx_cs.buf, 
pipeline->ctx_cs.cdw);
+   context_roll = true;
+   }
+
for (unsigned i = 0; i < MESA_SHADER_COMPUTE; i++) {
if (!pipeline->shaders[i])
continue;
@@ -902,6 +913,8 @@ radv_emit_graphics_pipeline(struct radv_cmd_buffer 
*cmd_buffer)
cmd_buffer->state.emitted_pipeline = pipeline;
 
cmd_buffer->state.dirty &= ~RADV_CMD_DIRTY_PIPELINE;
+
+   return context_roll;
 }
 
 static void
@@ -2859,6 +2872,8 @@ radv_emit_compute_pipeline(struct radv_cmd_buffer 
*cmd_buffer)
if (!pipeline || pipeline == cmd_buffer->state.emitted_compute_pipeline)
return;
 
+   assert(!pipeline->ctx_cs.cdw);
+
cmd_buffer->state.emitted_compute_pipeline = pipeline;
 
radeon_check_space(cmd_buffer->device->ws, cmd_buffer->cs, 
pipeline->cs.cdw);
@@ -3609,30 +3624,30 @@ radv_emit_draw_packets(struct radv_cmd_buffer 
*cmd_buffer,
  * any context registers.
  */
 static bool radv_need_late_scissor_emission(struct radv_cmd_buffer *cmd_buffer,
-bool indexed_draw)
+bool indexed_draw,
+bool pipeline_context_roll)
 {
struct radv_cmd_state *state = &cmd_buffer->state;
 
if (!cmd_buffer->device->physical_device->has_scissor_bug)
return false;
 
+   if (pipeline_context_roll)
+   return true;
+
uint32_t used_states = 
cmd_buffer->state.pipeline->graphics.needed_dynamic_state | 
~RADV_CMD_DIRTY_DYNAMIC_ALL;
 
/* Index, vertex and streamout buffers don't change context regs, and
-* pipeline is handle

[Mesa-dev] [PATCH] nir: fix copy-paste error in nir_lower_constant_initializers

2019-01-10 Thread Rhys Perry

Fixes: 393b59e0772e7bf0426bdf61c740752c4e09dde1
('nir: Rework nir_lower_constant_initializers() to handle functions')
---
 src/compiler/nir/nir_lower_constant_initializers.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/compiler/nir/nir_lower_constant_initializers.c 
b/src/compiler/nir/nir_lower_constant_initializers.c
index cbee59b1f30..959d1eabfca 100644
--- a/src/compiler/nir/nir_lower_constant_initializers.c
+++ b/src/compiler/nir/nir_lower_constant_initializers.c
@@ -104,10 +104,10 @@ nir_lower_constant_initializers(nir_shader *shader, 
nir_variable_mode modes)
  impl_progress |= lower_const_initializer(&builder, &shader->outputs);
 
   if ((modes & nir_var_private) && function->is_entrypoint)
- impl_progress |= lower_const_initializer(&builder, &shader->outputs);
+ impl_progress |= lower_const_initializer(&builder, &shader->globals);
 
   if ((modes & nir_var_system_value) && function->is_entrypoint)
- impl_progress |= lower_const_initializer(&builder, &shader->outputs);
+ impl_progress |= lower_const_initializer(&builder, 
&shader->system_values);
 
   if (modes & nir_var_function)
  impl_progress |= lower_const_initializer(&builder, 
&function->impl->locals);
-- 
2.20.1

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev

Re: [Mesa-dev] [PATCH] radv: use dithered alpha-to-coverage

2019-01-09 Thread Rhys Perry

Seems I sent the wrong commit message.

It was meant to be:
This matches the behaviour of AMDVLK and hides banding.
It is also seems to be allowed by the Vulkan spec.

Signed-off-by: Rhys Perry 

On Wed, 9 Jan 2019 at 14:40, Rhys Perry  wrote:
>
> This matches the behaviour of AMDVLK and hides banding
>
> TODO: run tests
> ---
>  src/amd/vulkan/radv_pipeline.c | 9 +
>  1 file changed, 5 insertions(+), 4 deletions(-)
>
> diff --git a/src/amd/vulkan/radv_pipeline.c b/src/amd/vulkan/radv_pipeline.c
> index 3561d17aaba..26ee59f11dd 100644
> --- a/src/amd/vulkan/radv_pipeline.c
> +++ b/src/amd/vulkan/radv_pipeline.c
> @@ -681,10 +681,11 @@ radv_pipeline_init_blend_state(struct radv_pipeline 
> *pipeline,
> else
> blend.cb_color_control |= S_028808_ROP3(V_028808_ROP3_COPY);
>
> -   blend.db_alpha_to_mask = S_028B70_ALPHA_TO_MASK_OFFSET0(2) |
> -   S_028B70_ALPHA_TO_MASK_OFFSET1(2) |
> -   S_028B70_ALPHA_TO_MASK_OFFSET2(2) |
> -   S_028B70_ALPHA_TO_MASK_OFFSET3(2);
> +   blend.db_alpha_to_mask = S_028B70_ALPHA_TO_MASK_OFFSET0(3) |
> +   S_028B70_ALPHA_TO_MASK_OFFSET1(1) |
> +   S_028B70_ALPHA_TO_MASK_OFFSET2(0) |
> +   S_028B70_ALPHA_TO_MASK_OFFSET3(2) |
> +   S_028B70_OFFSET_ROUND(1);
>
> if (vkms && vkms->alphaToCoverageEnable) {
> blend.db_alpha_to_mask |= S_028B70_ALPHA_TO_MASK_ENABLE(1);
> --
> 2.20.1
>
___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev

[Mesa-dev] [PATCH] radv: use dithered alpha-to-coverage

2019-01-09 Thread Rhys Perry

This matches the behaviour of AMDVLK and hides banding

TODO: run tests
---
 src/amd/vulkan/radv_pipeline.c | 9 +
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/src/amd/vulkan/radv_pipeline.c b/src/amd/vulkan/radv_pipeline.c
index 3561d17aaba..26ee59f11dd 100644
--- a/src/amd/vulkan/radv_pipeline.c
+++ b/src/amd/vulkan/radv_pipeline.c
@@ -681,10 +681,11 @@ radv_pipeline_init_blend_state(struct radv_pipeline 
*pipeline,
else
blend.cb_color_control |= S_028808_ROP3(V_028808_ROP3_COPY);
 
-   blend.db_alpha_to_mask = S_028B70_ALPHA_TO_MASK_OFFSET0(2) |
-   S_028B70_ALPHA_TO_MASK_OFFSET1(2) |
-   S_028B70_ALPHA_TO_MASK_OFFSET2(2) |
-   S_028B70_ALPHA_TO_MASK_OFFSET3(2);
+   blend.db_alpha_to_mask = S_028B70_ALPHA_TO_MASK_OFFSET0(3) |
+   S_028B70_ALPHA_TO_MASK_OFFSET1(1) |
+   S_028B70_ALPHA_TO_MASK_OFFSET2(0) |
+   S_028B70_ALPHA_TO_MASK_OFFSET3(2) |
+   S_028B70_OFFSET_ROUND(1);
 
if (vkms && vkms->alphaToCoverageEnable) {
blend.db_alpha_to_mask |= S_028B70_ALPHA_TO_MASK_ENABLE(1);
-- 
2.20.1

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev

[Mesa-dev] [PATCH] ac/nir, radv, radeonsi/nir: use correct indices for interpolation intrinsics

2019-01-09 Thread Rhys Perry

Fixes artifacts in World of Warcraft when Multi-sample Alpha-Test is
enabled.
It also fixes various piglit interpolateAt*() tests with NIR.

Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=106595
Signed-off-by: Rhys Perry 
---
 src/amd/common/ac_nir_to_llvm.c  | 2 +-
 src/amd/common/ac_shader_abi.h   | 2 ++
 src/amd/vulkan/radv_nir_to_llvm.c| 2 ++
 src/gallium/drivers/radeonsi/si_shader_nir.c | 3 +++
 4 files changed, 8 insertions(+), 1 deletion(-)

diff --git a/src/amd/common/ac_nir_to_llvm.c b/src/amd/common/ac_nir_to_llvm.c
index 6d97212b805..8fd8580087f 100644
--- a/src/amd/common/ac_nir_to_llvm.c
+++ b/src/amd/common/ac_nir_to_llvm.c
@@ -2829,7 +2829,7 @@ static LLVMValueRef visit_interp(struct ac_nir_context 
*ctx,
LLVMValueRef src0 = NULL;
 
nir_variable *var = 
nir_deref_instr_get_variable(nir_instr_as_deref(instr->src[0].ssa->parent_instr));
-   int input_index = var->data.location - VARYING_SLOT_VAR0;
+   int input_index = ctx->abi->fs_input_attr_indices[var->data.location - 
VARYING_SLOT_VAR0];
switch (instr->intrinsic) {
case nir_intrinsic_interp_deref_at_centroid:
location = INTERP_CENTROID;
diff --git a/src/amd/common/ac_shader_abi.h b/src/amd/common/ac_shader_abi.h
index 6b9a91c92a9..4f51aa9b0c0 100644
--- a/src/amd/common/ac_shader_abi.h
+++ b/src/amd/common/ac_shader_abi.h
@@ -76,6 +76,8 @@ struct ac_shader_abi {
 * driver_location.
 */
LLVMValueRef *inputs;
+   /* Varying -> attribute number mapping. Also NIR-only */
+   unsigned fs_input_attr_indices[MAX_VARYING];
 
void (*emit_outputs)(struct ac_shader_abi *abi,
 unsigned max_outputs,
diff --git a/src/amd/vulkan/radv_nir_to_llvm.c 
b/src/amd/vulkan/radv_nir_to_llvm.c
index 322b10b67a0..cd58167b766 100644
--- a/src/amd/vulkan/radv_nir_to_llvm.c
+++ b/src/amd/vulkan/radv_nir_to_llvm.c
@@ -2239,6 +2239,8 @@ handle_fs_inputs(struct radv_shader_context *ctx,
 
if (LLVMIsUndef(interp_param))
ctx->shader_info->fs.flat_shaded_mask |= 1u << 
index;
+   if (i >= VARYING_SLOT_VAR0)
+   ctx->abi.fs_input_attr_indices[i - 
VARYING_SLOT_VAR0] = index;
++index;
} else if (i == VARYING_SLOT_CLIP_DIST0) {
int length = 
ctx->shader_info->info.ps.num_input_clips_culls;
diff --git a/src/gallium/drivers/radeonsi/si_shader_nir.c 
b/src/gallium/drivers/radeonsi/si_shader_nir.c
index 0a692277f64..d5b8a8416d9 100644
--- a/src/gallium/drivers/radeonsi/si_shader_nir.c
+++ b/src/gallium/drivers/radeonsi/si_shader_nir.c
@@ -1019,6 +1019,9 @@ bool si_nir_build_llvm(struct si_shader_context *ctx, 
struct nir_shader *nir)
LLVMValueRef data[4];
unsigned loc = variable->data.location;
 
+   if (loc >= VARYING_SLOT_VAR0 && nir->info.stage == 
MESA_SHADER_FRAGMENT)
+   ctx->abi.fs_input_attr_indices[loc - 
VARYING_SLOT_VAR0] = input_idx / 4;
+
for (unsigned i = 0; i < attrib_count; i++) {
/* Packed components share the same location so 
skip
 * them if we have already processed the 
location.
-- 
2.20.1

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev

[Mesa-dev] [PATCH] radv: allow secondary command buffers to inherit unknown framebuffers

2018-12-20 Thread Rhys Perry

Fixes: f4e499ec79 ('radv: add initial non-conformant radv vulkan driver')
Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=107986
Signed-off-by: Rhys Perry 
---
 src/amd/vulkan/radv_cmd_buffer.c | 59 ++--
 src/amd/vulkan/radv_meta_clear.c |  8 +
 src/amd/vulkan/radv_private.h|  2 ++
 3 files changed, 50 insertions(+), 19 deletions(-)

diff --git a/src/amd/vulkan/radv_cmd_buffer.c b/src/amd/vulkan/radv_cmd_buffer.c
index c61310f3fc9..96fe5acb3bf 100644
--- a/src/amd/vulkan/radv_cmd_buffer.c
+++ b/src/amd/vulkan/radv_cmd_buffer.c
@@ -730,6 +730,9 @@ radv_emit_rbplus_state(struct radv_cmd_buffer *cmd_buffer)
struct radv_framebuffer *framebuffer = cmd_buffer->state.framebuffer;
const struct radv_subpass *subpass = cmd_buffer->state.subpass;
 
+   /* FIXME: handle when the framebuffer is unknown in secondary 
framebuffers */
+   assert(!cmd_buffer->inherit_unknown_fb);
+
unsigned sx_ps_downconvert = 0;
unsigned sx_blend_opt_epsilon = 0;
unsigned sx_blend_opt_control = 0;
@@ -1189,19 +1192,22 @@ radv_update_bound_fast_clear_ds(struct radv_cmd_buffer 
*cmd_buffer,
struct radv_framebuffer *framebuffer = cmd_buffer->state.framebuffer;
const struct radv_subpass *subpass = cmd_buffer->state.subpass;
struct radeon_cmdbuf *cs = cmd_buffer->cs;
-   struct radv_attachment_info *att;
-   uint32_t att_idx;
+   struct radv_attachment_info *att = NULL;
 
-   if (!framebuffer || !subpass)
+   if (!subpass)
return;
-
-   att_idx = subpass->depth_stencil_attachment.attachment;
-   if (att_idx == VK_ATTACHMENT_UNUSED)
+   if (!framebuffer && !cmd_buffer->inherit_unknown_fb)
return;
 
-   att = &framebuffer->attachments[att_idx];
-   if (att->attachment->image != image)
-   return;
+   if (framebuffer) {
+   uint32_t att_idx = subpass->depth_stencil_attachment.attachment;
+   if (att_idx == VK_ATTACHMENT_UNUSED)
+   return;
+
+   att = &framebuffer->attachments[att_idx];
+   if (att->attachment->image != image)
+   return;
+   }
 
radeon_set_context_reg_seq(cs, R_028028_DB_STENCIL_CLEAR, 2);
radeon_emit(cs, ds_clear_value.stencil);
@@ -1212,6 +1218,8 @@ radv_update_bound_fast_clear_ds(struct radv_cmd_buffer 
*cmd_buffer,
 */
if ((aspects & VK_IMAGE_ASPECT_DEPTH_BIT) &&
ds_clear_value.depth == 0.0) {
+   assert(att);
+
VkImageLayout layout = subpass->depth_stencil_attachment.layout;
 
radv_update_zrange_precision(cmd_buffer, &att->ds, image,
@@ -1426,19 +1434,22 @@ radv_update_bound_fast_clear_color(struct 
radv_cmd_buffer *cmd_buffer,
struct radv_framebuffer *framebuffer = cmd_buffer->state.framebuffer;
const struct radv_subpass *subpass = cmd_buffer->state.subpass;
struct radeon_cmdbuf *cs = cmd_buffer->cs;
-   struct radv_attachment_info *att;
-   uint32_t att_idx;
 
-   if (!framebuffer || !subpass)
+   if (!subpass)
return;
-
-   att_idx = subpass->color_attachments[cb_idx].attachment;
-   if (att_idx == VK_ATTACHMENT_UNUSED)
+   if (!framebuffer && !cmd_buffer->inherit_unknown_fb)
return;
 
-   att = &framebuffer->attachments[att_idx];
-   if (att->attachment->image != image)
-   return;
+   if (framebuffer) {
+   struct radv_attachment_info *att;
+   uint32_t att_idx = 
subpass->color_attachments[cb_idx].attachment;
+   if (att_idx == VK_ATTACHMENT_UNUSED)
+   return;
+
+   att = &framebuffer->attachments[att_idx];
+   if (att->attachment->image != image)
+   return;
+   }
 
radeon_set_context_reg_seq(cs, R_028C8C_CB_COLOR0_CLEAR_WORD0 + cb_idx 
* 0x3c, 2);
radeon_emit(cs, color_values[0]);
@@ -2528,6 +2539,7 @@ VkResult radv_BeginCommandBuffer(
cmd_buffer->state.last_first_instance = -1;
cmd_buffer->state.predication_type = -1;
cmd_buffer->usage_flags = pBeginInfo->flags;
+   cmd_buffer->inherit_unknown_fb = false;
 
if (cmd_buffer->level == VK_COMMAND_BUFFER_LEVEL_SECONDARY &&
(pBeginInfo->flags & 
VK_COMMAND_BUFFER_USAGE_RENDER_PASS_CONTINUE_BIT)) {
@@ -2535,6 +2547,9 @@ VkResult radv_BeginCommandBuffer(
cmd_buffer->state.framebuffer = 
radv_framebuffer_from_handle(pBeginInfo->pInheritanceInfo->framebuffer);
cmd_buffer->state.pass = 
radv_render_pass_from_handle(pBeginInfo->pInheritanceInfo->renderPass);
 
+   if (cmd_buffer->state.pa

Re: [Mesa-dev] [PATCH 01/38] ac: add various helpers for float16/int16/int8

2018-12-19 Thread Rhys Perry

I would expect these helpers to be much more efficient than the
functions you suggested. They are also (in my opinion) more readable
than the suggested functions.

I don't think it matters much though, so I'm fine either way.

On Tue, 18 Dec 2018 at 02:48, Marek Olšák  wrote:
>
> On Fri, Dec 7, 2018 at 12:22 PM Rhys Perry  wrote:
>>
>> Signed-off-by: Rhys Perry 
>> ---
>>  src/amd/common/ac_llvm_build.c  | 123 ++--
>>  src/amd/common/ac_llvm_build.h  |  22 +-
>>  src/amd/common/ac_nir_to_llvm.c |  30 
>>  3 files changed, 154 insertions(+), 21 deletions(-)
>>
>> diff --git a/src/amd/common/ac_llvm_build.c b/src/amd/common/ac_llvm_build.c
>> index 154cc696a2..cc7c6da5a4 100644
>> --- a/src/amd/common/ac_llvm_build.c
>> +++ b/src/amd/common/ac_llvm_build.c
>> @@ -87,12 +87,16 @@ ac_llvm_context_init(struct ac_llvm_context *ctx,
>> ctx->v4f32 = LLVMVectorType(ctx->f32, 4);
>> ctx->v8i32 = LLVMVectorType(ctx->i32, 8);
>>
>> +   ctx->i8_0 = LLVMConstInt(ctx->i8, 0, false);
>> +   ctx->i8_1 = LLVMConstInt(ctx->i8, 1, false);
>> ctx->i16_0 = LLVMConstInt(ctx->i16, 0, false);
>> ctx->i16_1 = LLVMConstInt(ctx->i16, 1, false);
>> ctx->i32_0 = LLVMConstInt(ctx->i32, 0, false);
>> ctx->i32_1 = LLVMConstInt(ctx->i32, 1, false);
>> ctx->i64_0 = LLVMConstInt(ctx->i64, 0, false);
>> ctx->i64_1 = LLVMConstInt(ctx->i64, 1, false);
>> +   ctx->f16_0 = LLVMConstReal(ctx->f16, 0.0);
>> +   ctx->f16_1 = LLVMConstReal(ctx->f16, 1.0);
>> ctx->f32_0 = LLVMConstReal(ctx->f32, 0.0);
>> ctx->f32_1 = LLVMConstReal(ctx->f32, 1.0);
>> ctx->f64_0 = LLVMConstReal(ctx->f64, 0.0);
>> @@ -201,7 +205,9 @@ ac_get_type_size(LLVMTypeRef type)
>>
>>  static LLVMTypeRef to_integer_type_scalar(struct ac_llvm_context *ctx, 
>> LLVMTypeRef t)
>>  {
>> -   if (t == ctx->f16 || t == ctx->i16)
>> +   if (t == ctx->i8)
>> +   return ctx->i8;
>> +   else if (t == ctx->f16 || t == ctx->i16)
>> return ctx->i16;
>> else if (t == ctx->f32 || t == ctx->i32)
>> return ctx->i32;
>> @@ -268,6 +274,110 @@ ac_to_float(struct ac_llvm_context *ctx, LLVMValueRef 
>> v)
>> return LLVMBuildBitCast(ctx->builder, v, ac_to_float_type(ctx, 
>> type), "");
>>  }
>>
>> +LLVMValueRef ac_get_zerof(struct ac_llvm_context *ctx, LLVMTypeRef t)
>> +{
>> +   if (t == ctx->f16)
>> +   return ctx->f16_0;
>> +   else if (t == ctx->f32)
>> +   return ctx->f32_0;
>> +   else if (t == ctx->f64)
>> +   return ctx->f64_0;
>> +   else
>> +   unreachable("Unhandled float size");
>> +}
>> +
>> +LLVMValueRef ac_get_onef(struct ac_llvm_context *ctx, LLVMTypeRef t)
>> +{
>> +   if (t == ctx->f16)
>> +   return ctx->f16_1;
>> +   else if (t == ctx->f32)
>> +   return ctx->f32_1;
>> +   else if (t == ctx->f64)
>> +   return ctx->f64_1;
>> +   else
>> +   unreachable("Unhandled float size");
>> +}
>> +
>> +LLVMValueRef ac_get_zero(struct ac_llvm_context *ctx, LLVMTypeRef t)
>> +{
>> +   if (t == ctx->i8)
>> +   return ctx->i8_0;
>> +   else if (t == ctx->i16)
>> +   return ctx->i16_0;
>> +   else if (t == ctx->i32)
>> +   return ctx->i32_0;
>> +   else if (t == ctx->i64)
>> +   return ctx->i64_0;
>> +   else
>> +   unreachable("Unhandled bit size");
>> +}
>> +
>> +LLVMValueRef ac_get_one(struct ac_llvm_context *ctx, LLVMTypeRef t)
>> +{
>> +   if (t == ctx->i8)
>> +   return ctx->i8_1;
>> +   else if (t == ctx->i16)
>> +   return ctx->i16_1;
>> +   else if (t == ctx->i32)
>> +   return ctx->i32_1;
>> +   else if (t == ctx->i64)
>> +   return ctx->i64_1;
>> +   else
>> +   unreachable("Unhandled bit size");
>> +}
>
>
> You don't need these helpers. You can just use LLVMConstInt and LLVMConstReal.
>
&g

Re: [Mesa-dev] [PATCH 0/2] radv/query: Use 1-bit booleans in query shaders

2018-12-19 Thread Rhys Perry

You missed this change (or something functionally similar):

diff --git a/src/amd/vulkan/radv_query.c b/src/amd/vulkan/radv_query.c
index e7bb81489f6..5d35af05579 100644
--- a/src/amd/vulkan/radv_query.c
+++ b/src/amd/vulkan/radv_query.c
@@ -630,8 +630,8 @@ build_tfb_query_shader(struct radv_device *device)
avails[1] = nir_iand(&b, nir_channel(&b, &load2->dest.ssa, 1),
 nir_channel(&b, &load2->dest.ssa, 3));
nir_ssa_def *result_is_available =
-   nir_iand(&b, nir_iand(&b, avails[0], avails[1]),
-nir_imm_int(&b, 0x8000));
+   nir_i2b(&b, nir_iand(&b, nir_iand(&b, avails[0], avails[1]),
+nir_imm_int(&b, 0x8000)));

/* Only compute result if available. */
nir_if *available_if = nir_if_create(b.shader);

Other than that, this looks fine and seems to work correctly on my Vega.

With that change (and for what it's worth), this is:
Reviewed-by: Rhys Perry 



On Wed, 19 Dec 2018 at 19:45, Jason Ekstrand  wrote:
>
> When we switched over to 1-bit booleans, the radv query shaders ended up
> still using 32-bit booleans for most stuff.  While this is technically
> valid from an IR perspective, most of the NIR passes don't really support
> 32-bit booleans correctly anymore now that we've moved to 1-bit.  This tiny
> series attempts to convert the radv query shaders over to using 1-bit
> Booleans.
>
> I've only compile-tested it and read through it a couple times but am not
> really set up for testing radv.  I would very much appreciate if someone
> more familiar with radv could review and test these patches (and possibly
> rewrite them if appropriate).
>
> Cc: Dave Airlie 
> Cc: Timothy Arceri 
> Cc: Bas Nieuwenhuizen 
>
> Jason Ekstrand (2):
>   radv/query: Add a nir_flag_set helper
>   radv/query: Use 1-bit booleans in query shaders
>
>  src/amd/vulkan/radv_query.c | 67 +++--
>  1 file changed, 34 insertions(+), 33 deletions(-)
>
> --
> 2.19.2
>
> ___
> mesa-dev mailing list
> mesa-dev@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/mesa-dev
___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev

Re: [Mesa-dev] [PATCH] nir: create 32-bit bcsel for 32-bit conditions

2018-12-18 Thread Rhys Perry

The 32-bit condition was in one of radv's meta shaders and had it from
the start.

32-bit conditions seems to be valid after lowering booleans to 32-bit.
Are they supposed to be invalid before the lowering?

On Wed, 19 Dec 2018 at 00:59, Jason Ekstrand  wrote:
>
> Seems reasonable though I'm a bit surprised your running peephole_select
> after lowering booleans.
>
> On December 18, 2018 18:16:46 Timothy Arceri  wrote:
>
> > Reviewed-by: Timothy Arceri 
> >
> > On 18/12/18 3:16 am, Rhys Perry wrote:
> >> Signed-off-by: Rhys Perry 
> >> ---
> >>   src/compiler/nir/nir_opt_peephole_select.c | 4 +++-
> >>   1 file changed, 3 insertions(+), 1 deletion(-)
> >>
> >> diff --git a/src/compiler/nir/nir_opt_peephole_select.c
> >> b/src/compiler/nir/nir_opt_peephole_select.c
> >> index ad9d0abec0..241627ed99 100644
> >> --- a/src/compiler/nir/nir_opt_peephole_select.c
> >> +++ b/src/compiler/nir/nir_opt_peephole_select.c
> >> @@ -205,7 +205,9 @@ nir_opt_peephole_select_block(nir_block *block,
> >> nir_shader *shader,
> >>break;
> >>
> >> nir_phi_instr *phi = nir_instr_as_phi(instr);
> >> -  nir_alu_instr *sel = nir_alu_instr_create(shader, nir_op_bcsel);
> >> +  nir_op sel_op = nir_src_bit_size(if_stmt->condition) == 1 ?
> >> +  nir_op_bcsel : nir_op_b32csel;
> >> +  nir_alu_instr *sel = nir_alu_instr_create(shader, sel_op);
> >> nir_src_copy(&sel->src[0].src, &if_stmt->condition, sel);
> >> /* Splat the condition to all channels */
> >> memset(sel->src[0].swizzle, 0, sizeof sel->src[0].swizzle);
> >>
> > ___
> > mesa-dev mailing list
> > mesa-dev@lists.freedesktop.org
> > https://lists.freedesktop.org/mailman/listinfo/mesa-dev
>
>
>
___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev

[Mesa-dev] [PATCH] nir: create 32-bit bcsel for 32-bit conditions

2018-12-17 Thread Rhys Perry

Signed-off-by: Rhys Perry 
---
 src/compiler/nir/nir_opt_peephole_select.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/src/compiler/nir/nir_opt_peephole_select.c 
b/src/compiler/nir/nir_opt_peephole_select.c
index ad9d0abec0..241627ed99 100644
--- a/src/compiler/nir/nir_opt_peephole_select.c
+++ b/src/compiler/nir/nir_opt_peephole_select.c
@@ -205,7 +205,9 @@ nir_opt_peephole_select_block(nir_block *block, nir_shader 
*shader,
  break;
 
   nir_phi_instr *phi = nir_instr_as_phi(instr);
-  nir_alu_instr *sel = nir_alu_instr_create(shader, nir_op_bcsel);
+  nir_op sel_op = nir_src_bit_size(if_stmt->condition) == 1 ?
+  nir_op_bcsel : nir_op_b32csel;
+  nir_alu_instr *sel = nir_alu_instr_create(shader, sel_op);
   nir_src_copy(&sel->src[0].src, &if_stmt->condition, sel);
   /* Splat the condition to all channels */
   memset(sel->src[0].swizzle, 0, sizeof sel->src[0].swizzle);
-- 
2.19.2

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev

[Mesa-dev] [PATCH] radv: don't set surf_index for stencil-only images

2018-12-14 Thread Rhys Perry

Fixes: f8d5b377c8b ('radv: set cb base tile swizzles for MRT speedups (v4)')
Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=108116
Signed-off-by: Rhys Perry 
---
Unfortunately I was not able to test this patch on a Polaris due to hardware
issues. It fixed the deqp-vk tests mentioned in the bugzilla without regressions
on Vega though.

 src/amd/vulkan/radv_image.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/amd/vulkan/radv_image.c b/src/amd/vulkan/radv_image.c
index 2cff4d5283..2bd74e202f 100644
--- a/src/amd/vulkan/radv_image.c
+++ b/src/amd/vulkan/radv_image.c
@@ -986,7 +986,7 @@ radv_image_create(VkDevice _device,
 
image->shareable = vk_find_struct_const(pCreateInfo->pNext,

EXTERNAL_MEMORY_IMAGE_CREATE_INFO_KHR) != NULL;
-   if (!vk_format_is_depth(pCreateInfo->format) && !create_info->scanout 
&& !image->shareable) {
+   if (!vk_format_is_depth_or_stencil(pCreateInfo->format) && 
!create_info->scanout && !image->shareable) {
image->info.surf_index = &device->image_mrt_offset_counter;
}
 
-- 
2.19.2

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev

[Mesa-dev] [PATCH 1/2] ac: refactor visit_load_buffer

2018-12-13 Thread Rhys Perry

This is so that we can split different types of loads more easily.

Signed-off-by: Rhys Perry 
---
 src/amd/common/ac_llvm_build.c  |  8 ++--
 src/amd/common/ac_nir_to_llvm.c | 80 -
 src/compiler/nir/nir.h  |  2 +-
 3 files changed, 44 insertions(+), 46 deletions(-)

diff --git a/src/amd/common/ac_llvm_build.c b/src/amd/common/ac_llvm_build.c
index abc18da13d..154cc696a2 100644
--- a/src/amd/common/ac_llvm_build.c
+++ b/src/amd/common/ac_llvm_build.c
@@ -2943,9 +2943,11 @@ LLVMValueRef ac_trim_vector(struct ac_llvm_context *ctx, 
LLVMValueRef value,
if (count == num_components)
return value;
 
-   LLVMValueRef masks[] = {
-   ctx->i32_0, ctx->i32_1,
-   LLVMConstInt(ctx->i32, 2, false), LLVMConstInt(ctx->i32, 3, false)};
+   LLVMValueRef masks[MAX2(count, 2)];
+   masks[0] = ctx->i32_0;
+   masks[1] = ctx->i32_1;
+   for (unsigned i = 2; i < count; i++)
+   masks[i] = LLVMConstInt(ctx->i32, i, false);
 
if (count == 1)
return LLVMBuildExtractElement(ctx->builder, value, masks[0],
diff --git a/src/amd/common/ac_nir_to_llvm.c b/src/amd/common/ac_nir_to_llvm.c
index a109f5a815..c05b45e084 100644
--- a/src/amd/common/ac_nir_to_llvm.c
+++ b/src/amd/common/ac_nir_to_llvm.c
@@ -1623,37 +1623,43 @@ static LLVMValueRef visit_atomic_ssbo(struct 
ac_nir_context *ctx,
 static LLVMValueRef visit_load_buffer(struct ac_nir_context *ctx,
   const nir_intrinsic_instr *instr)
 {
-   LLVMValueRef results[2];
-   int load_bytes;
int elem_size_bytes = instr->dest.ssa.bit_size / 8;
int num_components = instr->num_components;
-   int num_bytes = num_components * elem_size_bytes;
enum gl_access_qualifier access = nir_intrinsic_access(instr);
LLVMValueRef glc = ctx->ac.i1false;
 
if (access & (ACCESS_VOLATILE | ACCESS_COHERENT))
glc = ctx->ac.i1true;
 
-   for (int i = 0; i < num_bytes; i += load_bytes) {
-   load_bytes = MIN2(num_bytes - i, 16);
-   const char *load_name;
-   LLVMTypeRef data_type;
-   LLVMValueRef offset = get_src(ctx, instr->src[1]);
-   LLVMValueRef immoffset = LLVMConstInt(ctx->ac.i32, i, false);
-   LLVMValueRef rsrc = ctx->abi->load_ssbo(ctx->abi,
-   get_src(ctx, 
instr->src[0]), false);
-   LLVMValueRef vindex = ctx->ac.i32_0;
-
-   int idx = i ? 1 : 0;
+   LLVMValueRef offset = get_src(ctx, instr->src[1]);
+   LLVMValueRef rsrc = ctx->abi->load_ssbo(ctx->abi,
+   get_src(ctx, instr->src[0]), 
false);
+   LLVMValueRef vindex = ctx->ac.i32_0;
+
+   LLVMTypeRef def_type = get_def_type(ctx, &instr->dest.ssa);
+   LLVMTypeRef def_elem_type = num_components > 1 ? 
LLVMGetElementType(def_type) : def_type;
+
+   LLVMValueRef results[4];
+   for (int i = 0; i < num_components;) {
+   int num_elems = num_components - i;
+   if (num_elems * elem_size_bytes > 16)
+   num_elems = 16 / elem_size_bytes;
+   int load_bytes = num_elems * elem_size_bytes;
+
+   LLVMValueRef immoffset = LLVMConstInt(ctx->ac.i32, i * 
elem_size_bytes, false);
+
+   LLVMValueRef ret;
if (load_bytes == 2) {
-   results[idx] = ac_build_tbuffer_load_short(&ctx->ac,
-  rsrc,
-  vindex,
-  offset,
-  
ctx->ac.i32_0,
-  immoffset,
-  glc);
+   ret = ac_build_tbuffer_load_short(&ctx->ac,
+ rsrc,
+ vindex,
+ offset,
+ ctx->ac.i32_0,
+ immoffset,
+ glc);
} else {
+   const char *load_name;
+   LLVMTypeRef data_type;
switch (load_bytes) {
case 16:
case 12:
@@ -1679,33 +1685,23 @@ static LLVMValueRef visit_load_buffer(struct 
ac_nir_context *ctx,
glc,

[Mesa-dev] [PATCH 2/2] ac: split 16-bit ssbo loads that may not be dword aligned

2018-12-13 Thread Rhys Perry

Fixes: 7e7ee826982 ('ac: add support for 16bit buffer loads')
Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=108114
Signed-off-by: Rhys Perry 
---
 src/amd/common/ac_nir_to_llvm.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/amd/common/ac_nir_to_llvm.c b/src/amd/common/ac_nir_to_llvm.c
index c05b45e084..4a4c09cf5f 100644
--- a/src/amd/common/ac_nir_to_llvm.c
+++ b/src/amd/common/ac_nir_to_llvm.c
@@ -1642,6 +1642,8 @@ static LLVMValueRef visit_load_buffer(struct 
ac_nir_context *ctx,
LLVMValueRef results[4];
for (int i = 0; i < num_components;) {
int num_elems = num_components - i;
+   if (elem_size_bytes < 4 && nir_intrinsic_align(instr) % 4 != 0)
+   num_elems = 1;
if (num_elems * elem_size_bytes > 16)
num_elems = 16 / elem_size_bytes;
int load_bytes = num_elems * elem_size_bytes;
-- 
2.19.2

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev

Re: [Mesa-dev] [PATCH 2/2] radv: ensure export arguments are always float

2018-12-13 Thread Rhys Perry

(accidently sent an incomplete email)

Seems my LLVM configuration was messed up and I might have used my
distro's LLVM too.

LLVM 8 and 7 with a release build passes.

A debug build of 8 (and my messed up builds of 7 and 8 which I thought
were release ones) results in an assert.
On Thu, 13 Dec 2018 at 08:38, Samuel Pitoiset  wrote:
>
>
>
> On 12/6/18 3:18 PM, Rhys Perry wrote:
> > ./deqp-vk 
> > --deqp-case=dEQP-VK.spirv_assembly.instruction.graphics.16bit_storage.input_output_int_32_to_16.scalar_uint0_frag
> > should crash with something like:
> > deqp-vk: lib/IR/Instructions.cpp:2590: static llvm::CastInst*
> > llvm::CastInst::Create(llvm::Instruction::CastOps, llvm::Value*,
> > llvm::Type*, const llvm::Twine&, llvm::Instruction*): Assertion
> > `castIsValid(op, S, Ty) && "Invalid cast!"' failed.
> > because it's trying to zext/sext a half float to a i32.
> >
> > and ./deqp-vk 
> > --deqp-case=dEQP-VK.spirv_assembly.instruction.graphics.16bit_storage.input_output_int_32_to_16.scalar_uint0_vert
> > should crash with something like:
> > deqp-vk: lib/IR/Instructions.cpp:348: void
> > llvm::CallInst::init(llvm::FunctionType*, llvm::Value*,
> > llvm::ArrayRef,
> > llvm::ArrayRef >, const
> > llvm::Twine&): Assertion `(i >= FTy->getNumParams() ||
> > FTy->getParamType(i) == Args[i]->getType()) && "Calling a function
> > with a bad signature!"' failed.
> > because it's calling the export intrinsic with incorrect argument types.
> >
> > For both tests, it seems to only assert with LLVM 8 for some reason.
>
> I guess you use a debug llvm build? Can you figure out what change
> introduces this crash?
>
> > On Thu, 6 Dec 2018 at 13:31, Samuel Pitoiset  
> > wrote:
> >>
> >>
> >>
> >> On 12/6/18 2:15 PM, Rhys Perry wrote:
> >>> So that the signature is correct and consistent, the inputs to a export
> >>> intrinsic should always be 32-bit floats.
> >>>
> >>> This and the previous commit fixes a large amount crashes from
> >>> dEQP-VK.spirv_assembly.instruction.graphics.16bit_storage.input_output_int_*
> >>> tests
> >>>
> >>
> >> They don't crash for me? Please explain how to reproduce.
> >>
> >>> Fixes: b722b29f10d ('radv: add support for 16bit input/output')
> >>> Signed-off-by: Rhys Perry 
> >>> ---
> >>>src/amd/vulkan/radv_nir_to_llvm.c | 6 +-
> >>>1 file changed, 1 insertion(+), 5 deletions(-)
> >>>
> >>> diff --git a/src/amd/vulkan/radv_nir_to_llvm.c 
> >>> b/src/amd/vulkan/radv_nir_to_llvm.c
> >>> index 0c91118e5a..90bcc8dbfe 100644
> >>> --- a/src/amd/vulkan/radv_nir_to_llvm.c
> >>> +++ b/src/amd/vulkan/radv_nir_to_llvm.c
> >>> @@ -2464,12 +2464,8 @@ si_llvm_init_export_args(struct 
> >>> radv_shader_context *ctx,
> >>>} else
> >>>memcpy(&args->out[0], values, sizeof(values[0]) * 4);
> >>>
> >>> - for (unsigned i = 0; i < 4; ++i) {
> >>> - if (!(args->enabled_channels & (1 << i)))
> >>> - continue;
> >>> -
> >>> + for (unsigned i = 0; i < 4; ++i)
> >>>args->out[i] = ac_to_float(&ctx->ac, args->out[i]);
> >>> - }
> >>>}
> >>>
> >>>static void
> >>>
___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev

Re: [Mesa-dev] [PATCH 2/2] radv: ensure export arguments are always float

2018-12-13 Thread Rhys Perry

Seems my LLVM configuration was messed up and I might have used my
distro's LLVM too.

On Thu, 13 Dec 2018 at 08:38, Samuel Pitoiset  wrote:
>
>
>
> On 12/6/18 3:18 PM, Rhys Perry wrote:
> > ./deqp-vk 
> > --deqp-case=dEQP-VK.spirv_assembly.instruction.graphics.16bit_storage.input_output_int_32_to_16.scalar_uint0_frag
> > should crash with something like:
> > deqp-vk: lib/IR/Instructions.cpp:2590: static llvm::CastInst*
> > llvm::CastInst::Create(llvm::Instruction::CastOps, llvm::Value*,
> > llvm::Type*, const llvm::Twine&, llvm::Instruction*): Assertion
> > `castIsValid(op, S, Ty) && "Invalid cast!"' failed.
> > because it's trying to zext/sext a half float to a i32.
> >
> > and ./deqp-vk 
> > --deqp-case=dEQP-VK.spirv_assembly.instruction.graphics.16bit_storage.input_output_int_32_to_16.scalar_uint0_vert
> > should crash with something like:
> > deqp-vk: lib/IR/Instructions.cpp:348: void
> > llvm::CallInst::init(llvm::FunctionType*, llvm::Value*,
> > llvm::ArrayRef,
> > llvm::ArrayRef >, const
> > llvm::Twine&): Assertion `(i >= FTy->getNumParams() ||
> > FTy->getParamType(i) == Args[i]->getType()) && "Calling a function
> > with a bad signature!"' failed.
> > because it's calling the export intrinsic with incorrect argument types.
> >
> > For both tests, it seems to only assert with LLVM 8 for some reason.
>
> I guess you use a debug llvm build? Can you figure out what change
> introduces this crash?
>
> > On Thu, 6 Dec 2018 at 13:31, Samuel Pitoiset  
> > wrote:
> >>
> >>
> >>
> >> On 12/6/18 2:15 PM, Rhys Perry wrote:
> >>> So that the signature is correct and consistent, the inputs to a export
> >>> intrinsic should always be 32-bit floats.
> >>>
> >>> This and the previous commit fixes a large amount crashes from
> >>> dEQP-VK.spirv_assembly.instruction.graphics.16bit_storage.input_output_int_*
> >>> tests
> >>>
> >>
> >> They don't crash for me? Please explain how to reproduce.
> >>
> >>> Fixes: b722b29f10d ('radv: add support for 16bit input/output')
> >>> Signed-off-by: Rhys Perry 
> >>> ---
> >>>src/amd/vulkan/radv_nir_to_llvm.c | 6 +-
> >>>1 file changed, 1 insertion(+), 5 deletions(-)
> >>>
> >>> diff --git a/src/amd/vulkan/radv_nir_to_llvm.c 
> >>> b/src/amd/vulkan/radv_nir_to_llvm.c
> >>> index 0c91118e5a..90bcc8dbfe 100644
> >>> --- a/src/amd/vulkan/radv_nir_to_llvm.c
> >>> +++ b/src/amd/vulkan/radv_nir_to_llvm.c
> >>> @@ -2464,12 +2464,8 @@ si_llvm_init_export_args(struct 
> >>> radv_shader_context *ctx,
> >>>} else
> >>>memcpy(&args->out[0], values, sizeof(values[0]) * 4);
> >>>
> >>> - for (unsigned i = 0; i < 4; ++i) {
> >>> - if (!(args->enabled_channels & (1 << i)))
> >>> - continue;
> >>> -
> >>> + for (unsigned i = 0; i < 4; ++i)
> >>>args->out[i] = ac_to_float(&ctx->ac, args->out[i]);
> >>> - }
> >>>}
> >>>
> >>>static void
> >>>
___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev

Re: [Mesa-dev] [PATCH] radv: implement VK_EXT_sample_locations

2018-12-08 Thread Rhys Perry

A small number of questions/concerns:

- sampleLocationCoordinateRange[1] should probably be set to 0.9375,
  because of how the sample locations are encoded
- gl_SamplePosition doesn't seem like it would return the new sample
  locations
- R_028BD4_PA_SC_CENTROID_PRIORITY_{0,1} isn't updated. I'm not sure if
  this is required, but it's probably best to do so.
- I think it can pointlessly call radv_cayman_emit_msaa_sample_locs()
  before radv_emit_sample_locations()
- unlike AMDVLK, this doesn't seem to make use of sample location
  information during layout transitions?

You said that this implements the bare minimum, so you might already know
about some of these though (unless you were just talking about the
variableSampleLocations thing).
On Fri, 7 Dec 2018 at 16:19, Samuel Pitoiset  wrote:
>
> Basically, this extension allows applications to use custom
> sample locations. This only implements the barely minimum.
> It doesn't support variable sample locations during subpass.
>
> Most of the dEQP-VK.pipeline.multisample.sample_locations_ext.*
> CTS now pass.
>
> Only enabled on VI+ because it's untested on older chips.
>
> Signed-off-by: Samuel Pitoiset 
> ---
>  src/amd/vulkan/radv_cmd_buffer.c  | 177 +-
>  src/amd/vulkan/radv_device.c  |  27 +
>  src/amd/vulkan/radv_extensions.py |   1 +
>  src/amd/vulkan/radv_pipeline.c|  30 +
>  src/amd/vulkan/radv_private.h |  26 +++--
>  5 files changed, 253 insertions(+), 8 deletions(-)
>
> diff --git a/src/amd/vulkan/radv_cmd_buffer.c 
> b/src/amd/vulkan/radv_cmd_buffer.c
> index b4aea5bc898..c4bebeda0ce 100644
> --- a/src/amd/vulkan/radv_cmd_buffer.c
> +++ b/src/amd/vulkan/radv_cmd_buffer.c
> @@ -105,6 +105,7 @@ radv_bind_dynamic_state(struct radv_cmd_buffer 
> *cmd_buffer,
> dest->viewport.count = src->viewport.count;
> dest->scissor.count = src->scissor.count;
> dest->discard_rectangle.count = src->discard_rectangle.count;
> +   dest->sample_location.count = src->sample_location.count;
>
> if (copy_mask & RADV_DYNAMIC_VIEWPORT) {
> if (memcmp(&dest->viewport.viewports, 
> &src->viewport.viewports,
> @@ -192,6 +193,22 @@ radv_bind_dynamic_state(struct radv_cmd_buffer 
> *cmd_buffer,
> }
> }
>
> +   if (copy_mask & RADV_DYNAMIC_SAMPLE_LOCATIONS) {
> +   if (dest->sample_location.per_pixel != 
> src->sample_location.per_pixel ||
> +   dest->sample_location.grid_size.width != 
> src->sample_location.grid_size.width ||
> +   dest->sample_location.grid_size.height != 
> src->sample_location.grid_size.height ||
> +   memcmp(&dest->sample_location.locations,
> +  &src->sample_location.locations,
> +  src->sample_location.count * 
> sizeof(VkSampleLocationEXT))) {
> +   dest->sample_location.per_pixel = 
> src->sample_location.per_pixel;
> +   dest->sample_location.grid_size = 
> src->sample_location.grid_size;
> +   typed_memcpy(dest->sample_location.locations,
> +src->sample_location.locations,
> +src->sample_location.count);
> +   dest_mask |= RADV_DYNAMIC_SAMPLE_LOCATIONS;
> +   }
> +   }
> +
> cmd_buffer->state.dirty |= dest_mask;
>  }
>
> @@ -634,6 +651,135 @@ radv_emit_descriptor_pointers(struct radv_cmd_buffer 
> *cmd_buffer,
> }
>  }
>
> +/**
> + * Convert the user sample locations to hardware sample locations (the values
> + * that will be emitted by PA_SC_AA_SAMPLE_LOCS_PIXEL_*).
> + */
> +static void
> +radv_convert_user_sample_locs(struct radv_sample_locations_state *state,
> + uint32_t x, uint32_t y, VkOffset2D *sample_locs)
> +{
> +   uint32_t x_offset = x % state->grid_size.width;
> +   uint32_t y_offset = y % state->grid_size.height;
> +   uint32_t num_samples = (uint32_t)state->per_pixel;
> +   VkSampleLocationEXT *user_locs;
> +   uint32_t pixel_offset;
> +
> +   pixel_offset = (x_offset + y_offset * state->grid_size.width) * 
> num_samples;
> +
> +   assert(pixel_offset <= MAX_SAMPLE_LOCATIONS);
> +   user_locs = &state->locations[pixel_offset];
> +
> +   for (uint32_t i = 0; i < num_samples; i++) {
> +   float shifted_pos_x = user_locs[i].x - 0.5;
> +   float shifted_pos_y = user_locs[i].y - 0.5;
> +
> +   int32_t scaled_pos_x = floor(shifted_pos_x * 16);
> +   int32_t scaled_pos_y = floor(shifted_pos_y * 16);
> +
> +   sample_locs[i].x = CLAMP(scaled_pos_x, -8, 7);
> +   sample_locs[i].y = CLAMP(scaled_pos_y, -8, 7);
> +   }
> +}
> +
> +/**
> + * Compute the PA_SC_AA_SAMPLE_LOCS_PIXEL_* mask based on hardware sample
> + * locations.
> + */
> +static void
> +radv_compute_sample_locs_pixel(uint32_t

[Mesa-dev] [PATCH 29/38] ac/nir: implement 16-bit pack/unpack opcodes

2018-12-07 Thread Rhys Perry

Signed-off-by: Rhys Perry 
---
 src/amd/common/ac_nir_to_llvm.c | 24 
 1 file changed, 24 insertions(+)

diff --git a/src/amd/common/ac_nir_to_llvm.c b/src/amd/common/ac_nir_to_llvm.c
index aac3330c0d..d69135cc25 100644
--- a/src/amd/common/ac_nir_to_llvm.c
+++ b/src/amd/common/ac_nir_to_llvm.c
@@ -1011,6 +1011,30 @@ static void visit_alu(struct ac_nir_context *ctx, const 
nir_alu_instr *instr)
break;
}
 
+   case nir_op_pack_32_2x16_split: {
+   LLVMValueRef tmp = ac_build_gather_values(&ctx->ac, src, 2);
+   result = LLVMBuildBitCast(ctx->ac.builder, tmp, ctx->ac.i32, 
"");
+   break;
+   }
+
+   case nir_op_unpack_32_2x16_split_x: {
+   LLVMValueRef tmp = LLVMBuildBitCast(ctx->ac.builder, src[0],
+   ctx->ac.v2i16,
+   "");
+   result = LLVMBuildExtractElement(ctx->ac.builder, tmp,
+ctx->ac.i32_0, "");
+   break;
+   }
+
+   case nir_op_unpack_32_2x16_split_y: {
+   LLVMValueRef tmp = LLVMBuildBitCast(ctx->ac.builder, src[0],
+   ctx->ac.v2i16,
+   "");
+   result = LLVMBuildExtractElement(ctx->ac.builder, tmp,
+ctx->ac.i32_1, "");
+   break;
+   }
+
case nir_op_cube_face_coord: {
src[0] = ac_to_float(&ctx->ac, src[0]);
LLVMValueRef results[2];
-- 
2.19.2

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev

[Mesa-dev] [PATCH 24/38] ac/nir: make ac_find_lsb work on all bit sizes

2018-12-07 Thread Rhys Perry

Signed-off-by: Rhys Perry 
---
 src/amd/common/ac_llvm_build.c | 31 +--
 1 file changed, 5 insertions(+), 26 deletions(-)

diff --git a/src/amd/common/ac_llvm_build.c b/src/amd/common/ac_llvm_build.c
index 6266058b77..754ceda89b 100644
--- a/src/amd/common/ac_llvm_build.c
+++ b/src/amd/common/ac_llvm_build.c
@@ -2752,29 +2752,10 @@ LLVMValueRef ac_find_lsb(struct ac_llvm_context *ctx,
 LLVMValueRef src0)
 {
unsigned src0_bitsize = ac_get_elem_bits(ctx, LLVMTypeOf(src0));
-   const char *intrin_name;
-   LLVMTypeRef type;
-   LLVMValueRef zero;
-
-   switch (src0_bitsize) {
-   case 64:
-   intrin_name = "llvm.cttz.i64";
-   type = ctx->i64;
-   zero = ctx->i64_0;
-   break;
-   case 32:
-   intrin_name = "llvm.cttz.i32";
-   type = ctx->i32;
-   zero = ctx->i32_0;
-   break;
-   case 16:
-   intrin_name = "llvm.cttz.i16";
-   type = ctx->i16;
-   zero = ctx->i16_0;
-   break;
-   default:
-   unreachable(!"invalid bitsize");
-   }
+   char intrin_name[64];
+   LLVMTypeRef type = ac_int_of_size(ctx, src0_bitsize);
+   LLVMValueRef zero = ac_get_zero(ctx, type);
+   snprintf(intrin_name, sizeof(intrin_name), "llvm.cttz.i%d", 
src0_bitsize);
 
LLVMValueRef params[2] = {
src0,
@@ -2795,9 +2776,7 @@ LLVMValueRef ac_find_lsb(struct ac_llvm_context *ctx,
  params, 2,
  AC_FUNC_ATTR_READNONE);
 
-   if (src0_bitsize == 64) {
-   lsb = LLVMBuildTrunc(ctx->builder, lsb, ctx->i32, "");
-   }
+   lsb = ac_build_ui_cast(ctx, lsb, ctx->i32);
 
/* TODO: We need an intrinsic to skip this conditional. */
/* Check for zero: */
-- 
2.19.2

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev

[Mesa-dev] [PATCH 38/38] radv: expose float16, int16 and int8 features and extensions

2018-12-07 Thread Rhys Perry

Signed-off-by: Rhys Perry 
---
 src/amd/vulkan/radv_device.c  | 17 +
 src/amd/vulkan/radv_extensions.py |  4 
 src/amd/vulkan/radv_shader.c  |  3 +++
 3 files changed, 24 insertions(+)

diff --git a/src/amd/vulkan/radv_device.c b/src/amd/vulkan/radv_device.c
index ad057a8750..8444651a84 100644
--- a/src/amd/vulkan/radv_device.c
+++ b/src/amd/vulkan/radv_device.c
@@ -848,6 +848,23 @@ void radv_GetPhysicalDeviceFeatures2(
features->geometryStreams = true;
break;
}
+   case 
VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_FLOAT16_INT8_FEATURES_KHR: {
+   VkPhysicalDeviceFloat16Int8FeaturesKHR *features =
+   (VkPhysicalDeviceFloat16Int8FeaturesKHR*)ext;
+   bool enabled = pdevice->rad_info.chip_class >= VI;
+   features->shaderFloat16 = enabled && HAVE_LLVM >= 
0x0800;
+   features->shaderInt8 = enabled;
+   break;
+   }
+   case 
VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_8BIT_STORAGE_FEATURES_KHR: {
+   VkPhysicalDevice8BitStorageFeaturesKHR *features =
+   (VkPhysicalDevice8BitStorageFeaturesKHR*)ext;
+   bool enabled = pdevice->rad_info.chip_class >= VI;
+   features->storageBuffer8BitAccess = enabled;
+   features->uniformAndStorageBuffer8BitAccess = enabled;
+   features->storagePushConstant8 = enabled;
+   break;
+   }
default:
break;
}
diff --git a/src/amd/vulkan/radv_extensions.py 
b/src/amd/vulkan/radv_extensions.py
index 6bdf988d11..62c58e98af 100644
--- a/src/amd/vulkan/radv_extensions.py
+++ b/src/amd/vulkan/radv_extensions.py
@@ -91,6 +91,8 @@ EXTENSIONS = [
 Extension('VK_KHR_xlib_surface',  6, 
'VK_USE_PLATFORM_XLIB_KHR'),
 Extension('VK_KHR_multiview', 1, True),
 Extension('VK_KHR_display',  23, 
'VK_USE_PLATFORM_DISPLAY_KHR'),
+Extension('VK_KHR_shader_float16_int8',   1, 
'device->rad_info.chip_class >= VI'),
+Extension('VK_KHR_8bit_storage',  1, 
'device->rad_info.chip_class >= VI'),
 Extension('VK_EXT_direct_mode_display',   1, 
'VK_USE_PLATFORM_DISPLAY_KHR'),
 Extension('VK_EXT_acquire_xlib_display',  1, 
'VK_USE_PLATFORM_XLIB_XRANDR_EXT'),
 Extension('VK_EXT_calibrated_timestamps', 1, True),
@@ -117,6 +119,8 @@ EXTENSIONS = [
 Extension('VK_AMD_shader_core_properties',1, True),
 Extension('VK_AMD_shader_info',   1, True),
 Extension('VK_AMD_shader_trinary_minmax', 1, True),
+Extension('VK_AMD_gpu_shader_half_float', 1, 
'device->rad_info.chip_class >= VI && HAVE_LLVM >= 0x0800'),
+Extension('VK_AMD_gpu_shader_int16',  1, 
'device->rad_info.chip_class >= VI'),
 Extension('VK_GOOGLE_decorate_string',1, True),
 Extension('VK_GOOGLE_hlsl_functionality1',1, True),
 ]
diff --git a/src/amd/vulkan/radv_shader.c b/src/amd/vulkan/radv_shader.c
index a2ddf17680..921b9669f0 100644
--- a/src/amd/vulkan/radv_shader.c
+++ b/src/amd/vulkan/radv_shader.c
@@ -246,6 +246,9 @@ radv_shader_compile_to_nir(struct radv_device *device,
.storage_16bit = true,
.geometry_streams = true,
.transform_feedback = true,
+   .float16 = true,
+   .storage_8bit = true,
+   .int8 = true,
},
};
entry_point = spirv_to_nir(spirv, module->size / 4,
-- 
2.19.2

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev

[Mesa-dev] [PATCH 31/38] ac/nir, radv: create an array of varying output types

2018-12-07 Thread Rhys Perry

Signed-off-by: Rhys Perry 
---
 src/amd/common/ac_nir_to_llvm.c   | 68 +++
 src/amd/common/ac_shader_abi.h|  1 +
 src/amd/vulkan/radv_nir_to_llvm.c |  3 ++
 3 files changed, 72 insertions(+)

diff --git a/src/amd/common/ac_nir_to_llvm.c b/src/amd/common/ac_nir_to_llvm.c
index e4ae85a1ae..fa7b8c70f0 100644
--- a/src/amd/common/ac_nir_to_llvm.c
+++ b/src/amd/common/ac_nir_to_llvm.c
@@ -3917,6 +3917,68 @@ static void visit_cf_list(struct ac_nir_context *ctx,
}
 }
 
+static unsigned traverse_var_component_slots(struct ac_llvm_context *ctx, bool 
vs_in,
+struct nir_variable *var, unsigned 
cur_offset,
+const struct glsl_type *cur_type,
+void (*cb)(struct ac_llvm_context 
*, unsigned, enum glsl_base_type, void *),
+void *cbdata)
+{
+   if (glsl_type_is_struct(cur_type)) {
+   for (unsigned i = 0; i < glsl_get_length(cur_type); i++) {
+   const struct glsl_type *ft = 
glsl_get_struct_field(cur_type, i);
+   cur_offset = traverse_var_component_slots(ctx, vs_in, 
var, cur_offset, ft, cb, cbdata);
+   }
+   return (cur_offset + 3) / 4 * 4;
+   }
+
+   enum glsl_base_type base_type = 
glsl_get_base_type(glsl_without_array_or_matrix(cur_type));
+
+   unsigned stride = 
glsl_get_component_slots(glsl_without_array_or_matrix(cur_type));
+   if (!var->data.compact)
+   stride = (stride + 3) / 4 * 4;
+   unsigned arr_len = MAX2(glsl_get_matrix_columns(cur_type), 1);
+   if (glsl_type_is_array(cur_type))
+   arr_len *= glsl_get_aoa_size(cur_type);
+   for (unsigned i = 0; i < arr_len; i++) {
+   for (unsigned j = 0; j < 
glsl_get_component_slots(glsl_without_array_or_matrix(cur_type)); j++) {
+   cb(ctx, cur_offset + var->data.location_frac + j, 
base_type, cbdata);
+   }
+   cur_offset += stride;
+   }
+   return cur_offset;
+}
+
+static void setup_output_type(struct ac_llvm_context *ctx, unsigned index, 
enum glsl_base_type base, void *output_types)
+{
+   LLVMTypeRef type;
+   switch (base) {
+   case GLSL_TYPE_INT8:
+   case GLSL_TYPE_UINT8:
+   type = ctx->i8;
+   break;
+   case GLSL_TYPE_INT16:
+   case GLSL_TYPE_UINT16:
+   type = ctx->i16;
+   break;
+   case GLSL_TYPE_FLOAT16:
+   type = ctx->f16;
+   break;
+   case GLSL_TYPE_INT:
+   case GLSL_TYPE_UINT:
+   case GLSL_TYPE_BOOL:
+   case GLSL_TYPE_INT64:
+   case GLSL_TYPE_UINT64:
+   type = ctx->i32;
+   break;
+   case GLSL_TYPE_FLOAT:
+   case GLSL_TYPE_DOUBLE:
+   default:
+   type = ctx->f32;
+   break;
+   }
+   ((LLVMTypeRef*)output_types)[index] = type;
+}
+
 void
 ac_handle_shader_output_decl(struct ac_llvm_context *ctx,
 struct ac_shader_abi *abi,
@@ -3954,6 +4016,9 @@ ac_handle_shader_output_decl(struct ac_llvm_context *ctx,
   ac_build_alloca_undef(ctx, type, "");
}
}
+
+   traverse_var_component_slots(ctx, false, variable, output_loc * 4,
+variable->type, &setup_output_type, 
abi->output_types);
 }
 
 static LLVMTypeRef
@@ -4077,6 +4142,9 @@ void ac_nir_translate(struct ac_llvm_context *ac, struct 
ac_shader_abi *abi,
 
ctx.main_function = 
LLVMGetBasicBlockParent(LLVMGetInsertBlock(ctx.ac.builder));
 
+   for (unsigned i = 0; i < AC_LLVM_MAX_OUTPUTS * 4; i++)
+   ctx.abi->output_types[i] = ac->i32;
+
nir_foreach_variable(variable, &nir->outputs)
ac_handle_shader_output_decl(&ctx.ac, ctx.abi, nir, variable,
 ctx.stage);
diff --git a/src/amd/common/ac_shader_abi.h b/src/amd/common/ac_shader_abi.h
index 6b9a91c92a..1d078fc42d 100644
--- a/src/amd/common/ac_shader_abi.h
+++ b/src/amd/common/ac_shader_abi.h
@@ -69,6 +69,7 @@ struct ac_shader_abi {
LLVMValueRef view_index;
 
LLVMValueRef outputs[AC_LLVM_MAX_OUTPUTS * 4];
+   LLVMTypeRef output_types[AC_LLVM_MAX_OUTPUTS * 4];
 
/* For VS and PS: pre-loaded shader inputs.
 *
diff --git a/src/amd/vulkan/radv_nir_to_llvm.c 
b/src/amd/vulkan/radv_nir_to_llvm.c
index 90bcc8dbfe..f114a86018 100644
--- a/src/amd/vulkan/radv_nir_to_llvm.c
+++ b/src/amd/vulkan/radv_nir_to_llvm.c
@@ -3945,6 +3945,9 @@ radv_compile_gs_copy_shader(struct ac_llvm_compiler 
*ac_llvm,
ctx.gs_max_out_vertices = geom_shader->info.gs.vertices_out;
ac_setup_rings(&ctx);
 
+   for (unsigned i

[Mesa-dev] [PATCH 37/38] ac/nir: have nir_op_f2f16 round to zero

2018-12-07 Thread Rhys Perry

In the hope that one day LLVM will then be able to generate code with
vectorized v_cvt_pkrtz_f16_f32 instructions.

Signed-off-by: Rhys Perry 
---
 src/amd/common/ac_nir_to_llvm.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/amd/common/ac_nir_to_llvm.c b/src/amd/common/ac_nir_to_llvm.c
index 92b773981b..88b26e019f 100644
--- a/src/amd/common/ac_nir_to_llvm.c
+++ b/src/amd/common/ac_nir_to_llvm.c
@@ -884,6 +884,7 @@ static void visit_alu(struct ac_nir_context *ctx, const 
nir_alu_instr *instr)
result = LLVMBuildUIToFP(ctx->ac.builder, src[0], 
ac_to_float_type(&ctx->ac, def_type), "");
break;
case nir_op_f2f16_rtz:
+   case nir_op_f2f16:
src[0] = ac_to_float(&ctx->ac, src[0]);
if (LLVMTypeOf(src[0]) == ctx->ac.f64)
src[0] = LLVMBuildFPTrunc(ctx->ac.builder, src[0], 
ctx->ac.f32, "");
@@ -894,7 +895,6 @@ static void visit_alu(struct ac_nir_context *ctx, const 
nir_alu_instr *instr)
result = LLVMBuildTrunc(ctx->ac.builder, result, ctx->ac.i16, 
"");
break;
case nir_op_f2f16_rtne:
-   case nir_op_f2f16:
case nir_op_f2f32:
case nir_op_f2f64:
src[0] = ac_to_float(&ctx->ac, src[0]);
-- 
2.19.2

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev

[Mesa-dev] [PATCH 30/38] ac/nir: add 8-bit and 16-bit types to glsl_base_to_llvm_type

2018-12-07 Thread Rhys Perry

Signed-off-by: Rhys Perry 
---
 src/amd/common/ac_nir_to_llvm.c | 8 
 1 file changed, 8 insertions(+)

diff --git a/src/amd/common/ac_nir_to_llvm.c b/src/amd/common/ac_nir_to_llvm.c
index d69135cc25..e4ae85a1ae 100644
--- a/src/amd/common/ac_nir_to_llvm.c
+++ b/src/amd/common/ac_nir_to_llvm.c
@@ -3961,11 +3961,19 @@ glsl_base_to_llvm_type(struct ac_llvm_context *ac,
   enum glsl_base_type type)
 {
switch (type) {
+   case GLSL_TYPE_INT8:
+   case GLSL_TYPE_UINT8:
+   return ac->i8;
+   case GLSL_TYPE_INT16:
+   case GLSL_TYPE_UINT16:
+   return ac->i16;
case GLSL_TYPE_INT:
case GLSL_TYPE_UINT:
case GLSL_TYPE_BOOL:
case GLSL_TYPE_SUBROUTINE:
return ac->i32;
+   case GLSL_TYPE_FLOAT16:
+   return ac->f16;
case GLSL_TYPE_FLOAT: /* TODO handle mediump */
return ac->f32;
case GLSL_TYPE_INT64:
-- 
2.19.2

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev

1 2 3 4 >

1 - 100 of 342 matches

Mail list logo