Re: [Mesa-dev] [PATCH 43/61] radeonsi/gfx9: add GS prolog support for merged ES-GS

2017-04-28 Thread Marek Olšák
On Fri, Apr 28, 2017 at 9:33 PM, Nicolai Hähnle  wrote:
> On 28.04.2017 17:59, Marek Olšák wrote:
>>
>> On Fri, Apr 28, 2017 at 1:25 PM, Nicolai Hähnle 
>> wrote:
>>>
>>> On 24.04.2017 10:45, Marek Olšák wrote:


 From: Marek Olšák 

 ---
  src/gallium/drivers/radeonsi/si_shader.c | 87
 +---
  1 file changed, 70 insertions(+), 17 deletions(-)

 diff --git a/src/gallium/drivers/radeonsi/si_shader.c
 b/src/gallium/drivers/radeonsi/si_shader.c
 index a4c2ac0..392f85d 100644
 --- a/src/gallium/drivers/radeonsi/si_shader.c
 +++ b/src/gallium/drivers/radeonsi/si_shader.c
 @@ -7368,20 +7368,28 @@ static void
 si_count_scratch_private_memory(struct
 si_shader_context *ctx)
 LLVMTypeRef type =
 LLVMGetElementType(LLVMTypeOf(inst));
 /* No idea why LLVM aligns allocas to 4
 elements.
 */
 unsigned alignment = LLVMGetAlignment(inst);
 unsigned dw_size =
 align(llvm_get_type_size(type)
 / 4, alignment);
 ctx->shader->config.private_mem_vgprs +=
 dw_size;
 }
 bb = LLVMGetNextBasicBlock(bb);
 }
  }

 +static void si_init_exec_full_mask(struct si_shader_context *ctx)
 +{
 +   LLVMValueRef full_mask = LLVMConstInt(ctx->i64, ~0ull, 0);
 +   lp_build_intrinsic(ctx->gallivm.builder,
 +  "llvm.amdgcn.init.exec", ctx->voidt,
 +  &full_mask, 1, LP_FUNC_ATTR_CONVERGENT);
 +}
 +
  static void si_init_exec_from_input(struct si_shader_context *ctx,
 unsigned param, unsigned bitoffset)
  {
 LLVMValueRef args[] = {
 LLVMGetParam(ctx->main_fn, param),
 LLVMConstInt(ctx->i32, bitoffset, 0),
 };
 lp_build_intrinsic(ctx->gallivm.builder,
"llvm.amdgcn.init.exec.from.input",
ctx->voidt, args, 2,
 LP_FUNC_ATTR_CONVERGENT);
 @@ -7681,79 +7689,128 @@ static void si_get_ps_epilog_key(struct
 si_shader
 *shader,
 key->ps_epilog.states = shader->key.part.ps.epilog;
  }

  /**
   * Build the GS prolog function. Rotate the input vertices for triangle
 strips
   * with adjacency.
   */
  static void si_build_gs_prolog_function(struct si_shader_context *ctx,
 union si_shader_part_key *key)
  {
 -   const unsigned num_sgprs = GFX6_GS_NUM_USER_SGPR + 2;
 -   const unsigned num_vgprs = 8;
 +   unsigned num_sgprs, num_vgprs;
 struct gallivm_state *gallivm = &ctx->gallivm;
 LLVMBuilderRef builder = gallivm->builder;
 -   LLVMTypeRef params[32];
 -   LLVMTypeRef returns[32];
 +   LLVMTypeRef params[48]; /* 40 SGPRs (maximum) + some VGPRs */
 +   LLVMTypeRef returns[48];
 LLVMValueRef func, ret;

 +   if (ctx->screen->b.chip_class >= GFX9) {
 +   num_sgprs = 8 + GFX9_GS_NUM_USER_SGPR;
 +   num_vgprs = 5; /* ES inputs are not needed by GS */
 +   } else {
 +   num_sgprs = GFX6_GS_NUM_USER_SGPR + 2;
 +   num_vgprs = 8;
 +   }
 +
 for (unsigned i = 0; i < num_sgprs; ++i) {
 params[i] = ctx->i32;
 returns[i] = ctx->i32;
 }

 for (unsigned i = 0; i < num_vgprs; ++i) {
 params[num_sgprs + i] = ctx->i32;
 returns[num_sgprs + i] = ctx->f32;
 }

 /* Create the function. */
 si_create_function(ctx, "gs_prolog", returns, num_sgprs +
 num_vgprs,
params, num_sgprs + num_vgprs, num_sgprs -
 1);
 func = ctx->main_fn;

 +   /* Set the full EXEC mask for the prolog, because we are only
 fiddling
 +* with registers here. The main shader part will set the
 correct
 EXEC
 +* mask.
 +*/
 +   if (ctx->screen->b.chip_class >= GFX9)
 +   si_init_exec_full_mask(ctx);
 +
 /* Copy inputs to outputs. This should be no-op, as the
 registers
 match,
  * but it will prevent the compiler from overwriting them
 unintentionally.
  */
 ret = ctx->return_value;
 for (unsigned i = 0; i < num_sgprs; i++) {
 LLVMValueRef p = LLVMGetParam(func, i);
 ret = LLVMBuildInsertValue(builder, ret, p, i, "");
 }
 for (unsigned i = 0; i < num_vgprs; i++) {
 LLVMValueRef p = LLVMGetPa

Re: [Mesa-dev] [PATCH 43/61] radeonsi/gfx9: add GS prolog support for merged ES-GS

2017-04-28 Thread Nicolai Hähnle

On 28.04.2017 17:59, Marek Olšák wrote:

On Fri, Apr 28, 2017 at 1:25 PM, Nicolai Hähnle  wrote:

On 24.04.2017 10:45, Marek Olšák wrote:


From: Marek Olšák 

---
 src/gallium/drivers/radeonsi/si_shader.c | 87
+---
 1 file changed, 70 insertions(+), 17 deletions(-)

diff --git a/src/gallium/drivers/radeonsi/si_shader.c
b/src/gallium/drivers/radeonsi/si_shader.c
index a4c2ac0..392f85d 100644
--- a/src/gallium/drivers/radeonsi/si_shader.c
+++ b/src/gallium/drivers/radeonsi/si_shader.c
@@ -7368,20 +7368,28 @@ static void si_count_scratch_private_memory(struct
si_shader_context *ctx)
LLVMTypeRef type =
LLVMGetElementType(LLVMTypeOf(inst));
/* No idea why LLVM aligns allocas to 4 elements.
*/
unsigned alignment = LLVMGetAlignment(inst);
unsigned dw_size = align(llvm_get_type_size(type)
/ 4, alignment);
ctx->shader->config.private_mem_vgprs += dw_size;
}
bb = LLVMGetNextBasicBlock(bb);
}
 }

+static void si_init_exec_full_mask(struct si_shader_context *ctx)
+{
+   LLVMValueRef full_mask = LLVMConstInt(ctx->i64, ~0ull, 0);
+   lp_build_intrinsic(ctx->gallivm.builder,
+  "llvm.amdgcn.init.exec", ctx->voidt,
+  &full_mask, 1, LP_FUNC_ATTR_CONVERGENT);
+}
+
 static void si_init_exec_from_input(struct si_shader_context *ctx,
unsigned param, unsigned bitoffset)
 {
LLVMValueRef args[] = {
LLVMGetParam(ctx->main_fn, param),
LLVMConstInt(ctx->i32, bitoffset, 0),
};
lp_build_intrinsic(ctx->gallivm.builder,
   "llvm.amdgcn.init.exec.from.input",
   ctx->voidt, args, 2, LP_FUNC_ATTR_CONVERGENT);
@@ -7681,79 +7689,128 @@ static void si_get_ps_epilog_key(struct si_shader
*shader,
key->ps_epilog.states = shader->key.part.ps.epilog;
 }

 /**
  * Build the GS prolog function. Rotate the input vertices for triangle
strips
  * with adjacency.
  */
 static void si_build_gs_prolog_function(struct si_shader_context *ctx,
union si_shader_part_key *key)
 {
-   const unsigned num_sgprs = GFX6_GS_NUM_USER_SGPR + 2;
-   const unsigned num_vgprs = 8;
+   unsigned num_sgprs, num_vgprs;
struct gallivm_state *gallivm = &ctx->gallivm;
LLVMBuilderRef builder = gallivm->builder;
-   LLVMTypeRef params[32];
-   LLVMTypeRef returns[32];
+   LLVMTypeRef params[48]; /* 40 SGPRs (maximum) + some VGPRs */
+   LLVMTypeRef returns[48];
LLVMValueRef func, ret;

+   if (ctx->screen->b.chip_class >= GFX9) {
+   num_sgprs = 8 + GFX9_GS_NUM_USER_SGPR;
+   num_vgprs = 5; /* ES inputs are not needed by GS */
+   } else {
+   num_sgprs = GFX6_GS_NUM_USER_SGPR + 2;
+   num_vgprs = 8;
+   }
+
for (unsigned i = 0; i < num_sgprs; ++i) {
params[i] = ctx->i32;
returns[i] = ctx->i32;
}

for (unsigned i = 0; i < num_vgprs; ++i) {
params[num_sgprs + i] = ctx->i32;
returns[num_sgprs + i] = ctx->f32;
}

/* Create the function. */
si_create_function(ctx, "gs_prolog", returns, num_sgprs +
num_vgprs,
   params, num_sgprs + num_vgprs, num_sgprs - 1);
func = ctx->main_fn;

+   /* Set the full EXEC mask for the prolog, because we are only
fiddling
+* with registers here. The main shader part will set the correct
EXEC
+* mask.
+*/
+   if (ctx->screen->b.chip_class >= GFX9)
+   si_init_exec_full_mask(ctx);
+
/* Copy inputs to outputs. This should be no-op, as the registers
match,
 * but it will prevent the compiler from overwriting them
unintentionally.
 */
ret = ctx->return_value;
for (unsigned i = 0; i < num_sgprs; i++) {
LLVMValueRef p = LLVMGetParam(func, i);
ret = LLVMBuildInsertValue(builder, ret, p, i, "");
}
for (unsigned i = 0; i < num_vgprs; i++) {
LLVMValueRef p = LLVMGetParam(func, num_sgprs + i);
p = LLVMBuildBitCast(builder, p, ctx->f32, "");
ret = LLVMBuildInsertValue(builder, ret, p, num_sgprs + i,
"");
}

if (key->gs_prolog.states.tri_strip_adj_fix) {
/* Remap the input vertices for every other primitive. */
-   const unsigned vtx_params[6] = {
+   const unsigned gfx6_vtx_params[6] = {
num_sgprs,
num_sgprs + 1,
num_sgprs + 3,
num_sgprs + 4,
num_sgprs + 5,
num_sgprs + 6
};
+   const unsi

Re: [Mesa-dev] [PATCH 43/61] radeonsi/gfx9: add GS prolog support for merged ES-GS

2017-04-28 Thread Marek Olšák
On Fri, Apr 28, 2017 at 1:25 PM, Nicolai Hähnle  wrote:
> On 24.04.2017 10:45, Marek Olšák wrote:
>>
>> From: Marek Olšák 
>>
>> ---
>>  src/gallium/drivers/radeonsi/si_shader.c | 87
>> +---
>>  1 file changed, 70 insertions(+), 17 deletions(-)
>>
>> diff --git a/src/gallium/drivers/radeonsi/si_shader.c
>> b/src/gallium/drivers/radeonsi/si_shader.c
>> index a4c2ac0..392f85d 100644
>> --- a/src/gallium/drivers/radeonsi/si_shader.c
>> +++ b/src/gallium/drivers/radeonsi/si_shader.c
>> @@ -7368,20 +7368,28 @@ static void si_count_scratch_private_memory(struct
>> si_shader_context *ctx)
>> LLVMTypeRef type =
>> LLVMGetElementType(LLVMTypeOf(inst));
>> /* No idea why LLVM aligns allocas to 4 elements.
>> */
>> unsigned alignment = LLVMGetAlignment(inst);
>> unsigned dw_size = align(llvm_get_type_size(type)
>> / 4, alignment);
>> ctx->shader->config.private_mem_vgprs += dw_size;
>> }
>> bb = LLVMGetNextBasicBlock(bb);
>> }
>>  }
>>
>> +static void si_init_exec_full_mask(struct si_shader_context *ctx)
>> +{
>> +   LLVMValueRef full_mask = LLVMConstInt(ctx->i64, ~0ull, 0);
>> +   lp_build_intrinsic(ctx->gallivm.builder,
>> +  "llvm.amdgcn.init.exec", ctx->voidt,
>> +  &full_mask, 1, LP_FUNC_ATTR_CONVERGENT);
>> +}
>> +
>>  static void si_init_exec_from_input(struct si_shader_context *ctx,
>> unsigned param, unsigned bitoffset)
>>  {
>> LLVMValueRef args[] = {
>> LLVMGetParam(ctx->main_fn, param),
>> LLVMConstInt(ctx->i32, bitoffset, 0),
>> };
>> lp_build_intrinsic(ctx->gallivm.builder,
>>"llvm.amdgcn.init.exec.from.input",
>>ctx->voidt, args, 2, LP_FUNC_ATTR_CONVERGENT);
>> @@ -7681,79 +7689,128 @@ static void si_get_ps_epilog_key(struct si_shader
>> *shader,
>> key->ps_epilog.states = shader->key.part.ps.epilog;
>>  }
>>
>>  /**
>>   * Build the GS prolog function. Rotate the input vertices for triangle
>> strips
>>   * with adjacency.
>>   */
>>  static void si_build_gs_prolog_function(struct si_shader_context *ctx,
>> union si_shader_part_key *key)
>>  {
>> -   const unsigned num_sgprs = GFX6_GS_NUM_USER_SGPR + 2;
>> -   const unsigned num_vgprs = 8;
>> +   unsigned num_sgprs, num_vgprs;
>> struct gallivm_state *gallivm = &ctx->gallivm;
>> LLVMBuilderRef builder = gallivm->builder;
>> -   LLVMTypeRef params[32];
>> -   LLVMTypeRef returns[32];
>> +   LLVMTypeRef params[48]; /* 40 SGPRs (maximum) + some VGPRs */
>> +   LLVMTypeRef returns[48];
>> LLVMValueRef func, ret;
>>
>> +   if (ctx->screen->b.chip_class >= GFX9) {
>> +   num_sgprs = 8 + GFX9_GS_NUM_USER_SGPR;
>> +   num_vgprs = 5; /* ES inputs are not needed by GS */
>> +   } else {
>> +   num_sgprs = GFX6_GS_NUM_USER_SGPR + 2;
>> +   num_vgprs = 8;
>> +   }
>> +
>> for (unsigned i = 0; i < num_sgprs; ++i) {
>> params[i] = ctx->i32;
>> returns[i] = ctx->i32;
>> }
>>
>> for (unsigned i = 0; i < num_vgprs; ++i) {
>> params[num_sgprs + i] = ctx->i32;
>> returns[num_sgprs + i] = ctx->f32;
>> }
>>
>> /* Create the function. */
>> si_create_function(ctx, "gs_prolog", returns, num_sgprs +
>> num_vgprs,
>>params, num_sgprs + num_vgprs, num_sgprs - 1);
>> func = ctx->main_fn;
>>
>> +   /* Set the full EXEC mask for the prolog, because we are only
>> fiddling
>> +* with registers here. The main shader part will set the correct
>> EXEC
>> +* mask.
>> +*/
>> +   if (ctx->screen->b.chip_class >= GFX9)
>> +   si_init_exec_full_mask(ctx);
>> +
>> /* Copy inputs to outputs. This should be no-op, as the registers
>> match,
>>  * but it will prevent the compiler from overwriting them
>> unintentionally.
>>  */
>> ret = ctx->return_value;
>> for (unsigned i = 0; i < num_sgprs; i++) {
>> LLVMValueRef p = LLVMGetParam(func, i);
>> ret = LLVMBuildInsertValue(builder, ret, p, i, "");
>> }
>> for (unsigned i = 0; i < num_vgprs; i++) {
>> LLVMValueRef p = LLVMGetParam(func, num_sgprs + i);
>> p = LLVMBuildBitCast(builder, p, ctx->f32, "");
>> ret = LLVMBuildInsertValue(builder, ret, p, num_sgprs + i,
>> "");
>> }
>>
>> if (key->gs_prolog.states.tri_strip_adj_fix) {
>> /* Remap the input vertices for every other primitive. */
>> -   const unsigned vtx_params[6] = {
>> +   

Re: [Mesa-dev] [PATCH 43/61] radeonsi/gfx9: add GS prolog support for merged ES-GS

2017-04-28 Thread Nicolai Hähnle

On 24.04.2017 10:45, Marek Olšák wrote:

From: Marek Olšák 

---
 src/gallium/drivers/radeonsi/si_shader.c | 87 +---
 1 file changed, 70 insertions(+), 17 deletions(-)

diff --git a/src/gallium/drivers/radeonsi/si_shader.c 
b/src/gallium/drivers/radeonsi/si_shader.c
index a4c2ac0..392f85d 100644
--- a/src/gallium/drivers/radeonsi/si_shader.c
+++ b/src/gallium/drivers/radeonsi/si_shader.c
@@ -7368,20 +7368,28 @@ static void si_count_scratch_private_memory(struct 
si_shader_context *ctx)
LLVMTypeRef type = LLVMGetElementType(LLVMTypeOf(inst));
/* No idea why LLVM aligns allocas to 4 elements. */
unsigned alignment = LLVMGetAlignment(inst);
unsigned dw_size = align(llvm_get_type_size(type) / 4, 
alignment);
ctx->shader->config.private_mem_vgprs += dw_size;
}
bb = LLVMGetNextBasicBlock(bb);
}
 }

+static void si_init_exec_full_mask(struct si_shader_context *ctx)
+{
+   LLVMValueRef full_mask = LLVMConstInt(ctx->i64, ~0ull, 0);
+   lp_build_intrinsic(ctx->gallivm.builder,
+  "llvm.amdgcn.init.exec", ctx->voidt,
+  &full_mask, 1, LP_FUNC_ATTR_CONVERGENT);
+}
+
 static void si_init_exec_from_input(struct si_shader_context *ctx,
unsigned param, unsigned bitoffset)
 {
LLVMValueRef args[] = {
LLVMGetParam(ctx->main_fn, param),
LLVMConstInt(ctx->i32, bitoffset, 0),
};
lp_build_intrinsic(ctx->gallivm.builder,
   "llvm.amdgcn.init.exec.from.input",
   ctx->voidt, args, 2, LP_FUNC_ATTR_CONVERGENT);
@@ -7681,79 +7689,128 @@ static void si_get_ps_epilog_key(struct si_shader 
*shader,
key->ps_epilog.states = shader->key.part.ps.epilog;
 }

 /**
  * Build the GS prolog function. Rotate the input vertices for triangle strips
  * with adjacency.
  */
 static void si_build_gs_prolog_function(struct si_shader_context *ctx,
union si_shader_part_key *key)
 {
-   const unsigned num_sgprs = GFX6_GS_NUM_USER_SGPR + 2;
-   const unsigned num_vgprs = 8;
+   unsigned num_sgprs, num_vgprs;
struct gallivm_state *gallivm = &ctx->gallivm;
LLVMBuilderRef builder = gallivm->builder;
-   LLVMTypeRef params[32];
-   LLVMTypeRef returns[32];
+   LLVMTypeRef params[48]; /* 40 SGPRs (maximum) + some VGPRs */
+   LLVMTypeRef returns[48];
LLVMValueRef func, ret;

+   if (ctx->screen->b.chip_class >= GFX9) {
+   num_sgprs = 8 + GFX9_GS_NUM_USER_SGPR;
+   num_vgprs = 5; /* ES inputs are not needed by GS */
+   } else {
+   num_sgprs = GFX6_GS_NUM_USER_SGPR + 2;
+   num_vgprs = 8;
+   }
+
for (unsigned i = 0; i < num_sgprs; ++i) {
params[i] = ctx->i32;
returns[i] = ctx->i32;
}

for (unsigned i = 0; i < num_vgprs; ++i) {
params[num_sgprs + i] = ctx->i32;
returns[num_sgprs + i] = ctx->f32;
}

/* Create the function. */
si_create_function(ctx, "gs_prolog", returns, num_sgprs + num_vgprs,
   params, num_sgprs + num_vgprs, num_sgprs - 1);
func = ctx->main_fn;

+   /* Set the full EXEC mask for the prolog, because we are only fiddling
+* with registers here. The main shader part will set the correct EXEC
+* mask.
+*/
+   if (ctx->screen->b.chip_class >= GFX9)
+   si_init_exec_full_mask(ctx);
+
/* Copy inputs to outputs. This should be no-op, as the registers match,
 * but it will prevent the compiler from overwriting them 
unintentionally.
 */
ret = ctx->return_value;
for (unsigned i = 0; i < num_sgprs; i++) {
LLVMValueRef p = LLVMGetParam(func, i);
ret = LLVMBuildInsertValue(builder, ret, p, i, "");
}
for (unsigned i = 0; i < num_vgprs; i++) {
LLVMValueRef p = LLVMGetParam(func, num_sgprs + i);
p = LLVMBuildBitCast(builder, p, ctx->f32, "");
ret = LLVMBuildInsertValue(builder, ret, p, num_sgprs + i, "");
}

if (key->gs_prolog.states.tri_strip_adj_fix) {
/* Remap the input vertices for every other primitive. */
-   const unsigned vtx_params[6] = {
+   const unsigned gfx6_vtx_params[6] = {
num_sgprs,
num_sgprs + 1,
num_sgprs + 3,
num_sgprs + 4,
num_sgprs + 5,
num_sgprs + 6
};
+   const unsigned gfx9_vtx_params[3] = {
+   num_sgprs,
+   num_sgp

[Mesa-dev] [PATCH 43/61] radeonsi/gfx9: add GS prolog support for merged ES-GS

2017-04-24 Thread Marek Olšák
From: Marek Olšák 

---
 src/gallium/drivers/radeonsi/si_shader.c | 87 +---
 1 file changed, 70 insertions(+), 17 deletions(-)

diff --git a/src/gallium/drivers/radeonsi/si_shader.c 
b/src/gallium/drivers/radeonsi/si_shader.c
index a4c2ac0..392f85d 100644
--- a/src/gallium/drivers/radeonsi/si_shader.c
+++ b/src/gallium/drivers/radeonsi/si_shader.c
@@ -7368,20 +7368,28 @@ static void si_count_scratch_private_memory(struct 
si_shader_context *ctx)
LLVMTypeRef type = LLVMGetElementType(LLVMTypeOf(inst));
/* No idea why LLVM aligns allocas to 4 elements. */
unsigned alignment = LLVMGetAlignment(inst);
unsigned dw_size = align(llvm_get_type_size(type) / 4, 
alignment);
ctx->shader->config.private_mem_vgprs += dw_size;
}
bb = LLVMGetNextBasicBlock(bb);
}
 }
 
+static void si_init_exec_full_mask(struct si_shader_context *ctx)
+{
+   LLVMValueRef full_mask = LLVMConstInt(ctx->i64, ~0ull, 0);
+   lp_build_intrinsic(ctx->gallivm.builder,
+  "llvm.amdgcn.init.exec", ctx->voidt,
+  &full_mask, 1, LP_FUNC_ATTR_CONVERGENT);
+}
+
 static void si_init_exec_from_input(struct si_shader_context *ctx,
unsigned param, unsigned bitoffset)
 {
LLVMValueRef args[] = {
LLVMGetParam(ctx->main_fn, param),
LLVMConstInt(ctx->i32, bitoffset, 0),
};
lp_build_intrinsic(ctx->gallivm.builder,
   "llvm.amdgcn.init.exec.from.input",
   ctx->voidt, args, 2, LP_FUNC_ATTR_CONVERGENT);
@@ -7681,79 +7689,128 @@ static void si_get_ps_epilog_key(struct si_shader 
*shader,
key->ps_epilog.states = shader->key.part.ps.epilog;
 }
 
 /**
  * Build the GS prolog function. Rotate the input vertices for triangle strips
  * with adjacency.
  */
 static void si_build_gs_prolog_function(struct si_shader_context *ctx,
union si_shader_part_key *key)
 {
-   const unsigned num_sgprs = GFX6_GS_NUM_USER_SGPR + 2;
-   const unsigned num_vgprs = 8;
+   unsigned num_sgprs, num_vgprs;
struct gallivm_state *gallivm = &ctx->gallivm;
LLVMBuilderRef builder = gallivm->builder;
-   LLVMTypeRef params[32];
-   LLVMTypeRef returns[32];
+   LLVMTypeRef params[48]; /* 40 SGPRs (maximum) + some VGPRs */
+   LLVMTypeRef returns[48];
LLVMValueRef func, ret;
 
+   if (ctx->screen->b.chip_class >= GFX9) {
+   num_sgprs = 8 + GFX9_GS_NUM_USER_SGPR;
+   num_vgprs = 5; /* ES inputs are not needed by GS */
+   } else {
+   num_sgprs = GFX6_GS_NUM_USER_SGPR + 2;
+   num_vgprs = 8;
+   }
+
for (unsigned i = 0; i < num_sgprs; ++i) {
params[i] = ctx->i32;
returns[i] = ctx->i32;
}
 
for (unsigned i = 0; i < num_vgprs; ++i) {
params[num_sgprs + i] = ctx->i32;
returns[num_sgprs + i] = ctx->f32;
}
 
/* Create the function. */
si_create_function(ctx, "gs_prolog", returns, num_sgprs + num_vgprs,
   params, num_sgprs + num_vgprs, num_sgprs - 1);
func = ctx->main_fn;
 
+   /* Set the full EXEC mask for the prolog, because we are only fiddling
+* with registers here. The main shader part will set the correct EXEC
+* mask.
+*/
+   if (ctx->screen->b.chip_class >= GFX9)
+   si_init_exec_full_mask(ctx);
+
/* Copy inputs to outputs. This should be no-op, as the registers match,
 * but it will prevent the compiler from overwriting them 
unintentionally.
 */
ret = ctx->return_value;
for (unsigned i = 0; i < num_sgprs; i++) {
LLVMValueRef p = LLVMGetParam(func, i);
ret = LLVMBuildInsertValue(builder, ret, p, i, "");
}
for (unsigned i = 0; i < num_vgprs; i++) {
LLVMValueRef p = LLVMGetParam(func, num_sgprs + i);
p = LLVMBuildBitCast(builder, p, ctx->f32, "");
ret = LLVMBuildInsertValue(builder, ret, p, num_sgprs + i, "");
}
 
if (key->gs_prolog.states.tri_strip_adj_fix) {
/* Remap the input vertices for every other primitive. */
-   const unsigned vtx_params[6] = {
+   const unsigned gfx6_vtx_params[6] = {
num_sgprs,
num_sgprs + 1,
num_sgprs + 3,
num_sgprs + 4,
num_sgprs + 5,
num_sgprs + 6
};
+   const unsigned gfx9_vtx_params[3] = {
+   num_sgprs,
+   num_sgprs + 1,
+   num