From: Marek Olšák <marek.ol...@amd.com>
---
src/gallium/drivers/radeonsi/si_shader.c | 87
+++++++++++++++++++++++++-------
1 file changed, 70 insertions(+), 17 deletions(-)
diff --git a/src/gallium/drivers/radeonsi/si_shader.c
b/src/gallium/drivers/radeonsi/si_shader.c
index a4c2ac0..392f85d 100644
--- a/src/gallium/drivers/radeonsi/si_shader.c
+++ b/src/gallium/drivers/radeonsi/si_shader.c
@@ -7368,20 +7368,28 @@ static void si_count_scratch_private_memory(struct
si_shader_context *ctx)
LLVMTypeRef type =
LLVMGetElementType(LLVMTypeOf(inst));
/* No idea why LLVM aligns allocas to 4 elements.
*/
unsigned alignment = LLVMGetAlignment(inst);
unsigned dw_size = align(llvm_get_type_size(type)
/ 4, alignment);
ctx->shader->config.private_mem_vgprs += dw_size;
}
bb = LLVMGetNextBasicBlock(bb);
}
}
+static void si_init_exec_full_mask(struct si_shader_context *ctx)
+{
+ LLVMValueRef full_mask = LLVMConstInt(ctx->i64, ~0ull, 0);
+ lp_build_intrinsic(ctx->gallivm.builder,
+ "llvm.amdgcn.init.exec", ctx->voidt,
+ &full_mask, 1, LP_FUNC_ATTR_CONVERGENT);
+}
+
static void si_init_exec_from_input(struct si_shader_context *ctx,
unsigned param, unsigned bitoffset)
{
LLVMValueRef args[] = {
LLVMGetParam(ctx->main_fn, param),
LLVMConstInt(ctx->i32, bitoffset, 0),
};
lp_build_intrinsic(ctx->gallivm.builder,
"llvm.amdgcn.init.exec.from.input",
ctx->voidt, args, 2, LP_FUNC_ATTR_CONVERGENT);
@@ -7681,79 +7689,128 @@ static void si_get_ps_epilog_key(struct si_shader
*shader,
key->ps_epilog.states = shader->key.part.ps.epilog;
}
/**
* Build the GS prolog function. Rotate the input vertices for triangle
strips
* with adjacency.
*/
static void si_build_gs_prolog_function(struct si_shader_context *ctx,
union si_shader_part_key *key)
{
- const unsigned num_sgprs = GFX6_GS_NUM_USER_SGPR + 2;
- const unsigned num_vgprs = 8;
+ unsigned num_sgprs, num_vgprs;
struct gallivm_state *gallivm = &ctx->gallivm;
LLVMBuilderRef builder = gallivm->builder;
- LLVMTypeRef params[32];
- LLVMTypeRef returns[32];
+ LLVMTypeRef params[48]; /* 40 SGPRs (maximum) + some VGPRs */
+ LLVMTypeRef returns[48];
LLVMValueRef func, ret;
+ if (ctx->screen->b.chip_class >= GFX9) {
+ num_sgprs = 8 + GFX9_GS_NUM_USER_SGPR;
+ num_vgprs = 5; /* ES inputs are not needed by GS */
+ } else {
+ num_sgprs = GFX6_GS_NUM_USER_SGPR + 2;
+ num_vgprs = 8;
+ }
+
for (unsigned i = 0; i < num_sgprs; ++i) {
params[i] = ctx->i32;
returns[i] = ctx->i32;
}
for (unsigned i = 0; i < num_vgprs; ++i) {
params[num_sgprs + i] = ctx->i32;
returns[num_sgprs + i] = ctx->f32;
}
/* Create the function. */
si_create_function(ctx, "gs_prolog", returns, num_sgprs +
num_vgprs,
params, num_sgprs + num_vgprs, num_sgprs - 1);
func = ctx->main_fn;
+ /* Set the full EXEC mask for the prolog, because we are only
fiddling
+ * with registers here. The main shader part will set the correct
EXEC
+ * mask.
+ */
+ if (ctx->screen->b.chip_class >= GFX9)
+ si_init_exec_full_mask(ctx);
+
/* Copy inputs to outputs. This should be no-op, as the registers
match,
* but it will prevent the compiler from overwriting them
unintentionally.
*/
ret = ctx->return_value;
for (unsigned i = 0; i < num_sgprs; i++) {
LLVMValueRef p = LLVMGetParam(func, i);
ret = LLVMBuildInsertValue(builder, ret, p, i, "");
}
for (unsigned i = 0; i < num_vgprs; i++) {
LLVMValueRef p = LLVMGetParam(func, num_sgprs + i);
p = LLVMBuildBitCast(builder, p, ctx->f32, "");
ret = LLVMBuildInsertValue(builder, ret, p, num_sgprs + i,
"");
}
if (key->gs_prolog.states.tri_strip_adj_fix) {
/* Remap the input vertices for every other primitive. */
- const unsigned vtx_params[6] = {
+ const unsigned gfx6_vtx_params[6] = {
num_sgprs,
num_sgprs + 1,
num_sgprs + 3,
num_sgprs + 4,
num_sgprs + 5,
num_sgprs + 6
};
+ const unsigned gfx9_vtx_params[3] = {
+ num_sgprs,
+ num_sgprs + 1,
+ num_sgprs + 4,
+ };
+ LLVMValueRef vtx_in[6], vtx_out[6];
LLVMValueRef prim_id, rotate;
+ if (ctx->screen->b.chip_class >= GFX9) {
+ for (unsigned i = 0; i < 3; i++) {
+ vtx_in[i*2] = unpack_param(ctx,
gfx9_vtx_params[i], 0, 16);
+ vtx_in[i*2+1] = unpack_param(ctx,
gfx9_vtx_params[i], 16, 16);
+ }
+ } else {
+ for (unsigned i = 0; i < 6; i++)
+ vtx_in[i] = LLVMGetParam(func,
gfx6_vtx_params[i]);
+ }
+
prim_id = LLVMGetParam(func, num_sgprs + 2);
rotate = LLVMBuildTrunc(builder, prim_id, ctx->i1, "");
for (unsigned i = 0; i < 6; ++i) {
- LLVMValueRef base, rotated, actual;
- base = LLVMGetParam(func, vtx_params[i]);
- rotated = LLVMGetParam(func, vtx_params[(i + 4) %
6]);
- actual = LLVMBuildSelect(builder, rotate, rotated,
base, "");
- actual = LLVMBuildBitCast(builder, actual,
ctx->f32, "");
- ret = LLVMBuildInsertValue(builder, ret, actual,
vtx_params[i], "");
+ LLVMValueRef base, rotated;
+ base = vtx_in[i];
+ rotated = vtx_in[(i + 4) % 6];
+ vtx_out[i] = LLVMBuildSelect(builder, rotate,
rotated, base, "");
+ }
+
+ if (ctx->screen->b.chip_class >= GFX9) {
+ for (unsigned i = 0; i < 3; i++) {
+ LLVMValueRef hi, out;
+
+ hi = LLVMBuildShl(builder, vtx_out[i*2+1],
+ LLVMConstInt(ctx->i32,
16, 0), "");
+ out = LLVMBuildOr(builder, vtx_out[i*2],
hi, "");
+ out = LLVMBuildBitCast(builder, out,
ctx->f32, "");
+ ret = LLVMBuildInsertValue(builder, ret,
out,
+
gfx9_vtx_params[i], "");
+ }
+ } else {
+ for (unsigned i = 0; i < 6; i++) {
+ LLVMValueRef out;
+
+ out = LLVMBuildBitCast(builder,
vtx_out[i], ctx->f32, "");
+ ret = LLVMBuildInsertValue(builder, ret,
out,
+
gfx6_vtx_params[i], "");
+ }
}