Re: [Mesa-dev] [PATCH 43/61] radeonsi/gfx9: add GS prolog support for merged ES-GS
On Fri, Apr 28, 2017 at 9:33 PM, Nicolai Hähnle wrote: > On 28.04.2017 17:59, Marek Olšák wrote: >> >> On Fri, Apr 28, 2017 at 1:25 PM, Nicolai Hähnle >> wrote: >>> >>> On 24.04.2017 10:45, Marek Olšák wrote: From: Marek Olšák --- src/gallium/drivers/radeonsi/si_shader.c | 87 +--- 1 file changed, 70 insertions(+), 17 deletions(-) diff --git a/src/gallium/drivers/radeonsi/si_shader.c b/src/gallium/drivers/radeonsi/si_shader.c index a4c2ac0..392f85d 100644 --- a/src/gallium/drivers/radeonsi/si_shader.c +++ b/src/gallium/drivers/radeonsi/si_shader.c @@ -7368,20 +7368,28 @@ static void si_count_scratch_private_memory(struct si_shader_context *ctx) LLVMTypeRef type = LLVMGetElementType(LLVMTypeOf(inst)); /* No idea why LLVM aligns allocas to 4 elements. */ unsigned alignment = LLVMGetAlignment(inst); unsigned dw_size = align(llvm_get_type_size(type) / 4, alignment); ctx->shader->config.private_mem_vgprs += dw_size; } bb = LLVMGetNextBasicBlock(bb); } } +static void si_init_exec_full_mask(struct si_shader_context *ctx) +{ + LLVMValueRef full_mask = LLVMConstInt(ctx->i64, ~0ull, 0); + lp_build_intrinsic(ctx->gallivm.builder, + "llvm.amdgcn.init.exec", ctx->voidt, + &full_mask, 1, LP_FUNC_ATTR_CONVERGENT); +} + static void si_init_exec_from_input(struct si_shader_context *ctx, unsigned param, unsigned bitoffset) { LLVMValueRef args[] = { LLVMGetParam(ctx->main_fn, param), LLVMConstInt(ctx->i32, bitoffset, 0), }; lp_build_intrinsic(ctx->gallivm.builder, "llvm.amdgcn.init.exec.from.input", ctx->voidt, args, 2, LP_FUNC_ATTR_CONVERGENT); @@ -7681,79 +7689,128 @@ static void si_get_ps_epilog_key(struct si_shader *shader, key->ps_epilog.states = shader->key.part.ps.epilog; } /** * Build the GS prolog function. Rotate the input vertices for triangle strips * with adjacency. */ static void si_build_gs_prolog_function(struct si_shader_context *ctx, union si_shader_part_key *key) { - const unsigned num_sgprs = GFX6_GS_NUM_USER_SGPR + 2; - const unsigned num_vgprs = 8; + unsigned num_sgprs, num_vgprs; struct gallivm_state *gallivm = &ctx->gallivm; LLVMBuilderRef builder = gallivm->builder; - LLVMTypeRef params[32]; - LLVMTypeRef returns[32]; + LLVMTypeRef params[48]; /* 40 SGPRs (maximum) + some VGPRs */ + LLVMTypeRef returns[48]; LLVMValueRef func, ret; + if (ctx->screen->b.chip_class >= GFX9) { + num_sgprs = 8 + GFX9_GS_NUM_USER_SGPR; + num_vgprs = 5; /* ES inputs are not needed by GS */ + } else { + num_sgprs = GFX6_GS_NUM_USER_SGPR + 2; + num_vgprs = 8; + } + for (unsigned i = 0; i < num_sgprs; ++i) { params[i] = ctx->i32; returns[i] = ctx->i32; } for (unsigned i = 0; i < num_vgprs; ++i) { params[num_sgprs + i] = ctx->i32; returns[num_sgprs + i] = ctx->f32; } /* Create the function. */ si_create_function(ctx, "gs_prolog", returns, num_sgprs + num_vgprs, params, num_sgprs + num_vgprs, num_sgprs - 1); func = ctx->main_fn; + /* Set the full EXEC mask for the prolog, because we are only fiddling +* with registers here. The main shader part will set the correct EXEC +* mask. +*/ + if (ctx->screen->b.chip_class >= GFX9) + si_init_exec_full_mask(ctx); + /* Copy inputs to outputs. This should be no-op, as the registers match, * but it will prevent the compiler from overwriting them unintentionally. */ ret = ctx->return_value; for (unsigned i = 0; i < num_sgprs; i++) { LLVMValueRef p = LLVMGetParam(func, i); ret = LLVMBuildInsertValue(builder, ret, p, i, ""); } for (unsigned i = 0; i < num_vgprs; i++) { LLVMValueRef p = LLVMGetPa
Re: [Mesa-dev] [PATCH 43/61] radeonsi/gfx9: add GS prolog support for merged ES-GS
On 28.04.2017 17:59, Marek Olšák wrote: On Fri, Apr 28, 2017 at 1:25 PM, Nicolai Hähnle wrote: On 24.04.2017 10:45, Marek Olšák wrote: From: Marek Olšák --- src/gallium/drivers/radeonsi/si_shader.c | 87 +--- 1 file changed, 70 insertions(+), 17 deletions(-) diff --git a/src/gallium/drivers/radeonsi/si_shader.c b/src/gallium/drivers/radeonsi/si_shader.c index a4c2ac0..392f85d 100644 --- a/src/gallium/drivers/radeonsi/si_shader.c +++ b/src/gallium/drivers/radeonsi/si_shader.c @@ -7368,20 +7368,28 @@ static void si_count_scratch_private_memory(struct si_shader_context *ctx) LLVMTypeRef type = LLVMGetElementType(LLVMTypeOf(inst)); /* No idea why LLVM aligns allocas to 4 elements. */ unsigned alignment = LLVMGetAlignment(inst); unsigned dw_size = align(llvm_get_type_size(type) / 4, alignment); ctx->shader->config.private_mem_vgprs += dw_size; } bb = LLVMGetNextBasicBlock(bb); } } +static void si_init_exec_full_mask(struct si_shader_context *ctx) +{ + LLVMValueRef full_mask = LLVMConstInt(ctx->i64, ~0ull, 0); + lp_build_intrinsic(ctx->gallivm.builder, + "llvm.amdgcn.init.exec", ctx->voidt, + &full_mask, 1, LP_FUNC_ATTR_CONVERGENT); +} + static void si_init_exec_from_input(struct si_shader_context *ctx, unsigned param, unsigned bitoffset) { LLVMValueRef args[] = { LLVMGetParam(ctx->main_fn, param), LLVMConstInt(ctx->i32, bitoffset, 0), }; lp_build_intrinsic(ctx->gallivm.builder, "llvm.amdgcn.init.exec.from.input", ctx->voidt, args, 2, LP_FUNC_ATTR_CONVERGENT); @@ -7681,79 +7689,128 @@ static void si_get_ps_epilog_key(struct si_shader *shader, key->ps_epilog.states = shader->key.part.ps.epilog; } /** * Build the GS prolog function. Rotate the input vertices for triangle strips * with adjacency. */ static void si_build_gs_prolog_function(struct si_shader_context *ctx, union si_shader_part_key *key) { - const unsigned num_sgprs = GFX6_GS_NUM_USER_SGPR + 2; - const unsigned num_vgprs = 8; + unsigned num_sgprs, num_vgprs; struct gallivm_state *gallivm = &ctx->gallivm; LLVMBuilderRef builder = gallivm->builder; - LLVMTypeRef params[32]; - LLVMTypeRef returns[32]; + LLVMTypeRef params[48]; /* 40 SGPRs (maximum) + some VGPRs */ + LLVMTypeRef returns[48]; LLVMValueRef func, ret; + if (ctx->screen->b.chip_class >= GFX9) { + num_sgprs = 8 + GFX9_GS_NUM_USER_SGPR; + num_vgprs = 5; /* ES inputs are not needed by GS */ + } else { + num_sgprs = GFX6_GS_NUM_USER_SGPR + 2; + num_vgprs = 8; + } + for (unsigned i = 0; i < num_sgprs; ++i) { params[i] = ctx->i32; returns[i] = ctx->i32; } for (unsigned i = 0; i < num_vgprs; ++i) { params[num_sgprs + i] = ctx->i32; returns[num_sgprs + i] = ctx->f32; } /* Create the function. */ si_create_function(ctx, "gs_prolog", returns, num_sgprs + num_vgprs, params, num_sgprs + num_vgprs, num_sgprs - 1); func = ctx->main_fn; + /* Set the full EXEC mask for the prolog, because we are only fiddling +* with registers here. The main shader part will set the correct EXEC +* mask. +*/ + if (ctx->screen->b.chip_class >= GFX9) + si_init_exec_full_mask(ctx); + /* Copy inputs to outputs. This should be no-op, as the registers match, * but it will prevent the compiler from overwriting them unintentionally. */ ret = ctx->return_value; for (unsigned i = 0; i < num_sgprs; i++) { LLVMValueRef p = LLVMGetParam(func, i); ret = LLVMBuildInsertValue(builder, ret, p, i, ""); } for (unsigned i = 0; i < num_vgprs; i++) { LLVMValueRef p = LLVMGetParam(func, num_sgprs + i); p = LLVMBuildBitCast(builder, p, ctx->f32, ""); ret = LLVMBuildInsertValue(builder, ret, p, num_sgprs + i, ""); } if (key->gs_prolog.states.tri_strip_adj_fix) { /* Remap the input vertices for every other primitive. */ - const unsigned vtx_params[6] = { + const unsigned gfx6_vtx_params[6] = { num_sgprs, num_sgprs + 1, num_sgprs + 3, num_sgprs + 4, num_sgprs + 5, num_sgprs + 6 }; + const unsi
Re: [Mesa-dev] [PATCH 43/61] radeonsi/gfx9: add GS prolog support for merged ES-GS
On Fri, Apr 28, 2017 at 1:25 PM, Nicolai Hähnle wrote: > On 24.04.2017 10:45, Marek Olšák wrote: >> >> From: Marek Olšák >> >> --- >> src/gallium/drivers/radeonsi/si_shader.c | 87 >> +--- >> 1 file changed, 70 insertions(+), 17 deletions(-) >> >> diff --git a/src/gallium/drivers/radeonsi/si_shader.c >> b/src/gallium/drivers/radeonsi/si_shader.c >> index a4c2ac0..392f85d 100644 >> --- a/src/gallium/drivers/radeonsi/si_shader.c >> +++ b/src/gallium/drivers/radeonsi/si_shader.c >> @@ -7368,20 +7368,28 @@ static void si_count_scratch_private_memory(struct >> si_shader_context *ctx) >> LLVMTypeRef type = >> LLVMGetElementType(LLVMTypeOf(inst)); >> /* No idea why LLVM aligns allocas to 4 elements. >> */ >> unsigned alignment = LLVMGetAlignment(inst); >> unsigned dw_size = align(llvm_get_type_size(type) >> / 4, alignment); >> ctx->shader->config.private_mem_vgprs += dw_size; >> } >> bb = LLVMGetNextBasicBlock(bb); >> } >> } >> >> +static void si_init_exec_full_mask(struct si_shader_context *ctx) >> +{ >> + LLVMValueRef full_mask = LLVMConstInt(ctx->i64, ~0ull, 0); >> + lp_build_intrinsic(ctx->gallivm.builder, >> + "llvm.amdgcn.init.exec", ctx->voidt, >> + &full_mask, 1, LP_FUNC_ATTR_CONVERGENT); >> +} >> + >> static void si_init_exec_from_input(struct si_shader_context *ctx, >> unsigned param, unsigned bitoffset) >> { >> LLVMValueRef args[] = { >> LLVMGetParam(ctx->main_fn, param), >> LLVMConstInt(ctx->i32, bitoffset, 0), >> }; >> lp_build_intrinsic(ctx->gallivm.builder, >>"llvm.amdgcn.init.exec.from.input", >>ctx->voidt, args, 2, LP_FUNC_ATTR_CONVERGENT); >> @@ -7681,79 +7689,128 @@ static void si_get_ps_epilog_key(struct si_shader >> *shader, >> key->ps_epilog.states = shader->key.part.ps.epilog; >> } >> >> /** >> * Build the GS prolog function. Rotate the input vertices for triangle >> strips >> * with adjacency. >> */ >> static void si_build_gs_prolog_function(struct si_shader_context *ctx, >> union si_shader_part_key *key) >> { >> - const unsigned num_sgprs = GFX6_GS_NUM_USER_SGPR + 2; >> - const unsigned num_vgprs = 8; >> + unsigned num_sgprs, num_vgprs; >> struct gallivm_state *gallivm = &ctx->gallivm; >> LLVMBuilderRef builder = gallivm->builder; >> - LLVMTypeRef params[32]; >> - LLVMTypeRef returns[32]; >> + LLVMTypeRef params[48]; /* 40 SGPRs (maximum) + some VGPRs */ >> + LLVMTypeRef returns[48]; >> LLVMValueRef func, ret; >> >> + if (ctx->screen->b.chip_class >= GFX9) { >> + num_sgprs = 8 + GFX9_GS_NUM_USER_SGPR; >> + num_vgprs = 5; /* ES inputs are not needed by GS */ >> + } else { >> + num_sgprs = GFX6_GS_NUM_USER_SGPR + 2; >> + num_vgprs = 8; >> + } >> + >> for (unsigned i = 0; i < num_sgprs; ++i) { >> params[i] = ctx->i32; >> returns[i] = ctx->i32; >> } >> >> for (unsigned i = 0; i < num_vgprs; ++i) { >> params[num_sgprs + i] = ctx->i32; >> returns[num_sgprs + i] = ctx->f32; >> } >> >> /* Create the function. */ >> si_create_function(ctx, "gs_prolog", returns, num_sgprs + >> num_vgprs, >>params, num_sgprs + num_vgprs, num_sgprs - 1); >> func = ctx->main_fn; >> >> + /* Set the full EXEC mask for the prolog, because we are only >> fiddling >> +* with registers here. The main shader part will set the correct >> EXEC >> +* mask. >> +*/ >> + if (ctx->screen->b.chip_class >= GFX9) >> + si_init_exec_full_mask(ctx); >> + >> /* Copy inputs to outputs. This should be no-op, as the registers >> match, >> * but it will prevent the compiler from overwriting them >> unintentionally. >> */ >> ret = ctx->return_value; >> for (unsigned i = 0; i < num_sgprs; i++) { >> LLVMValueRef p = LLVMGetParam(func, i); >> ret = LLVMBuildInsertValue(builder, ret, p, i, ""); >> } >> for (unsigned i = 0; i < num_vgprs; i++) { >> LLVMValueRef p = LLVMGetParam(func, num_sgprs + i); >> p = LLVMBuildBitCast(builder, p, ctx->f32, ""); >> ret = LLVMBuildInsertValue(builder, ret, p, num_sgprs + i, >> ""); >> } >> >> if (key->gs_prolog.states.tri_strip_adj_fix) { >> /* Remap the input vertices for every other primitive. */ >> - const unsigned vtx_params[6] = { >> +
Re: [Mesa-dev] [PATCH 43/61] radeonsi/gfx9: add GS prolog support for merged ES-GS
On 24.04.2017 10:45, Marek Olšák wrote: From: Marek Olšák --- src/gallium/drivers/radeonsi/si_shader.c | 87 +--- 1 file changed, 70 insertions(+), 17 deletions(-) diff --git a/src/gallium/drivers/radeonsi/si_shader.c b/src/gallium/drivers/radeonsi/si_shader.c index a4c2ac0..392f85d 100644 --- a/src/gallium/drivers/radeonsi/si_shader.c +++ b/src/gallium/drivers/radeonsi/si_shader.c @@ -7368,20 +7368,28 @@ static void si_count_scratch_private_memory(struct si_shader_context *ctx) LLVMTypeRef type = LLVMGetElementType(LLVMTypeOf(inst)); /* No idea why LLVM aligns allocas to 4 elements. */ unsigned alignment = LLVMGetAlignment(inst); unsigned dw_size = align(llvm_get_type_size(type) / 4, alignment); ctx->shader->config.private_mem_vgprs += dw_size; } bb = LLVMGetNextBasicBlock(bb); } } +static void si_init_exec_full_mask(struct si_shader_context *ctx) +{ + LLVMValueRef full_mask = LLVMConstInt(ctx->i64, ~0ull, 0); + lp_build_intrinsic(ctx->gallivm.builder, + "llvm.amdgcn.init.exec", ctx->voidt, + &full_mask, 1, LP_FUNC_ATTR_CONVERGENT); +} + static void si_init_exec_from_input(struct si_shader_context *ctx, unsigned param, unsigned bitoffset) { LLVMValueRef args[] = { LLVMGetParam(ctx->main_fn, param), LLVMConstInt(ctx->i32, bitoffset, 0), }; lp_build_intrinsic(ctx->gallivm.builder, "llvm.amdgcn.init.exec.from.input", ctx->voidt, args, 2, LP_FUNC_ATTR_CONVERGENT); @@ -7681,79 +7689,128 @@ static void si_get_ps_epilog_key(struct si_shader *shader, key->ps_epilog.states = shader->key.part.ps.epilog; } /** * Build the GS prolog function. Rotate the input vertices for triangle strips * with adjacency. */ static void si_build_gs_prolog_function(struct si_shader_context *ctx, union si_shader_part_key *key) { - const unsigned num_sgprs = GFX6_GS_NUM_USER_SGPR + 2; - const unsigned num_vgprs = 8; + unsigned num_sgprs, num_vgprs; struct gallivm_state *gallivm = &ctx->gallivm; LLVMBuilderRef builder = gallivm->builder; - LLVMTypeRef params[32]; - LLVMTypeRef returns[32]; + LLVMTypeRef params[48]; /* 40 SGPRs (maximum) + some VGPRs */ + LLVMTypeRef returns[48]; LLVMValueRef func, ret; + if (ctx->screen->b.chip_class >= GFX9) { + num_sgprs = 8 + GFX9_GS_NUM_USER_SGPR; + num_vgprs = 5; /* ES inputs are not needed by GS */ + } else { + num_sgprs = GFX6_GS_NUM_USER_SGPR + 2; + num_vgprs = 8; + } + for (unsigned i = 0; i < num_sgprs; ++i) { params[i] = ctx->i32; returns[i] = ctx->i32; } for (unsigned i = 0; i < num_vgprs; ++i) { params[num_sgprs + i] = ctx->i32; returns[num_sgprs + i] = ctx->f32; } /* Create the function. */ si_create_function(ctx, "gs_prolog", returns, num_sgprs + num_vgprs, params, num_sgprs + num_vgprs, num_sgprs - 1); func = ctx->main_fn; + /* Set the full EXEC mask for the prolog, because we are only fiddling +* with registers here. The main shader part will set the correct EXEC +* mask. +*/ + if (ctx->screen->b.chip_class >= GFX9) + si_init_exec_full_mask(ctx); + /* Copy inputs to outputs. This should be no-op, as the registers match, * but it will prevent the compiler from overwriting them unintentionally. */ ret = ctx->return_value; for (unsigned i = 0; i < num_sgprs; i++) { LLVMValueRef p = LLVMGetParam(func, i); ret = LLVMBuildInsertValue(builder, ret, p, i, ""); } for (unsigned i = 0; i < num_vgprs; i++) { LLVMValueRef p = LLVMGetParam(func, num_sgprs + i); p = LLVMBuildBitCast(builder, p, ctx->f32, ""); ret = LLVMBuildInsertValue(builder, ret, p, num_sgprs + i, ""); } if (key->gs_prolog.states.tri_strip_adj_fix) { /* Remap the input vertices for every other primitive. */ - const unsigned vtx_params[6] = { + const unsigned gfx6_vtx_params[6] = { num_sgprs, num_sgprs + 1, num_sgprs + 3, num_sgprs + 4, num_sgprs + 5, num_sgprs + 6 }; + const unsigned gfx9_vtx_params[3] = { + num_sgprs, + num_sgp
[Mesa-dev] [PATCH 43/61] radeonsi/gfx9: add GS prolog support for merged ES-GS
From: Marek Olšák --- src/gallium/drivers/radeonsi/si_shader.c | 87 +--- 1 file changed, 70 insertions(+), 17 deletions(-) diff --git a/src/gallium/drivers/radeonsi/si_shader.c b/src/gallium/drivers/radeonsi/si_shader.c index a4c2ac0..392f85d 100644 --- a/src/gallium/drivers/radeonsi/si_shader.c +++ b/src/gallium/drivers/radeonsi/si_shader.c @@ -7368,20 +7368,28 @@ static void si_count_scratch_private_memory(struct si_shader_context *ctx) LLVMTypeRef type = LLVMGetElementType(LLVMTypeOf(inst)); /* No idea why LLVM aligns allocas to 4 elements. */ unsigned alignment = LLVMGetAlignment(inst); unsigned dw_size = align(llvm_get_type_size(type) / 4, alignment); ctx->shader->config.private_mem_vgprs += dw_size; } bb = LLVMGetNextBasicBlock(bb); } } +static void si_init_exec_full_mask(struct si_shader_context *ctx) +{ + LLVMValueRef full_mask = LLVMConstInt(ctx->i64, ~0ull, 0); + lp_build_intrinsic(ctx->gallivm.builder, + "llvm.amdgcn.init.exec", ctx->voidt, + &full_mask, 1, LP_FUNC_ATTR_CONVERGENT); +} + static void si_init_exec_from_input(struct si_shader_context *ctx, unsigned param, unsigned bitoffset) { LLVMValueRef args[] = { LLVMGetParam(ctx->main_fn, param), LLVMConstInt(ctx->i32, bitoffset, 0), }; lp_build_intrinsic(ctx->gallivm.builder, "llvm.amdgcn.init.exec.from.input", ctx->voidt, args, 2, LP_FUNC_ATTR_CONVERGENT); @@ -7681,79 +7689,128 @@ static void si_get_ps_epilog_key(struct si_shader *shader, key->ps_epilog.states = shader->key.part.ps.epilog; } /** * Build the GS prolog function. Rotate the input vertices for triangle strips * with adjacency. */ static void si_build_gs_prolog_function(struct si_shader_context *ctx, union si_shader_part_key *key) { - const unsigned num_sgprs = GFX6_GS_NUM_USER_SGPR + 2; - const unsigned num_vgprs = 8; + unsigned num_sgprs, num_vgprs; struct gallivm_state *gallivm = &ctx->gallivm; LLVMBuilderRef builder = gallivm->builder; - LLVMTypeRef params[32]; - LLVMTypeRef returns[32]; + LLVMTypeRef params[48]; /* 40 SGPRs (maximum) + some VGPRs */ + LLVMTypeRef returns[48]; LLVMValueRef func, ret; + if (ctx->screen->b.chip_class >= GFX9) { + num_sgprs = 8 + GFX9_GS_NUM_USER_SGPR; + num_vgprs = 5; /* ES inputs are not needed by GS */ + } else { + num_sgprs = GFX6_GS_NUM_USER_SGPR + 2; + num_vgprs = 8; + } + for (unsigned i = 0; i < num_sgprs; ++i) { params[i] = ctx->i32; returns[i] = ctx->i32; } for (unsigned i = 0; i < num_vgprs; ++i) { params[num_sgprs + i] = ctx->i32; returns[num_sgprs + i] = ctx->f32; } /* Create the function. */ si_create_function(ctx, "gs_prolog", returns, num_sgprs + num_vgprs, params, num_sgprs + num_vgprs, num_sgprs - 1); func = ctx->main_fn; + /* Set the full EXEC mask for the prolog, because we are only fiddling +* with registers here. The main shader part will set the correct EXEC +* mask. +*/ + if (ctx->screen->b.chip_class >= GFX9) + si_init_exec_full_mask(ctx); + /* Copy inputs to outputs. This should be no-op, as the registers match, * but it will prevent the compiler from overwriting them unintentionally. */ ret = ctx->return_value; for (unsigned i = 0; i < num_sgprs; i++) { LLVMValueRef p = LLVMGetParam(func, i); ret = LLVMBuildInsertValue(builder, ret, p, i, ""); } for (unsigned i = 0; i < num_vgprs; i++) { LLVMValueRef p = LLVMGetParam(func, num_sgprs + i); p = LLVMBuildBitCast(builder, p, ctx->f32, ""); ret = LLVMBuildInsertValue(builder, ret, p, num_sgprs + i, ""); } if (key->gs_prolog.states.tri_strip_adj_fix) { /* Remap the input vertices for every other primitive. */ - const unsigned vtx_params[6] = { + const unsigned gfx6_vtx_params[6] = { num_sgprs, num_sgprs + 1, num_sgprs + 3, num_sgprs + 4, num_sgprs + 5, num_sgprs + 6 }; + const unsigned gfx9_vtx_params[3] = { + num_sgprs, + num_sgprs + 1, + num