On Fri, Apr 28, 2017 at 1:54 PM, Nicolai Hähnle <nhaeh...@gmail.com> wrote: > On 24.04.2017 10:45, Marek Olšák wrote: >> >> From: Marek Olšák <marek.ol...@amd.com> >> >> VGPR1 = InstanceID / StepRate0; // StepRate0 can be set to 1 >> --- >> src/gallium/drivers/radeonsi/si_shader.c | 20 ++++++++++++++------ >> src/gallium/drivers/radeonsi/si_shader.h | 1 + >> src/gallium/drivers/radeonsi/si_state.c | 1 + >> src/gallium/drivers/radeonsi/si_state_shaders.c | 24 >> +++++++++++++++++------- >> 4 files changed, 33 insertions(+), 13 deletions(-) >> >> diff --git a/src/gallium/drivers/radeonsi/si_shader.c >> b/src/gallium/drivers/radeonsi/si_shader.c >> index edb50a3..ce509af 100644 >> --- a/src/gallium/drivers/radeonsi/si_shader.c >> +++ b/src/gallium/drivers/radeonsi/si_shader.c >> @@ -5838,23 +5838,28 @@ static void declare_vs_specific_input_sgprs(struct >> si_shader_context *ctx, >> params[ctx->param_vs_state_bits = (*num_params)++] = ctx->i32; >> } >> >> static void declare_vs_input_vgprs(struct si_shader_context *ctx, >> LLVMTypeRef *params, unsigned >> *num_params, >> unsigned *num_prolog_vgprs) >> { >> struct si_shader *shader = ctx->shader; >> >> params[ctx->param_vertex_id = (*num_params)++] = ctx->i32; >> - params[ctx->param_rel_auto_id = (*num_params)++] = ctx->i32; >> - params[ctx->param_vs_prim_id = (*num_params)++] = ctx->i32; >> - params[ctx->param_instance_id = (*num_params)++] = ctx->i32; >> + if (shader->key.as_ls) { >> + params[ctx->param_rel_auto_id = (*num_params)++] = >> ctx->i32; >> + params[ctx->param_instance_id = (*num_params)++] = >> ctx->i32; >> + } else { >> + params[ctx->param_instance_id = (*num_params)++] = >> ctx->i32; >> + params[ctx->param_vs_prim_id = (*num_params)++] = >> ctx->i32; >> + } >> + params[(*num_params)++] = ctx->i32; /* unused */ >> >> if (!shader->is_gs_copy_shader) { >> /* Vertex load indices. */ >> ctx->param_vertex_index0 = (*num_params); >> for (unsigned i = 0; i < >> shader->selector->info.num_inputs; i++) >> params[(*num_params)++] = ctx->i32; >> *num_prolog_vgprs += shader->selector->info.num_inputs; >> } >> } >> >> @@ -7497,25 +7502,28 @@ static bool si_compile_tgsi_main(struct >> si_shader_context *ctx, >> static void si_get_vs_prolog_key(const struct tgsi_shader_info *info, >> unsigned num_input_sgprs, >> const struct si_vs_prolog_bits >> *prolog_key, >> struct si_shader *shader_out, >> union si_shader_part_key *key) >> { >> memset(key, 0, sizeof(*key)); >> key->vs_prolog.states = *prolog_key; >> key->vs_prolog.num_input_sgprs = num_input_sgprs; >> key->vs_prolog.last_input = MAX2(1, info->num_inputs) - 1; >> + key->vs_prolog.as_ls = shader_out->key.as_ls; >> >> - if (shader_out->selector->type == PIPE_SHADER_TESS_CTRL) >> + if (shader_out->selector->type == PIPE_SHADER_TESS_CTRL) { >> + key->vs_prolog.as_ls = 1; >> key->vs_prolog.num_merged_next_stage_vgprs = 2; >> - else if (shader_out->selector->type == PIPE_SHADER_GEOMETRY) >> + } else if (shader_out->selector->type == PIPE_SHADER_GEOMETRY) { >> key->vs_prolog.num_merged_next_stage_vgprs = 5; >> + } >> >> /* Set the instanceID flag. */ >> for (unsigned i = 0; i < info->num_inputs; i++) >> if (key->vs_prolog.states.instance_divisors[i]) >> shader_out->info.uses_instanceid = true; >> } >> >> /** >> * Compute the VS epilog key, which contains all the information needed >> to >> * build the VS epilog function, and set the PrimitiveID output offset. >> @@ -8508,21 +8516,21 @@ static void si_build_vs_prolog_function(struct >> si_shader_context *ctx, >> LLVMValueRef ret, func; >> int last_sgpr, num_params, num_returns, i; >> unsigned first_vs_vgpr = key->vs_prolog.num_input_sgprs + >> >> key->vs_prolog.num_merged_next_stage_vgprs; >> unsigned num_input_vgprs = >> key->vs_prolog.num_merged_next_stage_vgprs + 4; >> unsigned num_all_input_regs = key->vs_prolog.num_input_sgprs + >> num_input_vgprs; >> unsigned user_sgpr_base = >> key->vs_prolog.num_merged_next_stage_vgprs ? 8 : 0; >> >> ctx->param_vertex_id = first_vs_vgpr; >> - ctx->param_instance_id = first_vs_vgpr + 3; >> + ctx->param_instance_id = first_vs_vgpr + (key->vs_prolog.as_ls ? 2 >> : 1); >> >> /* 4 preloaded VGPRs + vertex load indices as prolog outputs */ >> params = alloca(num_all_input_regs * sizeof(LLVMTypeRef)); >> returns = alloca((num_all_input_regs + key->vs_prolog.last_input + >> 1) * >> sizeof(LLVMTypeRef)); >> num_params = 0; >> num_returns = 0; >> >> /* Declare input and output SGPRs. */ >> num_params = 0; >> diff --git a/src/gallium/drivers/radeonsi/si_shader.h >> b/src/gallium/drivers/radeonsi/si_shader.h >> index 57685e0..6bca7f8 100644 >> --- a/src/gallium/drivers/radeonsi/si_shader.h >> +++ b/src/gallium/drivers/radeonsi/si_shader.h >> @@ -430,20 +430,21 @@ struct si_ps_epilog_bits { >> unsigned clamp_color:1; >> }; >> >> union si_shader_part_key { >> struct { >> struct si_vs_prolog_bits states; >> unsigned num_input_sgprs:6; >> /* For merged stages such as LS-HS, HS input VGPRs are >> first. */ >> unsigned num_merged_next_stage_vgprs:3; >> unsigned last_input:4; >> + unsigned as_ls:1; >> /* Prologs for monolithic shaders shouldn't set EXEC. */ >> unsigned is_monolithic:1; >> } vs_prolog; >> struct { >> struct si_vs_epilog_bits states; >> unsigned prim_id_param_offset:5; >> } vs_epilog; >> struct { >> struct si_tcs_epilog_bits states; >> } tcs_epilog; >> diff --git a/src/gallium/drivers/radeonsi/si_state.c >> b/src/gallium/drivers/radeonsi/si_state.c >> index 39494cc..938e7fb 100644 >> --- a/src/gallium/drivers/radeonsi/si_state.c >> +++ b/src/gallium/drivers/radeonsi/si_state.c >> @@ -4334,20 +4334,21 @@ static void si_init_config(struct si_context >> *sctx) >> if (sctx->b.chip_class <= VI) { >> si_pm4_set_reg(pm4, R_028A54_VGT_GS_PER_ES, SI_GS_PER_ES); >> si_pm4_set_reg(pm4, R_028A58_VGT_ES_PER_GS, 0x40); >> } >> si_pm4_set_reg(pm4, R_028A5C_VGT_GS_PER_VS, 0x2); >> >> si_pm4_set_reg(pm4, R_028A8C_VGT_PRIMITIVEID_RESET, 0x0); >> si_pm4_set_reg(pm4, R_028B28_VGT_STRMOUT_DRAW_OPAQUE_OFFSET, 0); >> >> si_pm4_set_reg(pm4, R_028B98_VGT_STRMOUT_BUFFER_CONFIG, 0x0); >> + si_pm4_set_reg(pm4, R_028AA0_VGT_INSTANCE_STEP_RATE_0, 1); >> si_pm4_set_reg(pm4, R_028AB8_VGT_VTX_CNT_EN, 0x0); >> if (sctx->b.chip_class < CIK) >> si_pm4_set_reg(pm4, R_008A14_PA_CL_ENHANCE, >> S_008A14_NUM_CLIP_SEQ(3) | >> S_008A14_CLIP_VTX_REORDER_ENA(1)); >> >> si_pm4_set_reg(pm4, R_028BD4_PA_SC_CENTROID_PRIORITY_0, >> 0x76543210); >> si_pm4_set_reg(pm4, R_028BD8_PA_SC_CENTROID_PRIORITY_1, >> 0xfedcba98); >> >> si_pm4_set_reg(pm4, R_02882C_PA_SU_PRIM_FILTER_CNTL, 0); >> >> diff --git a/src/gallium/drivers/radeonsi/si_state_shaders.c >> b/src/gallium/drivers/radeonsi/si_state_shaders.c >> index 5bbc037..0c997e8 100644 >> --- a/src/gallium/drivers/radeonsi/si_state_shaders.c >> +++ b/src/gallium/drivers/radeonsi/si_state_shaders.c >> @@ -450,22 +450,24 @@ static void si_shader_ls(struct si_screen *sscreen, >> struct si_shader *shader) >> assert(sscreen->b.chip_class <= VI); >> >> pm4 = si_get_shader_pm4_state(shader); >> if (!pm4) >> return; >> >> va = shader->bo->gpu_address; >> si_pm4_add_bo(pm4, shader->bo, RADEON_USAGE_READ, >> RADEON_PRIO_SHADER_BINARY); >> >> /* We need at least 2 components for LS. >> - * VGPR0-3: (VertexID, RelAutoindex, ???, InstanceID). */ >> - vgpr_comp_cnt = shader->info.uses_instanceid ? 3 : 1; >> + * VGPR0-3: (VertexID, RelAutoindex, InstanceID / StepRate0, >> InstanceID). >> + * StepRate0 is set to 1. so that VGPR3 doesn't have to be loaded. >> + */ >> + vgpr_comp_cnt = shader->info.uses_instanceid ? 2 : 1; >> >> si_pm4_set_reg(pm4, R_00B520_SPI_SHADER_PGM_LO_LS, va >> 8); >> si_pm4_set_reg(pm4, R_00B524_SPI_SHADER_PGM_HI_LS, va >> 40); >> >> shader->config.rsrc1 = S_00B528_VGPRS((shader->config.num_vgprs - >> 1) / 4) | >> S_00B528_SGPRS((shader->config.num_sgprs - 1) / >> 8) | >> S_00B528_VGPR_COMP_CNT(vgpr_comp_cnt) | >> S_00B528_DX10_CLAMP(1) | >> S_00B528_FLOAT_MODE(shader->config.float_mode); >> shader->config.rsrc2 = S_00B52C_USER_SGPR(SI_VS_NUM_USER_SGPR) | >> @@ -483,22 +485,24 @@ static void si_shader_hs(struct si_screen *sscreen, >> struct si_shader *shader) >> return; >> >> va = shader->bo->gpu_address; >> si_pm4_add_bo(pm4, shader->bo, RADEON_USAGE_READ, >> RADEON_PRIO_SHADER_BINARY); >> >> if (sscreen->b.chip_class >= GFX9) { >> si_pm4_set_reg(pm4, R_00B410_SPI_SHADER_PGM_LO_LS, va >> >> 8); >> si_pm4_set_reg(pm4, R_00B414_SPI_SHADER_PGM_HI_LS, va >> >> 40); >> >> /* We need at least 2 components for LS. >> - * VGPR0-3: (VertexID, RelAutoindex, ???, InstanceID). */ >> - ls_vgpr_comp_cnt = shader->info.uses_instanceid ? 3 : 1; >> + * VGPR0-3: (VertexID, RelAutoindex, InstanceID / >> StepRate0, InstanceID). >> + * StepRate0 is set to 1. so that VGPR3 doesn't have to be >> loaded. >> + */ >> + ls_vgpr_comp_cnt = shader->info.uses_instanceid ? 2 : 1; >> >> if (shader->config.scratch_bytes_per_wave) { >> fprintf(stderr, "HS: scratch buffer unsupported"); >> abort(); >> } >> >> shader->config.rsrc2 = >> S_00B42C_USER_SGPR(GFX9_TCS_NUM_USER_SGPR) | >> S_00B42C_USER_SGPR_MSB(GFX9_TCS_NUM_USER_SGPR >> >> 5) | >> >> S_00B42C_SCRATCH_EN(shader->config.scratch_bytes_per_wave > 0); >> @@ -536,21 +540,22 @@ static void si_shader_es(struct si_screen *sscreen, >> struct si_shader *shader) >> assert(sscreen->b.chip_class <= VI); >> >> pm4 = si_get_shader_pm4_state(shader); >> if (!pm4) >> return; >> >> va = shader->bo->gpu_address; >> si_pm4_add_bo(pm4, shader->bo, RADEON_USAGE_READ, >> RADEON_PRIO_SHADER_BINARY); >> >> if (shader->selector->type == PIPE_SHADER_VERTEX) { >> - vgpr_comp_cnt = shader->info.uses_instanceid ? 3 : 0; >> + /* VGPR0-3: (VertexID, InstanceID / StepRate0, ...) */ >> + vgpr_comp_cnt = shader->info.uses_instanceid ? 1 : 0; >> num_user_sgprs = SI_VS_NUM_USER_SGPR; >> } else if (shader->selector->type == PIPE_SHADER_TESS_EVAL) { >> vgpr_comp_cnt = shader->selector->info.uses_primid ? 3 : >> 2; >> num_user_sgprs = SI_TES_NUM_USER_SGPR; >> } else >> unreachable("invalid shader selector type"); >> >> oc_lds_en = shader->selector->type == PIPE_SHADER_TESS_EVAL ? 1 : >> 0; >> >> si_pm4_set_reg(pm4, R_028AAC_VGT_ESGS_RING_ITEMSIZE, >> @@ -751,21 +756,22 @@ static void si_shader_gs(struct si_screen *sscreen, >> struct si_shader *shader) >> va = shader->bo->gpu_address; >> si_pm4_add_bo(pm4, shader->bo, RADEON_USAGE_READ, >> RADEON_PRIO_SHADER_BINARY); >> >> if (sscreen->b.chip_class >= GFX9) { >> unsigned input_prim = >> sel->info.properties[TGSI_PROPERTY_GS_INPUT_PRIM]; >> unsigned es_type = shader->key.part.gs.es->type; >> unsigned es_vgpr_comp_cnt, gs_vgpr_comp_cnt; >> struct gfx9_gs_info gs_info; >> >> if (es_type == PIPE_SHADER_VERTEX) >> - es_vgpr_comp_cnt = shader->info.uses_instanceid ? >> 3 : 0; >> + /* VGPR0-3: (VertexID, InstanceID / StepRate0, >> ...) */ >> + es_vgpr_comp_cnt = shader->info.uses_instanceid ? >> 1 : 0; >> else if (es_type == PIPE_SHADER_TESS_EVAL) >> es_vgpr_comp_cnt = >> shader->key.part.gs.es->info.uses_primid ? 3 : 2; >> else >> unreachable("invalid shader selector type"); >> >> /* If offsets 4, 5 are used, GS_VGPR_COMP_CNT is ignored >> and >> * VGPR[0:4] are always loaded. >> */ >> if (sel->info.uses_invocationid) >> gs_vgpr_comp_cnt = 3; /* VGPR3 contains >> InvocationID. */ >> @@ -868,21 +874,25 @@ static void si_shader_vs(struct si_screen *sscreen, >> struct si_shader *shader, >> si_pm4_set_reg(pm4, R_028A84_VGT_PRIMITIVEID_EN, 0); >> } >> >> va = shader->bo->gpu_address; >> si_pm4_add_bo(pm4, shader->bo, RADEON_USAGE_READ, >> RADEON_PRIO_SHADER_BINARY); >> >> if (gs) { >> vgpr_comp_cnt = 0; /* only VertexID is needed for GS-COPY. >> */ >> num_user_sgprs = SI_GSCOPY_NUM_USER_SGPR; >> } else if (shader->selector->type == PIPE_SHADER_VERTEX) { >> - vgpr_comp_cnt = shader->info.uses_instanceid ? 3 : >> (enable_prim_id ? 2 : 0); >> + /* VGPR0-3: (VertexID, InstanceID / StepRate0, PrimID, >> InstanceID) >> + * If PrimID is disabled. InstanceID / StepRate1 is loaded >> instead. > > > StepRate0.
It's really StepRate1 (VGPR2 where PrimID is loaded). StepRate0 is applied to VGPR1. Marek _______________________________________________ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev