From: Marek Olšák <marek.ol...@amd.com> Shader key size: 107 -> 47
Divisors of 0 and 1 are encoded in the shader key. Greater instance divisors are loaded from a constant buffer. The shader code doing the division is huge. Is it something we need to worry about? Does any app use instance divisors >= 2? VS prolog disassembly: s_load_dwordx4 s[12:15], s[0:1], 0x80 ; C00A0300 00000080 s_nop 0 ; BF800000 s_waitcnt lgkmcnt(0) ; BF8C007F s_buffer_load_dword s14, s[12:15], 0x4 ; C0220386 00000004 s_waitcnt lgkmcnt(0) ; BF8C007F v_cvt_f32_u32_e32 v4, s14 ; 7E080C0E v_rcp_iflag_f32_e32 v4, v4 ; 7E084704 v_mul_f32_e32 v4, 0x4f800000, v4 ; 0A0808FF 4F800000 v_cvt_u32_f32_e32 v4, v4 ; 7E080F04 v_mul_hi_u32 v5, v4, s14 ; D2860005 00001D04 v_mul_lo_i32 v6, v4, s14 ; D2850006 00001D04 v_cmp_eq_u32_e64 s[12:13], 0, v5 ; D0CA000C 00020A80 v_sub_i32_e32 v5, vcc, 0, v6 ; 340A0C80 v_cndmask_b32_e64 v5, v6, v5, s[12:13] ; D1000005 00320B06 v_mul_hi_u32 v5, v5, v4 ; D2860005 00020905 v_add_i32_e32 v6, vcc, v5, v4 ; 320C0905 v_subrev_i32_e32 v4, vcc, v5, v4 ; 36080905 v_cndmask_b32_e64 v4, v4, v6, s[12:13] ; D1000004 00320D04 v_mul_hi_u32 v5, v4, v1 ; D2860005 00020304 v_add_i32_e32 v4, vcc, s8, v0 ; 32080008 v_mul_lo_i32 v6, v5, s14 ; D2850006 00001D05 v_add_i32_e32 v7, vcc, 1, v5 ; 320E0A81 v_cmp_ge_u32_e64 s[12:13], v1, v6 ; D0CE000C 00020D01 v_sub_i32_e32 v6, vcc, v1, v6 ; 340C0D01 v_cmp_le_u32_e32 vcc, s14, v6 ; 7D960C0E v_cndmask_b32_e64 v8, 0, -1, s[12:13] ; D1000008 00318280 v_cndmask_b32_e64 v6, 0, -1, vcc ; D1000006 01A98280 v_and_b32_e32 v6, v8, v6 ; 260C0D08 v_cmp_eq_u32_e32 vcc, 0, v6 ; 7D940C80 v_cndmask_b32_e32 v6, v7, v5, vcc ; 000C0B07 v_add_i32_e32 v5, vcc, -1, v5 ; 320A0AC1 v_cmp_eq_u32_e32 vcc, 0, v8 ; 7D941080 v_cndmask_b32_e32 v5, v6, v5, vcc ; 000A0B06 v_add_i32_e32 v5, vcc, s9, v5 ; 320A0A09 --- src/gallium/drivers/radeonsi/si_descriptors.c | 2 + src/gallium/drivers/radeonsi/si_pipe.c | 2 + src/gallium/drivers/radeonsi/si_shader.c | 78 +++++++++++++++++-------- src/gallium/drivers/radeonsi/si_shader.h | 9 ++- src/gallium/drivers/radeonsi/si_state.c | 15 +++++ src/gallium/drivers/radeonsi/si_state.h | 3 + src/gallium/drivers/radeonsi/si_state_shaders.c | 7 ++- 7 files changed, 88 insertions(+), 28 deletions(-) diff --git a/src/gallium/drivers/radeonsi/si_descriptors.c b/src/gallium/drivers/radeonsi/si_descriptors.c index 75d2a1d..88f7dce 100644 --- a/src/gallium/drivers/radeonsi/si_descriptors.c +++ b/src/gallium/drivers/radeonsi/si_descriptors.c @@ -2185,20 +2185,22 @@ void si_emit_graphics_shader_userdata(struct si_context *sctx, R_00B208_SPI_SHADER_USER_DATA_ADDR_LO_GS); si_emit_shader_pointer(sctx, descs, R_00B408_SPI_SHADER_USER_DATA_ADDR_LO_HS); } else { si_emit_shader_pointer(sctx, descs, R_00B230_SPI_SHADER_USER_DATA_GS_0); si_emit_shader_pointer(sctx, descs, R_00B330_SPI_SHADER_USER_DATA_ES_0); si_emit_shader_pointer(sctx, descs, R_00B430_SPI_SHADER_USER_DATA_HS_0); + si_emit_shader_pointer(sctx, descs, + R_00B530_SPI_SHADER_USER_DATA_LS_0); } } mask = sctx->shader_pointers_dirty & u_bit_consecutive(SI_DESCS_FIRST_SHADER, SI_DESCS_FIRST_COMPUTE - SI_DESCS_FIRST_SHADER); while (mask) { unsigned i = u_bit_scan(&mask); unsigned shader = (i - SI_DESCS_FIRST_SHADER) / SI_NUM_SHADER_DESCS; diff --git a/src/gallium/drivers/radeonsi/si_pipe.c b/src/gallium/drivers/radeonsi/si_pipe.c index 4088849..a940bb8 100644 --- a/src/gallium/drivers/radeonsi/si_pipe.c +++ b/src/gallium/drivers/radeonsi/si_pipe.c @@ -301,20 +301,22 @@ static struct pipe_context *si_create_context(struct pipe_screen *screen, for (shader = 0; shader < SI_NUM_SHADERS; shader++) { for (i = 0; i < SI_NUM_CONST_BUFFERS; i++) { sctx->b.b.set_constant_buffer(&sctx->b.b, shader, i, &sctx->null_const_buf); } } si_set_rw_buffer(sctx, SI_HS_CONST_DEFAULT_TESS_LEVELS, &sctx->null_const_buf); + si_set_rw_buffer(sctx, SI_VS_CONST_INSTANCE_DIVISORS, + &sctx->null_const_buf); si_set_rw_buffer(sctx, SI_VS_CONST_CLIP_PLANES, &sctx->null_const_buf); si_set_rw_buffer(sctx, SI_PS_CONST_POLY_STIPPLE, &sctx->null_const_buf); si_set_rw_buffer(sctx, SI_PS_CONST_SAMPLE_POSITIONS, &sctx->null_const_buf); /* Clear the NULL constant buffer, because loads should return zeros. */ sctx->b.clear_buffer(&sctx->b.b, sctx->null_const_buf.buffer, 0, sctx->null_const_buf.buffer->width0, 0, diff --git a/src/gallium/drivers/radeonsi/si_shader.c b/src/gallium/drivers/radeonsi/si_shader.c index 42b08bf..55d1232 100644 --- a/src/gallium/drivers/radeonsi/si_shader.c +++ b/src/gallium/drivers/radeonsi/si_shader.c @@ -305,31 +305,30 @@ get_tcs_out_current_patch_data_offset(struct si_shader_context *ctx) LLVMValueRef rel_patch_id = get_rel_patch_id(ctx); return LLVMBuildAdd(gallivm->builder, patch0_patch_data_offset, LLVMBuildMul(gallivm->builder, patch_stride, rel_patch_id, ""), ""); } static LLVMValueRef get_instance_index_for_fetch( struct si_shader_context *ctx, - unsigned param_start_instance, unsigned divisor) + unsigned param_start_instance, LLVMValueRef divisor) { struct gallivm_state *gallivm = &ctx->gallivm; LLVMValueRef result = LLVMGetParam(ctx->main_fn, ctx->param_instance_id); /* The division must be done before START_INSTANCE is added. */ - if (divisor > 1) - result = LLVMBuildUDiv(gallivm->builder, result, - LLVMConstInt(ctx->i32, divisor, 0), ""); + if (divisor != ctx->i32_1) + result = LLVMBuildUDiv(gallivm->builder, result, divisor, ""); return LLVMBuildAdd(gallivm->builder, result, LLVMGetParam(ctx->main_fn, param_start_instance), ""); } /* Bitcast <4 x float> to <2 x double>, extract the component, and convert * to float. */ static LLVMValueRef extract_double_to_float(struct si_shader_context *ctx, LLVMValueRef vec4, unsigned double_index) @@ -5275,26 +5274,24 @@ si_generate_gs_copy_shader(struct si_screen *sscreen, FREE(shader); shader = NULL; } return shader; } static void si_dump_shader_key_vs(const struct si_shader_key *key, const struct si_vs_prolog_bits *prolog, const char *prefix, FILE *f) { - fprintf(f, " %s.instance_divisors = {", prefix); - for (int i = 0; i < ARRAY_SIZE(prolog->instance_divisors); i++) { - fprintf(f, !i ? "%u" : ", %u", - prolog->instance_divisors[i]); - } - fprintf(f, "}\n"); + fprintf(f, " %s.instance_divisor_is_one = %u\n", + prefix, prolog->instance_divisor_is_one); + fprintf(f, " %s.instance_divisor_is_fetched = %u\n", + prefix, prolog->instance_divisor_is_fetched); fprintf(f, " mono.vs.fix_fetch = {"); for (int i = 0; i < SI_MAX_ATTRIBS; i++) fprintf(f, !i ? "%u" : ", %u", key->mono.vs_fix_fetch[i]); fprintf(f, "}\n"); } static void si_dump_shader_key(unsigned processor, const struct si_shader *shader, FILE *f) { @@ -5596,24 +5593,26 @@ static void si_get_vs_prolog_key(const struct tgsi_shader_info *info, key->vs_prolog.last_input = MAX2(1, info->num_inputs) - 1; key->vs_prolog.as_ls = shader_out->key.as_ls; if (shader_out->selector->type == PIPE_SHADER_TESS_CTRL) { key->vs_prolog.as_ls = 1; key->vs_prolog.num_merged_next_stage_vgprs = 2; } else if (shader_out->selector->type == PIPE_SHADER_GEOMETRY) { key->vs_prolog.num_merged_next_stage_vgprs = 5; } - /* Set the instanceID flag. */ - for (unsigned i = 0; i < info->num_inputs; i++) - if (key->vs_prolog.states.instance_divisors[i]) - shader_out->info.uses_instanceid = true; + /* Enable loading the InstanceID VGPR. */ + uint16_t input_mask = u_bit_consecutive(0, info->num_inputs); + + if ((key->vs_prolog.states.instance_divisor_is_one | + key->vs_prolog.states.instance_divisor_is_fetched) & input_mask) + shader_out->info.uses_instanceid = true; } /** * Compute the PS prolog key, which contains all the information needed to * build the PS prolog function, and set related bits in shader->config. */ static void si_get_ps_prolog_key(struct si_shader *shader, union si_shader_part_key *key, bool separate_prolog) { @@ -6520,20 +6519,35 @@ si_get_shader_part(struct si_screen *sscreen, result->next = *list; *list = result; out: si_llvm_dispose(&ctx); mtx_unlock(&sscreen->shader_parts_mutex); return result; } +static LLVMValueRef si_prolog_get_rw_buffers(struct si_shader_context *ctx) +{ + struct gallivm_state *gallivm = &ctx->gallivm; + LLVMValueRef ptr[2], list; + + /* Get the pointer to rw buffers. */ + ptr[0] = LLVMGetParam(ctx->main_fn, SI_SGPR_RW_BUFFERS); + ptr[1] = LLVMGetParam(ctx->main_fn, SI_SGPR_RW_BUFFERS_HI); + list = lp_build_gather_values(gallivm, ptr, 2); + list = LLVMBuildBitCast(gallivm->builder, list, ctx->i64, ""); + list = LLVMBuildIntToPtr(gallivm->builder, list, + si_const_array(ctx->v4i32, SI_NUM_RW_BUFFERS), ""); + return list; +} + /** * Build the vertex shader prolog function. * * The inputs are the same as VS (a lot of SGPRs and 4 VGPR system values). * All inputs are returned unmodified. The vertex load indices are * stored after them, which will be used by the API VS for fetching inputs. * * For example, the expected outputs for instance_divisors[] = {0, 1, 2} are: * input_v0, * input_v1, @@ -6602,25 +6616,47 @@ static void si_build_vs_prolog_function(struct si_shader_context *ctx, LLVMValueRef p = LLVMGetParam(func, i); ret = LLVMBuildInsertValue(gallivm->builder, ret, p, i, ""); } for (; i < num_params; i++) { LLVMValueRef p = LLVMGetParam(func, i); p = LLVMBuildBitCast(gallivm->builder, p, ctx->f32, ""); ret = LLVMBuildInsertValue(gallivm->builder, ret, p, i, ""); } /* Compute vertex load indices from instance divisors. */ + LLVMValueRef instance_divisor_constbuf = NULL; + + if (key->vs_prolog.states.instance_divisor_is_fetched) { + LLVMValueRef list = si_prolog_get_rw_buffers(ctx); + LLVMValueRef buf_index = + LLVMConstInt(ctx->i32, SI_VS_CONST_INSTANCE_DIVISORS, 0); + instance_divisor_constbuf = + ac_build_indexed_load_const(&ctx->ac, list, buf_index); + } + for (i = 0; i <= key->vs_prolog.last_input; i++) { - unsigned divisor = key->vs_prolog.states.instance_divisors[i]; + bool divisor_is_one = + key->vs_prolog.states.instance_divisor_is_one & (1u << i); + bool divisor_is_fetched = + key->vs_prolog.states.instance_divisor_is_fetched & (1u << i); LLVMValueRef index; - if (divisor) { + if (divisor_is_one || divisor_is_fetched) { + LLVMValueRef divisor = ctx->i32_1; + + if (divisor_is_fetched) { + divisor = buffer_load_const(ctx, instance_divisor_constbuf, + LLVMConstInt(ctx->i32, i * 4, 0)); + divisor = LLVMBuildBitCast(gallivm->builder, divisor, + ctx->i32, ""); + } + /* InstanceID / Divisor + StartInstance */ index = get_instance_index_for_fetch(ctx, user_sgpr_base + SI_SGPR_START_INSTANCE, divisor); } else { /* VertexID + BaseVertex */ index = LLVMBuildAdd(gallivm->builder, LLVMGetParam(func, ctx->param_vertex_id), LLVMGetParam(func, user_sgpr_base + @@ -6859,29 +6895,21 @@ static void si_build_ps_prolog_function(struct si_shader_context *ctx, for (i = 0; i < num_params; i++) { LLVMValueRef p = LLVMGetParam(func, i); ret = LLVMBuildInsertValue(gallivm->builder, ret, p, i, ""); } /* Polygon stippling. */ if (key->ps_prolog.states.poly_stipple) { /* POS_FIXED_PT is always last. */ unsigned pos = key->ps_prolog.num_input_sgprs + key->ps_prolog.num_input_vgprs - 1; - LLVMValueRef ptr[2], list; - - /* Get the pointer to rw buffers. */ - ptr[0] = LLVMGetParam(func, SI_SGPR_RW_BUFFERS); - ptr[1] = LLVMGetParam(func, SI_SGPR_RW_BUFFERS_HI); - list = lp_build_gather_values(gallivm, ptr, 2); - list = LLVMBuildBitCast(gallivm->builder, list, ctx->i64, ""); - list = LLVMBuildIntToPtr(gallivm->builder, list, - si_const_array(ctx->v4i32, SI_NUM_RW_BUFFERS), ""); + LLVMValueRef list = si_prolog_get_rw_buffers(ctx); si_llvm_emit_polygon_stipple(ctx, list, pos); } if (key->ps_prolog.states.bc_optimize_for_persp || key->ps_prolog.states.bc_optimize_for_linear) { unsigned i, base = key->ps_prolog.num_input_sgprs; LLVMValueRef center[2], centroid[2], tmp, bc_optimize; /* The shader should do: if (PRIM_MASK[31]) CENTROID = CENTER; diff --git a/src/gallium/drivers/radeonsi/si_shader.h b/src/gallium/drivers/radeonsi/si_shader.h index 6432126..a10067d 100644 --- a/src/gallium/drivers/radeonsi/si_shader.h +++ b/src/gallium/drivers/radeonsi/si_shader.h @@ -378,21 +378,28 @@ struct si_shader_selector { * -> = merged with the next stage */ /* Use the byte alignment for all following structure members for optimal * shader key memory footprint. */ #pragma pack(push, 1) /* Common VS bits between the shader key and the prolog key. */ struct si_vs_prolog_bits { - unsigned instance_divisors[SI_MAX_ATTRIBS]; + /* - If neither "is_one" nor "is_fetched" has a bit set, the instance + * divisor is 0. + * - If "is_one" has a bit set, the instance divisor is 1. + * - If "is_fetched" has a bit set, the instance divisor will be loaded + * from the constant buffer. + */ + uint16_t instance_divisor_is_one; /* bitmask of inputs */ + uint16_t instance_divisor_is_fetched; /* bitmask of inputs */ }; /* Common TCS bits between the shader key and the epilog key. */ struct si_tcs_epilog_bits { unsigned prim_mode:3; unsigned tes_reads_tess_factors:1; }; struct si_gs_prolog_bits { unsigned tri_strip_adj_fix:1; diff --git a/src/gallium/drivers/radeonsi/si_state.c b/src/gallium/drivers/radeonsi/si_state.c index a674a60..7e3d1a0 100644 --- a/src/gallium/drivers/radeonsi/si_state.c +++ b/src/gallium/drivers/radeonsi/si_state.c @@ -3766,20 +3766,25 @@ static void *si_create_vertex_elements(struct pipe_context *ctx, unsigned char swizzle[4]; if (vbo_index >= SI_NUM_VERTEX_BUFFERS) { FREE(v); return NULL; } if (elements[i].instance_divisor) { v->uses_instance_divisors = true; v->instance_divisors[i] = elements[i].instance_divisor; + + if (v->instance_divisors[i] == 1) + v->instance_divisor_is_one |= 1u << i; + else + v->instance_divisor_is_fetched |= 1u << i; } if (!used[vbo_index]) { v->first_vb_use_mask |= 1 << i; used[vbo_index] = true; } desc = util_format_description(elements[i].src_format); first_non_void = util_format_get_first_non_void_channel(elements[i].src_format); data_format = si_translate_buffer_dataformat(ctx->screen, desc, first_non_void); @@ -3894,20 +3899,30 @@ static void si_bind_vertex_elements(struct pipe_context *ctx, void *state) sctx->vertex_elements = v; sctx->vertex_buffers_dirty = true; if (v && (!old || old->count != v->count || old->uses_instance_divisors != v->uses_instance_divisors || v->uses_instance_divisors || /* we don't check which divisors changed */ memcmp(old->fix_fetch, v->fix_fetch, sizeof(v->fix_fetch[0]) * v->count))) sctx->do_update_shaders = true; + + if (v && v->instance_divisor_is_fetched) { + struct pipe_constant_buffer cb; + + cb.buffer = NULL; + cb.user_buffer = v->instance_divisors; + cb.buffer_offset = 0; + cb.buffer_size = sizeof(uint32_t) * v->count; + si_set_rw_buffer(sctx, SI_VS_CONST_INSTANCE_DIVISORS, &cb); + } } static void si_delete_vertex_element(struct pipe_context *ctx, void *state) { struct si_context *sctx = (struct si_context *)ctx; if (sctx->vertex_elements == state) sctx->vertex_elements = NULL; FREE(state); } diff --git a/src/gallium/drivers/radeonsi/si_state.h b/src/gallium/drivers/radeonsi/si_state.h index c9e0770..ec28aba 100644 --- a/src/gallium/drivers/radeonsi/si_state.h +++ b/src/gallium/drivers/radeonsi/si_state.h @@ -108,20 +108,22 @@ struct si_vertex_elements uint8_t fix_fetch[SI_MAX_ATTRIBS]; uint8_t format_size[SI_MAX_ATTRIBS]; uint8_t vertex_buffer_index[SI_MAX_ATTRIBS]; uint8_t count; bool uses_instance_divisors; uint16_t first_vb_use_mask; /* Vertex buffer descriptor list size aligned for optimal prefetch. */ uint16_t desc_list_byte_size; + uint16_t instance_divisor_is_one; /* bitmask of inputs */ + uint16_t instance_divisor_is_fetched; /* bitmask of inputs */ }; union si_state { struct { struct si_state_blend *blend; struct si_state_rasterizer *rasterizer; struct si_state_dsa *dsa; struct si_pm4_state *poly_offset; struct si_pm4_state *ls; struct si_pm4_state *hs; @@ -175,20 +177,21 @@ enum { SI_GS_RING_ESGS, SI_RING_GSVS, SI_VS_STREAMOUT_BUF0, SI_VS_STREAMOUT_BUF1, SI_VS_STREAMOUT_BUF2, SI_VS_STREAMOUT_BUF3, SI_HS_CONST_DEFAULT_TESS_LEVELS, + SI_VS_CONST_INSTANCE_DIVISORS, SI_VS_CONST_CLIP_PLANES, SI_PS_CONST_POLY_STIPPLE, SI_PS_CONST_SAMPLE_POSITIONS, SI_NUM_RW_BUFFERS, }; /* Indices into sctx->descriptors, laid out so that gfx and compute pipelines * are contiguous: * diff --git a/src/gallium/drivers/radeonsi/si_state_shaders.c b/src/gallium/drivers/radeonsi/si_state_shaders.c index 4eb3b75..63cc746 100644 --- a/src/gallium/drivers/radeonsi/si_state_shaders.c +++ b/src/gallium/drivers/radeonsi/si_state_shaders.c @@ -1180,24 +1180,27 @@ static unsigned si_get_alpha_test_func(struct si_context *sctx) } static void si_shader_selector_key_vs(struct si_context *sctx, struct si_shader_selector *vs, struct si_shader_key *key, struct si_vs_prolog_bits *prolog_key) { if (!sctx->vertex_elements) return; + prolog_key->instance_divisor_is_one = + sctx->vertex_elements->instance_divisor_is_one; + prolog_key->instance_divisor_is_fetched = + sctx->vertex_elements->instance_divisor_is_fetched; + unsigned count = MIN2(vs->info.num_inputs, sctx->vertex_elements->count); - memcpy(prolog_key->instance_divisors, - sctx->vertex_elements->instance_divisors, count * 4); memcpy(key->mono.vs_fix_fetch, sctx->vertex_elements->fix_fetch, count); } static void si_shader_selector_key_hw_vs(struct si_context *sctx, struct si_shader_selector *vs, struct si_shader_key *key) { struct si_shader_selector *ps = sctx->ps_shader.cso; key->opt.clip_disable = -- 2.7.4 _______________________________________________ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev