From: Marek Olšák <marek.ol...@amd.com> This results in a very tiny decrease in lgkm wait cycles. --- src/gallium/drivers/radeon/radeon_winsys.h | 1 + src/gallium/drivers/radeonsi/si_descriptors.c | 4 +++- src/gallium/drivers/radeonsi/si_pipe.h | 15 +++++++++++++++ src/gallium/winsys/amdgpu/drm/amdgpu_winsys.c | 1 + src/gallium/winsys/radeon/drm/radeon_drm_winsys.c | 1 + 5 files changed, 21 insertions(+), 1 deletion(-)
diff --git a/src/gallium/drivers/radeon/radeon_winsys.h b/src/gallium/drivers/radeon/radeon_winsys.h index 432550d..812c036 100644 --- a/src/gallium/drivers/radeon/radeon_winsys.h +++ b/src/gallium/drivers/radeon/radeon_winsys.h @@ -194,20 +194,21 @@ struct radeon_info { bool gfx_ib_pad_with_type2; bool has_sdma; bool has_uvd; uint32_t uvd_fw_version; uint32_t vce_fw_version; uint32_t me_fw_version; uint32_t pfp_fw_version; uint32_t ce_fw_version; uint32_t vce_harvest_config; uint32_t clock_crystal_freq; + uint32_t tcc_cache_line_size; /* Kernel info. */ uint32_t drm_major; /* version */ uint32_t drm_minor; uint32_t drm_patchlevel; bool has_userptr; /* Shader cores. */ uint32_t r600_max_quad_pipes; /* wave size / 16 */ uint32_t max_shader_clock; diff --git a/src/gallium/drivers/radeonsi/si_descriptors.c b/src/gallium/drivers/radeonsi/si_descriptors.c index 8f636af..72b33f3 100644 --- a/src/gallium/drivers/radeonsi/si_descriptors.c +++ b/src/gallium/drivers/radeonsi/si_descriptors.c @@ -1040,21 +1040,23 @@ static struct si_descriptors * si_const_buffer_descriptors(struct si_context *sctx, unsigned shader) { return &sctx->descriptors[si_const_buffer_descriptors_idx(shader)]; } void si_upload_const_buffer(struct si_context *sctx, struct r600_resource **rbuffer, const uint8_t *ptr, unsigned size, uint32_t *const_offset) { void *tmp; - u_upload_alloc(sctx->b.b.stream_uploader, 0, size, 256, const_offset, + u_upload_alloc(sctx->b.b.stream_uploader, 0, size, + si_optimal_tcc_alignment(sctx, size), + const_offset, (struct pipe_resource**)rbuffer, &tmp); if (*rbuffer) util_memcpy_cpu_to_le32(tmp, ptr, size); } static void si_set_constant_buffer(struct si_context *sctx, struct si_buffer_resources *buffers, unsigned descriptors_idx, uint slot, const struct pipe_constant_buffer *input) { diff --git a/src/gallium/drivers/radeonsi/si_pipe.h b/src/gallium/drivers/radeonsi/si_pipe.h index fb24bab..bee6881 100644 --- a/src/gallium/drivers/radeonsi/si_pipe.h +++ b/src/gallium/drivers/radeonsi/si_pipe.h @@ -505,11 +505,26 @@ static inline struct si_shader* si_get_vs_state(struct si_context *sctx) static inline bool si_vs_exports_prim_id(struct si_shader *shader) { if (shader->selector->type == PIPE_SHADER_VERTEX) return shader->key.part.vs.epilog.export_prim_id; else if (shader->selector->type == PIPE_SHADER_TESS_EVAL) return shader->key.part.tes.epilog.export_prim_id; else return false; } +static inline unsigned +si_optimal_tcc_alignment(struct si_context *sctx, unsigned upload_size) +{ + unsigned alignment, tcc_cache_line_size; + + /* If the upload size is less than the cache line size (e.g. 16, 32), + * the whole thing will fit into a cache line if we align it to its size. + * The idea is that multiple small uploads can share a cache line. + * If the upload size is greater, align it to the cache line size. + */ + alignment = util_next_power_of_two(upload_size); + tcc_cache_line_size = sctx->screen->b.info.tcc_cache_line_size; + return MIN2(alignment, tcc_cache_line_size); +} + #endif diff --git a/src/gallium/winsys/amdgpu/drm/amdgpu_winsys.c b/src/gallium/winsys/amdgpu/drm/amdgpu_winsys.c index db0087c..6511c48 100644 --- a/src/gallium/winsys/amdgpu/drm/amdgpu_winsys.c +++ b/src/gallium/winsys/amdgpu/drm/amdgpu_winsys.c @@ -338,20 +338,21 @@ static bool do_winsys_init(struct amdgpu_winsys *ws, int fd) ws->info.max_se = ws->amdinfo.num_shader_engines; ws->info.max_sh_per_se = ws->amdinfo.num_shader_arrays_per_engine; ws->info.has_uvd = uvd.available_rings != 0; ws->info.uvd_fw_version = uvd.available_rings ? uvd_version : 0; ws->info.vce_fw_version = vce.available_rings ? vce_version : 0; ws->info.has_userptr = true; ws->info.num_render_backends = ws->amdinfo.rb_pipes; ws->info.clock_crystal_freq = ws->amdinfo.gpu_counter_freq; + ws->info.tcc_cache_line_size = 64; /* TC L2 line size on GCN */ ws->info.num_tile_pipes = cik_get_num_tile_pipes(&ws->amdinfo); ws->info.pipe_interleave_bytes = 256 << ((ws->amdinfo.gb_addr_cfg >> 4) & 0x7); ws->info.has_virtual_memory = true; ws->info.has_sdma = dma.available_rings != 0; /* Get the number of good compute units. */ ws->info.num_good_compute_units = 0; for (i = 0; i < ws->info.max_se; i++) for (j = 0; j < ws->info.max_sh_per_se; j++) ws->info.num_good_compute_units += diff --git a/src/gallium/winsys/radeon/drm/radeon_drm_winsys.c b/src/gallium/winsys/radeon/drm/radeon_drm_winsys.c index bdcf194..7fde04e 100644 --- a/src/gallium/winsys/radeon/drm/radeon_drm_winsys.c +++ b/src/gallium/winsys/radeon/drm/radeon_drm_winsys.c @@ -534,20 +534,21 @@ static bool do_winsys_init(struct radeon_drm_winsys *ws) return false; } } /* Hawaii with old firmware needs type2 nop packet. * accel_working2 with value 3 indicates the new firmware. */ ws->info.gfx_ib_pad_with_type2 = ws->info.chip_class <= SI || (ws->info.family == CHIP_HAWAII && ws->accel_working2 < 3); + ws->info.tcc_cache_line_size = 64; /* TC L2 line size on GCN */ ws->check_vm = strstr(debug_get_option("R600_DEBUG", ""), "check_vm") != NULL; return true; } static void radeon_winsys_destroy(struct radeon_winsys *rws) { struct radeon_drm_winsys *ws = (struct radeon_drm_winsys*)rws; -- 2.7.4 _______________________________________________ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev