On Mon, Sep 4, 2017 at 3:56 PM, Nicolai Hähnle <nhaeh...@gmail.com> wrote: > On 01.09.2017 02:57, Marek Olšák wrote: >> >> From: Marek Olšák <marek.ol...@amd.com> >> >> This increases performance, but it was tuned for Raven, not Vega. >> We don't know yet how Vega will perform, hopefully not worse. >> --- >> src/gallium/drivers/radeon/r600_pipe_common.c | 2 + >> src/gallium/drivers/radeon/r600_pipe_common.h | 2 + >> src/gallium/drivers/radeonsi/Makefile.sources | 1 + >> src/gallium/drivers/radeonsi/si_hw_context.c | 2 + >> src/gallium/drivers/radeonsi/si_pipe.c | 5 + >> src/gallium/drivers/radeonsi/si_pipe.h | 2 + >> src/gallium/drivers/radeonsi/si_state.c | 26 +- >> src/gallium/drivers/radeonsi/si_state.h | 6 +- >> src/gallium/drivers/radeonsi/si_state_binning.c | 448 >> ++++++++++++++++++++++++ >> src/gallium/drivers/radeonsi/si_state_shaders.c | 2 + >> 10 files changed, 489 insertions(+), 7 deletions(-) >> create mode 100644 src/gallium/drivers/radeonsi/si_state_binning.c >> > [snip] > >> diff --git a/src/gallium/drivers/radeonsi/si_state_binning.c >> b/src/gallium/drivers/radeonsi/si_state_binning.c >> new file mode 100644 >> index 0000000..56bcdc8 >> --- /dev/null >> +++ b/src/gallium/drivers/radeonsi/si_state_binning.c >> @@ -0,0 +1,448 @@ >> +/* >> + * Copyright 2017 Advanced Micro Devices, Inc. >> + * >> + * Permission is hereby granted, free of charge, to any person obtaining >> a >> + * copy of this software and associated documentation files (the >> "Software"), >> + * to deal in the Software without restriction, including without >> limitation >> + * on the rights to use, copy, modify, merge, publish, distribute, sub >> + * license, and/or sell copies of the Software, and to permit persons to >> whom >> + * the Software is furnished to do so, subject to the following >> conditions: >> + * >> + * The above copyright notice and this permission notice (including the >> next >> + * paragraph) shall be included in all copies or substantial portions of >> the >> + * Software. >> + * >> + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, >> EXPRESS OR >> + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF >> MERCHANTABILITY, >> + * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT >> SHALL >> + * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM, >> + * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR >> + * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR >> THE >> + * USE OR OTHER DEALINGS IN THE SOFTWARE. >> + */ >> + >> +/* This file handles register programming of primitive binning. */ >> + >> +#include "si_pipe.h" >> +#include "sid.h" >> +#include "gfx9d.h" >> +#include "radeon/r600_cs.h" >> + >> +struct uvec2 { >> + unsigned x, y; >> +}; >> + >> +struct si_bin_size_map { >> + unsigned start; >> + unsigned bin_size_x; >> + unsigned bin_size_y; >> +}; >> + >> +typedef struct si_bin_size_map si_bin_size_subtable[3][9]; >> + >> +/* Find the bin size where sum is >= table[i].start and < table[i + >> 1].start. */ >> +static struct uvec2 si_find_bin_size(struct si_screen *sscreen, >> + const si_bin_size_subtable table[], >> + unsigned sum) >> +{ >> + unsigned log_num_rb_per_se = >> + util_logbase2_ceil(sscreen->b.info.num_render_backends / >> + sscreen->b.info.max_se); >> + unsigned log_num_se = util_logbase2_ceil(sscreen->b.info.max_se); >> + unsigned i; >> + >> + /* Get the chip-specific subtable. */ >> + const struct si_bin_size_map *subtable = >> + &table[log_num_rb_per_se][log_num_se][0]; >> + >> + for (i = 0; subtable[i].bin_size_x != 0; i++) { >> + if (sum >= subtable[i].start && sum < subtable[i + >> 1].start) >> + break; >> + } >> + >> + struct uvec2 size = {subtable[i].bin_size_x, >> subtable[i].bin_size_y}; >> + return size; >> +} >> + >> +static struct uvec2 si_get_color_bin_size(struct si_context *sctx, >> + unsigned cb_target_enabled_4bit) >> +{ >> + unsigned nr_samples = sctx->framebuffer.nr_samples; >> + unsigned sum = 0; >> + >> + /* Compute the sum of all Bpp. */ >> + for (unsigned i = 0; i < sctx->framebuffer.state.nr_cbufs; i++) { >> + if (!(cb_target_enabled_4bit & (0xf << (i * 4)))) >> + continue; >> + >> + struct r600_texture *rtex = >> + (struct >> r600_texture*)sctx->framebuffer.state.cbufs[i]->texture; >> + sum += rtex->surface.bpe; >> + } > > > I believe this should early-out for !sum, for depth-only rendering.
See the table. There are different values for depth-only rendering depending on the number of SEs / RBs. I don't why the depth function has early-out but not the color function. > > >> + >> + /* Multiply the sum by some function of the number of samples. */ >> + if (nr_samples >= 2) { >> + if (sctx->ps_iter_samples >= 2) >> + sum *= nr_samples; >> + else >> + sum *= 2; >> + } >> + >> + static const si_bin_size_subtable table[] = { >> + { >> + /* One RB / SE */ >> + { >> + /* One shader engine */ >> + { 0, 128, 128 }, >> + { 1, 64, 128 }, >> + { 2, 32, 128 }, >> + { 3, 16, 128 }, >> + { 17, 0, 0 }, >> + { UINT_MAX, 0, 0 }, > > > All the UINT_MAX lines in this table should be unnecessary, given that > si_find_bin_size bails out when it sees bin_size_x == 0. Will fix. > > > >> + }, >> + { >> + /* Two shader engines */ >> + { 0, 128, 128 }, >> + { 2, 64, 128 }, >> + { 3, 32, 128 }, >> + { 5, 16, 128 }, >> + { 17, 0, 0 }, >> + { UINT_MAX, 0, 0 }, >> + }, >> + { >> + /* Four shader engines */ >> + { 0, 128, 128 }, >> + { 3, 64, 128 }, >> + { 5, 16, 128 }, >> + { 17, 0, 0 }, >> + { UINT_MAX, 0, 0 }, >> + }, >> + }, >> + { >> + /* Two RB / SE */ >> + { >> + /* One shader engine */ >> + { 0, 128, 128 }, >> + { 2, 64, 128 }, >> + { 3, 32, 128 }, >> + { 5, 16, 128 }, >> + { 33, 0, 0 }, >> + { UINT_MAX, 0, 0 }, >> + }, >> + { >> + /* Two shader engines */ >> + { 0, 128, 128 }, >> + { 3, 64, 128 }, >> + { 5, 32, 128 }, >> + { 9, 16, 128 }, >> + { 33, 0, 0 }, >> + { UINT_MAX, 0, 0 }, >> + }, >> + { >> + /* Four shader engines */ >> + { 0, 256, 256 }, >> + { 2, 128, 256 }, >> + { 3, 128, 128 }, >> + { 5, 64, 128 }, >> + { 9, 16, 128 }, >> + { 33, 0, 0 }, >> + { UINT_MAX, 0, 0 }, >> + }, >> + }, >> + { >> + /* Four RB / SE */ >> + { >> + /* One shader engine */ >> + { 0, 128, 256 }, >> + { 2, 128, 128 }, >> + { 3, 64, 128 }, >> + { 5, 32, 128 }, >> + { 9, 16, 128 }, >> + { 33, 0, 0 }, >> + { UINT_MAX, 0, 0 }, >> + }, >> + { >> + /* Two shader engines */ >> + { 0, 256, 256 }, >> + { 2, 128, 256 }, >> + { 3, 128, 128 }, >> + { 5, 64, 128 }, >> + { 9, 32, 128 }, >> + { 17, 16, 128 }, >> + { 33, 0, 0 }, >> + { UINT_MAX, 0, 0 }, >> + }, >> + { >> + /* Four shader engines */ >> + { 0, 256, 512 }, >> + { 2, 256, 256 }, >> + { 3, 128, 256 }, >> + { 5, 128, 128 }, >> + { 9, 64, 128 }, >> + { 17, 16, 128 }, >> + { 33, 0, 0 }, >> + { UINT_MAX, 0, 0 }, >> + }, >> + }, >> + }; >> + >> + return si_find_bin_size(sctx->screen, table, sum); >> +} >> + >> +static struct uvec2 si_get_depth_bin_size(struct si_context *sctx) >> +{ >> + struct si_state_dsa *dsa = sctx->queued.named.dsa; >> + >> + if (!sctx->framebuffer.state.zsbuf || >> + (!dsa->depth_enabled && !dsa->stencil_enabled)) { >> + /* Return the max size. */ >> + struct uvec2 size = {512, 512}; >> + return size; >> + } >> + >> + struct r600_texture *rtex = >> + (struct >> r600_texture*)sctx->framebuffer.state.zsbuf->texture; >> + unsigned depth_coeff = dsa->depth_enabled ? 5 : 0; >> + unsigned stencil_coeff = rtex->surface.flags & RADEON_SURF_SBUFFER >> && >> + dsa->stencil_enabled ? 1 : 0; >> + unsigned sum = 4 * (depth_coeff + stencil_coeff) * >> + sctx->framebuffer.nr_samples; > > > sum will always be at least 4, which means that the first two entries in > each sub-table below will never be used. Can we remove them? (The tables > originate from internal docs, but I see you followed the Vulkan lead of > removing some entries already anyway, so...) I just copied the code from Vulkan. I'd like to keep it similar for easy porting. > > Also, do you understand where the 5 comes from? A reasonable guess is HTILE, > but that uses much less than 1/4 the memory of raw Z data. Not sure if it matters, but HTILE overhead isn't just the added memory size. I think it also adds latency, because HTILE adds a level of indirection. Is re-sending necessary? Marek _______________________________________________ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev