On Wed, Apr 3, 2013 at 10:25 AM, Eric Anholt <[email protected]> wrote: > The way we were allocating registers before, packing into low register > numbers for Ironlake, resulted in an overly-constrained dependency graph > for instruction scheduling. Improves GLBenchmark 2.1 performance by > 3.4% +/- 0.6% (n=26) > --- > src/mesa/drivers/dri/i965/brw_fs_reg_allocate.cpp | 2 ++ > src/mesa/program/register_allocate.c | 31 > +++++++++++++++++++-- > src/mesa/program/register_allocate.h | 1 + > 3 files changed, 31 insertions(+), 3 deletions(-) > > diff --git a/src/mesa/drivers/dri/i965/brw_fs_reg_allocate.cpp > b/src/mesa/drivers/dri/i965/brw_fs_reg_allocate.cpp > index 4ee7bbc..b9b0303 100644 > --- a/src/mesa/drivers/dri/i965/brw_fs_reg_allocate.cpp > +++ b/src/mesa/drivers/dri/i965/brw_fs_reg_allocate.cpp > @@ -108,6 +108,8 @@ brw_alloc_reg_set(struct brw_context *brw, int reg_width) > > uint8_t *ra_reg_to_grf = ralloc_array(brw, uint8_t, ra_reg_count); > struct ra_regs *regs = ra_alloc_reg_set(brw, ra_reg_count); > + if (intel->gen >= 6) > + ra_set_allocate_round_robin(regs); > int *classes = ralloc_array(brw, int, class_count); > int aligned_pairs_class = -1; > > diff --git a/src/mesa/program/register_allocate.c > b/src/mesa/program/register_allocate.c > index a9064c3..5f45662 100644 > --- a/src/mesa/program/register_allocate.c > +++ b/src/mesa/program/register_allocate.c > @@ -70,6 +70,7 @@ > * this during ra_set_finalize(). > */ > > +#include <stdbool.h> > #include <ralloc.h> > > #include "main/imports.h" > @@ -93,6 +94,8 @@ struct ra_regs { > > struct ra_class **classes; > unsigned int class_count; > + > + bool round_robin; > }; > > struct ra_class { > @@ -185,6 +188,22 @@ ra_alloc_reg_set(void *mem_ctx, unsigned int count) > return regs; > } > > +/** > + * The register allocator by default prefers to allocate low register > numbers, > + * since it was written for hardware (gen4/5 Intel) that is limited in its > + * multithreadedness by the number of registers used in a given shader. > + * > + * However, for hardware without that restriction, densely packed register > + * allocation can put serious constraints on instruction scheduling. This > + * function tells the allocator to rotate around the registers if possible as > + * it allocates the nodes. > + */ > +void > +ra_set_allocate_round_robin(struct ra_regs *regs) > +{ > + regs->round_robin = true; > +} > + > static void > ra_add_conflict_list(struct ra_regs *regs, unsigned int r1, unsigned int r2) > { > @@ -436,16 +455,19 @@ GLboolean > ra_select(struct ra_graph *g) > { > int i; > + int start_search_reg = 0; > > while (g->stack_count != 0) { > - unsigned int r; > + unsigned int ri; > + unsigned int r = -1; > int n = g->stack[g->stack_count - 1]; > struct ra_class *c = g->regs->classes[g->nodes[n].class]; > > /* Find the lowest-numbered reg which is not used by a member > * of the graph adjacent to us. > */ > - for (r = 0; r < g->regs->count; r++) { > + for (ri = 0; ri < g->regs->count; ri++) { > + r = (start_search_reg + ri) % g->regs->count; > if (!c->regs[r]) > continue; > > @@ -461,12 +483,15 @@ ra_select(struct ra_graph *g) > if (i == g->nodes[n].adjacency_count) > break; > } > - if (r == g->regs->count) > + if (ri == g->regs->count) > return GL_FALSE; > > g->nodes[n].reg = r; > g->nodes[n].in_stack = GL_FALSE; > g->stack_count--; > + > + if (g->regs->round_robin) > + start_search_reg = r; > }
With the s/= r/= r + 1/ change mentioned on IRC to make this work for 8-wide too, it gets my Reviewed-by: Matt Turner <[email protected]> _______________________________________________ mesa-dev mailing list [email protected] http://lists.freedesktop.org/mailman/listinfo/mesa-dev
