Re: [Mesa-dev] [PATCH] freedreno: a2xx: ir2 update
() On Tue, Jul 24, 2018 at 9:00 AM Jonathan Marek wrote: > > this patch brings a number of changes to ir2: > -ir2 now generates CF clauses as necessary during assembly. this simplifies > fd2_program/fd2_compiler and is necessary to implement optimization passes > -ir2 now has separate vector/scalar instructions. this will make it easier > to implementing scheduling of scalar+vector instructions together. dst_reg > is also now seperate from src registers instead of a single list > -ir2 now implements register allocation. this makes it possible to compile > shaders which have more than 64 TGSI registers > -ir2 now implements the following optimizations: removal of IN/OUT MOV > instructions generated by TGSI and removal of unused instructions when > some exports are disabled > -ir2 now allows full 8-bit index for constants > -ir2_alloc no longer allocates 4 times too many bytes > So, this might be easier to review if it was split up a bit better into multiple patches. That said, I think I'll merge it as is, mostly because you folks are the main ones using and working on a2xx currently, and it isn't something that would break a3xx+. However, a few recommendations for the future: 1) you probably want to start running piglit and/or deqp_gles2. (Piglit has better desktop gl coverage, deqp has better gles coverage. Not sure whether you care more about gl or gles.) Due to feature level of a2xx (and because, iirc, I didn't start running piglit much until a3xx), I guess there will be a lot of skips and fails, but main thing you want to watch for is tests that transition pass->fail.. piglit-summary.py can compare before/after piglit runs. (Not really sure how to do that best with deqp, but you can use piglit to run deqp tests.) 2) shader-db is good for measuring the effect of compiler changes across a bunch of shaders. Probably worth wiring up shaderdb traces for a2xx, see dump_shader_info() in ir3_shader.c for example. I suppose you could use the same format for the traces and re-use fd-report.py in the shader-db tree to parse before/after results. (That script could probably use some improvements, like splitting VS/FS results.. I guess I'll do that next time I work up the courage to hack on python.) 3) seems like eventually you'll want to stop re-inventing register_allocate.[ch].. perhaps it is overkill for a2xx, I guess there wasn't anything complicated like multiple register banks or conflicting register classes. So maybe this is fine for now. BR, -R > Signed-off-by: Jonathan Marek > --- > .../drivers/freedreno/a2xx/fd2_compiler.c | 210 ++--- > .../drivers/freedreno/a2xx/fd2_program.c | 75 +- > .../drivers/freedreno/a2xx/instr-a2xx.h | 28 +- > src/gallium/drivers/freedreno/a2xx/ir-a2xx.c | 734 +++--- > src/gallium/drivers/freedreno/a2xx/ir-a2xx.h | 113 +-- > 5 files changed, 615 insertions(+), 545 deletions(-) > > diff --git a/src/gallium/drivers/freedreno/a2xx/fd2_compiler.c > b/src/gallium/drivers/freedreno/a2xx/fd2_compiler.c > index 3ad47f9850..12f9a1ce0a 100644 > --- a/src/gallium/drivers/freedreno/a2xx/fd2_compiler.c > +++ b/src/gallium/drivers/freedreno/a2xx/fd2_compiler.c > @@ -93,9 +93,6 @@ struct fd2_compile_context { > unsigned position, psize; > > uint64_t need_sync; > - > - /* current exec CF instruction */ > - struct ir2_cf *cf; > }; > > static int > @@ -130,7 +127,6 @@ compile_init(struct fd2_compile_context *ctx, struct > fd_program_stateobj *prog, > > ctx->prog = prog; > ctx->so = so; > - ctx->cf = NULL; > ctx->pred_depth = 0; > > ret = tgsi_parse_init(>parser, so->tokens); > @@ -236,15 +232,6 @@ compile_free(struct fd2_compile_context *ctx) > tgsi_parse_free(>parser); > } > > -static struct ir2_cf * > -next_exec_cf(struct fd2_compile_context *ctx) > -{ > - struct ir2_cf *cf = ctx->cf; > - if (!cf || cf->exec.instrs_count >= ARRAY_SIZE(ctx->cf->exec.instrs)) > - ctx->cf = cf = ir2_cf_create(ctx->so->ir, EXEC); > - return cf; > -} > - > static void > compile_vtx_fetch(struct fd2_compile_context *ctx) > { > @@ -252,13 +239,13 @@ compile_vtx_fetch(struct fd2_compile_context *ctx) > int i; > for (i = 0; i < ctx->num_regs[TGSI_FILE_INPUT]; i++) { > struct ir2_instruction *instr = ir2_instr_create( > - next_exec_cf(ctx), IR2_FETCH); > + ctx->so->ir, IR2_FETCH); > instr->fetch.opc = VTX_FETCH; > > ctx->need_sync |= 1 << (i+1); > > - ir2_reg_create(instr, i+1, "xyzw", 0); > - ir2_reg_create(instr, 0, "x", 0); > + ir2_dst_create(instr, i+1, "xyzw", 0); > + ir2_reg_create(instr, 0, "x", IR2_REG_INPUT); > > if (i == 0) > instr->sync = true; > @@ -266,7 +253,6 @@ compile_vtx_fetch(struct fd2_compile_context *ctx) >
[Mesa-dev] [PATCH] freedreno: a2xx: ir2 update
this patch brings a number of changes to ir2: -ir2 now generates CF clauses as necessary during assembly. this simplifies fd2_program/fd2_compiler and is necessary to implement optimization passes -ir2 now has separate vector/scalar instructions. this will make it easier to implementing scheduling of scalar+vector instructions together. dst_reg is also now seperate from src registers instead of a single list -ir2 now implements register allocation. this makes it possible to compile shaders which have more than 64 TGSI registers -ir2 now implements the following optimizations: removal of IN/OUT MOV instructions generated by TGSI and removal of unused instructions when some exports are disabled -ir2 now allows full 8-bit index for constants -ir2_alloc no longer allocates 4 times too many bytes Signed-off-by: Jonathan Marek --- .../drivers/freedreno/a2xx/fd2_compiler.c | 210 ++--- .../drivers/freedreno/a2xx/fd2_program.c | 75 +- .../drivers/freedreno/a2xx/instr-a2xx.h | 28 +- src/gallium/drivers/freedreno/a2xx/ir-a2xx.c | 734 +++--- src/gallium/drivers/freedreno/a2xx/ir-a2xx.h | 113 +-- 5 files changed, 615 insertions(+), 545 deletions(-) diff --git a/src/gallium/drivers/freedreno/a2xx/fd2_compiler.c b/src/gallium/drivers/freedreno/a2xx/fd2_compiler.c index 3ad47f9850..12f9a1ce0a 100644 --- a/src/gallium/drivers/freedreno/a2xx/fd2_compiler.c +++ b/src/gallium/drivers/freedreno/a2xx/fd2_compiler.c @@ -93,9 +93,6 @@ struct fd2_compile_context { unsigned position, psize; uint64_t need_sync; - - /* current exec CF instruction */ - struct ir2_cf *cf; }; static int @@ -130,7 +127,6 @@ compile_init(struct fd2_compile_context *ctx, struct fd_program_stateobj *prog, ctx->prog = prog; ctx->so = so; - ctx->cf = NULL; ctx->pred_depth = 0; ret = tgsi_parse_init(>parser, so->tokens); @@ -236,15 +232,6 @@ compile_free(struct fd2_compile_context *ctx) tgsi_parse_free(>parser); } -static struct ir2_cf * -next_exec_cf(struct fd2_compile_context *ctx) -{ - struct ir2_cf *cf = ctx->cf; - if (!cf || cf->exec.instrs_count >= ARRAY_SIZE(ctx->cf->exec.instrs)) - ctx->cf = cf = ir2_cf_create(ctx->so->ir, EXEC); - return cf; -} - static void compile_vtx_fetch(struct fd2_compile_context *ctx) { @@ -252,13 +239,13 @@ compile_vtx_fetch(struct fd2_compile_context *ctx) int i; for (i = 0; i < ctx->num_regs[TGSI_FILE_INPUT]; i++) { struct ir2_instruction *instr = ir2_instr_create( - next_exec_cf(ctx), IR2_FETCH); + ctx->so->ir, IR2_FETCH); instr->fetch.opc = VTX_FETCH; ctx->need_sync |= 1 << (i+1); - ir2_reg_create(instr, i+1, "xyzw", 0); - ir2_reg_create(instr, 0, "x", 0); + ir2_dst_create(instr, i+1, "xyzw", 0); + ir2_reg_create(instr, 0, "x", IR2_REG_INPUT); if (i == 0) instr->sync = true; @@ -266,7 +253,6 @@ compile_vtx_fetch(struct fd2_compile_context *ctx) vfetch_instrs[i] = instr; } ctx->so->num_vfetch_instrs = i; - ctx->cf = NULL; } /* @@ -312,7 +298,7 @@ get_temp_gpr(struct fd2_compile_context *ctx, int idx) return num; } -static struct ir2_register * +static struct ir2_dst_register * add_dst_reg(struct fd2_compile_context *ctx, struct ir2_instruction *alu, const struct tgsi_dst_register *dst) { @@ -351,10 +337,10 @@ add_dst_reg(struct fd2_compile_context *ctx, struct ir2_instruction *alu, swiz[3] = (dst->WriteMask & TGSI_WRITEMASK_W) ? 'w' : '_'; swiz[4] = '\0'; - return ir2_reg_create(alu, num, swiz, flags); + return ir2_dst_create(alu, num, swiz, flags); } -static struct ir2_register * +static struct ir2_src_register * add_src_reg(struct fd2_compile_context *ctx, struct ir2_instruction *alu, const struct tgsi_src_register *src) { @@ -373,6 +359,7 @@ add_src_reg(struct fd2_compile_context *ctx, struct ir2_instruction *alu, if (ctx->type == PIPE_SHADER_VERTEX) { num = src->Index + 1; } else { + flags |= IR2_REG_INPUT; num = export_linkage(ctx, ctx->input_export_idx[src->Index]); } @@ -415,7 +402,7 @@ static void add_vector_clamp(struct tgsi_full_instruction *inst, struct ir2_instruction *alu) { if (inst->Instruction.Saturate) { - alu->alu.vector_clamp = true; + alu->alu_vector.clamp = true; } } @@ -423,7 +410,7 @@ static void add_scalar_clamp(struct tgsi_full_instruction *inst, struct ir2_instruction *alu) { if (inst->Instruction.Saturate) { - alu->alu.scalar_clamp = true; +