Re: [Mesa-dev] [PATCH] freedreno: a2xx: ir2 update

2018-08-10 Thread Rob Clark
()
On Tue, Jul 24, 2018 at 9:00 AM Jonathan Marek  wrote:
>
> this patch brings a number of changes to ir2:
> -ir2 now generates CF clauses as necessary during assembly. this simplifies
>  fd2_program/fd2_compiler and is necessary to implement optimization passes
> -ir2 now has separate vector/scalar instructions. this will make it easier
>  to implementing scheduling of scalar+vector instructions together. dst_reg
>  is also now seperate from src registers instead of a single list
> -ir2 now implements register allocation. this makes it possible to compile
>  shaders which have more than 64 TGSI registers
> -ir2 now implements the following optimizations: removal of IN/OUT MOV
>  instructions generated by TGSI and removal of unused instructions when
>  some exports are disabled
> -ir2 now allows full 8-bit index for constants
> -ir2_alloc no longer allocates 4 times too many bytes
>

So, this might be easier to review if it was split up a bit better
into multiple patches.

That said, I think I'll merge it as is, mostly because you folks are
the main ones using and working on a2xx currently, and it isn't
something that would break a3xx+.

However, a few recommendations for the future:

1) you probably want to start running piglit and/or deqp_gles2.
(Piglit has better desktop gl coverage, deqp has better gles coverage.
Not sure whether you care more about gl or gles.)  Due to feature
level of a2xx (and because, iirc, I didn't start running piglit much
until a3xx), I guess there will be a lot of skips and fails, but main
thing you want to watch for is tests that transition pass->fail..
piglit-summary.py can compare before/after piglit runs.  (Not really
sure how to do that best with deqp, but you can use piglit to run deqp
tests.)

2) shader-db is good for measuring the effect of compiler changes
across a bunch of shaders.  Probably worth wiring up shaderdb traces
for a2xx, see dump_shader_info() in ir3_shader.c for example.  I
suppose you could use the same format for the traces and re-use
fd-report.py in the shader-db tree to parse before/after results.
(That script could probably use some improvements, like splitting
VS/FS results.. I guess I'll do that next time I work up the courage
to hack on python.)

3) seems like eventually you'll want to stop re-inventing
register_allocate.[ch].. perhaps it is overkill for a2xx, I guess
there wasn't anything complicated like multiple register banks or
conflicting register classes.  So maybe this is fine for now.

BR,
-R


> Signed-off-by: Jonathan Marek 
> ---
>  .../drivers/freedreno/a2xx/fd2_compiler.c | 210 ++---
>  .../drivers/freedreno/a2xx/fd2_program.c  |  75 +-
>  .../drivers/freedreno/a2xx/instr-a2xx.h   |  28 +-
>  src/gallium/drivers/freedreno/a2xx/ir-a2xx.c  | 734 +++---
>  src/gallium/drivers/freedreno/a2xx/ir-a2xx.h  | 113 +--
>  5 files changed, 615 insertions(+), 545 deletions(-)
>
> diff --git a/src/gallium/drivers/freedreno/a2xx/fd2_compiler.c 
> b/src/gallium/drivers/freedreno/a2xx/fd2_compiler.c
> index 3ad47f9850..12f9a1ce0a 100644
> --- a/src/gallium/drivers/freedreno/a2xx/fd2_compiler.c
> +++ b/src/gallium/drivers/freedreno/a2xx/fd2_compiler.c
> @@ -93,9 +93,6 @@ struct fd2_compile_context {
> unsigned position, psize;
>
> uint64_t need_sync;
> -
> -   /* current exec CF instruction */
> -   struct ir2_cf *cf;
>  };
>
>  static int
> @@ -130,7 +127,6 @@ compile_init(struct fd2_compile_context *ctx, struct 
> fd_program_stateobj *prog,
>
> ctx->prog = prog;
> ctx->so = so;
> -   ctx->cf = NULL;
> ctx->pred_depth = 0;
>
> ret = tgsi_parse_init(>parser, so->tokens);
> @@ -236,15 +232,6 @@ compile_free(struct fd2_compile_context *ctx)
> tgsi_parse_free(>parser);
>  }
>
> -static struct ir2_cf *
> -next_exec_cf(struct fd2_compile_context *ctx)
> -{
> -   struct ir2_cf *cf = ctx->cf;
> -   if (!cf || cf->exec.instrs_count >= ARRAY_SIZE(ctx->cf->exec.instrs))
> -   ctx->cf = cf = ir2_cf_create(ctx->so->ir, EXEC);
> -   return cf;
> -}
> -
>  static void
>  compile_vtx_fetch(struct fd2_compile_context *ctx)
>  {
> @@ -252,13 +239,13 @@ compile_vtx_fetch(struct fd2_compile_context *ctx)
> int i;
> for (i = 0; i < ctx->num_regs[TGSI_FILE_INPUT]; i++) {
> struct ir2_instruction *instr = ir2_instr_create(
> -   next_exec_cf(ctx), IR2_FETCH);
> +   ctx->so->ir, IR2_FETCH);
> instr->fetch.opc = VTX_FETCH;
>
> ctx->need_sync |= 1 << (i+1);
>
> -   ir2_reg_create(instr, i+1, "xyzw", 0);
> -   ir2_reg_create(instr, 0, "x", 0);
> +   ir2_dst_create(instr, i+1, "xyzw", 0);
> +   ir2_reg_create(instr, 0, "x", IR2_REG_INPUT);
>
> if (i == 0)
> instr->sync = true;
> @@ -266,7 +253,6 @@ compile_vtx_fetch(struct fd2_compile_context *ctx)
> 

[Mesa-dev] [PATCH] freedreno: a2xx: ir2 update

2018-07-24 Thread Jonathan Marek
this patch brings a number of changes to ir2:
-ir2 now generates CF clauses as necessary during assembly. this simplifies
 fd2_program/fd2_compiler and is necessary to implement optimization passes
-ir2 now has separate vector/scalar instructions. this will make it easier
 to implementing scheduling of scalar+vector instructions together. dst_reg
 is also now seperate from src registers instead of a single list
-ir2 now implements register allocation. this makes it possible to compile
 shaders which have more than 64 TGSI registers
-ir2 now implements the following optimizations: removal of IN/OUT MOV
 instructions generated by TGSI and removal of unused instructions when
 some exports are disabled
-ir2 now allows full 8-bit index for constants
-ir2_alloc no longer allocates 4 times too many bytes

Signed-off-by: Jonathan Marek 
---
 .../drivers/freedreno/a2xx/fd2_compiler.c | 210 ++---
 .../drivers/freedreno/a2xx/fd2_program.c  |  75 +-
 .../drivers/freedreno/a2xx/instr-a2xx.h   |  28 +-
 src/gallium/drivers/freedreno/a2xx/ir-a2xx.c  | 734 +++---
 src/gallium/drivers/freedreno/a2xx/ir-a2xx.h  | 113 +--
 5 files changed, 615 insertions(+), 545 deletions(-)

diff --git a/src/gallium/drivers/freedreno/a2xx/fd2_compiler.c 
b/src/gallium/drivers/freedreno/a2xx/fd2_compiler.c
index 3ad47f9850..12f9a1ce0a 100644
--- a/src/gallium/drivers/freedreno/a2xx/fd2_compiler.c
+++ b/src/gallium/drivers/freedreno/a2xx/fd2_compiler.c
@@ -93,9 +93,6 @@ struct fd2_compile_context {
unsigned position, psize;
 
uint64_t need_sync;
-
-   /* current exec CF instruction */
-   struct ir2_cf *cf;
 };
 
 static int
@@ -130,7 +127,6 @@ compile_init(struct fd2_compile_context *ctx, struct 
fd_program_stateobj *prog,
 
ctx->prog = prog;
ctx->so = so;
-   ctx->cf = NULL;
ctx->pred_depth = 0;
 
ret = tgsi_parse_init(>parser, so->tokens);
@@ -236,15 +232,6 @@ compile_free(struct fd2_compile_context *ctx)
tgsi_parse_free(>parser);
 }
 
-static struct ir2_cf *
-next_exec_cf(struct fd2_compile_context *ctx)
-{
-   struct ir2_cf *cf = ctx->cf;
-   if (!cf || cf->exec.instrs_count >= ARRAY_SIZE(ctx->cf->exec.instrs))
-   ctx->cf = cf = ir2_cf_create(ctx->so->ir, EXEC);
-   return cf;
-}
-
 static void
 compile_vtx_fetch(struct fd2_compile_context *ctx)
 {
@@ -252,13 +239,13 @@ compile_vtx_fetch(struct fd2_compile_context *ctx)
int i;
for (i = 0; i < ctx->num_regs[TGSI_FILE_INPUT]; i++) {
struct ir2_instruction *instr = ir2_instr_create(
-   next_exec_cf(ctx), IR2_FETCH);
+   ctx->so->ir, IR2_FETCH);
instr->fetch.opc = VTX_FETCH;
 
ctx->need_sync |= 1 << (i+1);
 
-   ir2_reg_create(instr, i+1, "xyzw", 0);
-   ir2_reg_create(instr, 0, "x", 0);
+   ir2_dst_create(instr, i+1, "xyzw", 0);
+   ir2_reg_create(instr, 0, "x", IR2_REG_INPUT);
 
if (i == 0)
instr->sync = true;
@@ -266,7 +253,6 @@ compile_vtx_fetch(struct fd2_compile_context *ctx)
vfetch_instrs[i] = instr;
}
ctx->so->num_vfetch_instrs = i;
-   ctx->cf = NULL;
 }
 
 /*
@@ -312,7 +298,7 @@ get_temp_gpr(struct fd2_compile_context *ctx, int idx)
return num;
 }
 
-static struct ir2_register *
+static struct ir2_dst_register *
 add_dst_reg(struct fd2_compile_context *ctx, struct ir2_instruction *alu,
const struct tgsi_dst_register *dst)
 {
@@ -351,10 +337,10 @@ add_dst_reg(struct fd2_compile_context *ctx, struct 
ir2_instruction *alu,
swiz[3] = (dst->WriteMask & TGSI_WRITEMASK_W) ? 'w' : '_';
swiz[4] = '\0';
 
-   return ir2_reg_create(alu, num, swiz, flags);
+   return ir2_dst_create(alu, num, swiz, flags);
 }
 
-static struct ir2_register *
+static struct ir2_src_register *
 add_src_reg(struct fd2_compile_context *ctx, struct ir2_instruction *alu,
const struct tgsi_src_register *src)
 {
@@ -373,6 +359,7 @@ add_src_reg(struct fd2_compile_context *ctx, struct 
ir2_instruction *alu,
if (ctx->type == PIPE_SHADER_VERTEX) {
num = src->Index + 1;
} else {
+   flags |= IR2_REG_INPUT;
num = export_linkage(ctx,
ctx->input_export_idx[src->Index]);
}
@@ -415,7 +402,7 @@ static void
 add_vector_clamp(struct tgsi_full_instruction *inst, struct ir2_instruction 
*alu)
 {
if (inst->Instruction.Saturate) {
-   alu->alu.vector_clamp = true;
+   alu->alu_vector.clamp = true;
}
 }
 
@@ -423,7 +410,7 @@ static void
 add_scalar_clamp(struct tgsi_full_instruction *inst, struct ir2_instruction 
*alu)
 {
if (inst->Instruction.Saturate) {
-   alu->alu.scalar_clamp = true;
+