[Mesa-dev] [PATCH 8/9] freedreno: implement a20x hw binning

Jonathan Marek Tue, 13 Nov 2018 12:08:38 -0800

Not in this patch: emitting the hw binning variant and filling the
"draw_patches". That is part of the ir2 patch.


Signed-off-by: Jonathan Marek <jonat...@marek.ca>
---
 src/gallium/drivers/freedreno/a2xx/fd2_draw.c |  47 +++++--
 src/gallium/drivers/freedreno/a2xx/fd2_emit.c |   8 +-
 src/gallium/drivers/freedreno/a2xx/fd2_emit.h |   3 +-
 src/gallium/drivers/freedreno/a2xx/fd2_gmem.c | 118 ++++++++++++++++++
 .../drivers/freedreno/freedreno_gmem.c        |  29 +++--
 .../drivers/freedreno/freedreno_gmem.h        |   1 +
 6 files changed, 187 insertions(+), 19 deletions(-)

diff --git a/src/gallium/drivers/freedreno/a2xx/fd2_draw.c 
b/src/gallium/drivers/freedreno/a2xx/fd2_draw.c
index 49df1daa59..46c76df807 100644
--- a/src/gallium/drivers/freedreno/a2xx/fd2_draw.c
+++ b/src/gallium/drivers/freedreno/a2xx/fd2_draw.c
@@ -75,11 +75,13 @@ emit_vertexbufs(struct fd_context *ctx)
        // CONST(20,0) (or CONST(26,0) in soliv_vp)
 
        fd2_emit_vertex_bufs(ctx->batch->draw, 0x78, bufs, vtx->num_elements);
+       fd2_emit_vertex_bufs(ctx->batch->binning, 0x78, bufs, 
vtx->num_elements);
 }
 
 static void
 draw_impl(struct fd_context *ctx, const struct pipe_draw_info *info,
-                  struct fd_ringbuffer *ring, unsigned index_offset)
+                  struct fd_ringbuffer *ring, unsigned index_offset,
+                  bool binning)
 {
        enum pc_di_vis_cull_mode vismode;
 
@@ -87,9 +89,15 @@ draw_impl(struct fd_context *ctx, const struct 
pipe_draw_info *info,
        OUT_RING(ring, CP_REG(REG_A2XX_VGT_INDX_OFFSET));
        OUT_RING(ring, info->index_size ? 0 : info->start);
 
-       OUT_PKT3(ring, CP_SET_CONSTANT, 2);
-       OUT_RING(ring, CP_REG(REG_A2XX_VGT_VERTEX_REUSE_BLOCK_CNTL));
-       OUT_RING(ring, is_a20x(ctx->screen) ? 0x00000002 : 0x0000003b);
+       /* in the binning batch, this value is set once in fd2_emit_tile_init */
+       if (!binning) {
+               OUT_PKT3(ring, CP_SET_CONSTANT, 2);
+               OUT_RING(ring, CP_REG(REG_A2XX_VGT_VERTEX_REUSE_BLOCK_CNTL));
+               /* XXX do this for every REG_A2XX_VGT_VERTEX_REUSE_BLOCK_CNTL 
write ?
+                * if set to 0x3b on a20x, clipping is broken
+                */
+               OUT_RING(ring, is_a20x(ctx->screen) ? 0x00000002 : 0x0000003b);
+       }
 
        OUT_PKT0(ring, REG_A2XX_TC_CNTL_STATUS, 1);
        OUT_RING(ring, A2XX_TC_CNTL_STATUS_L2_INVALIDATE);
@@ -125,8 +133,26 @@ draw_impl(struct fd_context *ctx, const struct 
pipe_draw_info *info,
                OUT_RING(ring, info->min_index);        /* VGT_MIN_VTX_INDX */
        }
 
+       /* binning shader will take offset from C64 */
+       if (binning && is_a20x(ctx->screen)) {
+               OUT_PKT3(ring, CP_SET_CONSTANT, 5);
+               OUT_RING(ring, 0x00000180);
+               OUT_RING(ring, fui(ctx->batch->num_vertices));
+               OUT_RING(ring, fui(0.0f));
+               OUT_RING(ring, fui(0.0f));
+               OUT_RING(ring, fui(0.0f));
+       }
+
+       vismode = binning ? IGNORE_VISIBILITY : USE_VISIBILITY;
+       /* a22x hw binning not implemented */
+       if (binning || !is_a20x(ctx->screen) || (fd_mesa_debug & FD_DBG_NOBIN))
+               vismode = IGNORE_VISIBILITY;
+
+       if (info->mode == PIPE_PRIM_POINTS)
+               vismode = IGNORE_VISIBILITY;
+
        fd_draw_emit(ctx->batch, ring, ctx->primtypes[info->mode],
-                                IGNORE_VISIBILITY, info, index_offset);
+                                vismode, info, index_offset);
 
        if (is_a20x(ctx->screen)) {
                /* not sure why this is required, but it fixes some hangs */
@@ -152,6 +178,7 @@ fd2_draw_vbo(struct fd_context *ctx, const struct 
pipe_draw_info *pinfo,
                emit_vertexbufs(ctx);
 
        fd2_emit_state(ctx, ctx->batch->draw, ctx->dirty);
+       fd2_emit_state(ctx, ctx->batch->binning, ctx->dirty);
 
        /* a20x can draw only 65535 vertices at once
         * however, using a limit of 32k fixes an unexplained hang
@@ -171,17 +198,23 @@ fd2_draw_vbo(struct fd_context *ctx, const struct 
pipe_draw_info *pinfo,
                struct pipe_draw_info info = *pinfo;
                unsigned count = info.count;
                unsigned step = step_tbl[info.mode];
+               unsigned num_vertices = ctx->batch->num_vertices;
 
                if (!step)
                        return false;
 
                for (; count + step > 32766; count -= step) {
                        info.count = MIN2(count, 32766);
-                       draw_impl(ctx, &info, ctx->batch->draw, index_offset);
+                       draw_impl(ctx, &info, ctx->batch->draw, index_offset, 
false);
+                       draw_impl(ctx, &info, ctx->batch->binning, 
index_offset, true);
                        info.start += step;
+                       ctx->batch->num_vertices += step;
                }
+               /* changing this value is a hack, restore it */
+               ctx->batch->num_vertices = num_vertices;
        } else {
-               draw_impl(ctx, pinfo, ctx->batch->draw, index_offset);
+               draw_impl(ctx, pinfo, ctx->batch->draw, index_offset, false);
+               draw_impl(ctx, pinfo, ctx->batch->binning, index_offset, true);
        }
 
        fd_context_all_clean(ctx);
diff --git a/src/gallium/drivers/freedreno/a2xx/fd2_emit.c 
b/src/gallium/drivers/freedreno/a2xx/fd2_emit.c
index 60bc9fad4c..4e93cb39b0 100644
--- a/src/gallium/drivers/freedreno/a2xx/fd2_emit.c
+++ b/src/gallium/drivers/freedreno/a2xx/fd2_emit.c
@@ -186,11 +186,12 @@ fd2_emit_vertex_bufs(struct fd_ringbuffer *ring, uint32_t 
val,
 }
 
 void
-fd2_emit_state(struct fd_context *ctx, const enum fd_dirty_3d_state dirty)
+fd2_emit_state(struct fd_context *ctx, struct fd_ringbuffer *ring,
+               const enum fd_dirty_3d_state dirty)
 {
        struct fd2_blend_stateobj *blend = fd2_blend_stateobj(ctx->blend);
        struct fd2_zsa_stateobj *zsa = fd2_zsa_stateobj(ctx->zsa);
-       struct fd_ringbuffer *ring = ctx->batch->draw;
+       bool is_binning = (ring == ctx->batch->binning);
 
        /* NOTE: we probably want to eventually refactor this so each state
         * object handles emitting it's own state..  although the mapping of
@@ -228,7 +229,8 @@ fd2_emit_state(struct fd_context *ctx, const enum 
fd_dirty_3d_state dirty)
                OUT_RING(ring, CP_REG(REG_A2XX_PA_CL_CLIP_CNTL));
                OUT_RING(ring, rasterizer->pa_cl_clip_cntl);
                OUT_RING(ring, rasterizer->pa_su_sc_mode_cntl |
-                               
A2XX_PA_SU_SC_MODE_CNTL_VTX_WINDOW_OFFSET_ENABLE);
+                               
A2XX_PA_SU_SC_MODE_CNTL_VTX_WINDOW_OFFSET_ENABLE |
+                               COND(is_binning, 
A2XX_PA_SU_SC_MODE_CNTL_FACE_KILL_ENABLE));
 
                OUT_PKT3(ring, CP_SET_CONSTANT, 5);
                OUT_RING(ring, CP_REG(REG_A2XX_PA_SU_POINT_SIZE));
diff --git a/src/gallium/drivers/freedreno/a2xx/fd2_emit.h 
b/src/gallium/drivers/freedreno/a2xx/fd2_emit.h
index 5e4bddd1fa..38660e733e 100644
--- a/src/gallium/drivers/freedreno/a2xx/fd2_emit.h
+++ b/src/gallium/drivers/freedreno/a2xx/fd2_emit.h
@@ -40,7 +40,8 @@ struct fd2_vertex_buf {
 
 void fd2_emit_vertex_bufs(struct fd_ringbuffer *ring, uint32_t val,
                struct fd2_vertex_buf *vbufs, uint32_t n);
-void fd2_emit_state(struct fd_context *ctx, enum fd_dirty_3d_state dirty);
+void fd2_emit_state(struct fd_context *ctx, struct fd_ringbuffer *ring,
+               const enum fd_dirty_3d_state dirty);
 void fd2_emit_restore(struct fd_context *ctx, struct fd_ringbuffer *ring);
 
 void fd2_emit_init(struct pipe_context *pctx);
diff --git a/src/gallium/drivers/freedreno/a2xx/fd2_gmem.c 
b/src/gallium/drivers/freedreno/a2xx/fd2_gmem.c
index e98ae7334a..b6b7050026 100644
--- a/src/gallium/drivers/freedreno/a2xx/fd2_gmem.c
+++ b/src/gallium/drivers/freedreno/a2xx/fd2_gmem.c
@@ -375,6 +375,107 @@ fd2_emit_tile_init(struct fd_batch *batch)
        if (pfb->zsbuf)
                reg |= 
A2XX_RB_DEPTH_INFO_DEPTH_FORMAT(fd_pipe2depth(pfb->zsbuf->format));
        OUT_RING(ring, reg);                         /* RB_DEPTH_INFO */
+
+       /* set to zero, for some reason hardware doesn't certain values */
+       OUT_PKT3(ring, CP_SET_CONSTANT, 2);
+       OUT_RING(ring, CP_REG(REG_A2XX_VGT_CURRENT_BIN_ID_MIN));
+       OUT_RING(ring, 0);
+
+       OUT_PKT3(ring, CP_SET_CONSTANT, 2);
+       OUT_RING(ring, CP_REG(REG_A2XX_VGT_CURRENT_BIN_ID_MAX));
+       OUT_RING(ring, 0);
+
+       if (is_a20x(ctx->screen) && !(fd_mesa_debug & FD_DBG_NOBIN) &&
+               gmem->num_vsc_pipes) {
+               /* patch out unneeded memory exports by changing EXEC CF to 
EXEC_END
+                *
+                * in the shader compiler, we guarantee that the shader ends 
with
+                * a specific pattern of ALLOC/EXEC CF pairs for the hw binning 
exports
+                *
+                * the since patches point only to dwords and CFs are 1.5 dwords
+                * the patch is aligned and might point to a ALLOC CF
+                */
+               for (int i = 0; i < 
fd_patch_num_elements(&batch->draw_patches); i++) {
+                       struct fd_cs_patch *patch =
+                               fd_patch_element(&batch->draw_patches, i);
+                       *patch->cs = patch->val;
+
+                       instr_cf_t *cf = (instr_cf_t*) patch->cs;
+                       if (cf->opc == ALLOC)
+                               cf++;
+                       assert(cf->opc == EXEC);
+                       assert(cf[ctx->screen->num_vsc_pipes*2-2].opc == 
EXEC_END);
+                       cf[2*(gmem->num_vsc_pipes-1)].opc = EXEC_END;
+               }
+
+               /* initialize shader constants for the binning memexport */
+               OUT_PKT3(ring, CP_SET_CONSTANT, 1 + gmem->num_vsc_pipes * 4);
+               OUT_RING(ring, 0x0000000C);
+
+               for (int i = 0; i < gmem->num_vsc_pipes; i++) {
+                       struct fd_vsc_pipe *pipe = &ctx->vsc_pipe[i];
+
+                       /* XXX we know how large this needs to be..
+                        * should do some sort of realloc
+                        * it should be ctx->batch->num_vertices bytes large
+                        * with this size it will break with more than 256k 
vertices..
+                        */
+                       if (!pipe->bo) {
+                               pipe->bo = fd_bo_new(ctx->dev, 0x40000,
+                                       DRM_FREEDRENO_GEM_TYPE_KMEM);
+                       }
+
+                       /* memory export address (export32):
+                        * .x: (base_address >> 2) | 0x40000000 (?)
+                        * .y: index (float) - set by shader
+                        * .z: 0x4B00D000 (?)
+                        * .w: 0x4B000000 (?) | max_index (?)
+                       */
+                       OUT_RELOCW(ring, pipe->bo, 0, 0x40000000, -2);
+                       OUT_RING(ring, 0x00000000);
+                       OUT_RING(ring, 0x4B00D000);
+                       OUT_RING(ring, 0x4B000000 | 0x40000);
+               }
+
+               OUT_PKT3(ring, CP_SET_CONSTANT, 1 + gmem->num_vsc_pipes * 8);
+               OUT_RING(ring, 0x0000018C);
+
+               for (int i = 0; i < gmem->num_vsc_pipes; i++) {
+                       struct fd_vsc_pipe *pipe = &ctx->vsc_pipe[i];
+                       float off_x, off_y, mul_x, mul_y;
+
+                       /* const to tranform from [-1,1] to bin coordinates for 
this pipe
+                        * for x/y, [0,256/2040] = 0, [256/2040,512/2040] = 1, 
etc
+                        * 8 possible values on x/y axis,
+                        * to clip at binning stage: only use center 6x6
+                        * TODO: set the z parameters too so that hw binning
+                        * can clip primitives in Z too
+                        */
+
+                       mul_x = 1.0f / (float) (gmem->bin_w * 8);
+                       mul_y = 1.0f / (float) (gmem->bin_h * 8);
+                       off_x = -pipe->x * (1.0/8.0f) + 0.125f;
+                       off_y = -pipe->y * (1.0/8.0f) + 0.125f;
+
+                       OUT_RING(ring, fui(off_x * (256.0f/255.0f)));
+                       OUT_RING(ring, fui(off_y * (256.0f/255.0f)));
+                       OUT_RING(ring, 0x3f000000);
+                       OUT_RING(ring, fui(0.0f));
+
+                       OUT_RING(ring, fui(mul_x * (256.0f/255.0f)));
+                       OUT_RING(ring, fui(mul_y * (256.0f/255.0f)));
+                       OUT_RING(ring, fui(0.0f));
+                       OUT_RING(ring, fui(0.0f));
+               }
+
+               OUT_PKT3(ring, CP_SET_CONSTANT, 2);
+               OUT_RING(ring, CP_REG(REG_A2XX_VGT_VERTEX_REUSE_BLOCK_CNTL));
+               OUT_RING(ring, 0);
+
+               ctx->emit_ib(ring, batch->binning);
+       }
+
+       util_dynarray_resize(&batch->draw_patches, 0);
 }
 
 /* before mem2gmem */
@@ -403,6 +504,7 @@ fd2_emit_tile_prep(struct fd_batch *batch, struct fd_tile 
*tile)
 static void
 fd2_emit_tile_renderprep(struct fd_batch *batch, struct fd_tile *tile)
 {
+       struct fd_context *ctx = batch->ctx;
        struct fd_ringbuffer *ring = batch->gmem;
        struct pipe_framebuffer_state *pfb = &batch->framebuffer;
        enum pipe_format format = pipe_surface_format(pfb->cbufs[0]);
@@ -419,6 +521,22 @@ fd2_emit_tile_renderprep(struct fd_batch *batch, struct 
fd_tile *tile)
        OUT_RING(ring, CP_REG(REG_A2XX_PA_SC_WINDOW_OFFSET));
        OUT_RING(ring, A2XX_PA_SC_WINDOW_OFFSET_X(-tile->xoff) |
                        A2XX_PA_SC_WINDOW_OFFSET_Y(-tile->yoff));
+
+       if (is_a20x(ctx->screen) && !(fd_mesa_debug & FD_DBG_NOBIN)) {
+               struct fd_vsc_pipe *pipe = &ctx->vsc_pipe[tile->p];
+
+               OUT_PKT3(ring, CP_SET_CONSTANT, 2);
+               OUT_RING(ring, CP_REG(REG_A2XX_VGT_CURRENT_BIN_ID_MIN));
+               OUT_RING(ring, tile->n);
+
+               OUT_PKT3(ring, CP_SET_CONSTANT, 2);
+               OUT_RING(ring, CP_REG(REG_A2XX_VGT_CURRENT_BIN_ID_MAX));
+               OUT_RING(ring, tile->n);
+
+               /* TODO only emit this when tile->p changes */
+               OUT_PKT3(ring, CP_SET_DRAW_INIT_FLAGS, 1);
+               OUT_RELOC(ring, pipe->bo, 0, 0, 0);
+       }
 }
 
 void
diff --git a/src/gallium/drivers/freedreno/freedreno_gmem.c 
b/src/gallium/drivers/freedreno/freedreno_gmem.c
index bb15f0a3e1..d2483de1b0 100644
--- a/src/gallium/drivers/freedreno/freedreno_gmem.c
+++ b/src/gallium/drivers/freedreno/freedreno_gmem.c
@@ -214,12 +214,21 @@ calculate_tiles(struct fd_batch *batch)
 
 #define div_round_up(v, a)  (((v) + (a) - 1) / (a))
        /* figure out number of tiles per pipe: */
-       tpp_x = tpp_y = 1;
-       while (div_round_up(nbins_y, tpp_y) > screen->num_vsc_pipes)
-               tpp_y += 2;
-       while ((div_round_up(nbins_y, tpp_y) *
-                       div_round_up(nbins_x, tpp_x)) > screen->num_vsc_pipes)
-               tpp_x += 1;
+       if (is_a20x(ctx->screen)) {
+               /* for a20x we want to minimize the number of "pipes"
+                * binning data has 3 bits for x/y (8x8) but the edges are used 
to
+                * cull off-screen vertices with hw binning, so we have 6x6 
pipes
+                */
+               tpp_x = 6;
+               tpp_y = 6;
+       } else {
+               tpp_x = tpp_y = 1;
+               while (div_round_up(nbins_y, tpp_y) > screen->num_vsc_pipes)
+                       tpp_y += 2;
+               while ((div_round_up(nbins_y, tpp_y) *
+                               div_round_up(nbins_x, tpp_x)) > 
screen->num_vsc_pipes)
+                       tpp_x += 1;
+       }
 
        gmem->maxpw = tpp_x;
        gmem->maxph = tpp_y;
@@ -246,6 +255,9 @@ calculate_tiles(struct fd_batch *batch)
                xoff += tpp_x;
        }
 
+       /* number of pipes to use for a20x */
+       gmem->num_vsc_pipes = MAX2(1, i);
+
        for (; i < npipes; i++) {
                struct fd_vsc_pipe *pipe = &ctx->vsc_pipe[i];
                pipe->x = pipe->y = pipe->w = pipe->h = 0;
@@ -280,11 +292,12 @@ calculate_tiles(struct fd_batch *batch)
 
                        /* pipe number: */
                        p = ((i / tpp_y) * div_round_up(nbins_x, tpp_x)) + (j / 
tpp_x);
+                       assert(p < gmem->num_vsc_pipes);
 
                        /* clip bin width: */
                        bw = MIN2(bin_w, minx + width - xoff);
-
-                       tile->n = tile_n[p]++;
+                       tile->n = !is_a20x(ctx->screen) ? tile_n[p]++ :
+                               ((i % tpp_y + 1) << 3 | (j % tpp_x + 1));
                        tile->p = p;
                        tile->bin_w = bw;
                        tile->bin_h = bh;
diff --git a/src/gallium/drivers/freedreno/freedreno_gmem.h 
b/src/gallium/drivers/freedreno/freedreno_gmem.h
index b953999ff9..70641d62f3 100644
--- a/src/gallium/drivers/freedreno/freedreno_gmem.h
+++ b/src/gallium/drivers/freedreno/freedreno_gmem.h
@@ -57,6 +57,7 @@ struct fd_gmem_stateobj {
        uint16_t minx, miny;
        uint16_t width, height;
        uint16_t maxpw, maxph;   /* maximum pipe width/height */
+       uint8_t num_vsc_pipes;   /* number of pipes for a20x */
 };
 
 struct fd_batch;
-- 
2.17.1

_______________________________________________
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev

[Mesa-dev] [PATCH 8/9] freedreno: implement a20x hw binning

Reply via email to