Not in this patch: emitting the hw binning variant and filling the "draw_patches". That is part of the ir2 patch.
Signed-off-by: Jonathan Marek <jonat...@marek.ca> --- src/gallium/drivers/freedreno/a2xx/fd2_draw.c | 47 +++++-- src/gallium/drivers/freedreno/a2xx/fd2_emit.c | 8 +- src/gallium/drivers/freedreno/a2xx/fd2_emit.h | 3 +- src/gallium/drivers/freedreno/a2xx/fd2_gmem.c | 118 ++++++++++++++++++ .../drivers/freedreno/freedreno_gmem.c | 29 +++-- .../drivers/freedreno/freedreno_gmem.h | 1 + 6 files changed, 187 insertions(+), 19 deletions(-) diff --git a/src/gallium/drivers/freedreno/a2xx/fd2_draw.c b/src/gallium/drivers/freedreno/a2xx/fd2_draw.c index 49df1daa59..46c76df807 100644 --- a/src/gallium/drivers/freedreno/a2xx/fd2_draw.c +++ b/src/gallium/drivers/freedreno/a2xx/fd2_draw.c @@ -75,11 +75,13 @@ emit_vertexbufs(struct fd_context *ctx) // CONST(20,0) (or CONST(26,0) in soliv_vp) fd2_emit_vertex_bufs(ctx->batch->draw, 0x78, bufs, vtx->num_elements); + fd2_emit_vertex_bufs(ctx->batch->binning, 0x78, bufs, vtx->num_elements); } static void draw_impl(struct fd_context *ctx, const struct pipe_draw_info *info, - struct fd_ringbuffer *ring, unsigned index_offset) + struct fd_ringbuffer *ring, unsigned index_offset, + bool binning) { enum pc_di_vis_cull_mode vismode; @@ -87,9 +89,15 @@ draw_impl(struct fd_context *ctx, const struct pipe_draw_info *info, OUT_RING(ring, CP_REG(REG_A2XX_VGT_INDX_OFFSET)); OUT_RING(ring, info->index_size ? 0 : info->start); - OUT_PKT3(ring, CP_SET_CONSTANT, 2); - OUT_RING(ring, CP_REG(REG_A2XX_VGT_VERTEX_REUSE_BLOCK_CNTL)); - OUT_RING(ring, is_a20x(ctx->screen) ? 0x00000002 : 0x0000003b); + /* in the binning batch, this value is set once in fd2_emit_tile_init */ + if (!binning) { + OUT_PKT3(ring, CP_SET_CONSTANT, 2); + OUT_RING(ring, CP_REG(REG_A2XX_VGT_VERTEX_REUSE_BLOCK_CNTL)); + /* XXX do this for every REG_A2XX_VGT_VERTEX_REUSE_BLOCK_CNTL write ? + * if set to 0x3b on a20x, clipping is broken + */ + OUT_RING(ring, is_a20x(ctx->screen) ? 0x00000002 : 0x0000003b); + } OUT_PKT0(ring, REG_A2XX_TC_CNTL_STATUS, 1); OUT_RING(ring, A2XX_TC_CNTL_STATUS_L2_INVALIDATE); @@ -125,8 +133,26 @@ draw_impl(struct fd_context *ctx, const struct pipe_draw_info *info, OUT_RING(ring, info->min_index); /* VGT_MIN_VTX_INDX */ } + /* binning shader will take offset from C64 */ + if (binning && is_a20x(ctx->screen)) { + OUT_PKT3(ring, CP_SET_CONSTANT, 5); + OUT_RING(ring, 0x00000180); + OUT_RING(ring, fui(ctx->batch->num_vertices)); + OUT_RING(ring, fui(0.0f)); + OUT_RING(ring, fui(0.0f)); + OUT_RING(ring, fui(0.0f)); + } + + vismode = binning ? IGNORE_VISIBILITY : USE_VISIBILITY; + /* a22x hw binning not implemented */ + if (binning || !is_a20x(ctx->screen) || (fd_mesa_debug & FD_DBG_NOBIN)) + vismode = IGNORE_VISIBILITY; + + if (info->mode == PIPE_PRIM_POINTS) + vismode = IGNORE_VISIBILITY; + fd_draw_emit(ctx->batch, ring, ctx->primtypes[info->mode], - IGNORE_VISIBILITY, info, index_offset); + vismode, info, index_offset); if (is_a20x(ctx->screen)) { /* not sure why this is required, but it fixes some hangs */ @@ -152,6 +178,7 @@ fd2_draw_vbo(struct fd_context *ctx, const struct pipe_draw_info *pinfo, emit_vertexbufs(ctx); fd2_emit_state(ctx, ctx->batch->draw, ctx->dirty); + fd2_emit_state(ctx, ctx->batch->binning, ctx->dirty); /* a20x can draw only 65535 vertices at once * however, using a limit of 32k fixes an unexplained hang @@ -171,17 +198,23 @@ fd2_draw_vbo(struct fd_context *ctx, const struct pipe_draw_info *pinfo, struct pipe_draw_info info = *pinfo; unsigned count = info.count; unsigned step = step_tbl[info.mode]; + unsigned num_vertices = ctx->batch->num_vertices; if (!step) return false; for (; count + step > 32766; count -= step) { info.count = MIN2(count, 32766); - draw_impl(ctx, &info, ctx->batch->draw, index_offset); + draw_impl(ctx, &info, ctx->batch->draw, index_offset, false); + draw_impl(ctx, &info, ctx->batch->binning, index_offset, true); info.start += step; + ctx->batch->num_vertices += step; } + /* changing this value is a hack, restore it */ + ctx->batch->num_vertices = num_vertices; } else { - draw_impl(ctx, pinfo, ctx->batch->draw, index_offset); + draw_impl(ctx, pinfo, ctx->batch->draw, index_offset, false); + draw_impl(ctx, pinfo, ctx->batch->binning, index_offset, true); } fd_context_all_clean(ctx); diff --git a/src/gallium/drivers/freedreno/a2xx/fd2_emit.c b/src/gallium/drivers/freedreno/a2xx/fd2_emit.c index 60bc9fad4c..4e93cb39b0 100644 --- a/src/gallium/drivers/freedreno/a2xx/fd2_emit.c +++ b/src/gallium/drivers/freedreno/a2xx/fd2_emit.c @@ -186,11 +186,12 @@ fd2_emit_vertex_bufs(struct fd_ringbuffer *ring, uint32_t val, } void -fd2_emit_state(struct fd_context *ctx, const enum fd_dirty_3d_state dirty) +fd2_emit_state(struct fd_context *ctx, struct fd_ringbuffer *ring, + const enum fd_dirty_3d_state dirty) { struct fd2_blend_stateobj *blend = fd2_blend_stateobj(ctx->blend); struct fd2_zsa_stateobj *zsa = fd2_zsa_stateobj(ctx->zsa); - struct fd_ringbuffer *ring = ctx->batch->draw; + bool is_binning = (ring == ctx->batch->binning); /* NOTE: we probably want to eventually refactor this so each state * object handles emitting it's own state.. although the mapping of @@ -228,7 +229,8 @@ fd2_emit_state(struct fd_context *ctx, const enum fd_dirty_3d_state dirty) OUT_RING(ring, CP_REG(REG_A2XX_PA_CL_CLIP_CNTL)); OUT_RING(ring, rasterizer->pa_cl_clip_cntl); OUT_RING(ring, rasterizer->pa_su_sc_mode_cntl | - A2XX_PA_SU_SC_MODE_CNTL_VTX_WINDOW_OFFSET_ENABLE); + A2XX_PA_SU_SC_MODE_CNTL_VTX_WINDOW_OFFSET_ENABLE | + COND(is_binning, A2XX_PA_SU_SC_MODE_CNTL_FACE_KILL_ENABLE)); OUT_PKT3(ring, CP_SET_CONSTANT, 5); OUT_RING(ring, CP_REG(REG_A2XX_PA_SU_POINT_SIZE)); diff --git a/src/gallium/drivers/freedreno/a2xx/fd2_emit.h b/src/gallium/drivers/freedreno/a2xx/fd2_emit.h index 5e4bddd1fa..38660e733e 100644 --- a/src/gallium/drivers/freedreno/a2xx/fd2_emit.h +++ b/src/gallium/drivers/freedreno/a2xx/fd2_emit.h @@ -40,7 +40,8 @@ struct fd2_vertex_buf { void fd2_emit_vertex_bufs(struct fd_ringbuffer *ring, uint32_t val, struct fd2_vertex_buf *vbufs, uint32_t n); -void fd2_emit_state(struct fd_context *ctx, enum fd_dirty_3d_state dirty); +void fd2_emit_state(struct fd_context *ctx, struct fd_ringbuffer *ring, + const enum fd_dirty_3d_state dirty); void fd2_emit_restore(struct fd_context *ctx, struct fd_ringbuffer *ring); void fd2_emit_init(struct pipe_context *pctx); diff --git a/src/gallium/drivers/freedreno/a2xx/fd2_gmem.c b/src/gallium/drivers/freedreno/a2xx/fd2_gmem.c index e98ae7334a..b6b7050026 100644 --- a/src/gallium/drivers/freedreno/a2xx/fd2_gmem.c +++ b/src/gallium/drivers/freedreno/a2xx/fd2_gmem.c @@ -375,6 +375,107 @@ fd2_emit_tile_init(struct fd_batch *batch) if (pfb->zsbuf) reg |= A2XX_RB_DEPTH_INFO_DEPTH_FORMAT(fd_pipe2depth(pfb->zsbuf->format)); OUT_RING(ring, reg); /* RB_DEPTH_INFO */ + + /* set to zero, for some reason hardware doesn't certain values */ + OUT_PKT3(ring, CP_SET_CONSTANT, 2); + OUT_RING(ring, CP_REG(REG_A2XX_VGT_CURRENT_BIN_ID_MIN)); + OUT_RING(ring, 0); + + OUT_PKT3(ring, CP_SET_CONSTANT, 2); + OUT_RING(ring, CP_REG(REG_A2XX_VGT_CURRENT_BIN_ID_MAX)); + OUT_RING(ring, 0); + + if (is_a20x(ctx->screen) && !(fd_mesa_debug & FD_DBG_NOBIN) && + gmem->num_vsc_pipes) { + /* patch out unneeded memory exports by changing EXEC CF to EXEC_END + * + * in the shader compiler, we guarantee that the shader ends with + * a specific pattern of ALLOC/EXEC CF pairs for the hw binning exports + * + * the since patches point only to dwords and CFs are 1.5 dwords + * the patch is aligned and might point to a ALLOC CF + */ + for (int i = 0; i < fd_patch_num_elements(&batch->draw_patches); i++) { + struct fd_cs_patch *patch = + fd_patch_element(&batch->draw_patches, i); + *patch->cs = patch->val; + + instr_cf_t *cf = (instr_cf_t*) patch->cs; + if (cf->opc == ALLOC) + cf++; + assert(cf->opc == EXEC); + assert(cf[ctx->screen->num_vsc_pipes*2-2].opc == EXEC_END); + cf[2*(gmem->num_vsc_pipes-1)].opc = EXEC_END; + } + + /* initialize shader constants for the binning memexport */ + OUT_PKT3(ring, CP_SET_CONSTANT, 1 + gmem->num_vsc_pipes * 4); + OUT_RING(ring, 0x0000000C); + + for (int i = 0; i < gmem->num_vsc_pipes; i++) { + struct fd_vsc_pipe *pipe = &ctx->vsc_pipe[i]; + + /* XXX we know how large this needs to be.. + * should do some sort of realloc + * it should be ctx->batch->num_vertices bytes large + * with this size it will break with more than 256k vertices.. + */ + if (!pipe->bo) { + pipe->bo = fd_bo_new(ctx->dev, 0x40000, + DRM_FREEDRENO_GEM_TYPE_KMEM); + } + + /* memory export address (export32): + * .x: (base_address >> 2) | 0x40000000 (?) + * .y: index (float) - set by shader + * .z: 0x4B00D000 (?) + * .w: 0x4B000000 (?) | max_index (?) + */ + OUT_RELOCW(ring, pipe->bo, 0, 0x40000000, -2); + OUT_RING(ring, 0x00000000); + OUT_RING(ring, 0x4B00D000); + OUT_RING(ring, 0x4B000000 | 0x40000); + } + + OUT_PKT3(ring, CP_SET_CONSTANT, 1 + gmem->num_vsc_pipes * 8); + OUT_RING(ring, 0x0000018C); + + for (int i = 0; i < gmem->num_vsc_pipes; i++) { + struct fd_vsc_pipe *pipe = &ctx->vsc_pipe[i]; + float off_x, off_y, mul_x, mul_y; + + /* const to tranform from [-1,1] to bin coordinates for this pipe + * for x/y, [0,256/2040] = 0, [256/2040,512/2040] = 1, etc + * 8 possible values on x/y axis, + * to clip at binning stage: only use center 6x6 + * TODO: set the z parameters too so that hw binning + * can clip primitives in Z too + */ + + mul_x = 1.0f / (float) (gmem->bin_w * 8); + mul_y = 1.0f / (float) (gmem->bin_h * 8); + off_x = -pipe->x * (1.0/8.0f) + 0.125f; + off_y = -pipe->y * (1.0/8.0f) + 0.125f; + + OUT_RING(ring, fui(off_x * (256.0f/255.0f))); + OUT_RING(ring, fui(off_y * (256.0f/255.0f))); + OUT_RING(ring, 0x3f000000); + OUT_RING(ring, fui(0.0f)); + + OUT_RING(ring, fui(mul_x * (256.0f/255.0f))); + OUT_RING(ring, fui(mul_y * (256.0f/255.0f))); + OUT_RING(ring, fui(0.0f)); + OUT_RING(ring, fui(0.0f)); + } + + OUT_PKT3(ring, CP_SET_CONSTANT, 2); + OUT_RING(ring, CP_REG(REG_A2XX_VGT_VERTEX_REUSE_BLOCK_CNTL)); + OUT_RING(ring, 0); + + ctx->emit_ib(ring, batch->binning); + } + + util_dynarray_resize(&batch->draw_patches, 0); } /* before mem2gmem */ @@ -403,6 +504,7 @@ fd2_emit_tile_prep(struct fd_batch *batch, struct fd_tile *tile) static void fd2_emit_tile_renderprep(struct fd_batch *batch, struct fd_tile *tile) { + struct fd_context *ctx = batch->ctx; struct fd_ringbuffer *ring = batch->gmem; struct pipe_framebuffer_state *pfb = &batch->framebuffer; enum pipe_format format = pipe_surface_format(pfb->cbufs[0]); @@ -419,6 +521,22 @@ fd2_emit_tile_renderprep(struct fd_batch *batch, struct fd_tile *tile) OUT_RING(ring, CP_REG(REG_A2XX_PA_SC_WINDOW_OFFSET)); OUT_RING(ring, A2XX_PA_SC_WINDOW_OFFSET_X(-tile->xoff) | A2XX_PA_SC_WINDOW_OFFSET_Y(-tile->yoff)); + + if (is_a20x(ctx->screen) && !(fd_mesa_debug & FD_DBG_NOBIN)) { + struct fd_vsc_pipe *pipe = &ctx->vsc_pipe[tile->p]; + + OUT_PKT3(ring, CP_SET_CONSTANT, 2); + OUT_RING(ring, CP_REG(REG_A2XX_VGT_CURRENT_BIN_ID_MIN)); + OUT_RING(ring, tile->n); + + OUT_PKT3(ring, CP_SET_CONSTANT, 2); + OUT_RING(ring, CP_REG(REG_A2XX_VGT_CURRENT_BIN_ID_MAX)); + OUT_RING(ring, tile->n); + + /* TODO only emit this when tile->p changes */ + OUT_PKT3(ring, CP_SET_DRAW_INIT_FLAGS, 1); + OUT_RELOC(ring, pipe->bo, 0, 0, 0); + } } void diff --git a/src/gallium/drivers/freedreno/freedreno_gmem.c b/src/gallium/drivers/freedreno/freedreno_gmem.c index bb15f0a3e1..d2483de1b0 100644 --- a/src/gallium/drivers/freedreno/freedreno_gmem.c +++ b/src/gallium/drivers/freedreno/freedreno_gmem.c @@ -214,12 +214,21 @@ calculate_tiles(struct fd_batch *batch) #define div_round_up(v, a) (((v) + (a) - 1) / (a)) /* figure out number of tiles per pipe: */ - tpp_x = tpp_y = 1; - while (div_round_up(nbins_y, tpp_y) > screen->num_vsc_pipes) - tpp_y += 2; - while ((div_round_up(nbins_y, tpp_y) * - div_round_up(nbins_x, tpp_x)) > screen->num_vsc_pipes) - tpp_x += 1; + if (is_a20x(ctx->screen)) { + /* for a20x we want to minimize the number of "pipes" + * binning data has 3 bits for x/y (8x8) but the edges are used to + * cull off-screen vertices with hw binning, so we have 6x6 pipes + */ + tpp_x = 6; + tpp_y = 6; + } else { + tpp_x = tpp_y = 1; + while (div_round_up(nbins_y, tpp_y) > screen->num_vsc_pipes) + tpp_y += 2; + while ((div_round_up(nbins_y, tpp_y) * + div_round_up(nbins_x, tpp_x)) > screen->num_vsc_pipes) + tpp_x += 1; + } gmem->maxpw = tpp_x; gmem->maxph = tpp_y; @@ -246,6 +255,9 @@ calculate_tiles(struct fd_batch *batch) xoff += tpp_x; } + /* number of pipes to use for a20x */ + gmem->num_vsc_pipes = MAX2(1, i); + for (; i < npipes; i++) { struct fd_vsc_pipe *pipe = &ctx->vsc_pipe[i]; pipe->x = pipe->y = pipe->w = pipe->h = 0; @@ -280,11 +292,12 @@ calculate_tiles(struct fd_batch *batch) /* pipe number: */ p = ((i / tpp_y) * div_round_up(nbins_x, tpp_x)) + (j / tpp_x); + assert(p < gmem->num_vsc_pipes); /* clip bin width: */ bw = MIN2(bin_w, minx + width - xoff); - - tile->n = tile_n[p]++; + tile->n = !is_a20x(ctx->screen) ? tile_n[p]++ : + ((i % tpp_y + 1) << 3 | (j % tpp_x + 1)); tile->p = p; tile->bin_w = bw; tile->bin_h = bh; diff --git a/src/gallium/drivers/freedreno/freedreno_gmem.h b/src/gallium/drivers/freedreno/freedreno_gmem.h index b953999ff9..70641d62f3 100644 --- a/src/gallium/drivers/freedreno/freedreno_gmem.h +++ b/src/gallium/drivers/freedreno/freedreno_gmem.h @@ -57,6 +57,7 @@ struct fd_gmem_stateobj { uint16_t minx, miny; uint16_t width, height; uint16_t maxpw, maxph; /* maximum pipe width/height */ + uint8_t num_vsc_pipes; /* number of pipes for a20x */ }; struct fd_batch; -- 2.17.1 _______________________________________________ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev