Am 01.02.2016 um 02:00 schrieb srol...@vmware.com: > From: Roland Scheidegger <srol...@vmware.com> > > If the tri is fully inside a scissor edge (or rather, we just use the > bounding box of the tri for the comparison), then we can drop these > additional scissor "planes" early. We do not even need to allocate > space for them in the tri. > The math actually appears to be slightly iffy due to bounding boxes > being rounded, but it doesn't matter in the end. > Those scissor rects are costly - the 4 planes from the scissor are > already more expensive to calculate than the 3 planes from the tri itself, > and it also prevents us from using the specialized raster code for small > tris. > This helps openarena performance by about 8% or so. Of course, it helps > there that while openarena often enables scissoring (and even moves the > scissor rect around) I have not seen a single tri actually hit the > scissor rect, ever. > > v2: drop individual scissor edges, and do it earlier, not even allocating > space for them. > --- > src/gallium/drivers/llvmpipe/lp_rast_tri.c | 16 +++++ > src/gallium/drivers/llvmpipe/lp_setup_line.c | 81 ++++++++++++++--------- > src/gallium/drivers/llvmpipe/lp_setup_tri.c | 98 > +++++++++++++++++----------- > 3 files changed, 126 insertions(+), 69 deletions(-) > > diff --git a/src/gallium/drivers/llvmpipe/lp_rast_tri.c > b/src/gallium/drivers/llvmpipe/lp_rast_tri.c > index f4a2f02..bf27900 100644 > --- a/src/gallium/drivers/llvmpipe/lp_rast_tri.c > +++ b/src/gallium/drivers/llvmpipe/lp_rast_tri.c > @@ -380,11 +380,27 @@ lp_rast_triangle_32_3_4(struct lp_rasterizer_task *task, > */ > dcdx = _mm_sub_epi32(zero, dcdx); > > +#if 0 > + { > + __m128i lbits, fixup, xy, dcdxdy, dcdx16; > + lbits = _mm_and_si128(c, _mm_set1_epi32(FIXED_ONE - 1)); > + fixup = _mm_cmpeq_epi32(lbits, _mm_setzero_si128()); > + c = _mm_srai_epi32(c, 8); > + c = _mm_add_epi32(c, fixup); > + xy = _mm_set1_epi32(x | y << 16); > + dcdx = _mm_srai_epi32(dcdx, 8); > + dcdy = _mm_srai_epi32(dcdy, 8); > + dcdx16 = _mm_and_si128(_mm_set_epi16(0,-1,0,-1,0,-1,0,-1), dcdx); > + dcdxdy = _mm_or_si128(dcdx16, _mm_slli_epi32(dcdy, 16)); > + c = _mm_add_epi32(c, _mm_madd_epi16(dcdxdy, xy)); > + } > +#else > c = _mm_add_epi32(c, mm_mullo_epi32(dcdx, _mm_set1_epi32(x))); > c = _mm_add_epi32(c, mm_mullo_epi32(dcdy, _mm_set1_epi32(y))); > > /* Adjust so we can just check the sign bit (< 0 comparison), instead of > having to do a less efficient <= 0 comparison */ > c = _mm_sub_epi32(c, _mm_set1_epi32(1)); > +#endif Oops disregard this hunk. That's part of some experiment (use the same down-shifting logic for calculations used in the generic rasterization functions, plus some different fixup calc (with logic ops rather than the subs used elsewhere), and just because I can, replace the emulated 2 32bit muls with one single 16bit madd, all in all that's quite a bit cheaper... could do that for the 32_3_16 func as well - works well, maybe I'll look at that another time again...)
Roland > > dcdx2 = _mm_add_epi32(dcdx, dcdx); > dcdx3 = _mm_add_epi32(dcdx2, dcdx); > diff --git a/src/gallium/drivers/llvmpipe/lp_setup_line.c > b/src/gallium/drivers/llvmpipe/lp_setup_line.c > index f425825..3ec9ac4 100644 > --- a/src/gallium/drivers/llvmpipe/lp_setup_line.c > +++ b/src/gallium/drivers/llvmpipe/lp_setup_line.c > @@ -336,13 +336,6 @@ try_setup_line( struct lp_setup_context *setup, > layer = MIN2(layer, scene->fb_max_layer); > } > > - if (setup->scissor_test) { > - nr_planes = 8; > - } > - else { > - nr_planes = 4; > - } > - > dx = v1[0][0] - v2[0][0]; > dy = v1[0][1] - v2[0][1]; > area = (dx * dx + dy * dy); > @@ -591,6 +584,20 @@ try_setup_line( struct lp_setup_context *setup, > bbox.x0 = MAX2(bbox.x0, 0); > bbox.y0 = MAX2(bbox.y0, 0); > > + nr_planes = 4; > + /* > + * Determine how many scissor planes we need, that is drop scissor > + * edges if the bounding box of the tri is fully inside that edge. > + */ > + if (setup->scissor_test) { > + /* why not just use draw_regions */ > + struct u_rect *scissor = &setup->scissors[viewport_index]; > + if (bbox.x0 < scissor->x0) nr_planes++; > + if (bbox.x1 > scissor->x1) nr_planes++; > + if (bbox.y0 < scissor->y0) nr_planes++; > + if (bbox.y1 > scissor->y1) nr_planes++; > + } > + > line = lp_setup_alloc_triangle(scene, > key->num_inputs, > nr_planes, > @@ -708,30 +715,44 @@ try_setup_line( struct lp_setup_context *setup, > * Note that otherwise, the scissor planes only vary in 'C' value, > * and even then only on state-changes. Could alternatively store > * these planes elsewhere. > + * (Or only store the c value together with a bit indicating which > + * scissor edge this is, so rasterization would treat them differently > + * (easier to evaluate) to ordinary planes.) > */ > - if (nr_planes == 8) { > - const struct u_rect *scissor = > - &setup->scissors[viewport_index]; > - > - plane[4].dcdx = -1 << 8; > - plane[4].dcdy = 0; > - plane[4].c = (1-scissor->x0) << 8; > - plane[4].eo = 1 << 8; > - > - plane[5].dcdx = 1 << 8; > - plane[5].dcdy = 0; > - plane[5].c = (scissor->x1+1) << 8; > - plane[5].eo = 0; > - > - plane[6].dcdx = 0; > - plane[6].dcdy = 1 << 8; > - plane[6].c = (1-scissor->y0) << 8; > - plane[6].eo = 1 << 8; > - > - plane[7].dcdx = 0; > - plane[7].dcdy = -1 << 8; > - plane[7].c = (scissor->y1+1) << 8; > - plane[7].eo = 0; > + if (nr_planes > 4) { > + /* why not just use draw_regions */ > + struct u_rect *scissor = &setup->scissors[viewport_index]; > + unsigned scis_index = 4; > + > + if (bbox.x0 < scissor->x0) { > + plane[scis_index].dcdx = -1 << 8; > + plane[scis_index].dcdy = 0; > + plane[scis_index].c = (1-scissor->x0) << 8; > + plane[scis_index].eo = 1 << 8; > + scis_index++; > + } > + if (bbox.x1 > scissor->x1) { > + plane[scis_index].dcdx = 1 << 8; > + plane[scis_index].dcdy = 0; > + plane[scis_index].c = (scissor->x1+1) << 8; > + plane[scis_index].eo = 0 << 8; > + scis_index++; > + } > + if (bbox.y0 < scissor->y0) { > + plane[scis_index].dcdx = 0; > + plane[scis_index].dcdy = 1 << 8; > + plane[scis_index].c = (1-scissor->y0) << 8; > + plane[scis_index].eo = 1 << 8; > + scis_index++; > + } > + if (bbox.y1 > scissor->y1) { > + plane[scis_index].dcdx = 0; > + plane[scis_index].dcdy = -1 << 8; > + plane[scis_index].c = (scissor->y1+1) << 8; > + plane[scis_index].eo = 0; > + scis_index++; > + } > + assert(scis_index == nr_planes); > } > > return lp_setup_bin_triangle(setup, line, &bbox, nr_planes, > viewport_index); > diff --git a/src/gallium/drivers/llvmpipe/lp_setup_tri.c > b/src/gallium/drivers/llvmpipe/lp_setup_tri.c > index 1e3a750..4e9acce 100644 > --- a/src/gallium/drivers/llvmpipe/lp_setup_tri.c > +++ b/src/gallium/drivers/llvmpipe/lp_setup_tri.c > @@ -302,13 +302,6 @@ do_triangle_ccw(struct lp_setup_context *setup, > layer = MIN2(layer, scene->fb_max_layer); > } > > - if (setup->scissor_test) { > - nr_planes = 7; > - } > - else { > - nr_planes = 3; > - } > - > /* Bounding rectangle (in pixels) */ > { > /* Yes this is necessary to accurately calculate bounding boxes > @@ -347,6 +340,20 @@ do_triangle_ccw(struct lp_setup_context *setup, > bbox.x0 = MAX2(bbox.x0, 0); > bbox.y0 = MAX2(bbox.y0, 0); > > + nr_planes = 3; > + /* > + * Determine how many scissor planes we need, that is drop scissor > + * edges if the bounding box of the tri is fully inside that edge. > + */ > + if (setup->scissor_test) { > + /* why not just use draw_regions */ > + struct u_rect *scissor = &setup->scissors[viewport_index]; > + if (bbox.x0 < scissor->x0) nr_planes++; > + if (bbox.x1 > scissor->x1) nr_planes++; > + if (bbox.y0 < scissor->y0) nr_planes++; > + if (bbox.y1 > scissor->y1) nr_planes++; > + } > + > tri = lp_setup_alloc_triangle(scene, > key->num_inputs, > nr_planes, > @@ -367,13 +374,11 @@ do_triangle_ccw(struct lp_setup_context *setup, > > /* Setup parameter interpolants: > */ > - setup->setup.variant->jit_function( v0, > - v1, > - v2, > - frontfacing, > - GET_A0(&tri->inputs), > - GET_DADX(&tri->inputs), > - GET_DADY(&tri->inputs) ); > + setup->setup.variant->jit_function(v0, v1, v2, > + frontfacing, > + GET_A0(&tri->inputs), > + GET_DADX(&tri->inputs), > + GET_DADY(&tri->inputs)); > > tri->inputs.frontfacing = frontfacing; > tri->inputs.disable = FALSE; > @@ -383,9 +388,9 @@ do_triangle_ccw(struct lp_setup_context *setup, > > if (0) > lp_dump_setup_coef(&setup->setup.variant->key, > - (const float (*)[4])GET_A0(&tri->inputs), > - (const float (*)[4])GET_DADX(&tri->inputs), > - (const float (*)[4])GET_DADY(&tri->inputs)); > + (const float (*)[4])GET_A0(&tri->inputs), > + (const float (*)[4])GET_DADX(&tri->inputs), > + (const float (*)[4])GET_DADY(&tri->inputs)); > > plane = GET_PLANES(tri); > > @@ -672,29 +677,44 @@ do_triangle_ccw(struct lp_setup_context *setup, > * Note that otherwise, the scissor planes only vary in 'C' value, > * and even then only on state-changes. Could alternatively store > * these planes elsewhere. > + * (Or only store the c value together with a bit indicating which > + * scissor edge this is, so rasterization would treat them differently > + * (easier to evaluate) to ordinary planes.) > */ > - if (nr_planes == 7) { > - const struct u_rect *scissor = &setup->scissors[viewport_index]; > - > - plane[3].dcdx = -1 << 8; > - plane[3].dcdy = 0; > - plane[3].c = (1-scissor->x0) << 8; > - plane[3].eo = 1 << 8; > - > - plane[4].dcdx = 1 << 8; > - plane[4].dcdy = 0; > - plane[4].c = (scissor->x1+1) << 8; > - plane[4].eo = 0; > - > - plane[5].dcdx = 0; > - plane[5].dcdy = 1 << 8; > - plane[5].c = (1-scissor->y0) << 8; > - plane[5].eo = 1 << 8; > - > - plane[6].dcdx = 0; > - plane[6].dcdy = -1 << 8; > - plane[6].c = (scissor->y1+1) << 8; > - plane[6].eo = 0; > + if (nr_planes > 3) { > + /* why not just use draw_regions */ > + struct u_rect *scissor = &setup->scissors[viewport_index]; > + unsigned scis_index = 3; > + > + if (bbox.x0 < scissor->x0) { > + plane[scis_index].dcdx = -1 << 8; > + plane[scis_index].dcdy = 0; > + plane[scis_index].c = (1-scissor->x0) << 8; > + plane[scis_index].eo = 1 << 8; > + scis_index++; > + } > + if (bbox.x1 > scissor->x1) { > + plane[scis_index].dcdx = 1 << 8; > + plane[scis_index].dcdy = 0; > + plane[scis_index].c = (scissor->x1+1) << 8; > + plane[scis_index].eo = 0 << 8; > + scis_index++; > + } > + if (bbox.y0 < scissor->y0) { > + plane[scis_index].dcdx = 0; > + plane[scis_index].dcdy = 1 << 8; > + plane[scis_index].c = (1-scissor->y0) << 8; > + plane[scis_index].eo = 1 << 8; > + scis_index++; > + } > + if (bbox.y1 > scissor->y1) { > + plane[scis_index].dcdx = 0; > + plane[scis_index].dcdy = -1 << 8; > + plane[scis_index].c = (scissor->y1+1) << 8; > + plane[scis_index].eo = 0; > + scis_index++; > + } > + assert(scis_index == nr_planes); > } > > return lp_setup_bin_triangle(setup, tri, &bbox, nr_planes, > viewport_index); > _______________________________________________ mesa-dev mailing list mesa-dev@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/mesa-dev