Am 02.02.2016 um 13:32 schrieb Jose Fonseca: > On 01/02/16 01:00, srol...@vmware.com wrote: >> From: Roland Scheidegger <srol...@vmware.com> >> >> If the tri is fully inside a scissor edge (or rather, we just use the >> bounding box of the tri for the comparison), then we can drop these >> additional scissor "planes" early. We do not even need to allocate >> space for them in the tri. >> The math actually appears to be slightly iffy due to bounding boxes >> being rounded, but it doesn't matter in the end. >> Those scissor rects are costly - the 4 planes from the scissor are >> already more expensive to calculate than the 3 planes from the tri >> itself, >> and it also prevents us from using the specialized raster code for small >> tris. >> This helps openarena performance by about 8% or so. Of course, it helps >> there that while openarena often enables scissoring (and even moves the >> scissor rect around) I have not seen a single tri actually hit the >> scissor rect, ever. >> >> v2: drop individual scissor edges, and do it earlier, not even allocating >> space for them. >> --- >> src/gallium/drivers/llvmpipe/lp_rast_tri.c | 16 +++++ >> src/gallium/drivers/llvmpipe/lp_setup_line.c | 81 >> ++++++++++++++--------- >> src/gallium/drivers/llvmpipe/lp_setup_tri.c | 98 >> +++++++++++++++++----------- >> 3 files changed, 126 insertions(+), 69 deletions(-) >> >> diff --git a/src/gallium/drivers/llvmpipe/lp_rast_tri.c >> b/src/gallium/drivers/llvmpipe/lp_rast_tri.c >> index f4a2f02..bf27900 100644 >> --- a/src/gallium/drivers/llvmpipe/lp_rast_tri.c >> +++ b/src/gallium/drivers/llvmpipe/lp_rast_tri.c >> @@ -380,11 +380,27 @@ lp_rast_triangle_32_3_4(struct >> lp_rasterizer_task *task, >> */ >> dcdx = _mm_sub_epi32(zero, dcdx); >> >> +#if 0 >> + { >> + __m128i lbits, fixup, xy, dcdxdy, dcdx16; >> + lbits = _mm_and_si128(c, _mm_set1_epi32(FIXED_ONE - 1)); >> + fixup = _mm_cmpeq_epi32(lbits, _mm_setzero_si128()); >> + c = _mm_srai_epi32(c, 8); >> + c = _mm_add_epi32(c, fixup); >> + xy = _mm_set1_epi32(x | y << 16); >> + dcdx = _mm_srai_epi32(dcdx, 8); >> + dcdy = _mm_srai_epi32(dcdy, 8); >> + dcdx16 = _mm_and_si128(_mm_set_epi16(0,-1,0,-1,0,-1,0,-1), dcdx); >> + dcdxdy = _mm_or_si128(dcdx16, _mm_slli_epi32(dcdy, 16)); >> + c = _mm_add_epi32(c, _mm_madd_epi16(dcdxdy, xy)); >> + } >> +#else >> c = _mm_add_epi32(c, mm_mullo_epi32(dcdx, _mm_set1_epi32(x))); >> c = _mm_add_epi32(c, mm_mullo_epi32(dcdy, _mm_set1_epi32(y))); >> >> /* Adjust so we can just check the sign bit (< 0 comparison), >> instead of having to do a less efficient <= 0 comparison */ >> c = _mm_sub_epi32(c, _mm_set1_epi32(1)); >> +#endif >> >> dcdx2 = _mm_add_epi32(dcdx, dcdx); >> dcdx3 = _mm_add_epi32(dcdx2, dcdx); >> diff --git a/src/gallium/drivers/llvmpipe/lp_setup_line.c >> b/src/gallium/drivers/llvmpipe/lp_setup_line.c >> index f425825..3ec9ac4 100644 >> --- a/src/gallium/drivers/llvmpipe/lp_setup_line.c >> +++ b/src/gallium/drivers/llvmpipe/lp_setup_line.c >> @@ -336,13 +336,6 @@ try_setup_line( struct lp_setup_context *setup, >> layer = MIN2(layer, scene->fb_max_layer); >> } >> >> - if (setup->scissor_test) { >> - nr_planes = 8; >> - } >> - else { >> - nr_planes = 4; >> - } >> - >> dx = v1[0][0] - v2[0][0]; >> dy = v1[0][1] - v2[0][1]; >> area = (dx * dx + dy * dy); >> @@ -591,6 +584,20 @@ try_setup_line( struct lp_setup_context *setup, >> bbox.x0 = MAX2(bbox.x0, 0); >> bbox.y0 = MAX2(bbox.y0, 0); >> >> + nr_planes = 4; >> + /* >> + * Determine how many scissor planes we need, that is drop scissor >> + * edges if the bounding box of the tri is fully inside that edge. >> + */ >> + if (setup->scissor_test) { >> + /* why not just use draw_regions */ >> + struct u_rect *scissor = &setup->scissors[viewport_index]; > > > Please use 4 bools: > > bool scissor_left = bbox.x0 < scissor->x0; > > and reused them below. > > I'm concern we could use >= for some of these -- though it might depend > on the current rasterization rules. I think it should be ok. We already have adjusted the bounding box for the tri depending on rasterization rules so this shouldn't matter anymore. But yes, the math is a bit non-obvious with the rounded bounding box of the tri (I'd guess with msaa we'd definitely need something better, but that would be true elsewhere too).
> > Anyway, using bools ensure these conditions stay in sync with the code > bloew. I already submitted this yesterday after incorporating Brian's feedback. Those suggestions were actually incompatible with yours, you can have either faster or nicer code here ;-). > Maybe an even better alternative is have a "needed_scissor_planes" > helper function that gets used every where. I'm going to push something along these lines, making it nicer ;-) Roland > > > >> + if (bbox.x0 < scissor->x0) nr_planes++; >> + if (bbox.x1 > scissor->x1) nr_planes++; >> + if (bbox.y0 < scissor->y0) nr_planes++; >> + if (bbox.y1 > scissor->y1) nr_planes++; >> + } >> + >> line = lp_setup_alloc_triangle(scene, >> key->num_inputs, >> nr_planes, >> @@ -708,30 +715,44 @@ try_setup_line( struct lp_setup_context *setup, >> * Note that otherwise, the scissor planes only vary in 'C' value, >> * and even then only on state-changes. Could alternatively store >> * these planes elsewhere. >> + * (Or only store the c value together with a bit indicating which >> + * scissor edge this is, so rasterization would treat them >> differently >> + * (easier to evaluate) to ordinary planes.) >> */ >> - if (nr_planes == 8) { >> - const struct u_rect *scissor = >> - &setup->scissors[viewport_index]; >> - >> - plane[4].dcdx = -1 << 8; >> - plane[4].dcdy = 0; >> - plane[4].c = (1-scissor->x0) << 8; >> - plane[4].eo = 1 << 8; >> - >> - plane[5].dcdx = 1 << 8; >> - plane[5].dcdy = 0; >> - plane[5].c = (scissor->x1+1) << 8; >> - plane[5].eo = 0; >> - >> - plane[6].dcdx = 0; >> - plane[6].dcdy = 1 << 8; >> - plane[6].c = (1-scissor->y0) << 8; >> - plane[6].eo = 1 << 8; >> - >> - plane[7].dcdx = 0; >> - plane[7].dcdy = -1 << 8; >> - plane[7].c = (scissor->y1+1) << 8; >> - plane[7].eo = 0; >> + if (nr_planes > 4) { >> + /* why not just use draw_regions */ >> + struct u_rect *scissor = &setup->scissors[viewport_index]; >> + unsigned scis_index = 4; >> + >> + if (bbox.x0 < scissor->x0) { > > And use scissor_left. > >> + plane[scis_index].dcdx = -1 << 8; >> + plane[scis_index].dcdy = 0; >> + plane[scis_index].c = (1-scissor->x0) << 8; >> + plane[scis_index].eo = 1 << 8; >> + scis_index++; >> + } >> + if (bbox.x1 > scissor->x1) { >> + plane[scis_index].dcdx = 1 << 8; >> + plane[scis_index].dcdy = 0; >> + plane[scis_index].c = (scissor->x1+1) << 8; >> + plane[scis_index].eo = 0 << 8; >> + scis_index++; >> + } >> + if (bbox.y0 < scissor->y0) { >> + plane[scis_index].dcdx = 0; >> + plane[scis_index].dcdy = 1 << 8; >> + plane[scis_index].c = (1-scissor->y0) << 8; >> + plane[scis_index].eo = 1 << 8; >> + scis_index++; >> + } >> + if (bbox.y1 > scissor->y1) { >> + plane[scis_index].dcdx = 0; >> + plane[scis_index].dcdy = -1 << 8; >> + plane[scis_index].c = (scissor->y1+1) << 8; >> + plane[scis_index].eo = 0; >> + scis_index++; >> + } >> + assert(scis_index == nr_planes); >> } >> >> return lp_setup_bin_triangle(setup, line, &bbox, nr_planes, >> viewport_index); >> diff --git a/src/gallium/drivers/llvmpipe/lp_setup_tri.c >> b/src/gallium/drivers/llvmpipe/lp_setup_tri.c >> index 1e3a750..4e9acce 100644 >> --- a/src/gallium/drivers/llvmpipe/lp_setup_tri.c >> +++ b/src/gallium/drivers/llvmpipe/lp_setup_tri.c >> @@ -302,13 +302,6 @@ do_triangle_ccw(struct lp_setup_context *setup, >> layer = MIN2(layer, scene->fb_max_layer); >> } >> >> - if (setup->scissor_test) { >> - nr_planes = 7; >> - } >> - else { >> - nr_planes = 3; >> - } >> - >> /* Bounding rectangle (in pixels) */ >> { >> /* Yes this is necessary to accurately calculate bounding boxes >> @@ -347,6 +340,20 @@ do_triangle_ccw(struct lp_setup_context *setup, >> bbox.x0 = MAX2(bbox.x0, 0); >> bbox.y0 = MAX2(bbox.y0, 0); >> >> + nr_planes = 3; >> + /* >> + * Determine how many scissor planes we need, that is drop scissor >> + * edges if the bounding box of the tri is fully inside that edge. >> + */ >> + if (setup->scissor_test) { >> + /* why not just use draw_regions */ >> + struct u_rect *scissor = &setup->scissors[viewport_index]; >> + if (bbox.x0 < scissor->x0) nr_planes++; >> + if (bbox.x1 > scissor->x1) nr_planes++; >> + if (bbox.y0 < scissor->y0) nr_planes++; >> + if (bbox.y1 > scissor->y1) nr_planes++; >> + } >> + >> tri = lp_setup_alloc_triangle(scene, >> key->num_inputs, >> nr_planes, >> @@ -367,13 +374,11 @@ do_triangle_ccw(struct lp_setup_context *setup, >> >> /* Setup parameter interpolants: >> */ >> - setup->setup.variant->jit_function( v0, >> - v1, >> - v2, >> - frontfacing, >> - GET_A0(&tri->inputs), >> - GET_DADX(&tri->inputs), >> - GET_DADY(&tri->inputs) ); >> + setup->setup.variant->jit_function(v0, v1, v2, >> + frontfacing, >> + GET_A0(&tri->inputs), >> + GET_DADX(&tri->inputs), >> + GET_DADY(&tri->inputs)); >> >> tri->inputs.frontfacing = frontfacing; >> tri->inputs.disable = FALSE; >> @@ -383,9 +388,9 @@ do_triangle_ccw(struct lp_setup_context *setup, >> >> if (0) >> lp_dump_setup_coef(&setup->setup.variant->key, >> - (const float (*)[4])GET_A0(&tri->inputs), >> - (const float (*)[4])GET_DADX(&tri->inputs), >> - (const float (*)[4])GET_DADY(&tri->inputs)); >> + (const float (*)[4])GET_A0(&tri->inputs), >> + (const float (*)[4])GET_DADX(&tri->inputs), >> + (const float (*)[4])GET_DADY(&tri->inputs)); >> >> plane = GET_PLANES(tri); >> >> @@ -672,29 +677,44 @@ do_triangle_ccw(struct lp_setup_context *setup, >> * Note that otherwise, the scissor planes only vary in 'C' value, >> * and even then only on state-changes. Could alternatively store >> * these planes elsewhere. >> + * (Or only store the c value together with a bit indicating which >> + * scissor edge this is, so rasterization would treat them >> differently >> + * (easier to evaluate) to ordinary planes.) >> */ >> - if (nr_planes == 7) { >> - const struct u_rect *scissor = &setup->scissors[viewport_index]; >> - >> - plane[3].dcdx = -1 << 8; >> - plane[3].dcdy = 0; >> - plane[3].c = (1-scissor->x0) << 8; >> - plane[3].eo = 1 << 8; >> - >> - plane[4].dcdx = 1 << 8; >> - plane[4].dcdy = 0; >> - plane[4].c = (scissor->x1+1) << 8; >> - plane[4].eo = 0; >> - >> - plane[5].dcdx = 0; >> - plane[5].dcdy = 1 << 8; >> - plane[5].c = (1-scissor->y0) << 8; >> - plane[5].eo = 1 << 8; >> - >> - plane[6].dcdx = 0; >> - plane[6].dcdy = -1 << 8; >> - plane[6].c = (scissor->y1+1) << 8; >> - plane[6].eo = 0; >> + if (nr_planes > 3) { >> + /* why not just use draw_regions */ >> + struct u_rect *scissor = &setup->scissors[viewport_index]; >> + unsigned scis_index = 3; >> + >> + if (bbox.x0 < scissor->x0) { >> + plane[scis_index].dcdx = -1 << 8; >> + plane[scis_index].dcdy = 0; >> + plane[scis_index].c = (1-scissor->x0) << 8; >> + plane[scis_index].eo = 1 << 8; >> + scis_index++; >> + } >> + if (bbox.x1 > scissor->x1) { >> + plane[scis_index].dcdx = 1 << 8; >> + plane[scis_index].dcdy = 0; >> + plane[scis_index].c = (scissor->x1+1) << 8; >> + plane[scis_index].eo = 0 << 8; >> + scis_index++; >> + } >> + if (bbox.y0 < scissor->y0) { >> + plane[scis_index].dcdx = 0; >> + plane[scis_index].dcdy = 1 << 8; >> + plane[scis_index].c = (1-scissor->y0) << 8; >> + plane[scis_index].eo = 1 << 8; >> + scis_index++; >> + } >> + if (bbox.y1 > scissor->y1) { >> + plane[scis_index].dcdx = 0; >> + plane[scis_index].dcdy = -1 << 8; >> + plane[scis_index].c = (scissor->y1+1) << 8; >> + plane[scis_index].eo = 0; >> + scis_index++; >> + } >> + assert(scis_index == nr_planes); >> } >> >> return lp_setup_bin_triangle(setup, tri, &bbox, nr_planes, >> viewport_index); >> > > > > Otherwise looks good. > > The other patches of this series looks good too, and are > > Reviwed-by: Jose Fonseca <jfons...@vmware.com > _______________________________________________ mesa-dev mailing list mesa-dev@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/mesa-dev