On Wed, 2009-12-30 at 07:37 -0800, michal wrote: > Keith Whitwell wrote on 2009-12-30 16:22: > > Michal, > > > > Did you update the 'C' version of translate for this new functionality? > > You can't just extend the fast path - the fallback/default mode needs to > > be updated as well. > > > > > Yes, I did that in the previous commit.
OK. I was confused as your other changes to translate are labelled that way in the commit message, while this was buried inside what looked like a softpipe change. > > Also, I'm sure it's not necessary to do a divide per vertex-element to > > achieve instancing. It can't be that hard to throw some more counters > > at the problem and do this with a couple of adds instead of a divide... > > > > > The division is done once per instance, not per every vertex attrib. OK, I misread that also... > Are you serious about optimising such low-profile things? No, once per invocation is fine. Sorry for the confusion. Keith > > Keith > > > > On Wed, 2009-12-30 at 05:23 -0800, Micha?? Kr??l wrote: > > > >> Module: Mesa > >> Branch: instanced-arrays > >> Commit: 09c0287b84725098c0b365668231ddf00487c84c > >> URL: > >> http://cgit.freedesktop.org/mesa/mesa/commit/?id=09c0287b84725098c0b365668231ddf00487c84c > >> > >> Author: Michal Krol <mic...@vmware.com> > >> Date: Wed Dec 30 14:23:12 2009 +0100 > >> > >> translate: Implement instancing for linear SSE run. > >> > >> --- > >> > >> src/gallium/auxiliary/translate/translate_sse.c | 154 > >> ++++++++++++++++++----- > >> 1 files changed, 120 insertions(+), 34 deletions(-) > >> > >> diff --git a/src/gallium/auxiliary/translate/translate_sse.c > >> b/src/gallium/auxiliary/translate/translate_sse.c > >> index edd0be1..ddfa4c6 100644 > >> --- a/src/gallium/auxiliary/translate/translate_sse.c > >> +++ b/src/gallium/auxiliary/translate/translate_sse.c > >> @@ -49,6 +49,7 @@ > >> typedef void (PIPE_CDECL *run_func)( struct translate *translate, > >> unsigned start, > >> unsigned count, > >> + unsigned instance_id, > >> void *output_buffer ); > >> > >> typedef void (PIPE_CDECL *run_elts_func)( struct translate *translate, > >> @@ -59,7 +60,12 @@ typedef void (PIPE_CDECL *run_elts_func)( struct > >> translate *translate, > >> struct translate_buffer { > >> const void *base_ptr; > >> unsigned stride; > >> - void *ptr; /* updated per vertex */ > >> +}; > >> + > >> +struct translate_buffer_varient { > >> + unsigned buffer_index; > >> + unsigned instance_divisor; > >> + void *ptr; /* updated either per vertex or per > >> instance */ > >> }; > >> > >> > >> @@ -81,6 +87,16 @@ struct translate_sse { > >> struct translate_buffer buffer[PIPE_MAX_ATTRIBS]; > >> unsigned nr_buffers; > >> > >> + /* Multiple buffer varients can map to a single buffer. */ > >> + struct translate_buffer_varient buffer_varient[PIPE_MAX_ATTRIBS]; > >> + unsigned nr_buffer_varients; > >> + > >> + /* Multiple elements can map to a single buffer varient. */ > >> + unsigned element_to_buffer_varient[PIPE_MAX_ATTRIBS]; > >> + > >> + boolean use_instancing; > >> + unsigned instance_id; > >> + > >> run_func gen_run; > >> run_elts_func gen_run_elts; > >> > >> @@ -360,31 +376,59 @@ static boolean init_inputs( struct translate_sse *p, > >> { > >> unsigned i; > >> if (linear) { > >> - for (i = 0; i < p->nr_buffers; i++) { > >> + struct x86_reg instance_id = x86_make_disp(p->machine_EDX, > >> + get_offset(p, > >> &p->instance_id)); > >> + > >> + for (i = 0; i < p->nr_buffer_varients; i++) { > >> + struct translate_buffer_varient *varient = &p->buffer_varient[i]; > >> + struct translate_buffer *buffer = > >> &p->buffer[varient->buffer_index]; > >> struct x86_reg buf_stride = x86_make_disp(p->machine_EDX, > >> - get_offset(p, > >> &p->buffer[i].stride)); > >> + get_offset(p, > >> &buffer->stride)); > >> struct x86_reg buf_ptr = x86_make_disp(p->machine_EDX, > >> - get_offset(p, > >> &p->buffer[i].ptr)); > >> + get_offset(p, > >> &varient->ptr)); > >> struct x86_reg buf_base_ptr = x86_make_disp(p->machine_EDX, > >> - get_offset(p, > >> &p->buffer[i].base_ptr)); > >> + get_offset(p, > >> &buffer->base_ptr)); > >> struct x86_reg elt = p->idx_EBX; > >> - struct x86_reg tmp = p->tmp_EAX; > >> - > >> + struct x86_reg tmp_EAX = p->tmp_EAX; > >> > >> /* Calculate pointer to first attrib: > >> + * base_ptr + stride * index, where index depends on instance > >> divisor > >> */ > >> - x86_mov(p->func, tmp, buf_stride); > >> - x86_imul(p->func, tmp, elt); > >> - x86_add(p->func, tmp, buf_base_ptr); > >> + if (varient->instance_divisor) { > >> + /* Our index is instance ID divided by instance divisor. > >> + */ > >> + x86_mov(p->func, tmp_EAX, instance_id); > >> + > >> + if (varient->instance_divisor != 1) { > >> + struct x86_reg tmp_EDX = p->machine_EDX; > >> + struct x86_reg tmp_ECX = p->outbuf_ECX; > >> + > >> + /* TODO: Add x86_shr() to rtasm and use it whenever > >> + * instance divisor is power of two. > >> + */ > >> + > >> + x86_push(p->func, tmp_EDX); > >> + x86_push(p->func, tmp_ECX); > >> + x86_xor(p->func, tmp_EDX, tmp_EDX); > >> + x86_mov_reg_imm(p->func, tmp_ECX, > >> varient->instance_divisor); > >> + x86_div(p->func, tmp_ECX); /* EAX = EDX:EAX / ECX */ > >> + x86_pop(p->func, tmp_ECX); > >> + x86_pop(p->func, tmp_EDX); > >> + } > >> + } else { > >> + x86_mov(p->func, tmp_EAX, elt); > >> + } > >> + x86_imul(p->func, tmp_EAX, buf_stride); > >> + x86_add(p->func, tmp_EAX, buf_base_ptr); > >> > >> > >> /* In the linear case, keep the buffer pointer instead of the > >> * index number. > >> */ > >> - if (p->nr_buffers == 1) > >> - x86_mov( p->func, elt, tmp ); > >> + if (p->nr_buffer_varients == 1) > >> + x86_mov(p->func, elt, tmp_EAX); > >> else > >> - x86_mov( p->func, buf_ptr, tmp ); > >> + x86_mov(p->func, buf_ptr, tmp_EAX); > >> } > >> } > >> > >> @@ -394,31 +438,32 @@ static boolean init_inputs( struct translate_sse *p, > >> > >> static struct x86_reg get_buffer_ptr( struct translate_sse *p, > >> boolean linear, > >> - unsigned buf_idx, > >> + unsigned var_idx, > >> struct x86_reg elt ) > >> { > >> - if (linear && p->nr_buffers == 1) { > >> + if (linear && p->nr_buffer_varients == 1) { > >> return p->idx_EBX; > >> } > >> else if (linear) { > >> struct x86_reg ptr = p->tmp_EAX; > >> struct x86_reg buf_ptr = > >> x86_make_disp(p->machine_EDX, > >> - get_offset(p, &p->buffer[buf_idx].ptr)); > >> + get_offset(p, &p->buffer_varient[var_idx].ptr)); > >> > >> x86_mov(p->func, ptr, buf_ptr); > >> return ptr; > >> } > >> else { > >> struct x86_reg ptr = p->tmp_EAX; > >> + const struct translate_buffer_varient *varient = > >> &p->buffer_varient[var_idx]; > >> > >> struct x86_reg buf_stride = > >> x86_make_disp(p->machine_EDX, > >> - get_offset(p, &p->buffer[buf_idx].stride)); > >> + get_offset(p, > >> &p->buffer[varient->buffer_index].stride)); > >> > >> struct x86_reg buf_base_ptr = > >> x86_make_disp(p->machine_EDX, > >> - get_offset(p, &p->buffer[buf_idx].base_ptr)); > >> + get_offset(p, > >> &p->buffer[varient->buffer_index].base_ptr)); > >> > >> > >> > >> @@ -436,28 +481,33 @@ static struct x86_reg get_buffer_ptr( struct > >> translate_sse *p, > >> static boolean incr_inputs( struct translate_sse *p, > >> boolean linear ) > >> { > >> - if (linear && p->nr_buffers == 1) { > >> + if (linear && p->nr_buffer_varients == 1) { > >> struct x86_reg stride = x86_make_disp(p->machine_EDX, > >> get_offset(p, > >> &p->buffer[0].stride)); > >> > >> - x86_add(p->func, p->idx_EBX, stride); > >> - sse_prefetchnta(p->func, x86_make_disp(p->idx_EBX, 192)); > >> + if (p->buffer_varient[0].instance_divisor == 0) { > >> + x86_add(p->func, p->idx_EBX, stride); > >> + sse_prefetchnta(p->func, x86_make_disp(p->idx_EBX, 192)); > >> + } > >> } > >> else if (linear) { > >> unsigned i; > >> > >> /* Is this worthwhile?? > >> */ > >> - for (i = 0; i < p->nr_buffers; i++) { > >> + for (i = 0; i < p->nr_buffer_varients; i++) { > >> + struct translate_buffer_varient *varient = &p->buffer_varient[i]; > >> struct x86_reg buf_ptr = x86_make_disp(p->machine_EDX, > >> - get_offset(p, > >> &p->buffer[i].ptr)); > >> + get_offset(p, > >> &varient->ptr)); > >> struct x86_reg buf_stride = x86_make_disp(p->machine_EDX, > >> - get_offset(p, > >> &p->buffer[i].stride)); > >> + get_offset(p, > >> &p->buffer[varient->buffer_index].stride)); > >> > >> - x86_mov(p->func, p->tmp_EAX, buf_ptr); > >> - x86_add(p->func, p->tmp_EAX, buf_stride); > >> - if (i == 0) sse_prefetchnta(p->func, x86_make_disp(p->tmp_EAX, > >> 192)); > >> - x86_mov(p->func, buf_ptr, p->tmp_EAX); > >> + if (varient->instance_divisor == 0) { > >> + x86_mov(p->func, p->tmp_EAX, buf_ptr); > >> + x86_add(p->func, p->tmp_EAX, buf_stride); > >> + if (i == 0) sse_prefetchnta(p->func, > >> x86_make_disp(p->tmp_EAX, 192)); > >> + x86_mov(p->func, buf_ptr, p->tmp_EAX); > >> + } > >> } > >> } > >> else { > >> @@ -514,7 +564,18 @@ static boolean build_vertex_emit( struct > >> translate_sse *p, > >> x86_mov(p->func, p->machine_EDX, x86_fn_arg(p->func, 1)); > >> x86_mov(p->func, p->idx_EBX, x86_fn_arg(p->func, 2)); > >> x86_mov(p->func, p->count_ESI, x86_fn_arg(p->func, 3)); > >> - x86_mov(p->func, p->outbuf_ECX, x86_fn_arg(p->func, 4)); > >> + x86_mov(p->func, p->outbuf_ECX, x86_fn_arg(p->func, 5)); > >> + > >> + /* Load instance ID. > >> + */ > >> + if (p->use_instancing) { > >> + x86_mov(p->func, > >> + p->tmp_EAX, > >> + x86_fn_arg(p->func, 4)); > >> + x86_mov(p->func, > >> + x86_make_disp(p->machine_EDX, get_offset(p, > >> &p->instance_id)), > >> + p->tmp_EAX); > >> + } > >> > >> /* Get vertex count, compare to zero > >> */ > >> @@ -531,17 +592,18 @@ static boolean build_vertex_emit( struct > >> translate_sse *p, > >> label = x86_get_label(p->func); > >> { > >> struct x86_reg elt = linear ? p->idx_EBX : x86_deref(p->idx_EBX); > >> - int last_vb = -1; > >> + int last_varient = -1; > >> struct x86_reg vb; > >> > >> for (j = 0; j < p->translate.key.nr_elements; j++) { > >> const struct translate_element *a = &p->translate.key.element[j]; > >> + unsigned varient = p->element_to_buffer_varient[j]; > >> > >> /* Figure out source pointer address: > >> */ > >> - if (a->input_buffer != last_vb) { > >> - last_vb = a->input_buffer; > >> - vb = get_buffer_ptr(p, linear, a->input_buffer, elt); > >> + if (varient != last_varient) { > >> + last_varient = varient; > >> + vb = get_buffer_ptr(p, linear, varient, elt); > >> } > >> > >> if (!translate_attr( p, a, > >> @@ -645,6 +707,7 @@ static void PIPE_CDECL translate_sse_run( struct > >> translate *translate, > >> p->gen_run( translate, > >> start, > >> count, > >> + instance_id, > >> output_buffer ); > >> } > >> > >> @@ -667,9 +730,32 @@ struct translate *translate_sse2_create( const struct > >> translate_key *key ) > >> p->translate.run_elts = translate_sse_run_elts; > >> p->translate.run = translate_sse_run; > >> > >> - for (i = 0; i < key->nr_elements; i++) > >> + for (i = 0; i < key->nr_elements; i++) { > >> + unsigned j; > >> + > >> p->nr_buffers = MAX2( p->nr_buffers, key->element[i].input_buffer + > >> 1 ); > >> > >> + if (key->element[i].instance_divisor) { > >> + p->use_instancing = TRUE; > >> + } > >> + > >> + /* > >> + * Map vertex element to vertex buffer varient. > >> + */ > >> + for (j = 0; j < p->nr_buffer_varients; j++) { > >> + if (p->buffer_varient[j].buffer_index == > >> key->element[i].input_buffer && > >> + p->buffer_varient[j].instance_divisor == > >> key->element[i].instance_divisor) { > >> + break; > >> + } > >> + } > >> + if (j == p->nr_buffer_varients) { > >> + p->buffer_varient[j].buffer_index = key->element[i].input_buffer; > >> + p->buffer_varient[j].instance_divisor = > >> key->element[i].instance_divisor; > >> + p->nr_buffer_varients++; > >> + } > >> + p->element_to_buffer_varient[i] = j; > >> + } > >> + > >> if (0) debug_printf("nr_buffers: %d\n", p->nr_buffers); > >> > >> if (!build_vertex_emit(p, &p->linear_func, TRUE)) > >> > >> _______________________________________________ > >> mesa-commit mailing list > >> mesa-com...@lists.freedesktop.org > >> http://lists.freedesktop.org/mailman/listinfo/mesa-commit > >> > > > > > > > ------------------------------------------------------------------------------ This SF.Net email is sponsored by the Verizon Developer Community Take advantage of Verizon's best-in-class app development support A streamlined, 14 day to market process makes app distribution fast and easy Join now and get one step closer to millions of Verizon customers http://p.sf.net/sfu/verizon-dev2dev _______________________________________________ Mesa3d-dev mailing list Mesa3d-dev@lists.sourceforge.net https://lists.sourceforge.net/lists/listinfo/mesa3d-dev