Michal, Did you update the 'C' version of translate for this new functionality? You can't just extend the fast path - the fallback/default mode needs to be updated as well.
Also, I'm sure it's not necessary to do a divide per vertex-element to achieve instancing. It can't be that hard to throw some more counters at the problem and do this with a couple of adds instead of a divide... Keith On Wed, 2009-12-30 at 05:23 -0800, Micha?? Kr??l wrote: > Module: Mesa > Branch: instanced-arrays > Commit: 09c0287b84725098c0b365668231ddf00487c84c > URL: > http://cgit.freedesktop.org/mesa/mesa/commit/?id=09c0287b84725098c0b365668231ddf00487c84c > > Author: Michal Krol <mic...@vmware.com> > Date: Wed Dec 30 14:23:12 2009 +0100 > > translate: Implement instancing for linear SSE run. > > --- > > src/gallium/auxiliary/translate/translate_sse.c | 154 > ++++++++++++++++++----- > 1 files changed, 120 insertions(+), 34 deletions(-) > > diff --git a/src/gallium/auxiliary/translate/translate_sse.c > b/src/gallium/auxiliary/translate/translate_sse.c > index edd0be1..ddfa4c6 100644 > --- a/src/gallium/auxiliary/translate/translate_sse.c > +++ b/src/gallium/auxiliary/translate/translate_sse.c > @@ -49,6 +49,7 @@ > typedef void (PIPE_CDECL *run_func)( struct translate *translate, > unsigned start, > unsigned count, > + unsigned instance_id, > void *output_buffer ); > > typedef void (PIPE_CDECL *run_elts_func)( struct translate *translate, > @@ -59,7 +60,12 @@ typedef void (PIPE_CDECL *run_elts_func)( struct translate > *translate, > struct translate_buffer { > const void *base_ptr; > unsigned stride; > - void *ptr; /* updated per vertex */ > +}; > + > +struct translate_buffer_varient { > + unsigned buffer_index; > + unsigned instance_divisor; > + void *ptr; /* updated either per vertex or per > instance */ > }; > > > @@ -81,6 +87,16 @@ struct translate_sse { > struct translate_buffer buffer[PIPE_MAX_ATTRIBS]; > unsigned nr_buffers; > > + /* Multiple buffer varients can map to a single buffer. */ > + struct translate_buffer_varient buffer_varient[PIPE_MAX_ATTRIBS]; > + unsigned nr_buffer_varients; > + > + /* Multiple elements can map to a single buffer varient. */ > + unsigned element_to_buffer_varient[PIPE_MAX_ATTRIBS]; > + > + boolean use_instancing; > + unsigned instance_id; > + > run_func gen_run; > run_elts_func gen_run_elts; > > @@ -360,31 +376,59 @@ static boolean init_inputs( struct translate_sse *p, > { > unsigned i; > if (linear) { > - for (i = 0; i < p->nr_buffers; i++) { > + struct x86_reg instance_id = x86_make_disp(p->machine_EDX, > + get_offset(p, > &p->instance_id)); > + > + for (i = 0; i < p->nr_buffer_varients; i++) { > + struct translate_buffer_varient *varient = &p->buffer_varient[i]; > + struct translate_buffer *buffer = &p->buffer[varient->buffer_index]; > struct x86_reg buf_stride = x86_make_disp(p->machine_EDX, > - get_offset(p, > &p->buffer[i].stride)); > + get_offset(p, > &buffer->stride)); > struct x86_reg buf_ptr = x86_make_disp(p->machine_EDX, > - get_offset(p, > &p->buffer[i].ptr)); > + get_offset(p, > &varient->ptr)); > struct x86_reg buf_base_ptr = x86_make_disp(p->machine_EDX, > - get_offset(p, > &p->buffer[i].base_ptr)); > + get_offset(p, > &buffer->base_ptr)); > struct x86_reg elt = p->idx_EBX; > - struct x86_reg tmp = p->tmp_EAX; > - > + struct x86_reg tmp_EAX = p->tmp_EAX; > > /* Calculate pointer to first attrib: > + * base_ptr + stride * index, where index depends on instance > divisor > */ > - x86_mov(p->func, tmp, buf_stride); > - x86_imul(p->func, tmp, elt); > - x86_add(p->func, tmp, buf_base_ptr); > + if (varient->instance_divisor) { > + /* Our index is instance ID divided by instance divisor. > + */ > + x86_mov(p->func, tmp_EAX, instance_id); > + > + if (varient->instance_divisor != 1) { > + struct x86_reg tmp_EDX = p->machine_EDX; > + struct x86_reg tmp_ECX = p->outbuf_ECX; > + > + /* TODO: Add x86_shr() to rtasm and use it whenever > + * instance divisor is power of two. > + */ > + > + x86_push(p->func, tmp_EDX); > + x86_push(p->func, tmp_ECX); > + x86_xor(p->func, tmp_EDX, tmp_EDX); > + x86_mov_reg_imm(p->func, tmp_ECX, varient->instance_divisor); > + x86_div(p->func, tmp_ECX); /* EAX = EDX:EAX / ECX */ > + x86_pop(p->func, tmp_ECX); > + x86_pop(p->func, tmp_EDX); > + } > + } else { > + x86_mov(p->func, tmp_EAX, elt); > + } > + x86_imul(p->func, tmp_EAX, buf_stride); > + x86_add(p->func, tmp_EAX, buf_base_ptr); > > > /* In the linear case, keep the buffer pointer instead of the > * index number. > */ > - if (p->nr_buffers == 1) > - x86_mov( p->func, elt, tmp ); > + if (p->nr_buffer_varients == 1) > + x86_mov(p->func, elt, tmp_EAX); > else > - x86_mov( p->func, buf_ptr, tmp ); > + x86_mov(p->func, buf_ptr, tmp_EAX); > } > } > > @@ -394,31 +438,32 @@ static boolean init_inputs( struct translate_sse *p, > > static struct x86_reg get_buffer_ptr( struct translate_sse *p, > boolean linear, > - unsigned buf_idx, > + unsigned var_idx, > struct x86_reg elt ) > { > - if (linear && p->nr_buffers == 1) { > + if (linear && p->nr_buffer_varients == 1) { > return p->idx_EBX; > } > else if (linear) { > struct x86_reg ptr = p->tmp_EAX; > struct x86_reg buf_ptr = > x86_make_disp(p->machine_EDX, > - get_offset(p, &p->buffer[buf_idx].ptr)); > + get_offset(p, &p->buffer_varient[var_idx].ptr)); > > x86_mov(p->func, ptr, buf_ptr); > return ptr; > } > else { > struct x86_reg ptr = p->tmp_EAX; > + const struct translate_buffer_varient *varient = > &p->buffer_varient[var_idx]; > > struct x86_reg buf_stride = > x86_make_disp(p->machine_EDX, > - get_offset(p, &p->buffer[buf_idx].stride)); > + get_offset(p, > &p->buffer[varient->buffer_index].stride)); > > struct x86_reg buf_base_ptr = > x86_make_disp(p->machine_EDX, > - get_offset(p, &p->buffer[buf_idx].base_ptr)); > + get_offset(p, > &p->buffer[varient->buffer_index].base_ptr)); > > > > @@ -436,28 +481,33 @@ static struct x86_reg get_buffer_ptr( struct > translate_sse *p, > static boolean incr_inputs( struct translate_sse *p, > boolean linear ) > { > - if (linear && p->nr_buffers == 1) { > + if (linear && p->nr_buffer_varients == 1) { > struct x86_reg stride = x86_make_disp(p->machine_EDX, > get_offset(p, > &p->buffer[0].stride)); > > - x86_add(p->func, p->idx_EBX, stride); > - sse_prefetchnta(p->func, x86_make_disp(p->idx_EBX, 192)); > + if (p->buffer_varient[0].instance_divisor == 0) { > + x86_add(p->func, p->idx_EBX, stride); > + sse_prefetchnta(p->func, x86_make_disp(p->idx_EBX, 192)); > + } > } > else if (linear) { > unsigned i; > > /* Is this worthwhile?? > */ > - for (i = 0; i < p->nr_buffers; i++) { > + for (i = 0; i < p->nr_buffer_varients; i++) { > + struct translate_buffer_varient *varient = &p->buffer_varient[i]; > struct x86_reg buf_ptr = x86_make_disp(p->machine_EDX, > - get_offset(p, > &p->buffer[i].ptr)); > + get_offset(p, > &varient->ptr)); > struct x86_reg buf_stride = x86_make_disp(p->machine_EDX, > - get_offset(p, > &p->buffer[i].stride)); > + get_offset(p, > &p->buffer[varient->buffer_index].stride)); > > - x86_mov(p->func, p->tmp_EAX, buf_ptr); > - x86_add(p->func, p->tmp_EAX, buf_stride); > - if (i == 0) sse_prefetchnta(p->func, x86_make_disp(p->tmp_EAX, > 192)); > - x86_mov(p->func, buf_ptr, p->tmp_EAX); > + if (varient->instance_divisor == 0) { > + x86_mov(p->func, p->tmp_EAX, buf_ptr); > + x86_add(p->func, p->tmp_EAX, buf_stride); > + if (i == 0) sse_prefetchnta(p->func, x86_make_disp(p->tmp_EAX, > 192)); > + x86_mov(p->func, buf_ptr, p->tmp_EAX); > + } > } > } > else { > @@ -514,7 +564,18 @@ static boolean build_vertex_emit( struct translate_sse > *p, > x86_mov(p->func, p->machine_EDX, x86_fn_arg(p->func, 1)); > x86_mov(p->func, p->idx_EBX, x86_fn_arg(p->func, 2)); > x86_mov(p->func, p->count_ESI, x86_fn_arg(p->func, 3)); > - x86_mov(p->func, p->outbuf_ECX, x86_fn_arg(p->func, 4)); > + x86_mov(p->func, p->outbuf_ECX, x86_fn_arg(p->func, 5)); > + > + /* Load instance ID. > + */ > + if (p->use_instancing) { > + x86_mov(p->func, > + p->tmp_EAX, > + x86_fn_arg(p->func, 4)); > + x86_mov(p->func, > + x86_make_disp(p->machine_EDX, get_offset(p, &p->instance_id)), > + p->tmp_EAX); > + } > > /* Get vertex count, compare to zero > */ > @@ -531,17 +592,18 @@ static boolean build_vertex_emit( struct translate_sse > *p, > label = x86_get_label(p->func); > { > struct x86_reg elt = linear ? p->idx_EBX : x86_deref(p->idx_EBX); > - int last_vb = -1; > + int last_varient = -1; > struct x86_reg vb; > > for (j = 0; j < p->translate.key.nr_elements; j++) { > const struct translate_element *a = &p->translate.key.element[j]; > + unsigned varient = p->element_to_buffer_varient[j]; > > /* Figure out source pointer address: > */ > - if (a->input_buffer != last_vb) { > - last_vb = a->input_buffer; > - vb = get_buffer_ptr(p, linear, a->input_buffer, elt); > + if (varient != last_varient) { > + last_varient = varient; > + vb = get_buffer_ptr(p, linear, varient, elt); > } > > if (!translate_attr( p, a, > @@ -645,6 +707,7 @@ static void PIPE_CDECL translate_sse_run( struct > translate *translate, > p->gen_run( translate, > start, > count, > + instance_id, > output_buffer ); > } > > @@ -667,9 +730,32 @@ struct translate *translate_sse2_create( const struct > translate_key *key ) > p->translate.run_elts = translate_sse_run_elts; > p->translate.run = translate_sse_run; > > - for (i = 0; i < key->nr_elements; i++) > + for (i = 0; i < key->nr_elements; i++) { > + unsigned j; > + > p->nr_buffers = MAX2( p->nr_buffers, key->element[i].input_buffer + 1 > ); > > + if (key->element[i].instance_divisor) { > + p->use_instancing = TRUE; > + } > + > + /* > + * Map vertex element to vertex buffer varient. > + */ > + for (j = 0; j < p->nr_buffer_varients; j++) { > + if (p->buffer_varient[j].buffer_index == > key->element[i].input_buffer && > + p->buffer_varient[j].instance_divisor == > key->element[i].instance_divisor) { > + break; > + } > + } > + if (j == p->nr_buffer_varients) { > + p->buffer_varient[j].buffer_index = key->element[i].input_buffer; > + p->buffer_varient[j].instance_divisor = > key->element[i].instance_divisor; > + p->nr_buffer_varients++; > + } > + p->element_to_buffer_varient[i] = j; > + } > + > if (0) debug_printf("nr_buffers: %d\n", p->nr_buffers); > > if (!build_vertex_emit(p, &p->linear_func, TRUE)) > > _______________________________________________ > mesa-commit mailing list > mesa-com...@lists.freedesktop.org > http://lists.freedesktop.org/mailman/listinfo/mesa-commit ------------------------------------------------------------------------------ This SF.Net email is sponsored by the Verizon Developer Community Take advantage of Verizon's best-in-class app development support A streamlined, 14 day to market process makes app distribution fast and easy Join now and get one step closer to millions of Verizon customers http://p.sf.net/sfu/verizon-dev2dev _______________________________________________ Mesa3d-dev mailing list Mesa3d-dev@lists.sourceforge.net https://lists.sourceforge.net/lists/listinfo/mesa3d-dev