Re: [Mesa3d-dev] Mesa (instanced-arrays): translate: Implement instancing for linear SSE run .
Michal, Did you update the 'C' version of translate for this new functionality? You can't just extend the fast path - the fallback/default mode needs to be updated as well. Also, I'm sure it's not necessary to do a divide per vertex-element to achieve instancing. It can't be that hard to throw some more counters at the problem and do this with a couple of adds instead of a divide... Keith On Wed, 2009-12-30 at 05:23 -0800, Micha?? Kr??l wrote: Module: Mesa Branch: instanced-arrays Commit: 09c0287b84725098c0b365668231ddf00487c84c URL: http://cgit.freedesktop.org/mesa/mesa/commit/?id=09c0287b84725098c0b365668231ddf00487c84c Author: Michal Krol mic...@vmware.com Date: Wed Dec 30 14:23:12 2009 +0100 translate: Implement instancing for linear SSE run. --- src/gallium/auxiliary/translate/translate_sse.c | 154 ++- 1 files changed, 120 insertions(+), 34 deletions(-) diff --git a/src/gallium/auxiliary/translate/translate_sse.c b/src/gallium/auxiliary/translate/translate_sse.c index edd0be1..ddfa4c6 100644 --- a/src/gallium/auxiliary/translate/translate_sse.c +++ b/src/gallium/auxiliary/translate/translate_sse.c @@ -49,6 +49,7 @@ typedef void (PIPE_CDECL *run_func)( struct translate *translate, unsigned start, unsigned count, + unsigned instance_id, void *output_buffer ); typedef void (PIPE_CDECL *run_elts_func)( struct translate *translate, @@ -59,7 +60,12 @@ typedef void (PIPE_CDECL *run_elts_func)( struct translate *translate, struct translate_buffer { const void *base_ptr; unsigned stride; - void *ptr; /* updated per vertex */ +}; + +struct translate_buffer_varient { + unsigned buffer_index; + unsigned instance_divisor; + void *ptr;/* updated either per vertex or per instance */ }; @@ -81,6 +87,16 @@ struct translate_sse { struct translate_buffer buffer[PIPE_MAX_ATTRIBS]; unsigned nr_buffers; + /* Multiple buffer varients can map to a single buffer. */ + struct translate_buffer_varient buffer_varient[PIPE_MAX_ATTRIBS]; + unsigned nr_buffer_varients; + + /* Multiple elements can map to a single buffer varient. */ + unsigned element_to_buffer_varient[PIPE_MAX_ATTRIBS]; + + boolean use_instancing; + unsigned instance_id; + run_func gen_run; run_elts_func gen_run_elts; @@ -360,31 +376,59 @@ static boolean init_inputs( struct translate_sse *p, { unsigned i; if (linear) { - for (i = 0; i p-nr_buffers; i++) { + struct x86_reg instance_id = x86_make_disp(p-machine_EDX, + get_offset(p, p-instance_id)); + + for (i = 0; i p-nr_buffer_varients; i++) { + struct translate_buffer_varient *varient = p-buffer_varient[i]; + struct translate_buffer *buffer = p-buffer[varient-buffer_index]; struct x86_reg buf_stride = x86_make_disp(p-machine_EDX, - get_offset(p, p-buffer[i].stride)); + get_offset(p, buffer-stride)); struct x86_reg buf_ptr = x86_make_disp(p-machine_EDX, - get_offset(p, p-buffer[i].ptr)); + get_offset(p, varient-ptr)); struct x86_reg buf_base_ptr = x86_make_disp(p-machine_EDX, - get_offset(p, p-buffer[i].base_ptr)); + get_offset(p, buffer-base_ptr)); struct x86_reg elt = p-idx_EBX; - struct x86_reg tmp = p-tmp_EAX; - + struct x86_reg tmp_EAX = p-tmp_EAX; /* Calculate pointer to first attrib: + * base_ptr + stride * index, where index depends on instance divisor */ - x86_mov(p-func, tmp, buf_stride); - x86_imul(p-func, tmp, elt); - x86_add(p-func, tmp, buf_base_ptr); + if (varient-instance_divisor) { +/* Our index is instance ID divided by instance divisor. + */ +x86_mov(p-func, tmp_EAX, instance_id); + +if (varient-instance_divisor != 1) { + struct x86_reg tmp_EDX = p-machine_EDX; + struct x86_reg tmp_ECX = p-outbuf_ECX; + + /* TODO: Add x86_shr() to rtasm and use it whenever +* instance divisor is power of two. +*/ + + x86_push(p-func, tmp_EDX); + x86_push(p-func, tmp_ECX); + x86_xor(p-func, tmp_EDX, tmp_EDX); + x86_mov_reg_imm(p-func, tmp_ECX, varient-instance_divisor); + x86_div(p-func,
Re: [Mesa3d-dev] Mesa (instanced-arrays): translate: Implement instancing for linear SSE run .
Keith Whitwell wrote on 2009-12-30 16:22: Michal, Did you update the 'C' version of translate for this new functionality? You can't just extend the fast path - the fallback/default mode needs to be updated as well. Yes, I did that in the previous commit. Also, I'm sure it's not necessary to do a divide per vertex-element to achieve instancing. It can't be that hard to throw some more counters at the problem and do this with a couple of adds instead of a divide... The division is done once per instance, not per every vertex attrib. Are you serious about optimising such low-profile things? Keith On Wed, 2009-12-30 at 05:23 -0800, Micha?? Kr??l wrote: Module: Mesa Branch: instanced-arrays Commit: 09c0287b84725098c0b365668231ddf00487c84c URL: http://cgit.freedesktop.org/mesa/mesa/commit/?id=09c0287b84725098c0b365668231ddf00487c84c Author: Michal Krol mic...@vmware.com Date: Wed Dec 30 14:23:12 2009 +0100 translate: Implement instancing for linear SSE run. --- src/gallium/auxiliary/translate/translate_sse.c | 154 ++- 1 files changed, 120 insertions(+), 34 deletions(-) diff --git a/src/gallium/auxiliary/translate/translate_sse.c b/src/gallium/auxiliary/translate/translate_sse.c index edd0be1..ddfa4c6 100644 --- a/src/gallium/auxiliary/translate/translate_sse.c +++ b/src/gallium/auxiliary/translate/translate_sse.c @@ -49,6 +49,7 @@ typedef void (PIPE_CDECL *run_func)( struct translate *translate, unsigned start, unsigned count, + unsigned instance_id, void *output_buffer ); typedef void (PIPE_CDECL *run_elts_func)( struct translate *translate, @@ -59,7 +60,12 @@ typedef void (PIPE_CDECL *run_elts_func)( struct translate *translate, struct translate_buffer { const void *base_ptr; unsigned stride; - void *ptr; /* updated per vertex */ +}; + +struct translate_buffer_varient { + unsigned buffer_index; + unsigned instance_divisor; + void *ptr;/* updated either per vertex or per instance */ }; @@ -81,6 +87,16 @@ struct translate_sse { struct translate_buffer buffer[PIPE_MAX_ATTRIBS]; unsigned nr_buffers; + /* Multiple buffer varients can map to a single buffer. */ + struct translate_buffer_varient buffer_varient[PIPE_MAX_ATTRIBS]; + unsigned nr_buffer_varients; + + /* Multiple elements can map to a single buffer varient. */ + unsigned element_to_buffer_varient[PIPE_MAX_ATTRIBS]; + + boolean use_instancing; + unsigned instance_id; + run_func gen_run; run_elts_func gen_run_elts; @@ -360,31 +376,59 @@ static boolean init_inputs( struct translate_sse *p, { unsigned i; if (linear) { - for (i = 0; i p-nr_buffers; i++) { + struct x86_reg instance_id = x86_make_disp(p-machine_EDX, + get_offset(p, p-instance_id)); + + for (i = 0; i p-nr_buffer_varients; i++) { + struct translate_buffer_varient *varient = p-buffer_varient[i]; + struct translate_buffer *buffer = p-buffer[varient-buffer_index]; struct x86_reg buf_stride = x86_make_disp(p-machine_EDX, - get_offset(p, p-buffer[i].stride)); + get_offset(p, buffer-stride)); struct x86_reg buf_ptr = x86_make_disp(p-machine_EDX, - get_offset(p, p-buffer[i].ptr)); + get_offset(p, varient-ptr)); struct x86_reg buf_base_ptr = x86_make_disp(p-machine_EDX, - get_offset(p, p-buffer[i].base_ptr)); + get_offset(p, buffer-base_ptr)); struct x86_reg elt = p-idx_EBX; - struct x86_reg tmp = p-tmp_EAX; - + struct x86_reg tmp_EAX = p-tmp_EAX; /* Calculate pointer to first attrib: + * base_ptr + stride * index, where index depends on instance divisor */ - x86_mov(p-func, tmp, buf_stride); - x86_imul(p-func, tmp, elt); - x86_add(p-func, tmp, buf_base_ptr); + if (varient-instance_divisor) { +/* Our index is instance ID divided by instance divisor. + */ +x86_mov(p-func, tmp_EAX, instance_id); + +if (varient-instance_divisor != 1) { + struct x86_reg tmp_EDX = p-machine_EDX; + struct x86_reg tmp_ECX = p-outbuf_ECX; + + /* TODO: Add x86_shr() to rtasm and use it whenever +* instance divisor is power of two. +*/ + +
Re: [Mesa3d-dev] Mesa (instanced-arrays): translate: Implement instancing for linear SSE run .
On Wed, 2009-12-30 at 07:37 -0800, michal wrote: Keith Whitwell wrote on 2009-12-30 16:22: Michal, Did you update the 'C' version of translate for this new functionality? You can't just extend the fast path - the fallback/default mode needs to be updated as well. Yes, I did that in the previous commit. OK. I was confused as your other changes to translate are labelled that way in the commit message, while this was buried inside what looked like a softpipe change. Also, I'm sure it's not necessary to do a divide per vertex-element to achieve instancing. It can't be that hard to throw some more counters at the problem and do this with a couple of adds instead of a divide... The division is done once per instance, not per every vertex attrib. OK, I misread that also... Are you serious about optimising such low-profile things? No, once per invocation is fine. Sorry for the confusion. Keith Keith On Wed, 2009-12-30 at 05:23 -0800, Micha?? Kr??l wrote: Module: Mesa Branch: instanced-arrays Commit: 09c0287b84725098c0b365668231ddf00487c84c URL: http://cgit.freedesktop.org/mesa/mesa/commit/?id=09c0287b84725098c0b365668231ddf00487c84c Author: Michal Krol mic...@vmware.com Date: Wed Dec 30 14:23:12 2009 +0100 translate: Implement instancing for linear SSE run. --- src/gallium/auxiliary/translate/translate_sse.c | 154 ++- 1 files changed, 120 insertions(+), 34 deletions(-) diff --git a/src/gallium/auxiliary/translate/translate_sse.c b/src/gallium/auxiliary/translate/translate_sse.c index edd0be1..ddfa4c6 100644 --- a/src/gallium/auxiliary/translate/translate_sse.c +++ b/src/gallium/auxiliary/translate/translate_sse.c @@ -49,6 +49,7 @@ typedef void (PIPE_CDECL *run_func)( struct translate *translate, unsigned start, unsigned count, + unsigned instance_id, void *output_buffer ); typedef void (PIPE_CDECL *run_elts_func)( struct translate *translate, @@ -59,7 +60,12 @@ typedef void (PIPE_CDECL *run_elts_func)( struct translate *translate, struct translate_buffer { const void *base_ptr; unsigned stride; - void *ptr; /* updated per vertex */ +}; + +struct translate_buffer_varient { + unsigned buffer_index; + unsigned instance_divisor; + void *ptr;/* updated either per vertex or per instance */ }; @@ -81,6 +87,16 @@ struct translate_sse { struct translate_buffer buffer[PIPE_MAX_ATTRIBS]; unsigned nr_buffers; + /* Multiple buffer varients can map to a single buffer. */ + struct translate_buffer_varient buffer_varient[PIPE_MAX_ATTRIBS]; + unsigned nr_buffer_varients; + + /* Multiple elements can map to a single buffer varient. */ + unsigned element_to_buffer_varient[PIPE_MAX_ATTRIBS]; + + boolean use_instancing; + unsigned instance_id; + run_func gen_run; run_elts_func gen_run_elts; @@ -360,31 +376,59 @@ static boolean init_inputs( struct translate_sse *p, { unsigned i; if (linear) { - for (i = 0; i p-nr_buffers; i++) { + struct x86_reg instance_id = x86_make_disp(p-machine_EDX, + get_offset(p, p-instance_id)); + + for (i = 0; i p-nr_buffer_varients; i++) { + struct translate_buffer_varient *varient = p-buffer_varient[i]; + struct translate_buffer *buffer = p-buffer[varient-buffer_index]; struct x86_reg buf_stride = x86_make_disp(p-machine_EDX, - get_offset(p, p-buffer[i].stride)); + get_offset(p, buffer-stride)); struct x86_reg buf_ptr = x86_make_disp(p-machine_EDX, - get_offset(p, p-buffer[i].ptr)); + get_offset(p, varient-ptr)); struct x86_reg buf_base_ptr = x86_make_disp(p-machine_EDX, - get_offset(p, p-buffer[i].base_ptr)); + get_offset(p, buffer-base_ptr)); struct x86_reg elt = p-idx_EBX; - struct x86_reg tmp = p-tmp_EAX; - + struct x86_reg tmp_EAX = p-tmp_EAX; /* Calculate pointer to first attrib: + * base_ptr + stride * index, where index depends on instance divisor */ - x86_mov(p-func, tmp, buf_stride); - x86_imul(p-func, tmp, elt); - x86_add(p-func, tmp, buf_base_ptr); + if (varient-instance_divisor) { +/* Our index is instance ID divided by instance