Re: [Mesa3d-dev] Mesa (instanced-arrays): translate: Implement instancing for linear SSE run .

2009-12-30 Thread Keith Whitwell
Michal,

Did you update the 'C' version of translate for this new functionality?
You can't just extend the fast path - the fallback/default mode needs to
be updated as well.

Also, I'm sure it's not necessary to do a divide per vertex-element to
achieve instancing.  It can't be that hard to throw some more counters
at the problem and do this with a couple of adds instead of a divide...

Keith

On Wed, 2009-12-30 at 05:23 -0800, Micha?? Kr??l wrote:
 Module: Mesa
 Branch: instanced-arrays
 Commit: 09c0287b84725098c0b365668231ddf00487c84c
 URL:
 http://cgit.freedesktop.org/mesa/mesa/commit/?id=09c0287b84725098c0b365668231ddf00487c84c
 
 Author: Michal Krol mic...@vmware.com
 Date:   Wed Dec 30 14:23:12 2009 +0100
 
 translate: Implement instancing for linear SSE run.
 
 ---
 
  src/gallium/auxiliary/translate/translate_sse.c |  154 
 ++-
  1 files changed, 120 insertions(+), 34 deletions(-)
 
 diff --git a/src/gallium/auxiliary/translate/translate_sse.c 
 b/src/gallium/auxiliary/translate/translate_sse.c
 index edd0be1..ddfa4c6 100644
 --- a/src/gallium/auxiliary/translate/translate_sse.c
 +++ b/src/gallium/auxiliary/translate/translate_sse.c
 @@ -49,6 +49,7 @@
  typedef void (PIPE_CDECL *run_func)( struct translate *translate,
   unsigned start,
   unsigned count,
 + unsigned instance_id,
   void *output_buffer );
 
  typedef void (PIPE_CDECL *run_elts_func)( struct translate *translate,
 @@ -59,7 +60,12 @@ typedef void (PIPE_CDECL *run_elts_func)( struct translate 
 *translate,
  struct translate_buffer {
 const void *base_ptr;
 unsigned stride;
 -   void *ptr;   /* updated per vertex */
 +};
 +
 +struct translate_buffer_varient {
 +   unsigned buffer_index;
 +   unsigned instance_divisor;
 +   void *ptr;/* updated either per vertex or per 
 instance */
  };
 
 
 @@ -81,6 +87,16 @@ struct translate_sse {
 struct translate_buffer buffer[PIPE_MAX_ATTRIBS];
 unsigned nr_buffers;
 
 +   /* Multiple buffer varients can map to a single buffer. */
 +   struct translate_buffer_varient buffer_varient[PIPE_MAX_ATTRIBS];
 +   unsigned nr_buffer_varients;
 +
 +   /* Multiple elements can map to a single buffer varient. */
 +   unsigned element_to_buffer_varient[PIPE_MAX_ATTRIBS];
 +
 +   boolean use_instancing;
 +   unsigned instance_id;
 +
 run_func  gen_run;
 run_elts_func gen_run_elts;
 
 @@ -360,31 +376,59 @@ static boolean init_inputs( struct translate_sse *p,
  {
 unsigned i;
 if (linear) {
 -  for (i = 0; i  p-nr_buffers; i++) {
 +  struct x86_reg instance_id = x86_make_disp(p-machine_EDX,
 + get_offset(p, 
 p-instance_id));
 +
 +  for (i = 0; i  p-nr_buffer_varients; i++) {
 + struct translate_buffer_varient *varient = p-buffer_varient[i];
 + struct translate_buffer *buffer = p-buffer[varient-buffer_index];
   struct x86_reg buf_stride   = x86_make_disp(p-machine_EDX,
 - get_offset(p, 
 p-buffer[i].stride));
 + get_offset(p, 
 buffer-stride));
   struct x86_reg buf_ptr  = x86_make_disp(p-machine_EDX,
 - get_offset(p, 
 p-buffer[i].ptr));
 + get_offset(p, 
 varient-ptr));
   struct x86_reg buf_base_ptr = x86_make_disp(p-machine_EDX,
 - get_offset(p, 
 p-buffer[i].base_ptr));
 + get_offset(p, 
 buffer-base_ptr));
   struct x86_reg elt = p-idx_EBX;
 - struct x86_reg tmp = p-tmp_EAX;
 -
 + struct x86_reg tmp_EAX = p-tmp_EAX;
 
   /* Calculate pointer to first attrib:
 +  *   base_ptr + stride * index, where index depends on instance 
 divisor
*/
 - x86_mov(p-func, tmp, buf_stride);
 - x86_imul(p-func, tmp, elt);
 - x86_add(p-func, tmp, buf_base_ptr);
 + if (varient-instance_divisor) {
 +/* Our index is instance ID divided by instance divisor.
 + */
 +x86_mov(p-func, tmp_EAX, instance_id);
 +
 +if (varient-instance_divisor != 1) {
 +   struct x86_reg tmp_EDX = p-machine_EDX;
 +   struct x86_reg tmp_ECX = p-outbuf_ECX;
 +
 +   /* TODO: Add x86_shr() to rtasm and use it whenever
 +*   instance divisor is power of two.
 +*/
 +
 +   x86_push(p-func, tmp_EDX);
 +   x86_push(p-func, tmp_ECX);
 +   x86_xor(p-func, tmp_EDX, tmp_EDX);
 +   x86_mov_reg_imm(p-func, tmp_ECX, varient-instance_divisor);
 +   x86_div(p-func, 

Re: [Mesa3d-dev] Mesa (instanced-arrays): translate: Implement instancing for linear SSE run .

2009-12-30 Thread michal
Keith Whitwell wrote on 2009-12-30 16:22:
 Michal,

 Did you update the 'C' version of translate for this new functionality?
 You can't just extend the fast path - the fallback/default mode needs to
 be updated as well.

   
Yes, I did that in the previous commit.

 Also, I'm sure it's not necessary to do a divide per vertex-element to
 achieve instancing.  It can't be that hard to throw some more counters
 at the problem and do this with a couple of adds instead of a divide...

   
The division is done once per instance, not per every vertex attrib. Are 
you serious about optimising such low-profile things?

 Keith

 On Wed, 2009-12-30 at 05:23 -0800, Micha?? Kr??l wrote:
   
 Module: Mesa
 Branch: instanced-arrays
 Commit: 09c0287b84725098c0b365668231ddf00487c84c
 URL:
 http://cgit.freedesktop.org/mesa/mesa/commit/?id=09c0287b84725098c0b365668231ddf00487c84c

 Author: Michal Krol mic...@vmware.com
 Date:   Wed Dec 30 14:23:12 2009 +0100

 translate: Implement instancing for linear SSE run.

 ---

  src/gallium/auxiliary/translate/translate_sse.c |  154 
 ++-
  1 files changed, 120 insertions(+), 34 deletions(-)

 diff --git a/src/gallium/auxiliary/translate/translate_sse.c 
 b/src/gallium/auxiliary/translate/translate_sse.c
 index edd0be1..ddfa4c6 100644
 --- a/src/gallium/auxiliary/translate/translate_sse.c
 +++ b/src/gallium/auxiliary/translate/translate_sse.c
 @@ -49,6 +49,7 @@
  typedef void (PIPE_CDECL *run_func)( struct translate *translate,
   unsigned start,
   unsigned count,
 + unsigned instance_id,
   void *output_buffer );

  typedef void (PIPE_CDECL *run_elts_func)( struct translate *translate,
 @@ -59,7 +60,12 @@ typedef void (PIPE_CDECL *run_elts_func)( struct 
 translate *translate,
  struct translate_buffer {
 const void *base_ptr;
 unsigned stride;
 -   void *ptr;   /* updated per vertex */
 +};
 +
 +struct translate_buffer_varient {
 +   unsigned buffer_index;
 +   unsigned instance_divisor;
 +   void *ptr;/* updated either per vertex or per 
 instance */
  };


 @@ -81,6 +87,16 @@ struct translate_sse {
 struct translate_buffer buffer[PIPE_MAX_ATTRIBS];
 unsigned nr_buffers;

 +   /* Multiple buffer varients can map to a single buffer. */
 +   struct translate_buffer_varient buffer_varient[PIPE_MAX_ATTRIBS];
 +   unsigned nr_buffer_varients;
 +
 +   /* Multiple elements can map to a single buffer varient. */
 +   unsigned element_to_buffer_varient[PIPE_MAX_ATTRIBS];
 +
 +   boolean use_instancing;
 +   unsigned instance_id;
 +
 run_func  gen_run;
 run_elts_func gen_run_elts;

 @@ -360,31 +376,59 @@ static boolean init_inputs( struct translate_sse *p,
  {
 unsigned i;
 if (linear) {
 -  for (i = 0; i  p-nr_buffers; i++) {
 +  struct x86_reg instance_id = x86_make_disp(p-machine_EDX,
 + get_offset(p, 
 p-instance_id));
 +
 +  for (i = 0; i  p-nr_buffer_varients; i++) {
 + struct translate_buffer_varient *varient = p-buffer_varient[i];
 + struct translate_buffer *buffer = 
 p-buffer[varient-buffer_index];
   struct x86_reg buf_stride   = x86_make_disp(p-machine_EDX,
 - get_offset(p, 
 p-buffer[i].stride));
 + get_offset(p, 
 buffer-stride));
   struct x86_reg buf_ptr  = x86_make_disp(p-machine_EDX,
 - get_offset(p, 
 p-buffer[i].ptr));
 + get_offset(p, 
 varient-ptr));
   struct x86_reg buf_base_ptr = x86_make_disp(p-machine_EDX,
 - get_offset(p, 
 p-buffer[i].base_ptr));
 + get_offset(p, 
 buffer-base_ptr));
   struct x86_reg elt = p-idx_EBX;
 - struct x86_reg tmp = p-tmp_EAX;
 -
 + struct x86_reg tmp_EAX = p-tmp_EAX;

   /* Calculate pointer to first attrib:
 +  *   base_ptr + stride * index, where index depends on instance 
 divisor
*/
 - x86_mov(p-func, tmp, buf_stride);
 - x86_imul(p-func, tmp, elt);
 - x86_add(p-func, tmp, buf_base_ptr);
 + if (varient-instance_divisor) {
 +/* Our index is instance ID divided by instance divisor.
 + */
 +x86_mov(p-func, tmp_EAX, instance_id);
 +
 +if (varient-instance_divisor != 1) {
 +   struct x86_reg tmp_EDX = p-machine_EDX;
 +   struct x86_reg tmp_ECX = p-outbuf_ECX;
 +
 +   /* TODO: Add x86_shr() to rtasm and use it whenever
 +*   instance divisor is power of two.
 +*/
 +
 +   

Re: [Mesa3d-dev] Mesa (instanced-arrays): translate: Implement instancing for linear SSE run .

2009-12-30 Thread Keith Whitwell
On Wed, 2009-12-30 at 07:37 -0800, michal wrote:
 Keith Whitwell wrote on 2009-12-30 16:22:
  Michal,
 
  Did you update the 'C' version of translate for this new functionality?
  You can't just extend the fast path - the fallback/default mode needs to
  be updated as well.
 
 
 Yes, I did that in the previous commit.


OK.  I was confused as your other changes to translate are labelled that
way in the commit message, while this was buried inside what looked like
a softpipe change.

  Also, I'm sure it's not necessary to do a divide per vertex-element to
  achieve instancing.  It can't be that hard to throw some more counters
  at the problem and do this with a couple of adds instead of a divide...
 
 
 The division is done once per instance, not per every vertex attrib. 

OK, I misread that also...

 Are you serious about optimising such low-profile things?

No, once per invocation is fine.  Sorry for the confusion.

Keith

  Keith
 
  On Wed, 2009-12-30 at 05:23 -0800, Micha?? Kr??l wrote:
 
  Module: Mesa
  Branch: instanced-arrays
  Commit: 09c0287b84725098c0b365668231ddf00487c84c
  URL:
  http://cgit.freedesktop.org/mesa/mesa/commit/?id=09c0287b84725098c0b365668231ddf00487c84c
 
  Author: Michal Krol mic...@vmware.com
  Date:   Wed Dec 30 14:23:12 2009 +0100
 
  translate: Implement instancing for linear SSE run.
 
  ---
 
   src/gallium/auxiliary/translate/translate_sse.c |  154 
  ++-
   1 files changed, 120 insertions(+), 34 deletions(-)
 
  diff --git a/src/gallium/auxiliary/translate/translate_sse.c 
  b/src/gallium/auxiliary/translate/translate_sse.c
  index edd0be1..ddfa4c6 100644
  --- a/src/gallium/auxiliary/translate/translate_sse.c
  +++ b/src/gallium/auxiliary/translate/translate_sse.c
  @@ -49,6 +49,7 @@
   typedef void (PIPE_CDECL *run_func)( struct translate *translate,
unsigned start,
unsigned count,
  + unsigned instance_id,
void *output_buffer );
 
   typedef void (PIPE_CDECL *run_elts_func)( struct translate *translate,
  @@ -59,7 +60,12 @@ typedef void (PIPE_CDECL *run_elts_func)( struct 
  translate *translate,
   struct translate_buffer {
  const void *base_ptr;
  unsigned stride;
  -   void *ptr;   /* updated per vertex */
  +};
  +
  +struct translate_buffer_varient {
  +   unsigned buffer_index;
  +   unsigned instance_divisor;
  +   void *ptr;/* updated either per vertex or per 
  instance */
   };
 
 
  @@ -81,6 +87,16 @@ struct translate_sse {
  struct translate_buffer buffer[PIPE_MAX_ATTRIBS];
  unsigned nr_buffers;
 
  +   /* Multiple buffer varients can map to a single buffer. */
  +   struct translate_buffer_varient buffer_varient[PIPE_MAX_ATTRIBS];
  +   unsigned nr_buffer_varients;
  +
  +   /* Multiple elements can map to a single buffer varient. */
  +   unsigned element_to_buffer_varient[PIPE_MAX_ATTRIBS];
  +
  +   boolean use_instancing;
  +   unsigned instance_id;
  +
  run_func  gen_run;
  run_elts_func gen_run_elts;
 
  @@ -360,31 +376,59 @@ static boolean init_inputs( struct translate_sse *p,
   {
  unsigned i;
  if (linear) {
  -  for (i = 0; i  p-nr_buffers; i++) {
  +  struct x86_reg instance_id = x86_make_disp(p-machine_EDX,
  + get_offset(p, 
  p-instance_id));
  +
  +  for (i = 0; i  p-nr_buffer_varients; i++) {
  + struct translate_buffer_varient *varient = p-buffer_varient[i];
  + struct translate_buffer *buffer = 
  p-buffer[varient-buffer_index];
struct x86_reg buf_stride   = x86_make_disp(p-machine_EDX,
  - get_offset(p, 
  p-buffer[i].stride));
  + get_offset(p, 
  buffer-stride));
struct x86_reg buf_ptr  = x86_make_disp(p-machine_EDX,
  - get_offset(p, 
  p-buffer[i].ptr));
  + get_offset(p, 
  varient-ptr));
struct x86_reg buf_base_ptr = x86_make_disp(p-machine_EDX,
  - get_offset(p, 
  p-buffer[i].base_ptr));
  + get_offset(p, 
  buffer-base_ptr));
struct x86_reg elt = p-idx_EBX;
  - struct x86_reg tmp = p-tmp_EAX;
  -
  + struct x86_reg tmp_EAX = p-tmp_EAX;
 
/* Calculate pointer to first attrib:
  +  *   base_ptr + stride * index, where index depends on instance 
  divisor
 */
  - x86_mov(p-func, tmp, buf_stride);
  - x86_imul(p-func, tmp, elt);
  - x86_add(p-func, tmp, buf_base_ptr);
  + if (varient-instance_divisor) {
  +/* Our index is instance ID divided by instance