Re: [Mesa-dev] [PATCH] swr: fix transform feedback logic

2017-07-23 Thread Kyriazis, George

On Jul 23, 2017, at 1:51 PM, Ilia Mirkin 
> wrote:

On Sun, Jul 23, 2017 at 12:27 PM, Kyriazis, George
> wrote:

On Jul 23, 2017, at 11:21 AM, Ilia Mirkin 
> wrote:

On Sun, Jul 23, 2017 at 12:08 PM, George Kyriazis
> wrote:

The shader that is used to copy vertex data out of the vs/gs shaders to
the user-specified buffer (streamout os SO shader) was not using the
correct offsets.

Adjust the offsets that are used just for the SO shader:
- Make sure that position is handled in the same special way
as in the vs/gs shaders
- Use the correct offset to be passed in the core
- consolidate register slot mapping logic into one function, since it's
been calculated in 2 different places (one for calcuating the slot mask,
and one for the register offsets themselves

Also make room for all attibutes in the backend vertex area.

Fixes:
- all vtk GL2PS tests
- 18 piglit tests (16 ext_transform_feedback tests,
arb-quads-follow-provoking-vertex and primitive-type gl_points
---
src/gallium/drivers/swr/swr_draw.cpp  | 11 ---
src/gallium/drivers/swr/swr_state.cpp | 31 +--
src/gallium/drivers/swr/swr_state.h   |  3 +++
3 files changed, 40 insertions(+), 5 deletions(-)

diff --git a/src/gallium/drivers/swr/swr_draw.cpp
b/src/gallium/drivers/swr/swr_draw.cpp
index 62ad3f7..218de0f 100644
--- a/src/gallium/drivers/swr/swr_draw.cpp
+++ b/src/gallium/drivers/swr/swr_draw.cpp
@@ -26,6 +26,7 @@
#include "swr_resource.h"
#include "swr_fence.h"
#include "swr_query.h"
+#include "swr_state.h"
#include "jit_api.h"

#include "util/u_draw.h"
@@ -81,8 +82,11 @@ swr_draw_vbo(struct pipe_context *pipe, const struct
pipe_draw_info *info)
  offsets[output_buffer] = so->output[i].dst_offset;
   }

+unsigned attrib_slot = so->output[i].register_index;
+attrib_slot = swr_so_adjust_attrib(attrib_slot, ctx->vs);
+
   state.stream.decl[num].bufferIndex = output_buffer;
-state.stream.decl[num].attribSlot =
so->output[i].register_index - 1;
+state.stream.decl[num].attribSlot = attrib_slot;
   state.stream.decl[num].componentMask =
  ((1 << so->output[i].num_components) - 1)
  << so->output[i].start_component;
@@ -130,9 +134,10 @@ swr_draw_vbo(struct pipe_context *pipe, const struct
pipe_draw_info *info)
  SWR_FRONTEND_STATE feState = {0};

  feState.vsVertexSize =
-  VERTEX_ATTRIB_START_SLOT +
+  VERTEX_ATTRIB_START_SLOT
 + ctx->vs->info.base.num_outputs
-  - (ctx->vs->info.base.writes_position ? 1 : 0);
+  - (ctx->vs->info.base.writes_position ? 1 : 0)
+  + ctx->fs->info.base.num_outputs;

Why do you care about the number of fs outputs in the vertex size?

In theory, there shouldn’t be, but, as I said in my previous email, the clipper 
and dinner have some dependency that forces them to use the same vertex size.  
It could be changed, but it’s a much more intrusive change.


  if (ctx->rasterizer->flatshade_first) {
 feState.provokingVertex = {1, 0, 0};
diff --git a/src/gallium/drivers/swr/swr_state.cpp
b/src/gallium/drivers/swr/swr_state.cpp
index 501fdea..3e07929 100644
--- a/src/gallium/drivers/swr/swr_state.cpp
+++ b/src/gallium/drivers/swr/swr_state.cpp
@@ -345,13 +345,15 @@ swr_create_vs_state(struct pipe_context *pipe,
 // soState.streamToRasterizer not used

 for (uint32_t i = 0; i < stream_output->num_outputs; i++) {
+ unsigned attrib_slot = stream_output->output[i].register_index;
+ attrib_slot = swr_so_adjust_attrib(attrib_slot, swr_vs);
swr_vs->soState.streamMasks[stream_output->output[i].stream] |=
-1 << (stream_output->output[i].register_index - 1);
+(1 << attrib_slot);
 }
 for (uint32_t i = 0; i < MAX_SO_STREAMS; i++) {
   swr_vs->soState.streamNumEntries[i] =
_mm_popcnt_u32(swr_vs->soState.streamMasks[i]);
-swr_vs->soState.vertexAttribOffset[i] = VERTEX_ATTRIB_START_SLOT;
// TODO: optimize
+swr_vs->soState.vertexAttribOffset[i] = 0;
  }
  }

@@ -1777,6 +1779,31 @@ swr_update_derived(struct pipe_context *pipe,
  ctx->dirty = post_update_dirty_flags;
}

+unsigned
+swr_so_adjust_attrib(unsigned in_attrib,
+ swr_vertex_shader *swr_vs)
+{
+   ubyte semantic_name;
+   unsigned attrib;
+
+   attrib = in_attrib + VERTEX_ATTRIB_START_SLOT;
+
+   if (swr_vs) {
+  semantic_name = swr_vs->info.base.output_semantic_name[in_attrib];
+  if (semantic_name == TGSI_SEMANTIC_POSITION) {


But it's not just the position... there are plenty of other attributes
that need similar treatment. e.g. pointsize, layer, viewport index.


Yes, we want to deal with the other attributes, too, but lack the testing
time, due to the 17.2 schedule.  Since 

Re: [Mesa-dev] [PATCH] swr: fix transform feedback logic

2017-07-23 Thread Kyriazis, George

On Jul 23, 2017, at 1:42 PM, Rowley, Timothy O 
> wrote:


On Jul 23, 2017, at 11:08 AM, George Kyriazis 
> wrote:

The shader that is used to copy vertex data out of the vs/gs shaders to
the user-specified buffer (streamout os SO shader) was not using the
correct offsets.

Adjust the offsets that are used just for the SO shader:
- Make sure that position is handled in the same special way
as in the vs/gs shaders
- Use the correct offset to be passed in the core
- consolidate register slot mapping logic into one function, since it's
been calculated in 2 different places (one for calcuating the slot mask,
and one for the register offsets themselves

Also make room for all attibutes in the backend vertex area.

Add a comment to the commit indicating that as Ilia states, this is not a 
complete solution.

Ok


Fixes:
- all vtk GL2PS tests
- 18 piglit tests (16 ext_transform_feedback tests,
arb-quads-follow-provoking-vertex and primitive-type gl_points
---
src/gallium/drivers/swr/swr_draw.cpp  | 11 ---
src/gallium/drivers/swr/swr_state.cpp | 31 +--
src/gallium/drivers/swr/swr_state.h   |  3 +++
3 files changed, 40 insertions(+), 5 deletions(-)

diff --git a/src/gallium/drivers/swr/swr_draw.cpp 
b/src/gallium/drivers/swr/swr_draw.cpp
index 62ad3f7..218de0f 100644
--- a/src/gallium/drivers/swr/swr_draw.cpp
+++ b/src/gallium/drivers/swr/swr_draw.cpp
@@ -26,6 +26,7 @@
#include "swr_resource.h"
#include "swr_fence.h"
#include "swr_query.h"
+#include "swr_state.h"
#include "jit_api.h"

#include "util/u_draw.h"
@@ -81,8 +82,11 @@ swr_draw_vbo(struct pipe_context *pipe, const struct 
pipe_draw_info *info)
  offsets[output_buffer] = so->output[i].dst_offset;
   }

+unsigned attrib_slot = so->output[i].register_index;
+attrib_slot = swr_so_adjust_attrib(attrib_slot, ctx->vs);
+
   state.stream.decl[num].bufferIndex = output_buffer;
-state.stream.decl[num].attribSlot = so->output[i].register_index - 
1;
+state.stream.decl[num].attribSlot = attrib_slot;
   state.stream.decl[num].componentMask =
  ((1 << so->output[i].num_components) - 1)
  << so->output[i].start_component;
@@ -130,9 +134,10 @@ swr_draw_vbo(struct pipe_context *pipe, const struct 
pipe_draw_info *info)
  SWR_FRONTEND_STATE feState = {0};

  feState.vsVertexSize =
-  VERTEX_ATTRIB_START_SLOT +
+  VERTEX_ATTRIB_START_SLOT
 + ctx->vs->info.base.num_outputs
-  - (ctx->vs->info.base.writes_position ? 1 : 0);
+  - (ctx->vs->info.base.writes_position ? 1 : 0)
+  + ctx->fs->info.base.num_outputs;

Sizing vsVertexSize to essentially vs->num_outputs + fs->num_outputs seems odd, 
as the fe shouldn’t care about the number of outputs of the fs (inputs, maybe).

The clipper/binner code uses the stride (which originates from vsVertexSize) to 
create the intermediate vertices for the backend.  The sizes for the FE and BE 
are bound by the same size because of that.  In order to size the vertices to 
the absolute minimum size, you’ll have to do more intrusive changes in the core 
to decouple the two.

  if (ctx->rasterizer->flatshade_first) {
 feState.provokingVertex = {1, 0, 0};
diff --git a/src/gallium/drivers/swr/swr_state.cpp 
b/src/gallium/drivers/swr/swr_state.cpp
index 501fdea..3e07929 100644
--- a/src/gallium/drivers/swr/swr_state.cpp
+++ b/src/gallium/drivers/swr/swr_state.cpp
@@ -345,13 +345,15 @@ swr_create_vs_state(struct pipe_context *pipe,
 // soState.streamToRasterizer not used

 for (uint32_t i = 0; i < stream_output->num_outputs; i++) {
+ unsigned attrib_slot = stream_output->output[i].register_index;
+ attrib_slot = swr_so_adjust_attrib(attrib_slot, swr_vs);
swr_vs->soState.streamMasks[stream_output->output[i].stream] |=
-1 << (stream_output->output[i].register_index - 1);
+(1 << attrib_slot);
 }
 for (uint32_t i = 0; i < MAX_SO_STREAMS; i++) {
   swr_vs->soState.streamNumEntries[i] =
_mm_popcnt_u32(swr_vs->soState.streamMasks[i]);
-swr_vs->soState.vertexAttribOffset[i] = VERTEX_ATTRIB_START_SLOT; // 
TODO: optimize
+swr_vs->soState.vertexAttribOffset[i] = 0;
  }
  }

@@ -1777,6 +1779,31 @@ swr_update_derived(struct pipe_context *pipe,
  ctx->dirty = post_update_dirty_flags;
}

+unsigned
+swr_so_adjust_attrib(unsigned in_attrib,
+ swr_vertex_shader *swr_vs)
+{
+   ubyte semantic_name;
+   unsigned attrib;
+
+   attrib = in_attrib + VERTEX_ATTRIB_START_SLOT;
+
+   if (swr_vs) {
+  semantic_name = swr_vs->info.base.output_semantic_name[in_attrib];
+  if (semantic_name == TGSI_SEMANTIC_POSITION) {
+ attrib = VERTEX_POSITION_SLOT;
+  } else {
+ for (int i = 0; i < PIPE_MAX_SHADER_OUTPUTS; i++) {
+if 

Re: [Mesa-dev] [PATCH] swr: fix transform feedback logic

2017-07-23 Thread Ilia Mirkin
On Sun, Jul 23, 2017 at 12:27 PM, Kyriazis, George
 wrote:
>
> On Jul 23, 2017, at 11:21 AM, Ilia Mirkin  wrote:
>
> On Sun, Jul 23, 2017 at 12:08 PM, George Kyriazis
>  wrote:
>
> The shader that is used to copy vertex data out of the vs/gs shaders to
> the user-specified buffer (streamout os SO shader) was not using the
> correct offsets.
>
> Adjust the offsets that are used just for the SO shader:
> - Make sure that position is handled in the same special way
>  as in the vs/gs shaders
> - Use the correct offset to be passed in the core
> - consolidate register slot mapping logic into one function, since it's
>  been calculated in 2 different places (one for calcuating the slot mask,
>  and one for the register offsets themselves
>
> Also make room for all attibutes in the backend vertex area.
>
> Fixes:
> - all vtk GL2PS tests
> - 18 piglit tests (16 ext_transform_feedback tests,
>  arb-quads-follow-provoking-vertex and primitive-type gl_points
> ---
> src/gallium/drivers/swr/swr_draw.cpp  | 11 ---
> src/gallium/drivers/swr/swr_state.cpp | 31 +--
> src/gallium/drivers/swr/swr_state.h   |  3 +++
> 3 files changed, 40 insertions(+), 5 deletions(-)
>
> diff --git a/src/gallium/drivers/swr/swr_draw.cpp
> b/src/gallium/drivers/swr/swr_draw.cpp
> index 62ad3f7..218de0f 100644
> --- a/src/gallium/drivers/swr/swr_draw.cpp
> +++ b/src/gallium/drivers/swr/swr_draw.cpp
> @@ -26,6 +26,7 @@
> #include "swr_resource.h"
> #include "swr_fence.h"
> #include "swr_query.h"
> +#include "swr_state.h"
> #include "jit_api.h"
>
> #include "util/u_draw.h"
> @@ -81,8 +82,11 @@ swr_draw_vbo(struct pipe_context *pipe, const struct
> pipe_draw_info *info)
>offsets[output_buffer] = so->output[i].dst_offset;
> }
>
> +unsigned attrib_slot = so->output[i].register_index;
> +attrib_slot = swr_so_adjust_attrib(attrib_slot, ctx->vs);
> +
> state.stream.decl[num].bufferIndex = output_buffer;
> -state.stream.decl[num].attribSlot =
> so->output[i].register_index - 1;
> +state.stream.decl[num].attribSlot = attrib_slot;
> state.stream.decl[num].componentMask =
>((1 << so->output[i].num_components) - 1)
><< so->output[i].start_component;
> @@ -130,9 +134,10 @@ swr_draw_vbo(struct pipe_context *pipe, const struct
> pipe_draw_info *info)
>SWR_FRONTEND_STATE feState = {0};
>
>feState.vsVertexSize =
> -  VERTEX_ATTRIB_START_SLOT +
> +  VERTEX_ATTRIB_START_SLOT
>   + ctx->vs->info.base.num_outputs
> -  - (ctx->vs->info.base.writes_position ? 1 : 0);
> +  - (ctx->vs->info.base.writes_position ? 1 : 0)
> +  + ctx->fs->info.base.num_outputs;

Why do you care about the number of fs outputs in the vertex size?

>
>if (ctx->rasterizer->flatshade_first) {
>   feState.provokingVertex = {1, 0, 0};
> diff --git a/src/gallium/drivers/swr/swr_state.cpp
> b/src/gallium/drivers/swr/swr_state.cpp
> index 501fdea..3e07929 100644
> --- a/src/gallium/drivers/swr/swr_state.cpp
> +++ b/src/gallium/drivers/swr/swr_state.cpp
> @@ -345,13 +345,15 @@ swr_create_vs_state(struct pipe_context *pipe,
>   // soState.streamToRasterizer not used
>
>   for (uint32_t i = 0; i < stream_output->num_outputs; i++) {
> + unsigned attrib_slot = stream_output->output[i].register_index;
> + attrib_slot = swr_so_adjust_attrib(attrib_slot, swr_vs);
>  swr_vs->soState.streamMasks[stream_output->output[i].stream] |=
> -1 << (stream_output->output[i].register_index - 1);
> +(1 << attrib_slot);
>   }
>   for (uint32_t i = 0; i < MAX_SO_STREAMS; i++) {
> swr_vs->soState.streamNumEntries[i] =
>  _mm_popcnt_u32(swr_vs->soState.streamMasks[i]);
> -swr_vs->soState.vertexAttribOffset[i] = VERTEX_ATTRIB_START_SLOT;
> // TODO: optimize
> +swr_vs->soState.vertexAttribOffset[i] = 0;
>}
>}
>
> @@ -1777,6 +1779,31 @@ swr_update_derived(struct pipe_context *pipe,
>ctx->dirty = post_update_dirty_flags;
> }
>
> +unsigned
> +swr_so_adjust_attrib(unsigned in_attrib,
> + swr_vertex_shader *swr_vs)
> +{
> +   ubyte semantic_name;
> +   unsigned attrib;
> +
> +   attrib = in_attrib + VERTEX_ATTRIB_START_SLOT;
> +
> +   if (swr_vs) {
> +  semantic_name = swr_vs->info.base.output_semantic_name[in_attrib];
> +  if (semantic_name == TGSI_SEMANTIC_POSITION) {
>
>
> But it's not just the position... there are plenty of other attributes
> that need similar treatment. e.g. pointsize, layer, viewport index.
>
>
> Yes, we want to deal with the other attributes, too, but lack the testing
> time, due to the 17.2 schedule.  Since position fixes the vtk tests (that we
> are mostly concerned about), that is what this patch covers.  It’s in our
> plan to deal with the rest of the attributes in the near future 

Re: [Mesa-dev] [PATCH] swr: fix transform feedback logic

2017-07-23 Thread Rowley, Timothy O

> On Jul 23, 2017, at 11:08 AM, George Kyriazis  
> wrote:
> 
> The shader that is used to copy vertex data out of the vs/gs shaders to
> the user-specified buffer (streamout os SO shader) was not using the
> correct offsets.
> 
> Adjust the offsets that are used just for the SO shader:
> - Make sure that position is handled in the same special way
>  as in the vs/gs shaders
> - Use the correct offset to be passed in the core
> - consolidate register slot mapping logic into one function, since it's
>  been calculated in 2 different places (one for calcuating the slot mask,
>  and one for the register offsets themselves
> 
> Also make room for all attibutes in the backend vertex area.

Add a comment to the commit indicating that as Ilia states, this is not a 
complete solution.

> 
> Fixes:
> - all vtk GL2PS tests
> - 18 piglit tests (16 ext_transform_feedback tests,
>  arb-quads-follow-provoking-vertex and primitive-type gl_points
> ---
> src/gallium/drivers/swr/swr_draw.cpp  | 11 ---
> src/gallium/drivers/swr/swr_state.cpp | 31 +--
> src/gallium/drivers/swr/swr_state.h   |  3 +++
> 3 files changed, 40 insertions(+), 5 deletions(-)
> 
> diff --git a/src/gallium/drivers/swr/swr_draw.cpp 
> b/src/gallium/drivers/swr/swr_draw.cpp
> index 62ad3f7..218de0f 100644
> --- a/src/gallium/drivers/swr/swr_draw.cpp
> +++ b/src/gallium/drivers/swr/swr_draw.cpp
> @@ -26,6 +26,7 @@
> #include "swr_resource.h"
> #include "swr_fence.h"
> #include "swr_query.h"
> +#include "swr_state.h"
> #include "jit_api.h"
> 
> #include "util/u_draw.h"
> @@ -81,8 +82,11 @@ swr_draw_vbo(struct pipe_context *pipe, const struct 
> pipe_draw_info *info)
>offsets[output_buffer] = so->output[i].dst_offset;
> }
> 
> +unsigned attrib_slot = so->output[i].register_index;
> +attrib_slot = swr_so_adjust_attrib(attrib_slot, ctx->vs);
> +
> state.stream.decl[num].bufferIndex = output_buffer;
> -state.stream.decl[num].attribSlot = so->output[i].register_index 
> - 1;
> +state.stream.decl[num].attribSlot = attrib_slot;
> state.stream.decl[num].componentMask =
>((1 << so->output[i].num_components) - 1)
><< so->output[i].start_component;
> @@ -130,9 +134,10 @@ swr_draw_vbo(struct pipe_context *pipe, const struct 
> pipe_draw_info *info)
>SWR_FRONTEND_STATE feState = {0};
> 
>feState.vsVertexSize =
> -  VERTEX_ATTRIB_START_SLOT +
> +  VERTEX_ATTRIB_START_SLOT
>   + ctx->vs->info.base.num_outputs
> -  - (ctx->vs->info.base.writes_position ? 1 : 0);
> +  - (ctx->vs->info.base.writes_position ? 1 : 0)
> +  + ctx->fs->info.base.num_outputs;

Sizing vsVertexSize to essentially vs->num_outputs + fs->num_outputs seems odd, 
as the fe shouldn’t care about the number of outputs of the fs (inputs, maybe).

>if (ctx->rasterizer->flatshade_first) {
>   feState.provokingVertex = {1, 0, 0};
> diff --git a/src/gallium/drivers/swr/swr_state.cpp 
> b/src/gallium/drivers/swr/swr_state.cpp
> index 501fdea..3e07929 100644
> --- a/src/gallium/drivers/swr/swr_state.cpp
> +++ b/src/gallium/drivers/swr/swr_state.cpp
> @@ -345,13 +345,15 @@ swr_create_vs_state(struct pipe_context *pipe,
>   // soState.streamToRasterizer not used
> 
>   for (uint32_t i = 0; i < stream_output->num_outputs; i++) {
> + unsigned attrib_slot = stream_output->output[i].register_index;
> + attrib_slot = swr_so_adjust_attrib(attrib_slot, swr_vs);
>  swr_vs->soState.streamMasks[stream_output->output[i].stream] |=
> -1 << (stream_output->output[i].register_index - 1);
> +(1 << attrib_slot);
>   }
>   for (uint32_t i = 0; i < MAX_SO_STREAMS; i++) {
> swr_vs->soState.streamNumEntries[i] =
>  _mm_popcnt_u32(swr_vs->soState.streamMasks[i]);
> -swr_vs->soState.vertexAttribOffset[i] = VERTEX_ATTRIB_START_SLOT; // 
> TODO: optimize
> +swr_vs->soState.vertexAttribOffset[i] = 0;
>}
>}
> 
> @@ -1777,6 +1779,31 @@ swr_update_derived(struct pipe_context *pipe,
>ctx->dirty = post_update_dirty_flags;
> }
> 
> +unsigned
> +swr_so_adjust_attrib(unsigned in_attrib,
> + swr_vertex_shader *swr_vs)
> +{
> +   ubyte semantic_name;
> +   unsigned attrib;
> +
> +   attrib = in_attrib + VERTEX_ATTRIB_START_SLOT;
> +
> +   if (swr_vs) {
> +  semantic_name = swr_vs->info.base.output_semantic_name[in_attrib];
> +  if (semantic_name == TGSI_SEMANTIC_POSITION) {
> + attrib = VERTEX_POSITION_SLOT;
> +  } else {
> + for (int i = 0; i < PIPE_MAX_SHADER_OUTPUTS; i++) {
> +if (swr_vs->info.base.output_semantic_name[i] == 
> TGSI_SEMANTIC_POSITION) {
> +   attrib--;
> +   break;
> +}
> + }

Couldn’t this for loop be replaced with a “if 
(swr_vs->info.base.writes_position) attrib—;”?

> +  }

Re: [Mesa-dev] [PATCH] swr: fix transform feedback logic

2017-07-23 Thread Kyriazis, George

On Jul 23, 2017, at 11:21 AM, Ilia Mirkin 
> wrote:

On Sun, Jul 23, 2017 at 12:08 PM, George Kyriazis
> wrote:
The shader that is used to copy vertex data out of the vs/gs shaders to
the user-specified buffer (streamout os SO shader) was not using the
correct offsets.

Adjust the offsets that are used just for the SO shader:
- Make sure that position is handled in the same special way
 as in the vs/gs shaders
- Use the correct offset to be passed in the core
- consolidate register slot mapping logic into one function, since it's
 been calculated in 2 different places (one for calcuating the slot mask,
 and one for the register offsets themselves

Also make room for all attibutes in the backend vertex area.

Fixes:
- all vtk GL2PS tests
- 18 piglit tests (16 ext_transform_feedback tests,
 arb-quads-follow-provoking-vertex and primitive-type gl_points
---
src/gallium/drivers/swr/swr_draw.cpp  | 11 ---
src/gallium/drivers/swr/swr_state.cpp | 31 +--
src/gallium/drivers/swr/swr_state.h   |  3 +++
3 files changed, 40 insertions(+), 5 deletions(-)

diff --git a/src/gallium/drivers/swr/swr_draw.cpp 
b/src/gallium/drivers/swr/swr_draw.cpp
index 62ad3f7..218de0f 100644
--- a/src/gallium/drivers/swr/swr_draw.cpp
+++ b/src/gallium/drivers/swr/swr_draw.cpp
@@ -26,6 +26,7 @@
#include "swr_resource.h"
#include "swr_fence.h"
#include "swr_query.h"
+#include "swr_state.h"
#include "jit_api.h"

#include "util/u_draw.h"
@@ -81,8 +82,11 @@ swr_draw_vbo(struct pipe_context *pipe, const struct 
pipe_draw_info *info)
   offsets[output_buffer] = so->output[i].dst_offset;
}

+unsigned attrib_slot = so->output[i].register_index;
+attrib_slot = swr_so_adjust_attrib(attrib_slot, ctx->vs);
+
state.stream.decl[num].bufferIndex = output_buffer;
-state.stream.decl[num].attribSlot = so->output[i].register_index - 
1;
+state.stream.decl[num].attribSlot = attrib_slot;
state.stream.decl[num].componentMask =
   ((1 << so->output[i].num_components) - 1)
   << so->output[i].start_component;
@@ -130,9 +134,10 @@ swr_draw_vbo(struct pipe_context *pipe, const struct 
pipe_draw_info *info)
   SWR_FRONTEND_STATE feState = {0};

   feState.vsVertexSize =
-  VERTEX_ATTRIB_START_SLOT +
+  VERTEX_ATTRIB_START_SLOT
  + ctx->vs->info.base.num_outputs
-  - (ctx->vs->info.base.writes_position ? 1 : 0);
+  - (ctx->vs->info.base.writes_position ? 1 : 0)
+  + ctx->fs->info.base.num_outputs;

   if (ctx->rasterizer->flatshade_first) {
  feState.provokingVertex = {1, 0, 0};
diff --git a/src/gallium/drivers/swr/swr_state.cpp 
b/src/gallium/drivers/swr/swr_state.cpp
index 501fdea..3e07929 100644
--- a/src/gallium/drivers/swr/swr_state.cpp
+++ b/src/gallium/drivers/swr/swr_state.cpp
@@ -345,13 +345,15 @@ swr_create_vs_state(struct pipe_context *pipe,
  // soState.streamToRasterizer not used

  for (uint32_t i = 0; i < stream_output->num_outputs; i++) {
+ unsigned attrib_slot = stream_output->output[i].register_index;
+ attrib_slot = swr_so_adjust_attrib(attrib_slot, swr_vs);
 swr_vs->soState.streamMasks[stream_output->output[i].stream] |=
-1 << (stream_output->output[i].register_index - 1);
+(1 << attrib_slot);
  }
  for (uint32_t i = 0; i < MAX_SO_STREAMS; i++) {
swr_vs->soState.streamNumEntries[i] =
 _mm_popcnt_u32(swr_vs->soState.streamMasks[i]);
-swr_vs->soState.vertexAttribOffset[i] = VERTEX_ATTRIB_START_SLOT; // 
TODO: optimize
+swr_vs->soState.vertexAttribOffset[i] = 0;
   }
   }

@@ -1777,6 +1779,31 @@ swr_update_derived(struct pipe_context *pipe,
   ctx->dirty = post_update_dirty_flags;
}

+unsigned
+swr_so_adjust_attrib(unsigned in_attrib,
+ swr_vertex_shader *swr_vs)
+{
+   ubyte semantic_name;
+   unsigned attrib;
+
+   attrib = in_attrib + VERTEX_ATTRIB_START_SLOT;
+
+   if (swr_vs) {
+  semantic_name = swr_vs->info.base.output_semantic_name[in_attrib];
+  if (semantic_name == TGSI_SEMANTIC_POSITION) {

But it's not just the position... there are plenty of other attributes
that need similar treatment. e.g. pointsize, layer, viewport index.

Yes, we want to deal with the other attributes, too, but lack the testing time, 
due to the 17.2 schedule.  Since position fixes the vtk tests (that we are 
mostly concerned about), that is what this patch covers.  It’s in our plan to 
deal with the rest of the attributes in the near future (before we get 
distracted).

I tried to fix this a while back but IIRC my solution was left wanting:

https://github.com/imirkin/mesa/commits/swr

(see the WIP commits at the top).

I ended up getting distracted and never completed the work. Feel free
to copy/ignore/whatever the above commits. I also tried 

Re: [Mesa-dev] [PATCH] swr: fix transform feedback logic

2017-07-23 Thread Ilia Mirkin
On Sun, Jul 23, 2017 at 12:08 PM, George Kyriazis
 wrote:
> The shader that is used to copy vertex data out of the vs/gs shaders to
> the user-specified buffer (streamout os SO shader) was not using the
> correct offsets.
>
> Adjust the offsets that are used just for the SO shader:
> - Make sure that position is handled in the same special way
>   as in the vs/gs shaders
> - Use the correct offset to be passed in the core
> - consolidate register slot mapping logic into one function, since it's
>   been calculated in 2 different places (one for calcuating the slot mask,
>   and one for the register offsets themselves
>
> Also make room for all attibutes in the backend vertex area.
>
> Fixes:
> - all vtk GL2PS tests
> - 18 piglit tests (16 ext_transform_feedback tests,
>   arb-quads-follow-provoking-vertex and primitive-type gl_points
> ---
>  src/gallium/drivers/swr/swr_draw.cpp  | 11 ---
>  src/gallium/drivers/swr/swr_state.cpp | 31 +--
>  src/gallium/drivers/swr/swr_state.h   |  3 +++
>  3 files changed, 40 insertions(+), 5 deletions(-)
>
> diff --git a/src/gallium/drivers/swr/swr_draw.cpp 
> b/src/gallium/drivers/swr/swr_draw.cpp
> index 62ad3f7..218de0f 100644
> --- a/src/gallium/drivers/swr/swr_draw.cpp
> +++ b/src/gallium/drivers/swr/swr_draw.cpp
> @@ -26,6 +26,7 @@
>  #include "swr_resource.h"
>  #include "swr_fence.h"
>  #include "swr_query.h"
> +#include "swr_state.h"
>  #include "jit_api.h"
>
>  #include "util/u_draw.h"
> @@ -81,8 +82,11 @@ swr_draw_vbo(struct pipe_context *pipe, const struct 
> pipe_draw_info *info)
> offsets[output_buffer] = so->output[i].dst_offset;
>  }
>
> +unsigned attrib_slot = so->output[i].register_index;
> +attrib_slot = swr_so_adjust_attrib(attrib_slot, ctx->vs);
> +
>  state.stream.decl[num].bufferIndex = output_buffer;
> -state.stream.decl[num].attribSlot = so->output[i].register_index 
> - 1;
> +state.stream.decl[num].attribSlot = attrib_slot;
>  state.stream.decl[num].componentMask =
> ((1 << so->output[i].num_components) - 1)
> << so->output[i].start_component;
> @@ -130,9 +134,10 @@ swr_draw_vbo(struct pipe_context *pipe, const struct 
> pipe_draw_info *info)
> SWR_FRONTEND_STATE feState = {0};
>
> feState.vsVertexSize =
> -  VERTEX_ATTRIB_START_SLOT +
> +  VERTEX_ATTRIB_START_SLOT
>+ ctx->vs->info.base.num_outputs
> -  - (ctx->vs->info.base.writes_position ? 1 : 0);
> +  - (ctx->vs->info.base.writes_position ? 1 : 0)
> +  + ctx->fs->info.base.num_outputs;
>
> if (ctx->rasterizer->flatshade_first) {
>feState.provokingVertex = {1, 0, 0};
> diff --git a/src/gallium/drivers/swr/swr_state.cpp 
> b/src/gallium/drivers/swr/swr_state.cpp
> index 501fdea..3e07929 100644
> --- a/src/gallium/drivers/swr/swr_state.cpp
> +++ b/src/gallium/drivers/swr/swr_state.cpp
> @@ -345,13 +345,15 @@ swr_create_vs_state(struct pipe_context *pipe,
>// soState.streamToRasterizer not used
>
>for (uint32_t i = 0; i < stream_output->num_outputs; i++) {
> + unsigned attrib_slot = stream_output->output[i].register_index;
> + attrib_slot = swr_so_adjust_attrib(attrib_slot, swr_vs);
>   swr_vs->soState.streamMasks[stream_output->output[i].stream] |=
> -1 << (stream_output->output[i].register_index - 1);
> +(1 << attrib_slot);
>}
>for (uint32_t i = 0; i < MAX_SO_STREAMS; i++) {
>  swr_vs->soState.streamNumEntries[i] =
>   _mm_popcnt_u32(swr_vs->soState.streamMasks[i]);
> -swr_vs->soState.vertexAttribOffset[i] = VERTEX_ATTRIB_START_SLOT; // 
> TODO: optimize
> +swr_vs->soState.vertexAttribOffset[i] = 0;
> }
> }
>
> @@ -1777,6 +1779,31 @@ swr_update_derived(struct pipe_context *pipe,
> ctx->dirty = post_update_dirty_flags;
>  }
>
> +unsigned
> +swr_so_adjust_attrib(unsigned in_attrib,
> + swr_vertex_shader *swr_vs)
> +{
> +   ubyte semantic_name;
> +   unsigned attrib;
> +
> +   attrib = in_attrib + VERTEX_ATTRIB_START_SLOT;
> +
> +   if (swr_vs) {
> +  semantic_name = swr_vs->info.base.output_semantic_name[in_attrib];
> +  if (semantic_name == TGSI_SEMANTIC_POSITION) {

But it's not just the position... there are plenty of other attributes
that need similar treatment. e.g. pointsize, layer, viewport index.

I tried to fix this a while back but IIRC my solution was left wanting:

https://github.com/imirkin/mesa/commits/swr

(see the WIP commits at the top).

I ended up getting distracted and never completed the work. Feel free
to copy/ignore/whatever the above commits. I also tried to provide a
ARB_tf2 impl as well (i.e. pause/resume).

> + attrib = VERTEX_POSITION_SLOT;
> +  } else {
> + for (int i = 0; i < PIPE_MAX_SHADER_OUTPUTS; i++) {
> +if 

[Mesa-dev] [PATCH] swr: fix transform feedback logic

2017-07-23 Thread George Kyriazis
The shader that is used to copy vertex data out of the vs/gs shaders to
the user-specified buffer (streamout os SO shader) was not using the
correct offsets.

Adjust the offsets that are used just for the SO shader:
- Make sure that position is handled in the same special way
  as in the vs/gs shaders
- Use the correct offset to be passed in the core
- consolidate register slot mapping logic into one function, since it's
  been calculated in 2 different places (one for calcuating the slot mask,
  and one for the register offsets themselves

Also make room for all attibutes in the backend vertex area.

Fixes:
- all vtk GL2PS tests
- 18 piglit tests (16 ext_transform_feedback tests,
  arb-quads-follow-provoking-vertex and primitive-type gl_points
---
 src/gallium/drivers/swr/swr_draw.cpp  | 11 ---
 src/gallium/drivers/swr/swr_state.cpp | 31 +--
 src/gallium/drivers/swr/swr_state.h   |  3 +++
 3 files changed, 40 insertions(+), 5 deletions(-)

diff --git a/src/gallium/drivers/swr/swr_draw.cpp 
b/src/gallium/drivers/swr/swr_draw.cpp
index 62ad3f7..218de0f 100644
--- a/src/gallium/drivers/swr/swr_draw.cpp
+++ b/src/gallium/drivers/swr/swr_draw.cpp
@@ -26,6 +26,7 @@
 #include "swr_resource.h"
 #include "swr_fence.h"
 #include "swr_query.h"
+#include "swr_state.h"
 #include "jit_api.h"
 
 #include "util/u_draw.h"
@@ -81,8 +82,11 @@ swr_draw_vbo(struct pipe_context *pipe, const struct 
pipe_draw_info *info)
offsets[output_buffer] = so->output[i].dst_offset;
 }
 
+unsigned attrib_slot = so->output[i].register_index;
+attrib_slot = swr_so_adjust_attrib(attrib_slot, ctx->vs);
+
 state.stream.decl[num].bufferIndex = output_buffer;
-state.stream.decl[num].attribSlot = so->output[i].register_index - 
1;
+state.stream.decl[num].attribSlot = attrib_slot;
 state.stream.decl[num].componentMask =
((1 << so->output[i].num_components) - 1)
<< so->output[i].start_component;
@@ -130,9 +134,10 @@ swr_draw_vbo(struct pipe_context *pipe, const struct 
pipe_draw_info *info)
SWR_FRONTEND_STATE feState = {0};
 
feState.vsVertexSize =
-  VERTEX_ATTRIB_START_SLOT +
+  VERTEX_ATTRIB_START_SLOT
   + ctx->vs->info.base.num_outputs
-  - (ctx->vs->info.base.writes_position ? 1 : 0);
+  - (ctx->vs->info.base.writes_position ? 1 : 0)
+  + ctx->fs->info.base.num_outputs;
 
if (ctx->rasterizer->flatshade_first) {
   feState.provokingVertex = {1, 0, 0};
diff --git a/src/gallium/drivers/swr/swr_state.cpp 
b/src/gallium/drivers/swr/swr_state.cpp
index 501fdea..3e07929 100644
--- a/src/gallium/drivers/swr/swr_state.cpp
+++ b/src/gallium/drivers/swr/swr_state.cpp
@@ -345,13 +345,15 @@ swr_create_vs_state(struct pipe_context *pipe,
   // soState.streamToRasterizer not used
 
   for (uint32_t i = 0; i < stream_output->num_outputs; i++) {
+ unsigned attrib_slot = stream_output->output[i].register_index;
+ attrib_slot = swr_so_adjust_attrib(attrib_slot, swr_vs);
  swr_vs->soState.streamMasks[stream_output->output[i].stream] |=
-1 << (stream_output->output[i].register_index - 1);
+(1 << attrib_slot);
   }
   for (uint32_t i = 0; i < MAX_SO_STREAMS; i++) {
 swr_vs->soState.streamNumEntries[i] =
  _mm_popcnt_u32(swr_vs->soState.streamMasks[i]);
-swr_vs->soState.vertexAttribOffset[i] = VERTEX_ATTRIB_START_SLOT; // 
TODO: optimize
+swr_vs->soState.vertexAttribOffset[i] = 0;
}
}
 
@@ -1777,6 +1779,31 @@ swr_update_derived(struct pipe_context *pipe,
ctx->dirty = post_update_dirty_flags;
 }
 
+unsigned
+swr_so_adjust_attrib(unsigned in_attrib,
+ swr_vertex_shader *swr_vs)
+{
+   ubyte semantic_name;
+   unsigned attrib;
+
+   attrib = in_attrib + VERTEX_ATTRIB_START_SLOT;
+
+   if (swr_vs) {
+  semantic_name = swr_vs->info.base.output_semantic_name[in_attrib];
+  if (semantic_name == TGSI_SEMANTIC_POSITION) {
+ attrib = VERTEX_POSITION_SLOT;
+  } else {
+ for (int i = 0; i < PIPE_MAX_SHADER_OUTPUTS; i++) {
+if (swr_vs->info.base.output_semantic_name[i] == 
TGSI_SEMANTIC_POSITION) {
+   attrib--;
+   break;
+}
+ }
+  }
+   }
+
+   return attrib;
+}
 
 static struct pipe_stream_output_target *
 swr_create_so_target(struct pipe_context *pipe,
diff --git a/src/gallium/drivers/swr/swr_state.h 
b/src/gallium/drivers/swr/swr_state.h
index 7940a96..8cbd463 100644
--- a/src/gallium/drivers/swr/swr_state.h
+++ b/src/gallium/drivers/swr/swr_state.h
@@ -110,6 +110,9 @@ struct swr_derived_state {
 void swr_update_derived(struct pipe_context *,
 const struct pipe_draw_info * = nullptr);
 
+unsigned swr_so_adjust_attrib(unsigned in_attrib,
+  swr_vertex_shader *swr_vs);
+
 /*
  * Conversion functions: