Re: [Mesa-dev] [PATCH 2/9] swr/rast: New GS state/context API

2017-09-25 Thread Cherniak, Bruce
On Sep 25, 2017, at 11:31 AM, Rowley, Timothy O 
> wrote:

Ok, made the following changes - want a full v2 commit, or ok to do this on 
push?

I'm fine with doing it on push and don't need a full v2.  It simply replaces a 
couple magic numbers with their defines -- no functional change.
I'll mark the entire set rvb in patch 0/9.

--- a/src/gallium/drivers/swr/swr_shader.cpp
+++ b/src/gallium/drivers/swr/swr_shader.cpp
@@ -533,12 +533,12 @@ BuilderSWR::CompileGS(struct swr_context *ctx, 
swr_jit_gs_key )
pGS->inputVertStride = pGS->numInputAttribs + pGS->vertexAttribOffset;
pGS->outputVertexSize = SWR_VTX_NUM_SLOTS;
pGS->controlDataSize = 8; // GS ouputs max of 8 32B units
-   pGS->controlDataOffset = 32;
-   pGS->outputVertexOffset = pGS->controlDataOffset + pGS->controlDataSize * 
32;
+   pGS->controlDataOffset = VERTEX_COUNT_SIZE;
+   pGS->outputVertexOffset = pGS->controlDataOffset + CONTROL_HEADER_SIZE;

pGS->allocationSize =
-  32 + // vertex count
-  (8 * 32) + // control header
+  VERTEX_COUNT_SIZE + // vertex count
+  CONTROL_HEADER_SIZE + // control header
   (SWR_VTX_NUM_SLOTS * 16) * // sizeof vertex
   pGS->maxNumVerts; // num verts


On Sep 23, 2017, at 9:51 PM, Cherniak, Bruce 
> wrote:


On Sep 21, 2017, at 7:46 PM, Tim Rowley 
> wrote:

One piglit regression, which was a false pass:
spec@glsl-1.50@execution@geometry@dynamic_input_array_index
---
.../drivers/swr/rasterizer/core/frontend.cpp   | 227 -
src/gallium/drivers/swr/rasterizer/core/state.h|  55 +++--
src/gallium/drivers/swr/swr_shader.cpp | 183 -
3 files changed, 253 insertions(+), 212 deletions(-)

diff --git a/src/gallium/drivers/swr/rasterizer/core/frontend.cpp 
b/src/gallium/drivers/swr/rasterizer/core/frontend.cpp
index f882869..26e76a9 100644
--- a/src/gallium/drivers/swr/rasterizer/core/frontend.cpp
+++ b/src/gallium/drivers/swr/rasterizer/core/frontend.cpp
@@ -710,45 +710,67 @@ void ProcessStreamIdBuffer(uint32_t stream, uint8_t* 
pStreamIdBase, uint32_t num

THREAD SWR_GS_CONTEXT tlsGsContext;

-template
-struct GsBufferInfo
+// Buffers that are allocated if GS is enabled
+struct GsBuffers
{
-GsBufferInfo(const SWR_GS_STATE )
-{
-const uint32_t vertexCount = gsState.maxNumVerts;
-const uint32_t vertexStride = sizeof(SIMDVERTEX);
-const uint32_t numSimdBatches = (vertexCount + SIMD_WIDTH - 1) / 
SIMD_WIDTH;
+uint8_t* pGsIn;
+uint8_t* pGsOut[KNOB_SIMD_WIDTH];
+uint8_t* pGsTransposed;
+void* pStreamCutBuffer;
+};

-vertexPrimitiveStride = vertexStride * numSimdBatches;
-vertexInstanceStride = vertexPrimitiveStride * SIMD_WIDTH;
+//
+/// @brief Transposes GS output from SOA to AOS to feed the primitive assembler
+/// @param pDst - Destination buffer in AOS form for the current SIMD width, 
fed into the primitive assembler
+/// @param pSrc - Buffer of vertices in SOA form written by the geometry shader
+/// @param numVerts - Number of vertices outputted by the GS
+/// @param numAttribs - Number of attributes per vertex
+template
+void TransposeSOAtoAOS(uint8_t* pDst, uint8_t* pSrc, uint32_t numVerts, 
uint32_t numAttribs)
+{
+uint32_t srcVertexStride = numAttribs * sizeof(float) * 4;
+uint32_t dstVertexStride = numAttribs * sizeof(typename SIMD_T::Float) * 4;

-if (gsState.isSingleStream)
-{
-cutPrimitiveStride = (vertexCount + 7) / 8;
-cutInstanceStride = cutPrimitiveStride * SIMD_WIDTH;
+OSALIGNSIMD16(uint32_t) gatherOffsets[SimdWidth];

-streamCutPrimitiveStride = 0;
-streamCutInstanceStride = 0;
-}
-else
-{
-cutPrimitiveStride = AlignUp(vertexCount * 2 / 8, 4);
-cutInstanceStride = cutPrimitiveStride * SIMD_WIDTH;
-
-streamCutPrimitiveStride = (vertexCount + 7) / 8;
-streamCutInstanceStride = streamCutPrimitiveStride * SIMD_WIDTH;
-}
+for (uint32_t i = 0; i < SimdWidth; ++i)
+{
+gatherOffsets[i] = srcVertexStride * i;
   }
+auto vGatherOffsets = SIMD_T::load_si((typename 
SIMD_T::Integer*)[0]);

-uint32_t vertexPrimitiveStride;
-uint32_t vertexInstanceStride;
+uint32_t numSimd = AlignUp(numVerts, SimdWidth) / SimdWidth;
+uint32_t remainingVerts = numVerts;

-uint32_t cutPrimitiveStride;
-uint32_t cutInstanceStride;
+for (uint32_t s = 0; s < numSimd; ++s)
+{
+uint8_t* pSrcBase = pSrc + s * srcVertexStride * SimdWidth;
+uint8_t* pDstBase = pDst + s * dstVertexStride;

-uint32_t streamCutPrimitiveStride;
-uint32_t streamCutInstanceStride;
-};
+// Compute mask to prevent src overflow
+   

Re: [Mesa-dev] [PATCH 2/9] swr/rast: New GS state/context API

2017-09-25 Thread Rowley, Timothy O
Ok, made the following changes - want a full v2 commit, or ok to do this on 
push?

--- a/src/gallium/drivers/swr/swr_shader.cpp
+++ b/src/gallium/drivers/swr/swr_shader.cpp
@@ -533,12 +533,12 @@ BuilderSWR::CompileGS(struct swr_context *ctx, 
swr_jit_gs_key )
pGS->inputVertStride = pGS->numInputAttribs + pGS->vertexAttribOffset;
pGS->outputVertexSize = SWR_VTX_NUM_SLOTS;
pGS->controlDataSize = 8; // GS ouputs max of 8 32B units
-   pGS->controlDataOffset = 32;
-   pGS->outputVertexOffset = pGS->controlDataOffset + pGS->controlDataSize * 
32;
+   pGS->controlDataOffset = VERTEX_COUNT_SIZE;
+   pGS->outputVertexOffset = pGS->controlDataOffset + CONTROL_HEADER_SIZE;

pGS->allocationSize =
-  32 + // vertex count
-  (8 * 32) + // control header
+  VERTEX_COUNT_SIZE + // vertex count
+  CONTROL_HEADER_SIZE + // control header
   (SWR_VTX_NUM_SLOTS * 16) * // sizeof vertex
   pGS->maxNumVerts; // num verts


On Sep 23, 2017, at 9:51 PM, Cherniak, Bruce 
> wrote:


On Sep 21, 2017, at 7:46 PM, Tim Rowley 
> wrote:

One piglit regression, which was a false pass:
spec@glsl-1.50@execution@geometry@dynamic_input_array_index
---
.../drivers/swr/rasterizer/core/frontend.cpp   | 227 -
src/gallium/drivers/swr/rasterizer/core/state.h|  55 +++--
src/gallium/drivers/swr/swr_shader.cpp | 183 -
3 files changed, 253 insertions(+), 212 deletions(-)

diff --git a/src/gallium/drivers/swr/rasterizer/core/frontend.cpp 
b/src/gallium/drivers/swr/rasterizer/core/frontend.cpp
index f882869..26e76a9 100644
--- a/src/gallium/drivers/swr/rasterizer/core/frontend.cpp
+++ b/src/gallium/drivers/swr/rasterizer/core/frontend.cpp
@@ -710,45 +710,67 @@ void ProcessStreamIdBuffer(uint32_t stream, uint8_t* 
pStreamIdBase, uint32_t num

THREAD SWR_GS_CONTEXT tlsGsContext;

-template
-struct GsBufferInfo
+// Buffers that are allocated if GS is enabled
+struct GsBuffers
{
-GsBufferInfo(const SWR_GS_STATE )
-{
-const uint32_t vertexCount = gsState.maxNumVerts;
-const uint32_t vertexStride = sizeof(SIMDVERTEX);
-const uint32_t numSimdBatches = (vertexCount + SIMD_WIDTH - 1) / 
SIMD_WIDTH;
+uint8_t* pGsIn;
+uint8_t* pGsOut[KNOB_SIMD_WIDTH];
+uint8_t* pGsTransposed;
+void* pStreamCutBuffer;
+};

-vertexPrimitiveStride = vertexStride * numSimdBatches;
-vertexInstanceStride = vertexPrimitiveStride * SIMD_WIDTH;
+//
+/// @brief Transposes GS output from SOA to AOS to feed the primitive assembler
+/// @param pDst - Destination buffer in AOS form for the current SIMD width, 
fed into the primitive assembler
+/// @param pSrc - Buffer of vertices in SOA form written by the geometry shader
+/// @param numVerts - Number of vertices outputted by the GS
+/// @param numAttribs - Number of attributes per vertex
+template
+void TransposeSOAtoAOS(uint8_t* pDst, uint8_t* pSrc, uint32_t numVerts, 
uint32_t numAttribs)
+{
+uint32_t srcVertexStride = numAttribs * sizeof(float) * 4;
+uint32_t dstVertexStride = numAttribs * sizeof(typename SIMD_T::Float) * 4;

-if (gsState.isSingleStream)
-{
-cutPrimitiveStride = (vertexCount + 7) / 8;
-cutInstanceStride = cutPrimitiveStride * SIMD_WIDTH;
+OSALIGNSIMD16(uint32_t) gatherOffsets[SimdWidth];

-streamCutPrimitiveStride = 0;
-streamCutInstanceStride = 0;
-}
-else
-{
-cutPrimitiveStride = AlignUp(vertexCount * 2 / 8, 4);
-cutInstanceStride = cutPrimitiveStride * SIMD_WIDTH;
-
-streamCutPrimitiveStride = (vertexCount + 7) / 8;
-streamCutInstanceStride = streamCutPrimitiveStride * SIMD_WIDTH;
-}
+for (uint32_t i = 0; i < SimdWidth; ++i)
+{
+gatherOffsets[i] = srcVertexStride * i;
   }
+auto vGatherOffsets = SIMD_T::load_si((typename 
SIMD_T::Integer*)[0]);

-uint32_t vertexPrimitiveStride;
-uint32_t vertexInstanceStride;
+uint32_t numSimd = AlignUp(numVerts, SimdWidth) / SimdWidth;
+uint32_t remainingVerts = numVerts;

-uint32_t cutPrimitiveStride;
-uint32_t cutInstanceStride;
+for (uint32_t s = 0; s < numSimd; ++s)
+{
+uint8_t* pSrcBase = pSrc + s * srcVertexStride * SimdWidth;
+uint8_t* pDstBase = pDst + s * dstVertexStride;

-uint32_t streamCutPrimitiveStride;
-uint32_t streamCutInstanceStride;
-};
+// Compute mask to prevent src overflow
+uint32_t mask = std::min(remainingVerts, SimdWidth);
+mask = GenMask(mask);
+auto vMask = SIMD_T::vmask_ps(mask);
+auto viMask = SIMD_T::castps_si(vMask);
+
+for (uint32_t a = 0; a < numAttribs; ++a)
+{
+auto attribGatherX = SIMD_T::template 

Re: [Mesa-dev] [PATCH 2/9] swr/rast: New GS state/context API

2017-09-23 Thread Cherniak, Bruce

> On Sep 21, 2017, at 7:46 PM, Tim Rowley  wrote:
> 
> One piglit regression, which was a false pass:
>  spec@glsl-1.50@execution@geometry@dynamic_input_array_index
> ---
> .../drivers/swr/rasterizer/core/frontend.cpp   | 227 -
> src/gallium/drivers/swr/rasterizer/core/state.h|  55 +++--
> src/gallium/drivers/swr/swr_shader.cpp | 183 -
> 3 files changed, 253 insertions(+), 212 deletions(-)
> 
> diff --git a/src/gallium/drivers/swr/rasterizer/core/frontend.cpp 
> b/src/gallium/drivers/swr/rasterizer/core/frontend.cpp
> index f882869..26e76a9 100644
> --- a/src/gallium/drivers/swr/rasterizer/core/frontend.cpp
> +++ b/src/gallium/drivers/swr/rasterizer/core/frontend.cpp
> @@ -710,45 +710,67 @@ void ProcessStreamIdBuffer(uint32_t stream, uint8_t* 
> pStreamIdBase, uint32_t num
> 
> THREAD SWR_GS_CONTEXT tlsGsContext;
> 
> -template
> -struct GsBufferInfo
> +// Buffers that are allocated if GS is enabled
> +struct GsBuffers
> {
> -GsBufferInfo(const SWR_GS_STATE )
> -{
> -const uint32_t vertexCount = gsState.maxNumVerts;
> -const uint32_t vertexStride = sizeof(SIMDVERTEX);
> -const uint32_t numSimdBatches = (vertexCount + SIMD_WIDTH - 1) / 
> SIMD_WIDTH;
> +uint8_t* pGsIn;
> +uint8_t* pGsOut[KNOB_SIMD_WIDTH];
> +uint8_t* pGsTransposed;
> +void* pStreamCutBuffer;
> +};
> 
> -vertexPrimitiveStride = vertexStride * numSimdBatches;
> -vertexInstanceStride = vertexPrimitiveStride * SIMD_WIDTH;
> +//
> +/// @brief Transposes GS output from SOA to AOS to feed the primitive 
> assembler
> +/// @param pDst - Destination buffer in AOS form for the current SIMD width, 
> fed into the primitive assembler
> +/// @param pSrc - Buffer of vertices in SOA form written by the geometry 
> shader
> +/// @param numVerts - Number of vertices outputted by the GS
> +/// @param numAttribs - Number of attributes per vertex
> +template
> +void TransposeSOAtoAOS(uint8_t* pDst, uint8_t* pSrc, uint32_t numVerts, 
> uint32_t numAttribs)
> +{
> +uint32_t srcVertexStride = numAttribs * sizeof(float) * 4;
> +uint32_t dstVertexStride = numAttribs * sizeof(typename SIMD_T::Float) * 
> 4;
> 
> -if (gsState.isSingleStream)
> -{
> -cutPrimitiveStride = (vertexCount + 7) / 8;
> -cutInstanceStride = cutPrimitiveStride * SIMD_WIDTH;
> +OSALIGNSIMD16(uint32_t) gatherOffsets[SimdWidth];
> 
> -streamCutPrimitiveStride = 0;
> -streamCutInstanceStride = 0;
> -}
> -else
> -{
> -cutPrimitiveStride = AlignUp(vertexCount * 2 / 8, 4);
> -cutInstanceStride = cutPrimitiveStride * SIMD_WIDTH;
> -
> -streamCutPrimitiveStride = (vertexCount + 7) / 8;
> -streamCutInstanceStride = streamCutPrimitiveStride * SIMD_WIDTH;
> -}
> +for (uint32_t i = 0; i < SimdWidth; ++i)
> +{
> +gatherOffsets[i] = srcVertexStride * i;
> }
> +auto vGatherOffsets = SIMD_T::load_si((typename 
> SIMD_T::Integer*)[0]);
> 
> -uint32_t vertexPrimitiveStride;
> -uint32_t vertexInstanceStride;
> +uint32_t numSimd = AlignUp(numVerts, SimdWidth) / SimdWidth;
> +uint32_t remainingVerts = numVerts;
> 
> -uint32_t cutPrimitiveStride;
> -uint32_t cutInstanceStride;
> +for (uint32_t s = 0; s < numSimd; ++s)
> +{
> +uint8_t* pSrcBase = pSrc + s * srcVertexStride * SimdWidth;
> +uint8_t* pDstBase = pDst + s * dstVertexStride;
> 
> -uint32_t streamCutPrimitiveStride;
> -uint32_t streamCutInstanceStride;
> -};
> +// Compute mask to prevent src overflow
> +uint32_t mask = std::min(remainingVerts, SimdWidth);
> +mask = GenMask(mask);
> +auto vMask = SIMD_T::vmask_ps(mask);
> +auto viMask = SIMD_T::castps_si(vMask);
> +
> +for (uint32_t a = 0; a < numAttribs; ++a)
> +{
> +auto attribGatherX = SIMD_T::template mask_i32gather_ps SIMD_T::ScaleFactor(1)>(SIMD_T::setzero_ps(), (const float*)pSrcBase, 
> vGatherOffsets, vMask);
> +auto attribGatherY = SIMD_T::template mask_i32gather_ps SIMD_T::ScaleFactor(1)>(SIMD_T::setzero_ps(), (const float*)(pSrcBase + 
> sizeof(float)), vGatherOffsets, vMask);
> +auto attribGatherZ = SIMD_T::template mask_i32gather_ps SIMD_T::ScaleFactor(1)>(SIMD_T::setzero_ps(), (const float*)(pSrcBase + 
> sizeof(float) * 2), vGatherOffsets, vMask);
> +auto attribGatherW = SIMD_T::template mask_i32gather_ps SIMD_T::ScaleFactor(1)>(SIMD_T::setzero_ps(), (const float*)(pSrcBase + 
> sizeof(float) * 3), vGatherOffsets, vMask);
> +
> +SIMD_T::maskstore_ps((float*)pDstBase, viMask, attribGatherX);
> +SIMD_T::maskstore_ps((float*)(pDstBase + sizeof(typename 
> SIMD_T::Float)), viMask, attribGatherY);
> +

[Mesa-dev] [PATCH 2/9] swr/rast: New GS state/context API

2017-09-21 Thread Tim Rowley
One piglit regression, which was a false pass:
  spec@glsl-1.50@execution@geometry@dynamic_input_array_index
---
 .../drivers/swr/rasterizer/core/frontend.cpp   | 227 -
 src/gallium/drivers/swr/rasterizer/core/state.h|  55 +++--
 src/gallium/drivers/swr/swr_shader.cpp | 183 -
 3 files changed, 253 insertions(+), 212 deletions(-)

diff --git a/src/gallium/drivers/swr/rasterizer/core/frontend.cpp 
b/src/gallium/drivers/swr/rasterizer/core/frontend.cpp
index f882869..26e76a9 100644
--- a/src/gallium/drivers/swr/rasterizer/core/frontend.cpp
+++ b/src/gallium/drivers/swr/rasterizer/core/frontend.cpp
@@ -710,45 +710,67 @@ void ProcessStreamIdBuffer(uint32_t stream, uint8_t* 
pStreamIdBase, uint32_t num
 
 THREAD SWR_GS_CONTEXT tlsGsContext;
 
-template
-struct GsBufferInfo
+// Buffers that are allocated if GS is enabled
+struct GsBuffers
 {
-GsBufferInfo(const SWR_GS_STATE )
-{
-const uint32_t vertexCount = gsState.maxNumVerts;
-const uint32_t vertexStride = sizeof(SIMDVERTEX);
-const uint32_t numSimdBatches = (vertexCount + SIMD_WIDTH - 1) / 
SIMD_WIDTH;
+uint8_t* pGsIn;
+uint8_t* pGsOut[KNOB_SIMD_WIDTH];
+uint8_t* pGsTransposed;
+void* pStreamCutBuffer;
+};
 
-vertexPrimitiveStride = vertexStride * numSimdBatches;
-vertexInstanceStride = vertexPrimitiveStride * SIMD_WIDTH;
+//
+/// @brief Transposes GS output from SOA to AOS to feed the primitive assembler
+/// @param pDst - Destination buffer in AOS form for the current SIMD width, 
fed into the primitive assembler
+/// @param pSrc - Buffer of vertices in SOA form written by the geometry shader
+/// @param numVerts - Number of vertices outputted by the GS
+/// @param numAttribs - Number of attributes per vertex
+template
+void TransposeSOAtoAOS(uint8_t* pDst, uint8_t* pSrc, uint32_t numVerts, 
uint32_t numAttribs)
+{
+uint32_t srcVertexStride = numAttribs * sizeof(float) * 4;
+uint32_t dstVertexStride = numAttribs * sizeof(typename SIMD_T::Float) * 4;
 
-if (gsState.isSingleStream)
-{
-cutPrimitiveStride = (vertexCount + 7) / 8;
-cutInstanceStride = cutPrimitiveStride * SIMD_WIDTH;
+OSALIGNSIMD16(uint32_t) gatherOffsets[SimdWidth];
 
-streamCutPrimitiveStride = 0;
-streamCutInstanceStride = 0;
-}
-else
-{
-cutPrimitiveStride = AlignUp(vertexCount * 2 / 8, 4);
-cutInstanceStride = cutPrimitiveStride * SIMD_WIDTH;
-
-streamCutPrimitiveStride = (vertexCount + 7) / 8;
-streamCutInstanceStride = streamCutPrimitiveStride * SIMD_WIDTH;
-}
+for (uint32_t i = 0; i < SimdWidth; ++i)
+{
+gatherOffsets[i] = srcVertexStride * i;
 }
+auto vGatherOffsets = SIMD_T::load_si((typename 
SIMD_T::Integer*)[0]);
 
-uint32_t vertexPrimitiveStride;
-uint32_t vertexInstanceStride;
+uint32_t numSimd = AlignUp(numVerts, SimdWidth) / SimdWidth;
+uint32_t remainingVerts = numVerts;
 
-uint32_t cutPrimitiveStride;
-uint32_t cutInstanceStride;
+for (uint32_t s = 0; s < numSimd; ++s)
+{
+uint8_t* pSrcBase = pSrc + s * srcVertexStride * SimdWidth;
+uint8_t* pDstBase = pDst + s * dstVertexStride;
 
-uint32_t streamCutPrimitiveStride;
-uint32_t streamCutInstanceStride;
-};
+// Compute mask to prevent src overflow
+uint32_t mask = std::min(remainingVerts, SimdWidth);
+mask = GenMask(mask);
+auto vMask = SIMD_T::vmask_ps(mask);
+auto viMask = SIMD_T::castps_si(vMask);
+
+for (uint32_t a = 0; a < numAttribs; ++a)
+{
+auto attribGatherX = SIMD_T::template mask_i32gather_ps(SIMD_T::setzero_ps(), (const float*)pSrcBase, 
vGatherOffsets, vMask);
+auto attribGatherY = SIMD_T::template mask_i32gather_ps(SIMD_T::setzero_ps(), (const float*)(pSrcBase + 
sizeof(float)), vGatherOffsets, vMask);
+auto attribGatherZ = SIMD_T::template mask_i32gather_ps(SIMD_T::setzero_ps(), (const float*)(pSrcBase + 
sizeof(float) * 2), vGatherOffsets, vMask);
+auto attribGatherW = SIMD_T::template mask_i32gather_ps(SIMD_T::setzero_ps(), (const float*)(pSrcBase + 
sizeof(float) * 3), vGatherOffsets, vMask);
+
+SIMD_T::maskstore_ps((float*)pDstBase, viMask, attribGatherX);
+SIMD_T::maskstore_ps((float*)(pDstBase + sizeof(typename 
SIMD_T::Float)), viMask, attribGatherY);
+SIMD_T::maskstore_ps((float*)(pDstBase + sizeof(typename 
SIMD_T::Float) * 2), viMask, attribGatherZ);
+SIMD_T::maskstore_ps((float*)(pDstBase + sizeof(typename 
SIMD_T::Float) * 3), viMask, attribGatherW);
+
+pSrcBase += sizeof(float) * 4;
+pDstBase += sizeof(typename SIMD_T::Float) * 4;
+}
+remainingVerts -= SimdWidth;
+}
+}