Re: [Mesa-dev] [PATCH 2/9] swr/rast: New GS state/context API
On Sep 25, 2017, at 11:31 AM, Rowley, Timothy O> wrote: Ok, made the following changes - want a full v2 commit, or ok to do this on push? I'm fine with doing it on push and don't need a full v2. It simply replaces a couple magic numbers with their defines -- no functional change. I'll mark the entire set rvb in patch 0/9. --- a/src/gallium/drivers/swr/swr_shader.cpp +++ b/src/gallium/drivers/swr/swr_shader.cpp @@ -533,12 +533,12 @@ BuilderSWR::CompileGS(struct swr_context *ctx, swr_jit_gs_key ) pGS->inputVertStride = pGS->numInputAttribs + pGS->vertexAttribOffset; pGS->outputVertexSize = SWR_VTX_NUM_SLOTS; pGS->controlDataSize = 8; // GS ouputs max of 8 32B units - pGS->controlDataOffset = 32; - pGS->outputVertexOffset = pGS->controlDataOffset + pGS->controlDataSize * 32; + pGS->controlDataOffset = VERTEX_COUNT_SIZE; + pGS->outputVertexOffset = pGS->controlDataOffset + CONTROL_HEADER_SIZE; pGS->allocationSize = - 32 + // vertex count - (8 * 32) + // control header + VERTEX_COUNT_SIZE + // vertex count + CONTROL_HEADER_SIZE + // control header (SWR_VTX_NUM_SLOTS * 16) * // sizeof vertex pGS->maxNumVerts; // num verts On Sep 23, 2017, at 9:51 PM, Cherniak, Bruce > wrote: On Sep 21, 2017, at 7:46 PM, Tim Rowley > wrote: One piglit regression, which was a false pass: spec@glsl-1.50@execution@geometry@dynamic_input_array_index --- .../drivers/swr/rasterizer/core/frontend.cpp | 227 - src/gallium/drivers/swr/rasterizer/core/state.h| 55 +++-- src/gallium/drivers/swr/swr_shader.cpp | 183 - 3 files changed, 253 insertions(+), 212 deletions(-) diff --git a/src/gallium/drivers/swr/rasterizer/core/frontend.cpp b/src/gallium/drivers/swr/rasterizer/core/frontend.cpp index f882869..26e76a9 100644 --- a/src/gallium/drivers/swr/rasterizer/core/frontend.cpp +++ b/src/gallium/drivers/swr/rasterizer/core/frontend.cpp @@ -710,45 +710,67 @@ void ProcessStreamIdBuffer(uint32_t stream, uint8_t* pStreamIdBase, uint32_t num THREAD SWR_GS_CONTEXT tlsGsContext; -template -struct GsBufferInfo +// Buffers that are allocated if GS is enabled +struct GsBuffers { -GsBufferInfo(const SWR_GS_STATE ) -{ -const uint32_t vertexCount = gsState.maxNumVerts; -const uint32_t vertexStride = sizeof(SIMDVERTEX); -const uint32_t numSimdBatches = (vertexCount + SIMD_WIDTH - 1) / SIMD_WIDTH; +uint8_t* pGsIn; +uint8_t* pGsOut[KNOB_SIMD_WIDTH]; +uint8_t* pGsTransposed; +void* pStreamCutBuffer; +}; -vertexPrimitiveStride = vertexStride * numSimdBatches; -vertexInstanceStride = vertexPrimitiveStride * SIMD_WIDTH; +// +/// @brief Transposes GS output from SOA to AOS to feed the primitive assembler +/// @param pDst - Destination buffer in AOS form for the current SIMD width, fed into the primitive assembler +/// @param pSrc - Buffer of vertices in SOA form written by the geometry shader +/// @param numVerts - Number of vertices outputted by the GS +/// @param numAttribs - Number of attributes per vertex +template +void TransposeSOAtoAOS(uint8_t* pDst, uint8_t* pSrc, uint32_t numVerts, uint32_t numAttribs) +{ +uint32_t srcVertexStride = numAttribs * sizeof(float) * 4; +uint32_t dstVertexStride = numAttribs * sizeof(typename SIMD_T::Float) * 4; -if (gsState.isSingleStream) -{ -cutPrimitiveStride = (vertexCount + 7) / 8; -cutInstanceStride = cutPrimitiveStride * SIMD_WIDTH; +OSALIGNSIMD16(uint32_t) gatherOffsets[SimdWidth]; -streamCutPrimitiveStride = 0; -streamCutInstanceStride = 0; -} -else -{ -cutPrimitiveStride = AlignUp(vertexCount * 2 / 8, 4); -cutInstanceStride = cutPrimitiveStride * SIMD_WIDTH; - -streamCutPrimitiveStride = (vertexCount + 7) / 8; -streamCutInstanceStride = streamCutPrimitiveStride * SIMD_WIDTH; -} +for (uint32_t i = 0; i < SimdWidth; ++i) +{ +gatherOffsets[i] = srcVertexStride * i; } +auto vGatherOffsets = SIMD_T::load_si((typename SIMD_T::Integer*)[0]); -uint32_t vertexPrimitiveStride; -uint32_t vertexInstanceStride; +uint32_t numSimd = AlignUp(numVerts, SimdWidth) / SimdWidth; +uint32_t remainingVerts = numVerts; -uint32_t cutPrimitiveStride; -uint32_t cutInstanceStride; +for (uint32_t s = 0; s < numSimd; ++s) +{ +uint8_t* pSrcBase = pSrc + s * srcVertexStride * SimdWidth; +uint8_t* pDstBase = pDst + s * dstVertexStride; -uint32_t streamCutPrimitiveStride; -uint32_t streamCutInstanceStride; -}; +// Compute mask to prevent src overflow +
Re: [Mesa-dev] [PATCH 2/9] swr/rast: New GS state/context API
Ok, made the following changes - want a full v2 commit, or ok to do this on push? --- a/src/gallium/drivers/swr/swr_shader.cpp +++ b/src/gallium/drivers/swr/swr_shader.cpp @@ -533,12 +533,12 @@ BuilderSWR::CompileGS(struct swr_context *ctx, swr_jit_gs_key ) pGS->inputVertStride = pGS->numInputAttribs + pGS->vertexAttribOffset; pGS->outputVertexSize = SWR_VTX_NUM_SLOTS; pGS->controlDataSize = 8; // GS ouputs max of 8 32B units - pGS->controlDataOffset = 32; - pGS->outputVertexOffset = pGS->controlDataOffset + pGS->controlDataSize * 32; + pGS->controlDataOffset = VERTEX_COUNT_SIZE; + pGS->outputVertexOffset = pGS->controlDataOffset + CONTROL_HEADER_SIZE; pGS->allocationSize = - 32 + // vertex count - (8 * 32) + // control header + VERTEX_COUNT_SIZE + // vertex count + CONTROL_HEADER_SIZE + // control header (SWR_VTX_NUM_SLOTS * 16) * // sizeof vertex pGS->maxNumVerts; // num verts On Sep 23, 2017, at 9:51 PM, Cherniak, Bruce> wrote: On Sep 21, 2017, at 7:46 PM, Tim Rowley > wrote: One piglit regression, which was a false pass: spec@glsl-1.50@execution@geometry@dynamic_input_array_index --- .../drivers/swr/rasterizer/core/frontend.cpp | 227 - src/gallium/drivers/swr/rasterizer/core/state.h| 55 +++-- src/gallium/drivers/swr/swr_shader.cpp | 183 - 3 files changed, 253 insertions(+), 212 deletions(-) diff --git a/src/gallium/drivers/swr/rasterizer/core/frontend.cpp b/src/gallium/drivers/swr/rasterizer/core/frontend.cpp index f882869..26e76a9 100644 --- a/src/gallium/drivers/swr/rasterizer/core/frontend.cpp +++ b/src/gallium/drivers/swr/rasterizer/core/frontend.cpp @@ -710,45 +710,67 @@ void ProcessStreamIdBuffer(uint32_t stream, uint8_t* pStreamIdBase, uint32_t num THREAD SWR_GS_CONTEXT tlsGsContext; -template -struct GsBufferInfo +// Buffers that are allocated if GS is enabled +struct GsBuffers { -GsBufferInfo(const SWR_GS_STATE ) -{ -const uint32_t vertexCount = gsState.maxNumVerts; -const uint32_t vertexStride = sizeof(SIMDVERTEX); -const uint32_t numSimdBatches = (vertexCount + SIMD_WIDTH - 1) / SIMD_WIDTH; +uint8_t* pGsIn; +uint8_t* pGsOut[KNOB_SIMD_WIDTH]; +uint8_t* pGsTransposed; +void* pStreamCutBuffer; +}; -vertexPrimitiveStride = vertexStride * numSimdBatches; -vertexInstanceStride = vertexPrimitiveStride * SIMD_WIDTH; +// +/// @brief Transposes GS output from SOA to AOS to feed the primitive assembler +/// @param pDst - Destination buffer in AOS form for the current SIMD width, fed into the primitive assembler +/// @param pSrc - Buffer of vertices in SOA form written by the geometry shader +/// @param numVerts - Number of vertices outputted by the GS +/// @param numAttribs - Number of attributes per vertex +template +void TransposeSOAtoAOS(uint8_t* pDst, uint8_t* pSrc, uint32_t numVerts, uint32_t numAttribs) +{ +uint32_t srcVertexStride = numAttribs * sizeof(float) * 4; +uint32_t dstVertexStride = numAttribs * sizeof(typename SIMD_T::Float) * 4; -if (gsState.isSingleStream) -{ -cutPrimitiveStride = (vertexCount + 7) / 8; -cutInstanceStride = cutPrimitiveStride * SIMD_WIDTH; +OSALIGNSIMD16(uint32_t) gatherOffsets[SimdWidth]; -streamCutPrimitiveStride = 0; -streamCutInstanceStride = 0; -} -else -{ -cutPrimitiveStride = AlignUp(vertexCount * 2 / 8, 4); -cutInstanceStride = cutPrimitiveStride * SIMD_WIDTH; - -streamCutPrimitiveStride = (vertexCount + 7) / 8; -streamCutInstanceStride = streamCutPrimitiveStride * SIMD_WIDTH; -} +for (uint32_t i = 0; i < SimdWidth; ++i) +{ +gatherOffsets[i] = srcVertexStride * i; } +auto vGatherOffsets = SIMD_T::load_si((typename SIMD_T::Integer*)[0]); -uint32_t vertexPrimitiveStride; -uint32_t vertexInstanceStride; +uint32_t numSimd = AlignUp(numVerts, SimdWidth) / SimdWidth; +uint32_t remainingVerts = numVerts; -uint32_t cutPrimitiveStride; -uint32_t cutInstanceStride; +for (uint32_t s = 0; s < numSimd; ++s) +{ +uint8_t* pSrcBase = pSrc + s * srcVertexStride * SimdWidth; +uint8_t* pDstBase = pDst + s * dstVertexStride; -uint32_t streamCutPrimitiveStride; -uint32_t streamCutInstanceStride; -}; +// Compute mask to prevent src overflow +uint32_t mask = std::min(remainingVerts, SimdWidth); +mask = GenMask(mask); +auto vMask = SIMD_T::vmask_ps(mask); +auto viMask = SIMD_T::castps_si(vMask); + +for (uint32_t a = 0; a < numAttribs; ++a) +{ +auto attribGatherX = SIMD_T::template
Re: [Mesa-dev] [PATCH 2/9] swr/rast: New GS state/context API
> On Sep 21, 2017, at 7:46 PM, Tim Rowleywrote: > > One piglit regression, which was a false pass: > spec@glsl-1.50@execution@geometry@dynamic_input_array_index > --- > .../drivers/swr/rasterizer/core/frontend.cpp | 227 - > src/gallium/drivers/swr/rasterizer/core/state.h| 55 +++-- > src/gallium/drivers/swr/swr_shader.cpp | 183 - > 3 files changed, 253 insertions(+), 212 deletions(-) > > diff --git a/src/gallium/drivers/swr/rasterizer/core/frontend.cpp > b/src/gallium/drivers/swr/rasterizer/core/frontend.cpp > index f882869..26e76a9 100644 > --- a/src/gallium/drivers/swr/rasterizer/core/frontend.cpp > +++ b/src/gallium/drivers/swr/rasterizer/core/frontend.cpp > @@ -710,45 +710,67 @@ void ProcessStreamIdBuffer(uint32_t stream, uint8_t* > pStreamIdBase, uint32_t num > > THREAD SWR_GS_CONTEXT tlsGsContext; > > -template > -struct GsBufferInfo > +// Buffers that are allocated if GS is enabled > +struct GsBuffers > { > -GsBufferInfo(const SWR_GS_STATE ) > -{ > -const uint32_t vertexCount = gsState.maxNumVerts; > -const uint32_t vertexStride = sizeof(SIMDVERTEX); > -const uint32_t numSimdBatches = (vertexCount + SIMD_WIDTH - 1) / > SIMD_WIDTH; > +uint8_t* pGsIn; > +uint8_t* pGsOut[KNOB_SIMD_WIDTH]; > +uint8_t* pGsTransposed; > +void* pStreamCutBuffer; > +}; > > -vertexPrimitiveStride = vertexStride * numSimdBatches; > -vertexInstanceStride = vertexPrimitiveStride * SIMD_WIDTH; > +// > +/// @brief Transposes GS output from SOA to AOS to feed the primitive > assembler > +/// @param pDst - Destination buffer in AOS form for the current SIMD width, > fed into the primitive assembler > +/// @param pSrc - Buffer of vertices in SOA form written by the geometry > shader > +/// @param numVerts - Number of vertices outputted by the GS > +/// @param numAttribs - Number of attributes per vertex > +template > +void TransposeSOAtoAOS(uint8_t* pDst, uint8_t* pSrc, uint32_t numVerts, > uint32_t numAttribs) > +{ > +uint32_t srcVertexStride = numAttribs * sizeof(float) * 4; > +uint32_t dstVertexStride = numAttribs * sizeof(typename SIMD_T::Float) * > 4; > > -if (gsState.isSingleStream) > -{ > -cutPrimitiveStride = (vertexCount + 7) / 8; > -cutInstanceStride = cutPrimitiveStride * SIMD_WIDTH; > +OSALIGNSIMD16(uint32_t) gatherOffsets[SimdWidth]; > > -streamCutPrimitiveStride = 0; > -streamCutInstanceStride = 0; > -} > -else > -{ > -cutPrimitiveStride = AlignUp(vertexCount * 2 / 8, 4); > -cutInstanceStride = cutPrimitiveStride * SIMD_WIDTH; > - > -streamCutPrimitiveStride = (vertexCount + 7) / 8; > -streamCutInstanceStride = streamCutPrimitiveStride * SIMD_WIDTH; > -} > +for (uint32_t i = 0; i < SimdWidth; ++i) > +{ > +gatherOffsets[i] = srcVertexStride * i; > } > +auto vGatherOffsets = SIMD_T::load_si((typename > SIMD_T::Integer*)[0]); > > -uint32_t vertexPrimitiveStride; > -uint32_t vertexInstanceStride; > +uint32_t numSimd = AlignUp(numVerts, SimdWidth) / SimdWidth; > +uint32_t remainingVerts = numVerts; > > -uint32_t cutPrimitiveStride; > -uint32_t cutInstanceStride; > +for (uint32_t s = 0; s < numSimd; ++s) > +{ > +uint8_t* pSrcBase = pSrc + s * srcVertexStride * SimdWidth; > +uint8_t* pDstBase = pDst + s * dstVertexStride; > > -uint32_t streamCutPrimitiveStride; > -uint32_t streamCutInstanceStride; > -}; > +// Compute mask to prevent src overflow > +uint32_t mask = std::min(remainingVerts, SimdWidth); > +mask = GenMask(mask); > +auto vMask = SIMD_T::vmask_ps(mask); > +auto viMask = SIMD_T::castps_si(vMask); > + > +for (uint32_t a = 0; a < numAttribs; ++a) > +{ > +auto attribGatherX = SIMD_T::template mask_i32gather_ps SIMD_T::ScaleFactor(1)>(SIMD_T::setzero_ps(), (const float*)pSrcBase, > vGatherOffsets, vMask); > +auto attribGatherY = SIMD_T::template mask_i32gather_ps SIMD_T::ScaleFactor(1)>(SIMD_T::setzero_ps(), (const float*)(pSrcBase + > sizeof(float)), vGatherOffsets, vMask); > +auto attribGatherZ = SIMD_T::template mask_i32gather_ps SIMD_T::ScaleFactor(1)>(SIMD_T::setzero_ps(), (const float*)(pSrcBase + > sizeof(float) * 2), vGatherOffsets, vMask); > +auto attribGatherW = SIMD_T::template mask_i32gather_ps SIMD_T::ScaleFactor(1)>(SIMD_T::setzero_ps(), (const float*)(pSrcBase + > sizeof(float) * 3), vGatherOffsets, vMask); > + > +SIMD_T::maskstore_ps((float*)pDstBase, viMask, attribGatherX); > +SIMD_T::maskstore_ps((float*)(pDstBase + sizeof(typename > SIMD_T::Float)), viMask, attribGatherY); > +
[Mesa-dev] [PATCH 2/9] swr/rast: New GS state/context API
One piglit regression, which was a false pass: spec@glsl-1.50@execution@geometry@dynamic_input_array_index --- .../drivers/swr/rasterizer/core/frontend.cpp | 227 - src/gallium/drivers/swr/rasterizer/core/state.h| 55 +++-- src/gallium/drivers/swr/swr_shader.cpp | 183 - 3 files changed, 253 insertions(+), 212 deletions(-) diff --git a/src/gallium/drivers/swr/rasterizer/core/frontend.cpp b/src/gallium/drivers/swr/rasterizer/core/frontend.cpp index f882869..26e76a9 100644 --- a/src/gallium/drivers/swr/rasterizer/core/frontend.cpp +++ b/src/gallium/drivers/swr/rasterizer/core/frontend.cpp @@ -710,45 +710,67 @@ void ProcessStreamIdBuffer(uint32_t stream, uint8_t* pStreamIdBase, uint32_t num THREAD SWR_GS_CONTEXT tlsGsContext; -template -struct GsBufferInfo +// Buffers that are allocated if GS is enabled +struct GsBuffers { -GsBufferInfo(const SWR_GS_STATE ) -{ -const uint32_t vertexCount = gsState.maxNumVerts; -const uint32_t vertexStride = sizeof(SIMDVERTEX); -const uint32_t numSimdBatches = (vertexCount + SIMD_WIDTH - 1) / SIMD_WIDTH; +uint8_t* pGsIn; +uint8_t* pGsOut[KNOB_SIMD_WIDTH]; +uint8_t* pGsTransposed; +void* pStreamCutBuffer; +}; -vertexPrimitiveStride = vertexStride * numSimdBatches; -vertexInstanceStride = vertexPrimitiveStride * SIMD_WIDTH; +// +/// @brief Transposes GS output from SOA to AOS to feed the primitive assembler +/// @param pDst - Destination buffer in AOS form for the current SIMD width, fed into the primitive assembler +/// @param pSrc - Buffer of vertices in SOA form written by the geometry shader +/// @param numVerts - Number of vertices outputted by the GS +/// @param numAttribs - Number of attributes per vertex +template +void TransposeSOAtoAOS(uint8_t* pDst, uint8_t* pSrc, uint32_t numVerts, uint32_t numAttribs) +{ +uint32_t srcVertexStride = numAttribs * sizeof(float) * 4; +uint32_t dstVertexStride = numAttribs * sizeof(typename SIMD_T::Float) * 4; -if (gsState.isSingleStream) -{ -cutPrimitiveStride = (vertexCount + 7) / 8; -cutInstanceStride = cutPrimitiveStride * SIMD_WIDTH; +OSALIGNSIMD16(uint32_t) gatherOffsets[SimdWidth]; -streamCutPrimitiveStride = 0; -streamCutInstanceStride = 0; -} -else -{ -cutPrimitiveStride = AlignUp(vertexCount * 2 / 8, 4); -cutInstanceStride = cutPrimitiveStride * SIMD_WIDTH; - -streamCutPrimitiveStride = (vertexCount + 7) / 8; -streamCutInstanceStride = streamCutPrimitiveStride * SIMD_WIDTH; -} +for (uint32_t i = 0; i < SimdWidth; ++i) +{ +gatherOffsets[i] = srcVertexStride * i; } +auto vGatherOffsets = SIMD_T::load_si((typename SIMD_T::Integer*)[0]); -uint32_t vertexPrimitiveStride; -uint32_t vertexInstanceStride; +uint32_t numSimd = AlignUp(numVerts, SimdWidth) / SimdWidth; +uint32_t remainingVerts = numVerts; -uint32_t cutPrimitiveStride; -uint32_t cutInstanceStride; +for (uint32_t s = 0; s < numSimd; ++s) +{ +uint8_t* pSrcBase = pSrc + s * srcVertexStride * SimdWidth; +uint8_t* pDstBase = pDst + s * dstVertexStride; -uint32_t streamCutPrimitiveStride; -uint32_t streamCutInstanceStride; -}; +// Compute mask to prevent src overflow +uint32_t mask = std::min(remainingVerts, SimdWidth); +mask = GenMask(mask); +auto vMask = SIMD_T::vmask_ps(mask); +auto viMask = SIMD_T::castps_si(vMask); + +for (uint32_t a = 0; a < numAttribs; ++a) +{ +auto attribGatherX = SIMD_T::template mask_i32gather_ps(SIMD_T::setzero_ps(), (const float*)pSrcBase, vGatherOffsets, vMask); +auto attribGatherY = SIMD_T::template mask_i32gather_ps(SIMD_T::setzero_ps(), (const float*)(pSrcBase + sizeof(float)), vGatherOffsets, vMask); +auto attribGatherZ = SIMD_T::template mask_i32gather_ps(SIMD_T::setzero_ps(), (const float*)(pSrcBase + sizeof(float) * 2), vGatherOffsets, vMask); +auto attribGatherW = SIMD_T::template mask_i32gather_ps(SIMD_T::setzero_ps(), (const float*)(pSrcBase + sizeof(float) * 3), vGatherOffsets, vMask); + +SIMD_T::maskstore_ps((float*)pDstBase, viMask, attribGatherX); +SIMD_T::maskstore_ps((float*)(pDstBase + sizeof(typename SIMD_T::Float)), viMask, attribGatherY); +SIMD_T::maskstore_ps((float*)(pDstBase + sizeof(typename SIMD_T::Float) * 2), viMask, attribGatherZ); +SIMD_T::maskstore_ps((float*)(pDstBase + sizeof(typename SIMD_T::Float) * 3), viMask, attribGatherW); + +pSrcBase += sizeof(float) * 4; +pDstBase += sizeof(typename SIMD_T::Float) * 4; +} +remainingVerts -= SimdWidth; +} +}