> On Sep 21, 2017, at 7:46 PM, Tim Rowley <timothy.o.row...@intel.com> wrote: > > One piglit regression, which was a false pass: > spec@glsl-1.50@execution@geometry@dynamic_input_array_index > --- > .../drivers/swr/rasterizer/core/frontend.cpp | 227 ++++++++++++--------- > src/gallium/drivers/swr/rasterizer/core/state.h | 55 +++-- > src/gallium/drivers/swr/swr_shader.cpp | 183 ++++++++--------- > 3 files changed, 253 insertions(+), 212 deletions(-) > > diff --git a/src/gallium/drivers/swr/rasterizer/core/frontend.cpp > b/src/gallium/drivers/swr/rasterizer/core/frontend.cpp > index f882869..26e76a9 100644 > --- a/src/gallium/drivers/swr/rasterizer/core/frontend.cpp > +++ b/src/gallium/drivers/swr/rasterizer/core/frontend.cpp > @@ -710,45 +710,67 @@ void ProcessStreamIdBuffer(uint32_t stream, uint8_t* > pStreamIdBase, uint32_t num > > THREAD SWR_GS_CONTEXT tlsGsContext; > > -template<typename SIMDVERTEX, uint32_t SIMD_WIDTH> > -struct GsBufferInfo > +// Buffers that are allocated if GS is enabled > +struct GsBuffers > { > - GsBufferInfo(const SWR_GS_STATE &gsState) > - { > - const uint32_t vertexCount = gsState.maxNumVerts; > - const uint32_t vertexStride = sizeof(SIMDVERTEX); > - const uint32_t numSimdBatches = (vertexCount + SIMD_WIDTH - 1) / > SIMD_WIDTH; > + uint8_t* pGsIn; > + uint8_t* pGsOut[KNOB_SIMD_WIDTH]; > + uint8_t* pGsTransposed; > + void* pStreamCutBuffer; > +}; > > - vertexPrimitiveStride = vertexStride * numSimdBatches; > - vertexInstanceStride = vertexPrimitiveStride * SIMD_WIDTH; > +////////////////////////////////////////////////////////////////////////// > +/// @brief Transposes GS output from SOA to AOS to feed the primitive > assembler > +/// @param pDst - Destination buffer in AOS form for the current SIMD width, > fed into the primitive assembler > +/// @param pSrc - Buffer of vertices in SOA form written by the geometry > shader > +/// @param numVerts - Number of vertices outputted by the GS > +/// @param numAttribs - Number of attributes per vertex > +template<typename SIMD_T, uint32_t SimdWidth> > +void TransposeSOAtoAOS(uint8_t* pDst, uint8_t* pSrc, uint32_t numVerts, > uint32_t numAttribs) > +{ > + uint32_t srcVertexStride = numAttribs * sizeof(float) * 4; > + uint32_t dstVertexStride = numAttribs * sizeof(typename SIMD_T::Float) * > 4; > > - if (gsState.isSingleStream) > - { > - cutPrimitiveStride = (vertexCount + 7) / 8; > - cutInstanceStride = cutPrimitiveStride * SIMD_WIDTH; > + OSALIGNSIMD16(uint32_t) gatherOffsets[SimdWidth]; > > - streamCutPrimitiveStride = 0; > - streamCutInstanceStride = 0; > - } > - else > - { > - cutPrimitiveStride = AlignUp(vertexCount * 2 / 8, 4); > - cutInstanceStride = cutPrimitiveStride * SIMD_WIDTH; > - > - streamCutPrimitiveStride = (vertexCount + 7) / 8; > - streamCutInstanceStride = streamCutPrimitiveStride * SIMD_WIDTH; > - } > + for (uint32_t i = 0; i < SimdWidth; ++i) > + { > + gatherOffsets[i] = srcVertexStride * i; > } > + auto vGatherOffsets = SIMD_T::load_si((typename > SIMD_T::Integer*)&gatherOffsets[0]); > > - uint32_t vertexPrimitiveStride; > - uint32_t vertexInstanceStride; > + uint32_t numSimd = AlignUp(numVerts, SimdWidth) / SimdWidth; > + uint32_t remainingVerts = numVerts; > > - uint32_t cutPrimitiveStride; > - uint32_t cutInstanceStride; > + for (uint32_t s = 0; s < numSimd; ++s) > + { > + uint8_t* pSrcBase = pSrc + s * srcVertexStride * SimdWidth; > + uint8_t* pDstBase = pDst + s * dstVertexStride; > > - uint32_t streamCutPrimitiveStride; > - uint32_t streamCutInstanceStride; > -}; > + // Compute mask to prevent src overflow > + uint32_t mask = std::min(remainingVerts, SimdWidth); > + mask = GenMask(mask); > + auto vMask = SIMD_T::vmask_ps(mask); > + auto viMask = SIMD_T::castps_si(vMask); > + > + for (uint32_t a = 0; a < numAttribs; ++a) > + { > + auto attribGatherX = SIMD_T::template mask_i32gather_ps<typename > SIMD_T::ScaleFactor(1)>(SIMD_T::setzero_ps(), (const float*)pSrcBase, > vGatherOffsets, vMask); > + auto attribGatherY = SIMD_T::template mask_i32gather_ps<typename > SIMD_T::ScaleFactor(1)>(SIMD_T::setzero_ps(), (const float*)(pSrcBase + > sizeof(float)), vGatherOffsets, vMask); > + auto attribGatherZ = SIMD_T::template mask_i32gather_ps<typename > SIMD_T::ScaleFactor(1)>(SIMD_T::setzero_ps(), (const float*)(pSrcBase + > sizeof(float) * 2), vGatherOffsets, vMask); > + auto attribGatherW = SIMD_T::template mask_i32gather_ps<typename > SIMD_T::ScaleFactor(1)>(SIMD_T::setzero_ps(), (const float*)(pSrcBase + > sizeof(float) * 3), vGatherOffsets, vMask); > + > + SIMD_T::maskstore_ps((float*)pDstBase, viMask, attribGatherX); > + SIMD_T::maskstore_ps((float*)(pDstBase + sizeof(typename > SIMD_T::Float)), viMask, attribGatherY); > + SIMD_T::maskstore_ps((float*)(pDstBase + sizeof(typename > SIMD_T::Float) * 2), viMask, attribGatherZ); > + SIMD_T::maskstore_ps((float*)(pDstBase + sizeof(typename > SIMD_T::Float) * 3), viMask, attribGatherW); > + > + pSrcBase += sizeof(float) * 4; > + pDstBase += sizeof(typename SIMD_T::Float) * 4; > + } > + remainingVerts -= SimdWidth; > + } > +} > > ////////////////////////////////////////////////////////////////////////// > /// @brief Implements GS stage. > @@ -763,9 +785,7 @@ static void GeometryShaderStage( > DRAW_CONTEXT *pDC, > uint32_t workerId, > PA_STATE& pa, > - void* pGsOut, > - void* pCutBuffer, > - void* pStreamCutBuffer, > + GsBuffers* pGsBuffers, > uint32_t* pSoPrimData, > #if USE_SIMD16_FRONTEND > uint32_t numPrims_simd8, > @@ -779,25 +799,29 @@ static void GeometryShaderStage( > const API_STATE& state = GetApiState(pDC); > const SWR_GS_STATE* pState = &state.gsState; > > - SWR_ASSERT(pGsOut != nullptr, "GS output buffer should be initialized"); > - SWR_ASSERT(pCutBuffer != nullptr, "GS output cut buffer should be > initialized"); > + static uint8_t sNullBuffer[1024] = { 0 }; > > - tlsGsContext.pStream = (uint8_t*)pGsOut; > - tlsGsContext.pCutOrStreamIdBuffer = (uint8_t*)pCutBuffer; > + for (uint32_t i = 0; i < KNOB_SIMD_WIDTH; ++i) > + { > + tlsGsContext.pStreams[i] = pGsBuffers->pGsOut[i]; > + } > + tlsGsContext.pVerts = (simdvector*)pGsBuffers->pGsIn; > tlsGsContext.PrimitiveID = primID; > > uint32_t numVertsPerPrim = NumVertsPerPrim(pa.binTopology, true); > simdvector attrib[MAX_NUM_VERTS_PER_PRIM]; > > // assemble all attributes for the input primitive > + tlsGsContext.inputVertStride = pState->inputVertStride; > for (uint32_t slot = 0; slot < pState->numInputAttribs; ++slot) > { > + uint32_t srcAttribSlot = pState->srcVertexAttribOffset + slot; > uint32_t attribSlot = pState->vertexAttribOffset + slot; > - pa.Assemble(attribSlot, attrib); > + pa.Assemble(srcAttribSlot, attrib); > > for (uint32_t i = 0; i < numVertsPerPrim; ++i) > { > - tlsGsContext.vert[i].attrib[VERTEX_ATTRIB_START_SLOT + slot] = > attrib[i]; > + tlsGsContext.pVerts[attribSlot + pState->inputVertStride * i] = > attrib[i]; > } > } > > @@ -805,15 +829,9 @@ static void GeometryShaderStage( > pa.Assemble(VERTEX_POSITION_SLOT, attrib); > for (uint32_t i = 0; i < numVertsPerPrim; ++i) > { > - tlsGsContext.vert[i].attrib[VERTEX_POSITION_SLOT] = attrib[i]; > + tlsGsContext.pVerts[VERTEX_POSITION_SLOT + pState->inputVertStride * > i] = attrib[i]; > } > > -#if USE_SIMD16_FRONTEND > - const GsBufferInfo<simd16vertex, KNOB_SIMD16_WIDTH> > bufferInfo(state.gsState); > -#else > - const GsBufferInfo<simdvertex, KNOB_SIMD_WIDTH> > bufferInfo(state.gsState); > -#endif > - > // record valid prims from the frontend to avoid over binning the newly > generated > // prims from the GS > #if USE_SIMD16_FRONTEND > @@ -830,8 +848,10 @@ static void GeometryShaderStage( > // execute the geometry shader > state.pfnGsFunc(GetPrivateState(pDC), &tlsGsContext); > > - tlsGsContext.pStream += bufferInfo.vertexInstanceStride; > - tlsGsContext.pCutOrStreamIdBuffer += bufferInfo.cutInstanceStride; > + for (uint32_t i = 0; i < KNOB_SIMD_WIDTH; ++i) > + { > + tlsGsContext.pStreams[i] += pState->allocationSize; > + } > } > > // set up new binner and state for the GS output topology > @@ -865,32 +885,48 @@ static void GeometryShaderStage( > // foreach input prim: > // - setup a new PA based on the emitted verts for that prim > // - loop over the new verts, calling PA to assemble each prim > - uint32_t* pVertexCount = (uint32_t*)&tlsGsContext.vertexCount; > uint32_t* pPrimitiveId = (uint32_t*)&primID; > > uint32_t totalPrimsGenerated = 0; > for (uint32_t inputPrim = 0; inputPrim < numInputPrims; ++inputPrim) > { > - uint8_t* pInstanceBase = (uint8_t*)pGsOut + inputPrim * > bufferInfo.vertexPrimitiveStride; > - uint8_t* pCutBufferBase = (uint8_t*)pCutBuffer + inputPrim * > bufferInfo.cutPrimitiveStride; > + uint8_t* pInstanceBase = (uint8_t*)pGsBuffers->pGsOut[inputPrim]; > + > + // Vertex count is either emitted by shader or static > + uint32_t vertexCount = 0; > + if (pState->staticVertexCount) > + { > + vertexCount = pState->staticVertexCount; > + } > + else > + { > + // If emitted in shader, it should be the stored in the first > dword of the output buffer > + vertexCount = *(uint32_t*)pInstanceBase; > + } > > for (uint32_t instance = 0; instance < pState->instanceCount; > ++instance) > { > - uint32_t numEmittedVerts = pVertexCount[inputPrim]; > + uint32_t numEmittedVerts = vertexCount; > if (numEmittedVerts == 0) > { > continue; > } > > - uint8_t* pBase = pInstanceBase + instance * > bufferInfo.vertexInstanceStride; > - uint8_t* pCutBase = pCutBufferBase + instance * > bufferInfo.cutInstanceStride; > + uint8_t* pBase = pInstanceBase + instance * > pState->allocationSize; > + uint8_t* pCutBase = pState->controlDataSize == 0 ? > &sNullBuffer[0] : pBase + pState->controlDataOffset; > + uint8_t* pVertexBaseAOS = pBase + pState->outputVertexOffset; > + > +#if USE_SIMD16_FRONTEND > + TransposeSOAtoAOS<SIMD512, > KNOB_SIMD16_WIDTH>((uint8_t*)pGsBuffers->pGsTransposed, pVertexBaseAOS, > vertexCount, pState->outputVertexSize); > +#else > + TransposeSOAtoAOS<SIMD256, > KNOB_SIMD_WIDTH>((uint8_t*)pGsBuffers->pGsTransposed, pVertexBaseAOS, > vertexCount, pState->outputVertexSize); > +#endif > > uint32_t numAttribs = state.feNumAttributes; > > for (uint32_t stream = 0; stream < MAX_SO_STREAMS; ++stream) > { > bool processCutVerts = false; > - > uint8_t* pCutBuffer = pCutBase; > > // assign default stream ID, only relevant when GS is > outputting a single stream > @@ -910,16 +946,16 @@ static void GeometryShaderStage( > } > > // multi-stream output, need to translate StreamID buffer > to a cut buffer > - ProcessStreamIdBuffer(stream, pCutBase, numEmittedVerts, > (uint8_t*)pStreamCutBuffer); > - pCutBuffer = (uint8_t*)pStreamCutBuffer; > + ProcessStreamIdBuffer(stream, pCutBase, numEmittedVerts, > (uint8_t*)pGsBuffers->pStreamCutBuffer); > + pCutBuffer = (uint8_t*)pGsBuffers->pStreamCutBuffer; > processCutVerts = false; > } > > #if USE_SIMD16_FRONTEND > - PA_STATE_CUT gsPa(pDC, pBase, numEmittedVerts, > SWR_VTX_NUM_SLOTS, reinterpret_cast<simd16mask *>(pCutBuffer), > numEmittedVerts, numAttribs, pState->outputTopology, processCutVerts); > + PA_STATE_CUT gsPa(pDC, (uint8_t*)pGsBuffers->pGsTransposed, > numEmittedVerts, pState->outputVertexSize, reinterpret_cast<simd16mask > *>(pCutBuffer), numEmittedVerts, numAttribs, pState->outputTopology, > processCutVerts); > > #else > - PA_STATE_CUT gsPa(pDC, pBase, numEmittedVerts, > SWR_VTX_NUM_SLOTS, pCutBuffer, numEmittedVerts, numAttribs, > pState->outputTopology, processCutVerts); > + PA_STATE_CUT gsPa(pDC, (uint8_t*)pGsBuffers->pGsTransposed, > numEmittedVerts, pState->outputVertexSize, pCutBuffer, numEmittedVerts, > numAttribs, pState->outputTopology, processCutVerts); > > #endif > while (gsPa.GetNextStreamOutput()) > @@ -979,42 +1015,40 @@ static void GeometryShaderStage( > /// @param state - API state > /// @param ppGsOut - pointer to GS output buffer allocation > /// @param ppCutBuffer - pointer to GS output cut buffer allocation > -template<typename SIMDVERTEX, uint32_t SIMD_WIDTH> > -static INLINE void AllocateGsBuffers(DRAW_CONTEXT* pDC, const API_STATE& > state, void** ppGsOut, void** ppCutBuffer, > - void **ppStreamCutBuffer) > +template<typename SIMD_T, uint32_t SIMD_WIDTH> > +static INLINE void AllocateGsBuffers(DRAW_CONTEXT* pDC, const API_STATE& > state, uint32_t vertsPerPrim, GsBuffers* pGsBuffers) > { > auto pArena = pDC->pArena; > SWR_ASSERT(pArena != nullptr); > SWR_ASSERT(state.gsState.gsEnable); > > - // allocate arena space to hold GS output verts > - // @todo pack attribs > - // @todo support multiple streams > + const SWR_GS_STATE& gsState = state.gsState; > > - const GsBufferInfo<SIMDVERTEX, SIMD_WIDTH> bufferInfo(state.gsState); > + // Allocate storage for vertex inputs > + uint32_t vertexInBufferSize = gsState.inputVertStride * > sizeof(simdvector) * vertsPerPrim; > + pGsBuffers->pGsIn = (uint8_t*)pArena->AllocAligned(vertexInBufferSize, > 32); > > - const uint32_t vertexBufferSize = state.gsState.instanceCount * > bufferInfo.vertexInstanceStride; > + // Allocate arena space to hold GS output verts > + const uint32_t vertexBufferSize = gsState.instanceCount * > gsState.allocationSize; > > - *ppGsOut = pArena->AllocAligned(vertexBufferSize, SIMD_WIDTH * > sizeof(float)); > + for (uint32_t i = 0; i < KNOB_SIMD_WIDTH; ++i) > + { > + pGsBuffers->pGsOut[i] = > (uint8_t*)pArena->AllocAligned(vertexBufferSize, 32); > + } > > - // allocate arena space to hold cut or streamid buffer, which is > essentially a bitfield sized to the > - // maximum vertex output as defined by the GS state, per SIMD lane, per > GS instance > + // Allocate storage for transposed GS output > + uint32_t numSimdBatches = AlignUp(gsState.maxNumVerts, SIMD_WIDTH) / > SIMD_WIDTH; > + uint32_t transposedBufferSize = numSimdBatches * > gsState.outputVertexSize * sizeof(typename SIMD_T::Vec4); > + pGsBuffers->pGsTransposed = > (uint8_t*)pArena->AllocAligned(transposedBufferSize, 32); > > - // allocate space for temporary per-stream cut buffer if multi-stream is > enabled > + // Allocate storage to hold temporary stream->cut buffer, if necessary > if (state.gsState.isSingleStream) > { > - const uint32_t cutBufferSize = state.gsState.instanceCount * > bufferInfo.cutInstanceStride; > - > - *ppCutBuffer = pArena->AllocAligned(cutBufferSize, SIMD_WIDTH * > sizeof(float)); > - *ppStreamCutBuffer = nullptr; > + pGsBuffers->pStreamCutBuffer = nullptr; > } > else > { > - const uint32_t cutBufferSize = state.gsState.instanceCount * > bufferInfo.cutInstanceStride; > - const uint32_t streamCutBufferSize = state.gsState.instanceCount * > bufferInfo.streamCutInstanceStride; > - > - *ppCutBuffer = pArena->AllocAligned(cutBufferSize, SIMD_WIDTH * > sizeof(float)); > - *ppStreamCutBuffer = pArena->AllocAligned(streamCutBufferSize, > SIMD_WIDTH * sizeof(float)); > + pGsBuffers->pStreamCutBuffer = > (uint8_t*)pArena->AllocAligned(AlignUp(gsState.maxNumVerts * 2, 32), 32); > } > } > > @@ -1062,9 +1096,7 @@ static void TessellationStages( > DRAW_CONTEXT *pDC, > uint32_t workerId, > PA_STATE& pa, > - void* pGsOut, > - void* pCutBuffer, > - void* pCutStreamBuffer, > + GsBuffers* pGsBuffers, > uint32_t* pSoPrimData, > #if USE_SIMD16_FRONTEND > uint32_t numPrims_simd8, > @@ -1264,17 +1296,16 @@ static void TessellationStages( > { > #if USE_SIMD16_FRONTEND > tessPa.useAlternateOffset = false; > - GeometryShaderStage<HasStreamOutT, HasRastT>(pDC, workerId, > tessPa, pGsOut, pCutBuffer, pCutStreamBuffer, pSoPrimData, numPrims_lo, > primID_lo); > + GeometryShaderStage<HasStreamOutT, HasRastT>(pDC, workerId, > tessPa, pGsBuffers, pSoPrimData, numPrims_lo, primID_lo); > > if (numPrims_hi) > { > tessPa.useAlternateOffset = true; > - GeometryShaderStage<HasStreamOutT, HasRastT>(pDC, > workerId, tessPa, pGsOut, pCutBuffer, pCutStreamBuffer, pSoPrimData, > numPrims_hi, primID_hi); > + GeometryShaderStage<HasStreamOutT, HasRastT>(pDC, > workerId, tessPa, pGsBuffers, pSoPrimData, numPrims_hi, primID_hi); > } > #else > GeometryShaderStage<HasStreamOutT, HasRastT>( > - pDC, workerId, tessPa, pGsOut, pCutBuffer, > pCutStreamBuffer, pSoPrimData, > - _simd_set1_epi32(dsContext.PrimitiveID)); > + pDC, workerId, tessPa, pGsBuffers, pSoPrimData, > _simd_set1_epi32(dsContext.PrimitiveID)); > #endif > } > else > @@ -1408,15 +1439,13 @@ void ProcessDraw( > uint32_t numPrims = GetNumPrims(state.topology, work.numVerts); > #endif > > - void* pGsOut = nullptr; > - void* pCutBuffer = nullptr; > - void* pStreamCutBuffer = nullptr; > + GsBuffers gsBuffers; > if (HasGeometryShaderT::value) > { > #if USE_SIMD16_FRONTEND > - AllocateGsBuffers<simd16vertex, KNOB_SIMD16_WIDTH>(pDC, state, > &pGsOut, &pCutBuffer, &pStreamCutBuffer); > + AllocateGsBuffers<SIMD512, KNOB_SIMD16_WIDTH>(pDC, state, > NumVertsPerPrim(state.topology, true), &gsBuffers); > #else > - AllocateGsBuffers<simdvertex, KNOB_SIMD_WIDTH>(pDC, state, &pGsOut, > &pCutBuffer, &pStreamCutBuffer); > + AllocateGsBuffers<SIMD256, KNOB_SIMD_WIDTH>(pDC, state, > NumVertsPerPrim(state.topology, true), &gsBuffers); > #endif > } > > @@ -1672,23 +1701,23 @@ void ProcessDraw( > if (HasTessellationT::value) > { > pa.useAlternateOffset = false; > - TessellationStages<HasGeometryShaderT, > HasStreamOutT, HasRastT>(pDC, workerId, pa, pGsOut, pCutBuffer, > pStreamCutBuffer, pSoPrimData, numPrims_lo, primID_lo); > + TessellationStages<HasGeometryShaderT, > HasStreamOutT, HasRastT>(pDC, workerId, pa, &gsBuffers, pSoPrimData, > numPrims_lo, primID_lo); > > if (numPrims_hi) > { > pa.useAlternateOffset = true; > - TessellationStages<HasGeometryShaderT, > HasStreamOutT, HasRastT>(pDC, workerId, pa, pGsOut, pCutBuffer, > pStreamCutBuffer, pSoPrimData, numPrims_hi, primID_hi); > + TessellationStages<HasGeometryShaderT, > HasStreamOutT, HasRastT>(pDC, workerId, pa, &gsBuffers, pSoPrimData, > numPrims_hi, primID_hi); > } > } > else if (HasGeometryShaderT::value) > { > pa.useAlternateOffset = false; > - GeometryShaderStage<HasStreamOutT, > HasRastT>(pDC, workerId, pa, pGsOut, pCutBuffer, pStreamCutBuffer, > pSoPrimData, numPrims_lo, primID_lo); > + GeometryShaderStage<HasStreamOutT, > HasRastT>(pDC, workerId, pa, &gsBuffers, pSoPrimData, numPrims_lo, primID_lo); > > if (numPrims_hi) > { > pa.useAlternateOffset = true; > - GeometryShaderStage<HasStreamOutT, > HasRastT>(pDC, workerId, pa, pGsOut, pCutBuffer, pStreamCutBuffer, > pSoPrimData, numPrims_hi, primID_hi); > + GeometryShaderStage<HasStreamOutT, > HasRastT>(pDC, workerId, pa, &gsBuffers, pSoPrimData, numPrims_hi, primID_hi); > } > } > else > @@ -1847,12 +1876,12 @@ void ProcessDraw( > if (HasTessellationT::value) > { > TessellationStages<HasGeometryShaderT, > HasStreamOutT, HasRastT>( > - pDC, workerId, pa, pGsOut, pCutBuffer, > pStreamCutBuffer, pSoPrimData, pa.GetPrimID(work.startPrimID)); > + pDC, workerId, pa, &gsBuffers, > pSoPrimData, pa.GetPrimID(work.startPrimID)); > } > else if (HasGeometryShaderT::value) > { > GeometryShaderStage<HasStreamOutT, HasRastT>( > - pDC, workerId, pa, pGsOut, pCutBuffer, > pStreamCutBuffer, pSoPrimData, pa.GetPrimID(work.startPrimID)); > + pDC, workerId, pa, &gsBuffers, > pSoPrimData, pa.GetPrimID(work.startPrimID)); > } > else > { > diff --git a/src/gallium/drivers/swr/rasterizer/core/state.h > b/src/gallium/drivers/swr/rasterizer/core/state.h > index 13c1d8b..f7c9308 100644 > --- a/src/gallium/drivers/swr/rasterizer/core/state.h > +++ b/src/gallium/drivers/swr/rasterizer/core/state.h > @@ -301,13 +301,12 @@ struct SWR_DS_CONTEXT > ///////////////////////////////////////////////////////////////////////// > struct SWR_GS_CONTEXT > { > - simdvertex vert[MAX_NUM_VERTS_PER_PRIM]; // IN: input primitive data for > SIMD prims > - simdscalari PrimitiveID; // IN: input primitive ID generated from > the draw call > - uint32_t InstanceID; // IN: input instance ID > - simdscalari mask; // IN: Active mask for shader > - uint8_t* pStream; // OUT: output stream (contains vertices > for all output streams) > - uint8_t* pCutOrStreamIdBuffer; // OUT: cut or stream id buffer > - simdscalari vertexCount; // OUT: num vertices emitted per SIMD > lane > + simdvector* pVerts; // IN: input primitive data for SIMD > prims > + uint32_t inputVertStride; // IN: input vertex stride, in > attributes > + simdscalari PrimitiveID; // IN: input primitive ID generated > from the draw call > + uint32_t InstanceID; // IN: input instance ID > + simdscalari mask; // IN: Active mask for shader > + uint8_t* pStreams[KNOB_SIMD_WIDTH]; // OUT: output stream (contains > vertices for all output streams) > }; > > struct PixelPositions > @@ -714,30 +713,56 @@ struct SWR_GS_STATE > { > bool gsEnable; > > - // number of input attributes per vertex. used by the frontend to > + // Number of input attributes per vertex. Used by the frontend to > // optimize assembling primitives for GS > uint32_t numInputAttribs; > > - // output topology - can be point, tristrip, or linestrip > + // Stride of incoming verts in attributes > + uint32_t inputVertStride; > + > + // Output topology - can be point, tristrip, or linestrip > PRIMITIVE_TOPOLOGY outputTopology; // @llvm_enum > > - // maximum number of verts that can be emitted by a single instance of > the GS > + // Maximum number of verts that can be emitted by a single instance of > the GS > uint32_t maxNumVerts; > > - // instance count > + // Instance count > uint32_t instanceCount; > > - // if true, geometry shader emits a single stream, with separate cut > buffer. > - // if false, geometry shader emits vertices for multiple streams to the > stream buffer, with a separate StreamID buffer > + // If true, geometry shader emits a single stream, with separate cut > buffer. > + // If false, geometry shader emits vertices for multiple streams to the > stream buffer, with a separate StreamID buffer > // to map vertices to streams > bool isSingleStream; > > - // when single stream is enabled, singleStreamID dictates which stream > is being output. > + // When single stream is enabled, singleStreamID dictates which stream > is being output. > // field ignored if isSingleStream is false > uint32_t singleStreamID; > > - // Offset to the start of the attributes of the input vertices, in > simdvector units > + // Total amount of memory to allocate for one instance of the shader > output in bytes > + uint32_t allocationSize; > + > + // Offset to the start of the attributes of the input vertices, in > simdvector units, as read by the GS > uint32_t vertexAttribOffset; > + > + // Offset to the attributes as stored by the preceding shader stage. > + uint32_t srcVertexAttribOffset; > + > + // Size of the control data section which contains cut or streamID data, > in simdscalar units. Should be sized to handle > + // the maximum number of verts output by the GS. Can be 0 if there are > no cuts or streamID bits. > + uint32_t controlDataSize; > + > + // Offset to the control data section, in bytes > + uint32_t controlDataOffset; > + > + // Total size of an output vertex, in simdvector units > + uint32_t outputVertexSize; > + > + // Offset to the start of the vertex section, in bytes > + uint32_t outputVertexOffset; > + > + // Set this to non-zero to indicate that the shader outputs a static > number of verts. If zero, shader is > + // expected to store the final vertex count in the first dword of the gs > output stream. > + uint32_t staticVertexCount; > }; > > > diff --git a/src/gallium/drivers/swr/swr_shader.cpp > b/src/gallium/drivers/swr/swr_shader.cpp > index 0a81eaa..7f11e72 100644 > --- a/src/gallium/drivers/swr/swr_shader.cpp > +++ b/src/gallium/drivers/swr/swr_shader.cpp > @@ -347,18 +347,20 @@ BuilderSWR::swr_gs_llvm_fetch_input(const struct > lp_build_tgsi_gs_iface *gs_ifac > Value *attrib = > LOAD(GEP(iface->pVtxAttribMap, {C(0), unwrap(attrib_index)})); > > - Value *pInput = > - LOAD(GEP(iface->pGsCtx, > - {C(0), > - C(SWR_GS_CONTEXT_vert), > - unwrap(vertex_index), > - C(0), > - attrib, > - unwrap(swizzle_index)})); > + Value *pVertex = LOAD(iface->pGsCtx, {0, SWR_GS_CONTEXT_pVerts}); > + Value *pInputVertStride = LOAD(iface->pGsCtx, {0, > SWR_GS_CONTEXT_inputVertStride}); > + > + Value *pVector = ADD(MUL(unwrap(vertex_index), pInputVertStride), > attrib); > + > + Value *pInput = LOAD(GEP(pVertex, {pVector, unwrap(swizzle_index)})); > > return wrap(pInput); > } > > +// GS output stream layout > +#define VERTEX_COUNT_SIZE 32 > +#define CONTROL_HEADER_SIZE (8*32) > + > void > BuilderSWR::swr_gs_llvm_emit_vertex(const struct lp_build_tgsi_gs_iface > *gs_base, > struct lp_build_tgsi_context * bld_base, > @@ -366,41 +368,19 @@ BuilderSWR::swr_gs_llvm_emit_vertex(const struct > lp_build_tgsi_gs_iface *gs_base > LLVMValueRef emitted_vertices_vec) > { > swr_gs_llvm_iface *iface = (swr_gs_llvm_iface*)gs_base; > - SWR_GS_STATE *pGS = iface->pGsState; > > IRB()->SetInsertPoint(unwrap(LLVMGetInsertBlock(gallivm->builder))); > > -#if USE_SIMD16_FRONTEND > - const uint32_t simdVertexStride = sizeof(simdvertex) * 2; > - const uint32_t numSimdBatches = (pGS->maxNumVerts + (mVWidth * 2) - 1) / > (mVWidth * 2); > -#else > - const uint32_t simdVertexStride = sizeof(simdvertex); > - const uint32_t numSimdBatches = (pGS->maxNumVerts + mVWidth - 1) / > mVWidth; > -#endif > - const uint32_t inputPrimStride = numSimdBatches * simdVertexStride; > - > - Value *pStream = LOAD(iface->pGsCtx, { 0, SWR_GS_CONTEXT_pStream }); > - Value *vMask = LOAD(iface->pGsCtx, { 0, SWR_GS_CONTEXT_mask }); > - Value *vMask1 = TRUNC(vMask, VectorType::get(mInt1Ty, 8)); > + const uint32_t headerSize = VERTEX_COUNT_SIZE + CONTROL_HEADER_SIZE; > + const uint32_t attribSize = 4 * sizeof(float); > + const uint32_t vertSize = attribSize * SWR_VTX_NUM_SLOTS; > + Value *pVertexOffset = MUL(unwrap(emitted_vertices_vec), > VIMMED1(vertSize)); > > - Value *vOffsets = C({ > - inputPrimStride * 0, > - inputPrimStride * 1, > - inputPrimStride * 2, > - inputPrimStride * 3, > - inputPrimStride * 4, > - inputPrimStride * 5, > - inputPrimStride * 6, > - inputPrimStride * 7 } ); > + Value *vMask = LOAD(iface->pGsCtx, {0, SWR_GS_CONTEXT_mask}); > + Value *vMask1 = TRUNC(vMask, VectorType::get(mInt1Ty, mVWidth)); > > -#if USE_SIMD16_FRONTEND > - const uint32_t simdShift = log2(mVWidth * 2); > - Value *vSimdSlot = AND(unwrap(emitted_vertices_vec), (mVWidth * 2) - 1); > -#else > - const uint32_t simdShift = log2(mVWidth); > - Value *vSimdSlot = AND(unwrap(emitted_vertices_vec), mVWidth - 1); > -#endif > - Value *vVertexSlot = ASHR(unwrap(emitted_vertices_vec), simdShift); > + Value *pStack = STACKSAVE(); > + Value *pTmpPtr = ALLOCA(mFP32Ty, C(4)); // used for dummy write for lane > masking > > for (uint32_t attrib = 0; attrib < iface->num_outputs; ++attrib) { > uint32_t attribSlot = attrib; > @@ -420,46 +400,36 @@ BuilderSWR::swr_gs_llvm_emit_vertex(const struct > lp_build_tgsi_gs_iface *gs_base > } > } > > -#if USE_SIMD16_FRONTEND > - Value *vOffsetsAttrib = > - ADD(vOffsets, MUL(vVertexSlot, > VIMMED1((uint32_t)sizeof(simdvertex) * 2))); > - vOffsetsAttrib = > - ADD(vOffsetsAttrib, > VIMMED1((uint32_t)(attribSlot*sizeof(simdvector) * 2))); > -#else > - Value *vOffsetsAttrib = > - ADD(vOffsets, MUL(vVertexSlot, > VIMMED1((uint32_t)sizeof(simdvertex)))); > - vOffsetsAttrib = > - ADD(vOffsetsAttrib, > VIMMED1((uint32_t)(attribSlot*sizeof(simdvector)))); > -#endif > - vOffsetsAttrib = > - ADD(vOffsetsAttrib, MUL(vSimdSlot, > VIMMED1((uint32_t)sizeof(float)))); > + Value *pOutputOffset = ADD(pVertexOffset, VIMMED1(headerSize + > attribSize * attribSlot)); // + sgvChannel ? > > - for (uint32_t channel = 0; channel < 4; ++channel) { > - Value *vPtrs = GEP(pStream, vOffsetsAttrib); > - Value *vData; > + for (uint32_t lane = 0; lane < mVWidth; ++lane) { > + Value *pLaneOffset = VEXTRACT(pOutputOffset, C(lane)); > + Value *pStream = LOAD(iface->pGsCtx, {0, SWR_GS_CONTEXT_pStreams, > lane}); > + Value *pStreamOffset = GEP(pStream, pLaneOffset); > + pStreamOffset = BITCAST(pStreamOffset, mFP32PtrTy); > > - if (attribSlot == VERTEX_SGV_SLOT) > - vData = LOAD(unwrap(outputs[attrib][0])); > - else > - vData = LOAD(unwrap(outputs[attrib][channel])); > + Value *pLaneMask = VEXTRACT(vMask1, C(lane)); > + pStreamOffset = SELECT(pLaneMask, pStreamOffset, pTmpPtr); > > - if (attribSlot != VERTEX_SGV_SLOT || > - sgvChannel == channel) { > - vPtrs = BITCAST(vPtrs, > - VectorType::get(PointerType::get(mFP32Ty, 0), > 8)); > + for (uint32_t channel = 0; channel < 4; ++channel) { > + Value *vData; > > - MASKED_SCATTER(vData, vPtrs, 32, vMask1); > - } > + if (attribSlot == VERTEX_SGV_SLOT) > + vData = LOAD(unwrap(outputs[attrib][0])); > + else > + vData = LOAD(unwrap(outputs[attrib][channel])); > > -#if USE_SIMD16_FRONTEND > - vOffsetsAttrib = > - ADD(vOffsetsAttrib, VIMMED1((uint32_t)sizeof(simdscalar) * 2)); > -#else > - vOffsetsAttrib = > - ADD(vOffsetsAttrib, VIMMED1((uint32_t)sizeof(simdscalar))); > -#endif > + if (attribSlot != VERTEX_SGV_SLOT || > + sgvChannel == channel) { > + vData = VEXTRACT(vData, C(lane)); > + STORE(vData, pStreamOffset); > + } > + pStreamOffset = GEP(pStreamOffset, C(1)); > + } > } > } > + > + STACKRESTORE(pStack); > } > > void > @@ -469,12 +439,9 @@ BuilderSWR::swr_gs_llvm_end_primitive(const struct > lp_build_tgsi_gs_iface *gs_ba > LLVMValueRef emitted_prims_vec) > { > swr_gs_llvm_iface *iface = (swr_gs_llvm_iface*)gs_base; > - SWR_GS_STATE *pGS = iface->pGsState; > > IRB()->SetInsertPoint(unwrap(LLVMGetInsertBlock(gallivm->builder))); > > - Value *pCutBuffer = > - LOAD(iface->pGsCtx, {0, SWR_GS_CONTEXT_pCutOrStreamIdBuffer}); > Value *vMask = LOAD(iface->pGsCtx, { 0, SWR_GS_CONTEXT_mask }); > Value *vMask1 = TRUNC(vMask, VectorType::get(mInt1Ty, 8)); > > @@ -496,31 +463,29 @@ BuilderSWR::swr_gs_llvm_end_primitive(const struct > lp_build_tgsi_gs_iface *gs_ba > mask = AND(mask, cmpMask); > vMask1 = TRUNC(mask, VectorType::get(mInt1Ty, 8)); > > - const uint32_t cutPrimStride = > - (pGS->maxNumVerts + JM()->mVWidth - 1) / JM()->mVWidth; > - Value *vOffsets = C({ > - (uint32_t)(cutPrimStride * 0), > - (uint32_t)(cutPrimStride * 1), > - (uint32_t)(cutPrimStride * 2), > - (uint32_t)(cutPrimStride * 3), > - (uint32_t)(cutPrimStride * 4), > - (uint32_t)(cutPrimStride * 5), > - (uint32_t)(cutPrimStride * 6), > - (uint32_t)(cutPrimStride * 7) } ); > - > vCount = SUB(vCount, VIMMED1(1)); > - Value *vOffset = ADD(UDIV(vCount, VIMMED1(8)), vOffsets); > + Value *vOffset = ADD(UDIV(vCount, VIMMED1(8)), > VIMMED1(VERTEX_COUNT_SIZE)); > Value *vValue = SHL(VIMMED1(1), UREM(vCount, VIMMED1(8))); > > vValue = TRUNC(vValue, VectorType::get(mInt8Ty, 8)); > > - Value *vPtrs = GEP(pCutBuffer, vOffset); > - vPtrs = > - BITCAST(vPtrs, VectorType::get(PointerType::get(mInt8Ty, 0), > JM()->mVWidth)); > + Value *pStack = STACKSAVE(); > + Value *pTmpPtr = ALLOCA(mInt8Ty, C(4)); // used for dummy read/write for > lane masking > + > + for (uint32_t lane = 0; lane < mVWidth; ++lane) { > + Value *vLaneOffset = VEXTRACT(vOffset, C(lane)); > + Value *pStream = LOAD(iface->pGsCtx, {0, SWR_GS_CONTEXT_pStreams, > lane}); > + Value *pStreamOffset = GEP(pStream, vLaneOffset); > + > + Value *pLaneMask = VEXTRACT(vMask1, C(lane)); > + pStreamOffset = SELECT(pLaneMask, pStreamOffset, pTmpPtr); > > - Value *vGather = MASKED_GATHER(vPtrs, 32, vMask1); > - vValue = OR(vGather, vValue); > - MASKED_SCATTER(vValue, vPtrs, 32, vMask1); > + Value *vVal = LOAD(pStreamOffset); > + vVal = OR(vVal, VEXTRACT(vValue, C(lane))); > + STORE(vVal, pStreamOffset); > + } > + > + STACKRESTORE(pStack); > } > > void > @@ -533,7 +498,14 @@ BuilderSWR::swr_gs_llvm_epilogue(const struct > lp_build_tgsi_gs_iface *gs_base, > > IRB()->SetInsertPoint(unwrap(LLVMGetInsertBlock(gallivm->builder))); > > - STORE(unwrap(total_emitted_vertices_vec), iface->pGsCtx, {0, > SWR_GS_CONTEXT_vertexCount}); > + // Store emit count to each output stream in the first DWORD > + for (uint32_t lane = 0; lane < mVWidth; ++lane) > + { > + Value* pStream = LOAD(iface->pGsCtx, {0, SWR_GS_CONTEXT_pStreams, > lane}); > + pStream = BITCAST(pStream, mInt32PtrTy); > + Value* pLaneCount = VEXTRACT(unwrap(total_emitted_vertices_vec), > C(lane)); > + STORE(pLaneCount, pStream); > + } > } > > PFN_GS_FUNC > @@ -542,6 +514,8 @@ BuilderSWR::CompileGS(struct swr_context *ctx, > swr_jit_gs_key &key) > SWR_GS_STATE *pGS = &ctx->gs->gsState; > struct tgsi_shader_info *info = &ctx->gs->info.base; > > + memset(pGS, 0, sizeof(*pGS)); > + > pGS->gsEnable = true; > > pGS->numInputAttribs = info->num_inputs; > @@ -555,6 +529,18 @@ BuilderSWR::CompileGS(struct swr_context *ctx, > swr_jit_gs_key &key) > pGS->singleStreamID = 0; > > pGS->vertexAttribOffset = VERTEX_ATTRIB_START_SLOT; // TODO: optimize > + pGS->srcVertexAttribOffset = VERTEX_ATTRIB_START_SLOT; // TODO: optimize > + pGS->inputVertStride = pGS->numInputAttribs + pGS->vertexAttribOffset; > + pGS->outputVertexSize = SWR_VTX_NUM_SLOTS; > + pGS->controlDataSize = 8; // GS ouputs max of 8 32B units > + pGS->controlDataOffset = 32; > + pGS->outputVertexOffset = pGS->controlDataOffset + pGS->controlDataSize * > 32; > + > + pGS->allocationSize = > + 32 + // vertex count > + (8 * 32) + // control header > + (SWR_VTX_NUM_SLOTS * 16) * // sizeof vertex > + pGS->maxNumVerts; // num verts
Consider using VERTEX_COUNT_SIZE and CONTROL_HEADER_SIZE defines? pGS->controlDataOffset = VERTEX_COUNT_SIZE; pGS->outputVertexOffset = VERTEX_COUNT_SIZE + CONTROL_HEADER_SIZE; pGS->allocationSize = VERTEX_COUNT_SIZE + CONTROL_HEADER_SIZE (SWR_VTX_NUM_SLOTS * 16) * // sizeof vertex pGS->maxNumVerts; // num verts > struct swr_geometry_shader *gs = ctx->gs; > > @@ -635,10 +621,11 @@ BuilderSWR::CompileGS(struct swr_context *ctx, > swr_jit_gs_key &key) > lp_type_float_vec(32, 32 * 8), wrap(mask_val)); > > // zero out cut buffer so we can load/modify/store bits > - MEMSET(LOAD(pGsCtx, {0, SWR_GS_CONTEXT_pCutOrStreamIdBuffer}), > - C((char)0), > - pGS->instanceCount * ((pGS->maxNumVerts + 7) / 8) * JM()->mVWidth, > - sizeof(float) * KNOB_SIMD_WIDTH); > + for (uint32_t lane = 0; lane < mVWidth; ++lane) > + { > + Value* pStream = LOAD(pGsCtx, {0, SWR_GS_CONTEXT_pStreams, lane}); > + MEMSET(pStream, C((char)0), VERTEX_COUNT_SIZE + CONTROL_HEADER_SIZE, > sizeof(float) * KNOB_SIMD_WIDTH); > + } > > struct swr_gs_llvm_iface gs_iface; > gs_iface.base.fetch_input = ::swr_gs_llvm_fetch_input; > -- > 2.7.4 > > _______________________________________________ > mesa-dev mailing list > mesa-dev@lists.freedesktop.org > https://lists.freedesktop.org/mailman/listinfo/mesa-dev _______________________________________________ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev