[Mesa-dev] [PATCH 5/6] swr/rast: don't use 32-bit gathers for elements < 32-bits in size

2018-01-04 Thread Tim Rowley
Using a gather for elements less than 32-bits in size can cause
pagefaults when loading the last elements in a page-aligned-sized
buffer.
---
 .../drivers/swr/rasterizer/jitter/fetch_jit.cpp| 61 +-
 1 file changed, 60 insertions(+), 1 deletion(-)

diff --git a/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.cpp 
b/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.cpp
index 99a936d176..ad70cbe95d 100644
--- a/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.cpp
+++ b/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.cpp
@@ -741,7 +741,66 @@ void FetchJit::CreateGatherOddFormats(SWR_FORMAT format, 
Value* pMask, Value* pB
 // only works if pixel size is <= 32bits
 SWR_ASSERT(info.bpp <= 32);
 
-Value *pGather = GATHERDD(VIMMED1(0), pBase, pOffsets, pMask);
+Value *pGather;
+if (info.bpp == 32)
+{
+pGather = GATHERDD(VIMMED1(0), pBase, pOffsets, pMask);
+}
+else
+{
+// Can't use 32-bit gather for items less than 32-bits, could cause 
page faults.
+Value *pMem = ALLOCA(mSimdInt32Ty);
+STORE(VIMMED1(0u), pMem);
+
+pBase = BITCAST(pBase, PointerType::get(mInt8Ty, 0));
+Value* pDstMem = BITCAST(pMem, mInt32PtrTy);
+
+for (uint32_t lane = 0; lane < mVWidth; ++lane)
+{
+// Get index
+Value* index = VEXTRACT(pOffsets, C(lane));
+Value* mask = VEXTRACT(pMask, C(lane));
+switch (info.bpp)
+{
+case 8:
+{
+Value* pDst = BITCAST(GEP(pDstMem, C(lane)), 
PointerType::get(mInt8Ty, 0));
+Value* pSrc = BITCAST(GEP(pBase, index), 
PointerType::get(mInt8Ty, 0));
+STORE(LOAD(SELECT(mask, pSrc, pDst)), pDst);
+break;
+}
+
+case 16:
+{
+Value* pDst = BITCAST(GEP(pDstMem, C(lane)), 
PointerType::get(mInt16Ty, 0));
+Value* pSrc = BITCAST(GEP(pBase, index), 
PointerType::get(mInt16Ty, 0));
+STORE(LOAD(SELECT(mask, pSrc, pDst)), pDst);
+break;
+}
+break;
+
+case 24:
+{
+// First 16-bits of data
+Value* pDst = BITCAST(GEP(pDstMem, C(lane)), 
PointerType::get(mInt16Ty, 0));
+Value* pSrc = BITCAST(GEP(pBase, index), 
PointerType::get(mInt16Ty, 0));
+STORE(LOAD(SELECT(mask, pSrc, pDst)), pDst);
+
+// Last 8-bits of data
+pDst = BITCAST(GEP(pDst, C(1)), PointerType::get(mInt8Ty, 0));
+pSrc = BITCAST(GEP(pSrc, C(1)), PointerType::get(mInt8Ty, 0));
+STORE(LOAD(SELECT(mask, pSrc, pDst)), pDst);
+break;
+}
+
+default:
+SWR_INVALID("Shouldn't have BPP = %d now", info.bpp);
+break;
+}
+}
+
+pGather = LOAD(pMem);
+}
 
 for (uint32_t comp = 0; comp < 4; ++comp)
 {
-- 
2.14.1

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH 1/6] swr/rast: SIMD16 builder - cleanup naming (simd2 -> simd16)

2018-01-04 Thread Tim Rowley
---
 .../drivers/swr/rasterizer/jitter/builder.cpp  |  76 +-
 .../drivers/swr/rasterizer/jitter/builder.h|  45 +++---
 .../drivers/swr/rasterizer/jitter/builder_misc.cpp | 133 
 .../drivers/swr/rasterizer/jitter/builder_misc.h   |  50 +++---
 .../drivers/swr/rasterizer/jitter/fetch_jit.cpp| 168 +++--
 5 files changed, 239 insertions(+), 233 deletions(-)

diff --git a/src/gallium/drivers/swr/rasterizer/jitter/builder.cpp 
b/src/gallium/drivers/swr/rasterizer/jitter/builder.cpp
index 4b83a3204c..c46159a35a 100644
--- a/src/gallium/drivers/swr/rasterizer/jitter/builder.cpp
+++ b/src/gallium/drivers/swr/rasterizer/jitter/builder.cpp
@@ -40,52 +40,56 @@ namespace SwrJit
 Builder::Builder(JitManager *pJitMgr)
 : mpJitMgr(pJitMgr)
 {
+SWR_ASSERT(pJitMgr->mVWidth == 8);
+
 mVWidth = pJitMgr->mVWidth;
-#if USE_SIMD16_BUILDER
-mVWidth2 = pJitMgr->mVWidth * 2;
-#endif
+mVWidth16 = pJitMgr->mVWidth * 2;
 
 mpIRBuilder = >mBuilder;
 
-mVoidTy = Type::getVoidTy(pJitMgr->mContext);
-mFP16Ty = Type::getHalfTy(pJitMgr->mContext);
-mFP32Ty = Type::getFloatTy(pJitMgr->mContext);
-mFP32PtrTy = PointerType::get(mFP32Ty, 0);
-mDoubleTy = Type::getDoubleTy(pJitMgr->mContext);
-mInt1Ty = Type::getInt1Ty(pJitMgr->mContext);
-mInt8Ty = Type::getInt8Ty(pJitMgr->mContext);
-mInt16Ty = Type::getInt16Ty(pJitMgr->mContext);
-mInt32Ty = Type::getInt32Ty(pJitMgr->mContext);
-mInt8PtrTy = PointerType::get(mInt8Ty, 0);
+// Built in types: scalar
+
+mVoidTy = Type::getVoidTy(pJitMgr->mContext);
+mFP16Ty = Type::getHalfTy(pJitMgr->mContext);
+mFP32Ty = Type::getFloatTy(pJitMgr->mContext);
+mFP32PtrTy  = PointerType::get(mFP32Ty, 0);
+mDoubleTy   = Type::getDoubleTy(pJitMgr->mContext);
+mInt1Ty = Type::getInt1Ty(pJitMgr->mContext);
+mInt8Ty = Type::getInt8Ty(pJitMgr->mContext);
+mInt16Ty= Type::getInt16Ty(pJitMgr->mContext);
+mInt32Ty= Type::getInt32Ty(pJitMgr->mContext);
+mInt8PtrTy  = PointerType::get(mInt8Ty, 0);
 mInt16PtrTy = PointerType::get(mInt16Ty, 0);
 mInt32PtrTy = PointerType::get(mInt32Ty, 0);
-mInt64Ty = Type::getInt64Ty(pJitMgr->mContext);
-mSimdInt1Ty = VectorType::get(mInt1Ty, mVWidth);
-mSimdInt16Ty = VectorType::get(mInt16Ty, mVWidth);
-mSimdInt32Ty = VectorType::get(mInt32Ty, mVWidth);
-mSimdInt64Ty = VectorType::get(mInt64Ty, mVWidth);
-mSimdFP16Ty = VectorType::get(mFP16Ty, mVWidth);
-mSimdFP32Ty = VectorType::get(mFP32Ty, mVWidth);
-mSimdVectorTy = ArrayType::get(mSimdFP32Ty, 4);
+mInt64Ty= Type::getInt64Ty(pJitMgr->mContext);
+
+// Built in types: simd8
+
+mSimdInt1Ty = VectorType::get(mInt1Ty,  mVWidth);
+mSimdInt16Ty= VectorType::get(mInt16Ty, mVWidth);
+mSimdInt32Ty= VectorType::get(mInt32Ty, mVWidth);
+mSimdInt64Ty= VectorType::get(mInt64Ty, mVWidth);
+mSimdFP16Ty = VectorType::get(mFP16Ty,  mVWidth);
+mSimdFP32Ty = VectorType::get(mFP32Ty,  mVWidth);
+mSimdVectorTy   = ArrayType::get(mSimdFP32Ty, 4);
 mSimdVectorTRTy = ArrayType::get(mSimdFP32Ty, 5);
-#if USE_SIMD16_BUILDER
-mSimd2Int1Ty = VectorType::get(mInt1Ty, mVWidth2);
-mSimd2Int16Ty = VectorType::get(mInt16Ty, mVWidth2);
-mSimd2Int32Ty = VectorType::get(mInt32Ty, mVWidth2);
-mSimd2Int64Ty = VectorType::get(mInt64Ty, mVWidth2);
-mSimd2FP16Ty = VectorType::get(mFP16Ty, mVWidth2);
-mSimd2FP32Ty = VectorType::get(mFP32Ty, mVWidth2);
-mSimd2VectorTy = ArrayType::get(mSimd2FP32Ty, 4);
-mSimd2VectorTRTy = ArrayType::get(mSimd2FP32Ty, 5);
-#endif
+
+// Built in types: simd16
+
+mSimd16Int1Ty   = VectorType::get(mInt1Ty,  mVWidth16);
+mSimd16Int16Ty  = VectorType::get(mInt16Ty, mVWidth16);
+mSimd16Int32Ty  = VectorType::get(mInt32Ty, mVWidth16);
+mSimd16Int64Ty  = VectorType::get(mInt64Ty, mVWidth16);
+mSimd16FP16Ty   = VectorType::get(mFP16Ty,  mVWidth16);
+mSimd16FP32Ty   = VectorType::get(mFP32Ty,  mVWidth16);
+mSimd16VectorTy = ArrayType::get(mSimd16FP32Ty, 4);
+mSimd16VectorTRTy   = ArrayType::get(mSimd16FP32Ty, 5);
 
 if (sizeof(uint32_t*) == 4)
 {
 mIntPtrTy = mInt32Ty;
 mSimdIntPtrTy = mSimdInt32Ty;
-#if USE_SIMD16_BUILDER
-mSimd2IntPtrTy = mSimd2Int32Ty;
-#endif
+mSimd16IntPtrTy = mSimd16Int32Ty;
 }
 else
 {
@@ -93,9 +97,7 @@ namespace SwrJit
 
 mIntPtrTy = mInt64Ty;
 mSimdIntPtrTy = mSimdInt64Ty;
-#if USE_SIMD16_BUILDER
-mSimd2IntPtrTy = mSimd2Int64Ty;
-#endif
+

[Mesa-dev] [PATCH 4/6] swr/rast: autogenerate named structs instead of literal structs

2018-01-04 Thread Tim Rowley
Results in far smaller and useful IR output.
---
 .../swr/rasterizer/codegen/templates/gen_llvm.hpp  | 23 ++
 1 file changed, 15 insertions(+), 8 deletions(-)

diff --git a/src/gallium/drivers/swr/rasterizer/codegen/templates/gen_llvm.hpp 
b/src/gallium/drivers/swr/rasterizer/codegen/templates/gen_llvm.hpp
index 18ea781713..574ee5aaa7 100644
--- a/src/gallium/drivers/swr/rasterizer/codegen/templates/gen_llvm.hpp
+++ b/src/gallium/drivers/swr/rasterizer/codegen/templates/gen_llvm.hpp
@@ -40,15 +40,22 @@ namespace SwrJit
 INLINE static StructType *Gen_${type['name']}(JitManager* pJitMgr)
 {
 LLVMContext& ctx = pJitMgr->mContext;
-std::vector members;
-<%
-(max_type_len, max_name_len) = calc_max_len(type['members'])
-%>
-%for member in type['members']:
-/* ${member['name']} ${pad(len(member['name']), max_name_len)}*/ 
members.push_back( ${member['type']} );
-%endfor
 
-return StructType::get(ctx, members, false);
+StructType* pRetType = 
pJitMgr->mpCurrentModule->getTypeByName("${type['name']}");
+if (pRetType == nullptr)
+{
+std::vector members;
+<%
+(max_type_len, max_name_len) = calc_max_len(type['members'])
+%>
+%for member in type['members']:
+/* ${member['name']} ${pad(len(member['name']), max_name_len)}*/ 
members.push_back(${ member['type'] });
+%endfor
+
+pRetType = StructType::create(members, "${type['name']}", false);
+}
+
+return pRetType;
 }
 
 %for member in type['members']:
-- 
2.14.1

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH 3/6] swr/rast: SIMD16 fetch shader jitter cleanup

2018-01-04 Thread Tim Rowley
Bake in USE_SIMD16_BUILDER code paths (for USE_SIMD16_SHADER defined),
remove USE_SIMD16_BUILDER define, remove deprecated psuedo-SIMD16 code
paths.
---
 .../drivers/swr/rasterizer/jitter/fetch_jit.cpp| 1118 +++-
 1 file changed, 383 insertions(+), 735 deletions(-)

diff --git a/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.cpp 
b/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.cpp
index ac09a82f6c..99a936d176 100644
--- a/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.cpp
+++ b/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.cpp
@@ -50,7 +50,6 @@ enum ConversionType
 
 #if USE_SIMD16_SHADERS
 #define USE_SIMD16_GATHERS 0
-#define USE_SIMD16_BUILDER 0
 #endif
 
 //
@@ -61,6 +60,7 @@ struct FetchJit : public Builder
 FetchJit(JitManager* pJitMgr) : Builder(pJitMgr){};
 
 Function* Create(const FETCH_COMPILE_STATE& fetchState);
+
 Value* GetSimdValid32bitIndices(Value* vIndices, Value* pLastIndex);
 Value* GetSimdValid16bitIndices(Value* vIndices, Value* pLastIndex);
 Value* GetSimdValid8bitIndices(Value* vIndices, Value* pLastIndex);
@@ -69,43 +69,49 @@ struct FetchJit : public Builder
 typedef std::tuple Shuffle8bpcArgs;
+
 #if USE_SIMD16_SHADERS
+#if USE_SIMD16_GATHERS
+void Shuffle8bpcGatherd16(Shuffle8bpcArgs );
+#else
 void Shuffle8bpcGatherd(Shuffle8bpcArgs , bool useVertexID2);
+#endif
 #else
 void Shuffle8bpcGatherd(Shuffle8bpcArgs );
 #endif
-#if USE_SIMD16_BUILDER
-void Shuffle8bpcGatherd2(Shuffle8bpcArgs );
-#endif
 
 typedef std::tuple Shuffle16bpcArgs;
+
 #if USE_SIMD16_SHADERS
+#if USE_SIMD16_GATHERS
+void Shuffle16bpcGather16(Shuffle16bpcArgs );
+#else
 void Shuffle16bpcGather(Shuffle16bpcArgs , bool useVertexID2);
+#endif
 #else
 void Shuffle16bpcGather(Shuffle16bpcArgs );
 #endif
-#if USE_SIMD16_BUILDER
-void Shuffle16bpcGather2(Shuffle16bpcArgs );
-#endif
 
+#if USE_SIMD16_GATHERS
+void StoreVertexElements16(Value* pVtxOut, const uint32_t outputElt, const 
uint32_t numEltsToStore, Value* ()[4]);
+#else
 void StoreVertexElements(Value* pVtxOut, const uint32_t outputElt, const 
uint32_t numEltsToStore, Value* ()[4]);
-#if USE_SIMD16_BUILDER
-void StoreVertexElements2(Value* pVtxOut, const uint32_t outputElt, const 
uint32_t numEltsToStore, Value* ()[4]);
 #endif
 
 #if USE_SIMD16_SHADERS
-Value* GenerateCompCtrlVector(const ComponentControl ctrl, bool 
useVertexID2);
+#if USE_SIMD16_GATHERS
+Value *GenerateCompCtrlVector16(const ComponentControl ctrl);
 #else
-Value* GenerateCompCtrlVector(const ComponentControl ctrl);
+Value *GenerateCompCtrlVector(const ComponentControl ctrl, bool 
useVertexID2);
 #endif
-#if USE_SIMD16_BUILDER
-Value* GenerateCompCtrlVector2(const ComponentControl ctrl);
+#else
+Value *GenerateCompCtrlVector(const ComponentControl ctrl);
 #endif
 
 void JitLoadVertices(const FETCH_COMPILE_STATE , Value* 
streams, Value* vIndices, Value* pVtxOut);
-#if USE_SIMD16_SHADERS
 
+#if USE_SIMD16_SHADERS
 #if USE_SIMD16_GATHERS
 void JitGatherVertices(const FETCH_COMPILE_STATE , Value 
*streams, Value *vIndices, Value *vIndices2, Value *pVtxOut, bool useVertexID2);
 #else
@@ -833,21 +839,14 @@ void FetchJit::JitGatherVertices(const 
FETCH_COMPILE_STATE ,
 uint32_t outputElt = 0;
 Value* vVertexElements[4];
 #if USE_SIMD16_GATHERS
-Value* vVertexElements2[4];
-#if USE_SIMD16_BUILDER
 Value *pVtxSrc2[4];
-#endif
 #endif
 
 Value* startVertex = LOAD(mpFetchInfo, {0, SWR_FETCH_CONTEXT_StartVertex});
 Value* startInstance = LOAD(mpFetchInfo, {0, 
SWR_FETCH_CONTEXT_StartInstance});
 Value* curInstance = LOAD(mpFetchInfo, {0, SWR_FETCH_CONTEXT_CurInstance});
 #if USE_SIMD16_GATHERS
-#if USE_SIMD16_BUILDER
 Value* vBaseVertex16 = VBROADCAST_16(LOAD(mpFetchInfo, { 0, 
SWR_FETCH_CONTEXT_BaseVertex }));
-#else
-Value* vBaseVertex = VBROADCAST(LOAD(mpFetchInfo, { 0, 
SWR_FETCH_CONTEXT_BaseVertex }));
-#endif
 #else
 Value* vBaseVertex = VBROADCAST(LOAD(mpFetchInfo, { 0, 
SWR_FETCH_CONTEXT_BaseVertex }));
 #endif
@@ -874,11 +873,7 @@ void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE 
,
 
 Value *stride = LOAD(streams, {ied.StreamIndex, 
SWR_VERTEX_BUFFER_STATE_pitch});
 #if USE_SIMD16_GATHERS
-#if USE_SIMD16_BUILDER
 Value *vStride16 = VBROADCAST_16(stride);
-#else
-Value *vStride = VBROADCAST(stride);
-#endif
 #else
 Value *vStride = VBROADCAST(stride);
 #endif
@@ -901,20 +896,14 @@ void FetchJit::JitGatherVertices(const 

[Mesa-dev] [PATCH 6/6] swr/rast: switch win32 jit format to COFF

2018-01-04 Thread Tim Rowley
Allows for call-stack and exception handling for jitted functions.
---
 src/gallium/drivers/swr/rasterizer/jitter/JitManager.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/gallium/drivers/swr/rasterizer/jitter/JitManager.cpp 
b/src/gallium/drivers/swr/rasterizer/jitter/JitManager.cpp
index 883ac77482..508bc027dd 100644
--- a/src/gallium/drivers/swr/rasterizer/jitter/JitManager.cpp
+++ b/src/gallium/drivers/swr/rasterizer/jitter/JitManager.cpp
@@ -92,7 +92,7 @@ JitManager::JitManager(uint32_t simdWidth, const char *arch, 
const char* core)
 #if defined(_WIN32)
 // Needed for MCJIT on windows
 Triple hostTriple(sys::getProcessTriple());
-hostTriple.setObjectFormat(Triple::ELF);
+hostTriple.setObjectFormat(Triple::COFF);
 mpCurrentModule->setTargetTriple(hostTriple.getTriple());
 #endif // _WIN32
 
@@ -486,4 +486,4 @@ std::unique_ptr 
JitCache::getObject(const llvm::Module* M)
 fclose(fpIn);
 
 return pBuf;
-}
+}
\ No newline at end of file
-- 
2.14.1

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH 2/6] swr/rast: shuffle header files for msvc pre-compiled header usage

2018-01-04 Thread Tim Rowley
---
 src/gallium/drivers/swr/Makefile.sources   |   1 +
 .../drivers/swr/rasterizer/jitter/JitManager.cpp   |  36 +-
 .../drivers/swr/rasterizer/jitter/JitManager.h |  46 +--
 .../drivers/swr/rasterizer/jitter/blend_jit.cpp|   3 +-
 .../drivers/swr/rasterizer/jitter/builder.cpp  |   1 +
 .../drivers/swr/rasterizer/jitter/builder_misc.cpp |   1 +
 .../drivers/swr/rasterizer/jitter/fetch_jit.cpp|   3 +-
 .../drivers/swr/rasterizer/jitter/jit_api.h|   1 -
 .../drivers/swr/rasterizer/jitter/jit_pch.hpp  | 134 +
 .../swr/rasterizer/jitter/streamout_jit.cpp|   5 +-
 10 files changed, 143 insertions(+), 88 deletions(-)
 create mode 100644 src/gallium/drivers/swr/rasterizer/jitter/jit_pch.hpp

diff --git a/src/gallium/drivers/swr/Makefile.sources 
b/src/gallium/drivers/swr/Makefile.sources
index 53f8bf011b..cd2040e137 100644
--- a/src/gallium/drivers/swr/Makefile.sources
+++ b/src/gallium/drivers/swr/Makefile.sources
@@ -145,6 +145,7 @@ JITTER_CXX_SOURCES := \
rasterizer/jitter/fetch_jit.cpp \
rasterizer/jitter/fetch_jit.h \
rasterizer/jitter/jit_api.h \
+   rasterizer/jitter/jit_pch.hpp \
rasterizer/jitter/JitManager.cpp \
rasterizer/jitter/JitManager.h \
rasterizer/jitter/streamout_jit.cpp \
diff --git a/src/gallium/drivers/swr/rasterizer/jitter/JitManager.cpp 
b/src/gallium/drivers/swr/rasterizer/jitter/JitManager.cpp
index 59672bb545..883ac77482 100644
--- a/src/gallium/drivers/swr/rasterizer/jitter/JitManager.cpp
+++ b/src/gallium/drivers/swr/rasterizer/jitter/JitManager.cpp
@@ -27,41 +27,7 @@
 * Notes:
 * 
 **/
-#if defined(_WIN32)
-#pragma warning(disable: 4800 4146 4244 4267 4355 4996)
-#endif
-
-#pragma push_macro("DEBUG")
-#undef DEBUG
-
-#if defined(_WIN32)
-#include "llvm/ADT/Triple.h"
-#endif
-#include "llvm/IR/Function.h"
-
-#include "llvm/Support/MemoryBuffer.h"
-#include "llvm/Support/SourceMgr.h"
-
-#include "llvm/Analysis/CFGPrinter.h"
-#include "llvm/IRReader/IRReader.h"
-#include "llvm/Target/TargetMachine.h"
-#include "llvm/Support/FormattedStream.h"
-#include "llvm/Support/Path.h"
-#include "llvm/Support/MemoryBuffer.h"
-#include "llvm/Config/llvm-config.h"
-
-#if LLVM_VERSION_MAJOR < 4
-#include "llvm/Bitcode/ReaderWriter.h"
-#else
-#include "llvm/Bitcode/BitcodeWriter.h"
-#include "llvm/Bitcode/BitcodeReader.h"
-#endif
-
-#if LLVM_USE_INTEL_JITEVENTS
-#include "llvm/ExecutionEngine/JITEventListener.h"
-#endif
-
-#pragma pop_macro("DEBUG")
+#include "jit_pch.hpp"
 
 #include "JitManager.h"
 #include "jit_api.h"
diff --git a/src/gallium/drivers/swr/rasterizer/jitter/JitManager.h 
b/src/gallium/drivers/swr/rasterizer/jitter/JitManager.h
index c30a807222..9e5e4cf2b6 100644
--- a/src/gallium/drivers/swr/rasterizer/jitter/JitManager.h
+++ b/src/gallium/drivers/swr/rasterizer/jitter/JitManager.h
@@ -29,52 +29,9 @@
 **/
 #pragma once
 
-#if defined(_WIN32)
-#pragma warning(disable : 4146 4244 4267 4800 4996)
-#endif
-
-// llvm 3.7+ reuses "DEBUG" as an enum value
-#pragma push_macro("DEBUG")
-#undef DEBUG
-
-#include "llvm/IR/DataLayout.h"
-#include "llvm/IR/Instructions.h"
-#include "llvm/IR/LLVMContext.h"
-#include "llvm/IR/Module.h"
-#include "llvm/IR/Type.h"
-#include "llvm/IR/IRBuilder.h"
-#include "llvm/IR/IntrinsicInst.h"
-#include "llvm/ExecutionEngine/ObjectCache.h"
-
-#include "llvm/Config/llvm-config.h"
-
-#include "llvm/IR/Verifier.h"
-#include "llvm/ExecutionEngine/MCJIT.h"
-#include "llvm/Support/FileSystem.h"
-#define LLVM_F_NONE sys::fs::F_None
-
-#include "llvm/Analysis/Passes.h"
-
-#include "llvm/IR/LegacyPassManager.h"
-using FunctionPassManager = llvm::legacy::FunctionPassManager;
-using PassManager = llvm::legacy::PassManager;
-
-#include "llvm/CodeGen/Passes.h"
-#include "llvm/ExecutionEngine/ExecutionEngine.h"
-#include "llvm/Support/raw_ostream.h"
-#include "llvm/Support/TargetSelect.h"
-#include "llvm/Transforms/IPO.h"
-#include "llvm/Transforms/Scalar.h"
-#include "llvm/Support/Host.h"
-#include "llvm/Support/DynamicLibrary.h"
-
-
-#include "common/os.h"
+#include "jit_pch.hpp"
 #include "common/isa.hpp"
 
-#include 
-
-#pragma pop_macro("DEBUG")
 
 //
 /// JitInstructionSet
@@ -173,6 +130,7 @@ struct JitManager
 
 uint32_t mVWidth;
 
+
 // Built in types.
 llvm::Type*mInt8Ty;
 llvm::Type*mInt32Ty;
diff --git a/src/gallium/drivers/swr/rasterizer/jitter/blend_jit.cpp 
b/src/gallium/drivers/swr/rasterizer/jitter/blend_jit.cpp
index 3258639d38..cc92622978 100644
--- a/src/gallium/drivers/swr/rasterizer/jitter/blend_jit.cpp
+++ b/src/gallium/drivers/swr/rasterizer/jitter/blend_jit.cpp
@@ -27,13 +27,12 @@
 * Notes:
 *
 

[Mesa-dev] [PATCH 0/6] swr: update rasterizer

2018-01-04 Thread Tim Rowley
Highlights include simd16 cleanup (renaming and removing old
codepaths), fixing a potential crash with the fetch shader, and code
cleanups.

Tim Rowley (6):
  swr/rast: SIMD16 builder - cleanup naming (simd2 -> simd16)
  swr/rast: shuffle header files for msvc pre-compiled header usage
  swr/rast: SIMD16 fetch shader jitter cleanup
  swr/rast: autogenerate named structs instead of literal structs
  swr/rast: don't use 32-bit gathers for elements < 32-bits in size
  swr/rast: switch win32 jit format to COFF

 src/gallium/drivers/swr/Makefile.sources   |1 +
 .../swr/rasterizer/codegen/templates/gen_llvm.hpp  |   23 +-
 .../drivers/swr/rasterizer/jitter/JitManager.cpp   |   40 +-
 .../drivers/swr/rasterizer/jitter/JitManager.h |   46 +-
 .../drivers/swr/rasterizer/jitter/blend_jit.cpp|3 +-
 .../drivers/swr/rasterizer/jitter/builder.cpp  |   77 +-
 .../drivers/swr/rasterizer/jitter/builder.h|   45 +-
 .../drivers/swr/rasterizer/jitter/builder_misc.cpp |  134 +-
 .../drivers/swr/rasterizer/jitter/builder_misc.h   |   50 +-
 .../drivers/swr/rasterizer/jitter/fetch_jit.cpp| 1304 
 .../drivers/swr/rasterizer/jitter/jit_api.h|1 -
 .../drivers/swr/rasterizer/jitter/jit_pch.hpp  |  134 ++
 .../swr/rasterizer/jitter/streamout_jit.cpp|5 +-
 13 files changed, 819 insertions(+), 1044 deletions(-)
 create mode 100644 src/gallium/drivers/swr/rasterizer/jitter/jit_pch.hpp

-- 
2.14.1

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH] swr/rast: fix invalid sign masks in avx512 simdlib code

2018-01-04 Thread Tim Rowley
Should be 0x8000 instead of 0x800.

Cc: mesa-sta...@lists.freedesktop.org
---
 src/gallium/drivers/swr/rasterizer/common/simdlib_128_avx512.inl | 2 +-
 src/gallium/drivers/swr/rasterizer/common/simdlib_256_avx512.inl | 2 +-
 src/gallium/drivers/swr/rasterizer/common/simdlib_512_avx512.inl | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/gallium/drivers/swr/rasterizer/common/simdlib_128_avx512.inl 
b/src/gallium/drivers/swr/rasterizer/common/simdlib_128_avx512.inl
index 66e8309610..b70a7691e2 100644
--- a/src/gallium/drivers/swr/rasterizer/common/simdlib_128_avx512.inl
+++ b/src/gallium/drivers/swr/rasterizer/common/simdlib_128_avx512.inl
@@ -270,7 +270,7 @@ static SIMDINLINE Float SIMDCALL mask_i32gather_ps(Float 
old, float const* p, In
 {
 __mmask16 m = 0xf;
 m = _mm512_mask_test_epi32_mask(m, _mm512_castps_si512(__conv(mask)),
-_mm512_set1_epi32(0x800));
+_mm512_set1_epi32(0x8000));
 return __conv(_mm512_mask_i32gather_ps(
 __conv(old),
 m,
diff --git a/src/gallium/drivers/swr/rasterizer/common/simdlib_256_avx512.inl 
b/src/gallium/drivers/swr/rasterizer/common/simdlib_256_avx512.inl
index 3f93cfbd7f..3fcfd250f9 100644
--- a/src/gallium/drivers/swr/rasterizer/common/simdlib_256_avx512.inl
+++ b/src/gallium/drivers/swr/rasterizer/common/simdlib_256_avx512.inl
@@ -271,7 +271,7 @@ static SIMDINLINE Float SIMDCALL mask_i32gather_ps(Float 
old, float const* p, In
 {
 __mmask16 m = 0xff;
 m = _mm512_mask_test_epi32_mask(m, _mm512_castps_si512(__conv(mask)),
-_mm512_set1_epi32(0x800));
+_mm512_set1_epi32(0x8000));
 return __conv(_mm512_mask_i32gather_ps(
 __conv(old),
 m,
diff --git a/src/gallium/drivers/swr/rasterizer/common/simdlib_512_avx512.inl 
b/src/gallium/drivers/swr/rasterizer/common/simdlib_512_avx512.inl
index c13b9f616a..8de62f2a7e 100644
--- a/src/gallium/drivers/swr/rasterizer/common/simdlib_512_avx512.inl
+++ b/src/gallium/drivers/swr/rasterizer/common/simdlib_512_avx512.inl
@@ -540,7 +540,7 @@ static SIMDINLINE uint32_t SIMDCALL movemask_pd(Double a)
 }
 static SIMDINLINE uint32_t SIMDCALL movemask_ps(Float a)
 {
-__mmask16 m = _mm512_test_epi32_mask(castps_si(a), set1_epi32(0x800));
+__mmask16 m = _mm512_test_epi32_mask(castps_si(a), set1_epi32(0x8000));
 return static_cast(m);
 }
 
-- 
2.14.1

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH] swr/rast: fix build break for llvm-6

2018-01-02 Thread Tim Rowley
LLVM api change.

Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=104381
---
 src/gallium/drivers/swr/rasterizer/jitter/JitManager.cpp | 4 
 1 file changed, 4 insertions(+)

diff --git a/src/gallium/drivers/swr/rasterizer/jitter/JitManager.cpp 
b/src/gallium/drivers/swr/rasterizer/jitter/JitManager.cpp
index 3f0772c942..59672bb545 100644
--- a/src/gallium/drivers/swr/rasterizer/jitter/JitManager.cpp
+++ b/src/gallium/drivers/swr/rasterizer/jitter/JitManager.cpp
@@ -498,7 +498,11 @@ std::unique_ptr 
JitCache::getObject(const llvm::Module* M)
 break;
 }
 
+#if LLVM_VERSION_MAJOR < 6
 pBuf = 
llvm::MemoryBuffer::getNewUninitMemBuffer(size_t(header.GetBufferSize()));
+#else
+pBuf = 
llvm::WritableMemoryBuffer::getNewUninitMemBuffer(size_t(header.GetBufferSize()));
+#endif
 if (!fread(const_cast(pBuf->getBufferStart()), 
header.GetBufferSize(), 1, fpIn))
 {
 pBuf = nullptr;
-- 
2.14.1

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH 19/20] swr/rast: EXTRACT2 changed from vextract/vinsert to vshuffle

2017-12-14 Thread Tim Rowley
---
 .../drivers/swr/rasterizer/jitter/builder_misc.cpp | 60 ++
 .../drivers/swr/rasterizer/jitter/builder_misc.h   |  3 +-
 .../drivers/swr/rasterizer/jitter/fetch_jit.cpp| 30 +--
 3 files changed, 32 insertions(+), 61 deletions(-)

diff --git a/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.cpp 
b/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.cpp
index bdcafd28a3..0774889af1 100644
--- a/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.cpp
+++ b/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.cpp
@@ -653,16 +653,14 @@ namespace SwrJit
 }
 else
 {
-Value *src0 = EXTRACT2_F(vSrc, 0);
-Value *src1 = EXTRACT2_F(vSrc, 1);
+Value *src0 = EXTRACT2(vSrc, 0);
+Value *src1 = EXTRACT2(vSrc, 1);
 
-Value *indices0 = EXTRACT2_I(vIndices, 0);
-Value *indices1 = EXTRACT2_I(vIndices, 1);
+Value *indices0 = EXTRACT2(vIndices, 0);
+Value *indices1 = EXTRACT2(vIndices, 1);
 
-Value *vmask16 = VMASK2(vMask);
-
-Value *mask0 = MASK(EXTRACT2_I(vmask16, 0));  // TODO: do this 
better..
-Value *mask1 = MASK(EXTRACT2_I(vmask16, 1));
+Value *mask0 = EXTRACT2(vMask, 0);
+Value *mask1 = EXTRACT2(vMask, 1);
 
 Value *gather0 = GATHERPS(src0, pBase, indices0, mask0, scale);
 Value *gather1 = GATHERPS(src1, pBase, indices1, mask1, scale);
@@ -738,16 +736,14 @@ namespace SwrJit
 }
 else
 {
-Value *src0 = EXTRACT2_F(vSrc, 0);
-Value *src1 = EXTRACT2_F(vSrc, 1);
-
-Value *indices0 = EXTRACT2_I(vIndices, 0);
-Value *indices1 = EXTRACT2_I(vIndices, 1);
+Value *src0 = EXTRACT2(vSrc, 0);
+Value *src1 = EXTRACT2(vSrc, 1);
 
-Value *vmask16 = VMASK2(vMask);
+Value *indices0 = EXTRACT2(vIndices, 0);
+Value *indices1 = EXTRACT2(vIndices, 1);
 
-Value *mask0 = MASK(EXTRACT2_I(vmask16, 0));  // TODO: do this 
better..
-Value *mask1 = MASK(EXTRACT2_I(vmask16, 1));
+Value *mask0 = EXTRACT2(vMask, 0);
+Value *mask1 = EXTRACT2(vMask, 1);
 
 Value *gather0 = GATHERDD(src0, pBase, indices0, mask0, scale);
 Value *gather1 = GATHERDD(src1, pBase, indices1, mask1, scale);
@@ -809,34 +805,12 @@ namespace SwrJit
 }
 
 #if USE_SIMD16_BUILDER
-//
-/// @brief
-Value *Builder::EXTRACT2_F(Value *a2, uint32_t imm)
-{
-const uint32_t i0 = (imm > 0) ? mVWidth : 0;
-
-Value *result = VUNDEF_F();
-
-for (uint32_t i = 0; i < mVWidth; i += 1)
-{
-#if 1
-if (!a2->getType()->getScalarType()->isFloatTy())
-{
-a2 = BITCAST(a2, mSimd2FP32Ty);
-}
-
-#endif
-Value *temp = VEXTRACT(a2, C(i0 + i));
-
-result = VINSERT(result, temp, C(i));
-}
-
-return result;
-}
-
-Value *Builder::EXTRACT2_I(Value *a2, uint32_t imm)
+Value *Builder::EXTRACT2(Value *x, uint32_t imm)
 {
-return BITCAST(EXTRACT2_F(a2, imm), mSimdInt32Ty);
+if (imm == 0)
+return VSHUFFLE(x, UndefValue::get(x->getType()), {0, 1, 2, 3, 4, 
5, 6, 7});
+else
+return VSHUFFLE(x, UndefValue::get(x->getType()), {8, 9, 10, 11, 
12, 13, 14, 15});
 }
 
 Value *Builder::JOIN2(Value *a, Value *b)
diff --git a/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.h 
b/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.h
index 98bc563351..646ed0efb2 100644
--- a/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.h
+++ b/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.h
@@ -117,8 +117,7 @@ Value *VMASK2(Value *mask);
 //
 
 #if USE_SIMD16_BUILDER
-Value *EXTRACT2_F(Value *a2, uint32_t imm);
-Value *EXTRACT2_I(Value *a2, uint32_t imm);
+Value *EXTRACT2(Value *x, uint32_t imm);
 Value *JOIN2(Value *a, Value *b);
 #endif
 
diff --git a/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.cpp 
b/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.cpp
index 8d97ddfdc9..aa911b58f3 100644
--- a/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.cpp
+++ b/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.cpp
@@ -1078,14 +1078,12 @@ void FetchJit::JitGatherVertices(const 
FETCH_COMPILE_STATE ,
 vOffsets16 = ADD(vOffsets16, vInstanceStride16);
 
 // TODO: remove the following simd8 interop stuff once all code paths 
are fully widened to SIMD16..
-Value *vmask16 = VMASK2(vGatherMask16);
 
-Value *vGatherMask  = MASK(EXTRACT2_I(vmask16, 0));
-Value *vGatherMask2 = MASK(EXTRACT2_I(vmask16, 1));
-
-Value *vOffsets  = EXTRACT2_I(vOffsets16, 0);
-Value 

[Mesa-dev] [PATCH 17/20] swr/rast: Replace VPSRL with LSHR

2017-12-14 Thread Tim Rowley
Replace use of x86 intrinsic with general llvm IR instruction.

Generates the same final assembly.
---
 .../swr/rasterizer/codegen/gen_llvm_ir_macros.py   |  2 --
 .../drivers/swr/rasterizer/jitter/builder_misc.cpp | 30 --
 .../drivers/swr/rasterizer/jitter/builder_misc.h   |  5 
 .../drivers/swr/rasterizer/jitter/fetch_jit.cpp|  8 +++---
 4 files changed, 4 insertions(+), 41 deletions(-)

diff --git a/src/gallium/drivers/swr/rasterizer/codegen/gen_llvm_ir_macros.py 
b/src/gallium/drivers/swr/rasterizer/codegen/gen_llvm_ir_macros.py
index 8bbf36d9b8..9544353eb9 100644
--- a/src/gallium/drivers/swr/rasterizer/codegen/gen_llvm_ir_macros.py
+++ b/src/gallium/drivers/swr/rasterizer/codegen/gen_llvm_ir_macros.py
@@ -47,8 +47,6 @@ intrinsics = [
 ['VGATHERPS_16', 'x86_avx512_gather_dps_512', ['src', 'pBase', 
'indices', 'mask', 'scale']],
 ['VGATHERDD', 'x86_avx2_gather_d_d_256', ['src', 'pBase', 'indices', 
'mask', 'scale']],
 ['VGATHERDD_16', 'x86_avx512_gather_dpi_512', ['src', 'pBase', 
'indices', 'mask', 'scale']],
-['VPSRLI', 'x86_avx2_psrli_d', ['src', 'imm']],
-['VPSRLI_16', 'x86_avx512_psrli_d_512', ['src', 'imm']],
 ['VSQRTPS', 'x86_avx_sqrt_ps_256', ['a']],
 ['VRSQRTPS', 'x86_avx_rsqrt_ps_256', ['a']],
 ['VRCPPS', 'x86_avx_rcp_ps_256', ['a']],
diff --git a/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.cpp 
b/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.cpp
index 684c9fac54..bdcafd28a3 100644
--- a/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.cpp
+++ b/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.cpp
@@ -808,36 +808,6 @@ namespace SwrJit
 return vGather;
 }
 
-#if USE_SIMD16_BUILDER
-Value *Builder::PSRLI(Value *a, Value *imm)
-{
-return VPSRLI(a, imm);
-}
-
-Value *Builder::PSRLI_16(Value *a, Value *imm)
-{
-Value *result = VUNDEF2_I();
-
-// use avx512 shift right instruction if available
-if (JM()->mArch.AVX512F())
-{
-result = VPSRLI_16(a, imm);
-}
-else
-{
-Value *a0 = EXTRACT2_I(a, 0);
-Value *a1 = EXTRACT2_I(a, 1);
-
-Value *result0 = PSRLI(a0, imm);
-Value *result1 = PSRLI(a1, imm);
-
-result = JOIN2(result0, result1);
-}
-
-return result;
-}
-
-#endif
 #if USE_SIMD16_BUILDER
 //
 /// @brief
diff --git a/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.h 
b/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.h
index 6c883d8f52..98bc563351 100644
--- a/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.h
+++ b/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.h
@@ -143,11 +143,6 @@ void GATHER4DD(const SWR_FORMAT_INFO , Value* 
pSrcBase, Value* byteOffsets,
 
 Value *GATHERPD(Value* src, Value* pBase, Value* indices, Value* mask, uint8_t 
scale = 1);
 
-#if USE_SIMD16_BUILDER
-Value *PSRLI(Value *a, Value *imm);
-Value *PSRLI_16(Value *a, Value *imm);
-
-#endif
 void SCATTERPS(Value* pDst, Value* vSrc, Value* vOffsets, Value* vMask);
 
 void Shuffle8bpcGather4(const SWR_FORMAT_INFO , Value* vGatherInput, 
Value* vGatherOutput[], bool bPackedOutput);
diff --git a/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.cpp 
b/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.cpp
index 1312ac0009..8d97ddfdc9 100644
--- a/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.cpp
+++ b/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.cpp
@@ -1422,12 +1422,12 @@ void FetchJit::JitGatherVertices(const 
FETCH_COMPILE_STATE ,
 // But, we know that elements must be aligned 
for FETCH. :)
 // Right shift the offset by a bit and then 
scale by 2 to remove the sign extension.
 #if USE_SIMD16_BUILDER
-Value *shiftedOffsets = VPSRLI_16(vOffsets16, 
C(1));
+Value *shiftedOffsets = LSHR(vOffsets16, 1);
 pVtxSrc2[currentVertexElement] = 
GATHERPS_16(gatherSrc16, pStreamBase, shiftedOffsets, vGatherMask16, 2);
 
 #else
-Value *vShiftedOffsets = VPSRLI(vOffsets, 
C(1));
-Value *vShiftedOffsets2 = VPSRLI(vOffsets2, 
C(1));
+Value *vShiftedOffsets = LSHR(vOffsets, 1);
+Value *vShiftedOffsets2 = LSHR(vOffsets2, 1);
 
 vVertexElements[currentVertexElement]  = 
GATHERPS(gatherSrc, pStreamBase, vShiftedOffsets, vGatherMask, 2);
 vVertexElements2[currentVertexElement] = 
GATHERPS(gatherSrc2, pStreamBase, vShiftedOffsets2, vGatherMask2, 2);
@@ -1492,7 +1492,7 @@ void FetchJit::JitGatherVertices(const 
FETCH_COMPILE_STATE ,
 // However, 

[Mesa-dev] [PATCH 20/20] swr/rast: Move more RTAI handling out of binner

2017-12-14 Thread Tim Rowley
---
 src/gallium/drivers/swr/rasterizer/core/binner.cpp | 13 +
 src/gallium/drivers/swr/rasterizer/core/clip.h |  1 +
 2 files changed, 2 insertions(+), 12 deletions(-)

diff --git a/src/gallium/drivers/swr/rasterizer/core/binner.cpp 
b/src/gallium/drivers/swr/rasterizer/core/binner.cpp
index 7ef87c4443..9aa9f9e79b 100644
--- a/src/gallium/drivers/swr/rasterizer/core/binner.cpp
+++ b/src/gallium/drivers/swr/rasterizer/core/binner.cpp
@@ -1023,18 +1023,7 @@ void BinPostSetupPointsImpl(
 SIMD_T::store_si(reinterpret_cast(aMTBottom), bbox.ymax);
 
 // store render target array index
-OSALIGNSIMD16(uint32_t) aRTAI[SIMD_WIDTH];
-if (state.backendState.readRenderTargetArrayIndex)
-{
-typename SIMD_T::Vec4 vRtai[2];
-pa.Assemble(VERTEX_SGV_SLOT, vRtai);
-typename SIMD_T::Integer vRtaii = 
SIMD_T::castps_si(vRtai[0][VERTEX_SGV_RTAI_COMP]);
-SIMD_T::store_si(reinterpret_cast(aRTAI), vRtaii);
-}
-else
-{
-SIMD_T::store_si(reinterpret_cast(aRTAI), SIMD_T::setzero_si());
-}
+const uint32_t *aRTAI = reinterpret_cast();
 
 OSALIGNSIMD16(float) aPointSize[SIMD_WIDTH];
 SIMD_T::store_ps(reinterpret_cast(aPointSize), vPointSize);
diff --git a/src/gallium/drivers/swr/rasterizer/core/clip.h 
b/src/gallium/drivers/swr/rasterizer/core/clip.h
index e5e00d49b0..592c9bfa73 100644
--- a/src/gallium/drivers/swr/rasterizer/core/clip.h
+++ b/src/gallium/drivers/swr/rasterizer/core/clip.h
@@ -646,6 +646,7 @@ public:
 
 PA_STATE_OPT clipPA(pDC, numEmittedPrims, reinterpret_cast([0]), numEmittedVerts, SWR_VTX_NUM_SLOTS, true, 
NumVertsPerPrim, clipTopology);
 clipPA.viewportArrayActive = pa.viewportArrayActive;
+clipPA.rtArrayActive = pa.rtArrayActive;
 
 static const uint32_t primMaskMap[] = { 0x0, 0x1, 0x3, 0x7, 0xf, 
0x1f, 0x3f, 0x7f };
 
-- 
2.14.1

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH 13/20] swr/rast: SIMD16 Fetch - Fully widen 32-bit integer vertex components

2017-12-14 Thread Tim Rowley
Also widen the 16-bit a 8-bit integer vertex component gathers to SIMD16.
---
 .../swr/rasterizer/codegen/gen_llvm_ir_macros.py   |  1 +
 .../drivers/swr/rasterizer/jitter/builder_misc.cpp | 36 +
 .../drivers/swr/rasterizer/jitter/builder_misc.h   |  3 +
 .../drivers/swr/rasterizer/jitter/fetch_jit.cpp| 86 +-
 4 files changed, 109 insertions(+), 17 deletions(-)

diff --git a/src/gallium/drivers/swr/rasterizer/codegen/gen_llvm_ir_macros.py 
b/src/gallium/drivers/swr/rasterizer/codegen/gen_llvm_ir_macros.py
index ac8b3badf6..8bbf36d9b8 100644
--- a/src/gallium/drivers/swr/rasterizer/codegen/gen_llvm_ir_macros.py
+++ b/src/gallium/drivers/swr/rasterizer/codegen/gen_llvm_ir_macros.py
@@ -46,6 +46,7 @@ intrinsics = [
 ['VGATHERPS', 'x86_avx2_gather_d_ps_256', ['src', 'pBase', 'indices', 
'mask', 'scale']],
 ['VGATHERPS_16', 'x86_avx512_gather_dps_512', ['src', 'pBase', 
'indices', 'mask', 'scale']],
 ['VGATHERDD', 'x86_avx2_gather_d_d_256', ['src', 'pBase', 'indices', 
'mask', 'scale']],
+['VGATHERDD_16', 'x86_avx512_gather_dpi_512', ['src', 'pBase', 
'indices', 'mask', 'scale']],
 ['VPSRLI', 'x86_avx2_psrli_d', ['src', 'imm']],
 ['VPSRLI_16', 'x86_avx512_psrli_d_512', ['src', 'imm']],
 ['VSQRTPS', 'x86_avx_sqrt_ps_256', ['a']],
diff --git a/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.cpp 
b/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.cpp
index 3a486e4c1e..684c9fac54 100644
--- a/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.cpp
+++ b/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.cpp
@@ -723,6 +723,42 @@ namespace SwrJit
 return vGather;
 }
 
+#if USE_SIMD16_BUILDER
+Value *Builder::GATHERDD_16(Value *vSrc, Value *pBase, Value *vIndices, 
Value *vMask, uint8_t scale)
+{
+Value *vGather = VUNDEF2_F();
+
+// use avx512 gather instruction if available
+if (JM()->mArch.AVX512F())
+{
+// force mask to , required by vgather2
+Value *mask = BITCAST(vMask, mInt16Ty);
+
+vGather = VGATHERDD_16(vSrc, pBase, vIndices, mask, 
C((uint32_t)scale));
+}
+else
+{
+Value *src0 = EXTRACT2_F(vSrc, 0);
+Value *src1 = EXTRACT2_F(vSrc, 1);
+
+Value *indices0 = EXTRACT2_I(vIndices, 0);
+Value *indices1 = EXTRACT2_I(vIndices, 1);
+
+Value *vmask16 = VMASK2(vMask);
+
+Value *mask0 = MASK(EXTRACT2_I(vmask16, 0));  // TODO: do this 
better..
+Value *mask1 = MASK(EXTRACT2_I(vmask16, 1));
+
+Value *gather0 = GATHERDD(src0, pBase, indices0, mask0, scale);
+Value *gather1 = GATHERDD(src1, pBase, indices1, mask1, scale);
+
+vGather = JOIN2(gather0, gather1);
+}
+
+return vGather;
+}
+
+#endif
 //
 /// @brief Generate a masked gather operation in LLVM IR.  If not
 /// supported on the underlying platform, emulate it with loads
diff --git a/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.h 
b/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.h
index 231bd6ad85..6c883d8f52 100644
--- a/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.h
+++ b/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.h
@@ -135,6 +135,9 @@ void GATHER4PS(const SWR_FORMAT_INFO , Value* 
pSrcBase, Value* byteOffsets,
Value* mask, Value* vGatherComponents[], bool bPackedOutput);
 
 Value *GATHERDD(Value* src, Value* pBase, Value* indices, Value* mask, uint8_t 
scale = 1);
+#if USE_SIMD16_BUILDER
+Value *GATHERDD_16(Value *src, Value *pBase, Value *indices, Value *mask, 
uint8_t scale = 1);
+#endif
 void GATHER4DD(const SWR_FORMAT_INFO , Value* pSrcBase, Value* 
byteOffsets,
Value* mask, Value* vGatherComponents[], bool bPackedOutput);
 
diff --git a/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.cpp 
b/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.cpp
index e0a0770560..ec3b5eafcc 100644
--- a/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.cpp
+++ b/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.cpp
@@ -1349,14 +1349,6 @@ void FetchJit::JitGatherVertices(const 
FETCH_COMPILE_STATE ,
 if (compMask)
 {
 #if USE_SIMD16_BUILDER
-#if USE_SIMD16_BUILDER
-#else
-Value *gatherResult[2];
-
-gatherResult[0] = JOIN2(vGatherResult[0], 
vGatherResult2[0]);
-gatherResult[1] = JOIN2(vGatherResult[1], 
vGatherResult2[1]);
-
-#endif
 Value *pVtxOut2 = BITCAST(pVtxOut, 
PointerType::get(VectorType::get(mFP32Ty, mVWidth2), 0));
 
 Shuffle16bpcArgs args = 
std::forward_as_tuple(gatherResult, pVtxOut2, Instruction::CastOps::FPExt, 
CONVERT_NONE,
@@ -1701,6 +1693,9 @@ void FetchJit::JitGatherVertices(const 

[Mesa-dev] [PATCH 18/20] swr/rast: Fix cache of API thread event manager

2017-12-14 Thread Tim Rowley
---
 src/gallium/drivers/swr/rasterizer/core/api.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/gallium/drivers/swr/rasterizer/core/api.cpp 
b/src/gallium/drivers/swr/rasterizer/core/api.cpp
index 25a3f34841..09b482dcc0 100644
--- a/src/gallium/drivers/swr/rasterizer/core/api.cpp
+++ b/src/gallium/drivers/swr/rasterizer/core/api.cpp
@@ -166,7 +166,7 @@ HANDLE SwrCreateContext(
 
 #if defined(KNOB_ENABLE_AR)
 // cache the API thread event manager, for use with sim layer
-pCreateInfo->hArEventManager = pContext->pArContext[16];
+pCreateInfo->hArEventManager = 
pContext->pArContext[pContext->NumWorkerThreads + 1];
 #endif
 
 // State setup AFTER context is fully initialized
-- 
2.14.1

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH 12/20] swr/rast: Replace INSERT2 vextract/vinsert with JOIN2 vshuffle

2017-12-14 Thread Tim Rowley
---
 .../drivers/swr/rasterizer/jitter/builder_misc.cpp | 38 ++---
 .../drivers/swr/rasterizer/jitter/builder_misc.h   |  5 +-
 .../drivers/swr/rasterizer/jitter/fetch_jit.cpp| 92 ++
 3 files changed, 30 insertions(+), 105 deletions(-)

diff --git a/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.cpp 
b/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.cpp
index b2210db717..3a486e4c1e 100644
--- a/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.cpp
+++ b/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.cpp
@@ -667,8 +667,7 @@ namespace SwrJit
 Value *gather0 = GATHERPS(src0, pBase, indices0, mask0, scale);
 Value *gather1 = GATHERPS(src1, pBase, indices1, mask1, scale);
 
-vGather = INSERT2_F(vGather, gather0, 0);
-vGather = INSERT2_F(vGather, gather1, 1);
+vGather = JOIN2(gather0, gather1);
 }
 
 return vGather;
@@ -796,8 +795,7 @@ namespace SwrJit
 Value *result0 = PSRLI(a0, imm);
 Value *result1 = PSRLI(a1, imm);
 
-result = INSERT2_I(result, result0, 0);
-result = INSERT2_I(result, result1, 1);
+result = JOIN2(result0, result1);
 }
 
 return result;
@@ -835,37 +833,13 @@ namespace SwrJit
 return BITCAST(EXTRACT2_F(a2, imm), mSimdInt32Ty);
 }
 
-//
-/// @brief
-Value *Builder::INSERT2_F(Value *a2, Value *b, uint32_t imm)
+Value *Builder::JOIN2(Value *a, Value *b)
 {
-const uint32_t i0 = (imm > 0) ? mVWidth : 0;
-
-Value *result = BITCAST(a2, mSimd2FP32Ty);
-
-for (uint32_t i = 0; i < mVWidth; i += 1)
-{
-#if 1
-if (!b->getType()->getScalarType()->isFloatTy())
-{
-b = BITCAST(b, mSimdFP32Ty);
-}
-
-#endif
-Value *temp = VEXTRACT(b, C(i));
-
-result = VINSERT(result, temp, C(i0 + i));
-}
-
-return result;
+return VSHUFFLE(a, b,
+{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 
15});
 }
-
-Value *Builder::INSERT2_I(Value *a2, Value *b, uint32_t imm)
-{
-return BITCAST(INSERT2_F(a2, b, imm), mSimd2Int32Ty);
-}
-
 #endif
+
 //
 /// @brief convert x86  mask to llvm  mask
 Value *Builder::MASK(Value *vmask)
diff --git a/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.h 
b/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.h
index 62360a3ad7..231bd6ad85 100644
--- a/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.h
+++ b/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.h
@@ -119,10 +119,9 @@ Value *VMASK2(Value *mask);
 #if USE_SIMD16_BUILDER
 Value *EXTRACT2_F(Value *a2, uint32_t imm);
 Value *EXTRACT2_I(Value *a2, uint32_t imm);
-Value *INSERT2_F(Value *a2, Value *b, uint32_t imm);
-Value *INSERT2_I(Value *a2, Value *b, uint32_t imm);
-
+Value *JOIN2(Value *a, Value *b);
 #endif
+
 Value *MASKLOADD(Value* src, Value* mask);
 
 void Gather4(const SWR_FORMAT format, Value* pSrcBase, Value* byteOffsets,
diff --git a/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.cpp 
b/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.cpp
index c960dc77fb..e0a0770560 100644
--- a/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.cpp
+++ b/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.cpp
@@ -960,10 +960,7 @@ void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE 
,
 // offset indices by baseVertex
 #if USE_SIMD16_GATHERS
 #if USE_SIMD16_BUILDER
-Value *vIndices16 = VUNDEF2_I();
-
-vIndices16 = INSERT2_I(vIndices16, vIndices,  0);
-vIndices16 = INSERT2_I(vIndices16, vIndices2, 1);
+Value *vIndices16 = JOIN2(vIndices, vIndices2);
 
 vCurIndices16 = ADD(vIndices16, vBaseVertex16);
 #else
@@ -982,10 +979,7 @@ void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE 
,
 // offset indices by baseVertex
 #if USE_SIMD16_GATHERS
 #if USE_SIMD16_BUILDER
-Value *vIndices16 = VUNDEF2_I();
-
-vIndices16 = INSERT2_I(vIndices16, vIndices,  0);
-vIndices16 = INSERT2_I(vIndices16, vIndices2, 1);
+Value *vIndices16 = JOIN2(vIndices, vIndices2);
 
 vCurIndices16 = ADD(vIndices16, vBaseVertex16);
 #else
@@ -1206,9 +1200,7 @@ void FetchJit::JitGatherVertices(const 
FETCH_COMPILE_STATE ,
 {
 #if USE_SIMD16_BUILDER
 // pack adjacent pairs of SIMD8s into SIMD16s
-pVtxSrc2[currentVertexElement] = VUNDEF2_F();
-pVtxSrc2[currentVertexElement] = 
INSERT2_F(pVtxSrc2[currentVertexElement], pResults[c],  0);
-pVtxSrc2[currentVertexElement] = 
INSERT2_F(pVtxSrc2[currentVertexElement], pResults2[c], 1);
+

[Mesa-dev] [PATCH 15/20] swr/rast: Pull of RTAI gather & offset out of clip/bin code

2017-12-14 Thread Tim Rowley
---
 src/gallium/drivers/swr/rasterizer/core/binner.cpp | 118 +++-
 src/gallium/drivers/swr/rasterizer/core/clip.cpp   |  30 ++--
 src/gallium/drivers/swr/rasterizer/core/clip.h |  35 +++--
 src/gallium/drivers/swr/rasterizer/core/context.h  |   4 +-
 .../drivers/swr/rasterizer/core/frontend.cpp   | 153 +++--
 src/gallium/drivers/swr/rasterizer/core/frontend.h |   8 +-
 src/gallium/drivers/swr/rasterizer/core/pa.h   |   1 +
 7 files changed, 203 insertions(+), 146 deletions(-)

diff --git a/src/gallium/drivers/swr/rasterizer/core/binner.cpp 
b/src/gallium/drivers/swr/rasterizer/core/binner.cpp
index a664ed812f..7ef87c4443 100644
--- a/src/gallium/drivers/swr/rasterizer/core/binner.cpp
+++ b/src/gallium/drivers/swr/rasterizer/core/binner.cpp
@@ -45,7 +45,8 @@ void BinPostSetupLinesImpl(
 typename SIMD_T::Float recipW[],
 uint32_t primMask,
 typename SIMD_T::Integer const ,
-typename SIMD_T::Integer const );
+typename SIMD_T::Integer const ,
+typename SIMD_T::Integer const );
 
 template 
 void BinPostSetupPointsImpl(
@@ -55,7 +56,8 @@ void BinPostSetupPointsImpl(
 typename SIMD_T::Vec4 prim[],
 uint32_t primMask,
 typename SIMD_T::Integer const ,
-typename SIMD_T::Integer const );
+typename SIMD_T::Integer const ,
+typename SIMD_T::Integer const );
 
 //
 /// @brief Processes attributes for the backend based on linkage mask and
@@ -308,9 +310,11 @@ void SIMDCALL BinTrianglesImpl(
 typename SIMD_T::Vec4 tri[3],
 uint32_t triMask,
 typename SIMD_T::Integer const ,
-typename SIMD_T::Integer const )
+typename SIMD_T::Integer const ,
+typename SIMD_T::Integer const )
 {
 SWR_CONTEXT *pContext = pDC->pContext;
+const uint32_t *aRTAI = reinterpret_cast();
 
 AR_BEGIN(FEBinTriangles, pDC->drawId);
 
@@ -604,21 +608,21 @@ endBinTriangles:
 recipW[0] = vRecipW0;
 recipW[1] = vRecipW1;
 
-BinPostSetupLinesImpl(pDC, pa, workerId, line, 
recipW, triMask, primID, viewportIdx);
+BinPostSetupLinesImpl(pDC, pa, workerId, line, 
recipW, triMask, primID, viewportIdx, rtIdx);
 
 line[0] = tri[1];
 line[1] = tri[2];
 recipW[0] = vRecipW1;
 recipW[1] = vRecipW2;
 
-BinPostSetupLinesImpl(pDC, pa, workerId, line, 
recipW, triMask, primID, viewportIdx);
+BinPostSetupLinesImpl(pDC, pa, workerId, line, 
recipW, triMask, primID, viewportIdx, rtIdx);
 
 line[0] = tri[2];
 line[1] = tri[0];
 recipW[0] = vRecipW2;
 recipW[1] = vRecipW0;
 
-BinPostSetupLinesImpl(pDC, pa, workerId, line, 
recipW, triMask, primID, viewportIdx);
+BinPostSetupLinesImpl(pDC, pa, workerId, line, 
recipW, triMask, primID, viewportIdx, rtIdx);
 
 AR_END(FEBinTriangles, 1);
 return;
@@ -626,9 +630,9 @@ endBinTriangles:
 else if (rastState.fillMode == SWR_FILLMODE_POINT)
 {
 // Bin 3 points
-BinPostSetupPointsImpl(pDC, pa, workerId, [0], 
triMask, primID, viewportIdx);
-BinPostSetupPointsImpl(pDC, pa, workerId, [1], 
triMask, primID, viewportIdx);
-BinPostSetupPointsImpl(pDC, pa, workerId, [2], 
triMask, primID, viewportIdx);
+BinPostSetupPointsImpl(pDC, pa, workerId, [0], 
triMask, primID, viewportIdx, rtIdx);
+BinPostSetupPointsImpl(pDC, pa, workerId, [1], 
triMask, primID, viewportIdx, rtIdx);
+BinPostSetupPointsImpl(pDC, pa, workerId, [2], 
triMask, primID, viewportIdx, rtIdx);
 
 AR_END(FEBinTriangles, 1);
 return;
@@ -659,22 +663,6 @@ endBinTriangles:
 TransposeVertices(vHorizZ, tri[0].z, tri[1].z, tri[2].z);
 TransposeVertices(vHorizW, vRecipW0, vRecipW1, vRecipW2);
 
-// store render target array index
-OSALIGNSIMD16(uint32_t) aRTAI[SIMD_WIDTH];
-if (state.backendState.readRenderTargetArrayIndex)
-{
-typename SIMD_T::Vec4 vRtai[3];
-pa.Assemble(VERTEX_SGV_SLOT, vRtai);
-typename SIMD_T::Integer vRtaii;
-vRtaii = SIMD_T::castps_si(vRtai[0][VERTEX_SGV_RTAI_COMP]);
-SIMD_T::store_si(reinterpret_cast(aRTAI), 
vRtaii);
-}
-else
-{
-SIMD_T::store_si(reinterpret_cast(aRTAI), 
SIMD_T::setzero_si());
-}
-
-
 // scan remaining valid triangles and bin each separately
 while (_BitScanForward(, triMask))
 {
@@ -763,9 +751,10 @@ void BinTriangles(
 simdvector tri[3],
 uint32_t triMask,
 simdscalari const ,
-simdscalari const )
+simdscalari const ,
+simdscalari const )
 {
-BinTrianglesImpl(pDC, pa, workerId, tri, 
triMask, primID, viewportIdx);
+

[Mesa-dev] [PATCH 16/20] swr/rast: Rework thread binding parameters for machine partitioning

2017-12-14 Thread Tim Rowley
Add BASE_NUMA_NODE, BASE_CORE, BASE_THREAD parameters to
SwrCreateContext.

Add optional SWR_API_THREADING_INFO parameter to SwrCreateContext to
control reservation of API threads.

Add SwrBindApiThread() function to allow binding of API threads to
reserved HW threads.
---
 .../drivers/swr/rasterizer/codegen/knob_defs.py|  29 +-
 src/gallium/drivers/swr/rasterizer/core/api.cpp|  40 ++-
 src/gallium/drivers/swr/rasterizer/core/api.h  |  33 +++
 src/gallium/drivers/swr/rasterizer/core/context.h  |   1 +
 .../drivers/swr/rasterizer/core/threads.cpp| 299 +++--
 src/gallium/drivers/swr/rasterizer/core/threads.h  |   4 +
 .../drivers/swr/rasterizer/core/tilemgr.cpp|   4 +-
 7 files changed, 322 insertions(+), 88 deletions(-)

diff --git a/src/gallium/drivers/swr/rasterizer/codegen/knob_defs.py 
b/src/gallium/drivers/swr/rasterizer/codegen/knob_defs.py
index 09e3124602..30803927e3 100644
--- a/src/gallium/drivers/swr/rasterizer/codegen/knob_defs.py
+++ b/src/gallium/drivers/swr/rasterizer/codegen/knob_defs.py
@@ -62,15 +62,33 @@ KNOBS = [
 'category'  : 'perf',
 }],
 
-['MAX_NUMA_NODES', {
+['BASE_NUMA_NODE', {
 'type'  : 'uint32_t',
 'default'   : '0',
+'desc'  : ['Starting NUMA node index to use when allocating 
compute resources.',
+   'Setting this to a non-zero value will reduce the 
maximum # of NUMA nodes used.'],
+'category'  : 'perf',
+'advanced'  : True,
+}],
+
+['MAX_NUMA_NODES', {
+'type'  : 'uint32_t',
+'default'   : '1' if sys.platform == 'win32' else '0',
 'desc'  : ['Maximum # of NUMA-nodes per system used for worker 
threads',
'  0 == ALL NUMA-nodes in the system',
'  N == Use at most N NUMA-nodes for rendering'],
 'category'  : 'perf',
 }],
 
+['BASE_CORE', {
+'type'  : 'uint32_t',
+'default'   : '0',
+'desc'  : ['Starting core index to use when allocating compute 
resources.',
+   'Setting this to a non-zero value will reduce the 
maximum # of cores used.'],
+'category'  : 'perf',
+'advanced'  : True,
+}],
+
 ['MAX_CORES_PER_NUMA_NODE', {
 'type'  : 'uint32_t',
 'default'   : '0',
@@ -80,6 +98,15 @@ KNOBS = [
 'category'  : 'perf',
 }],
 
+['BASE_THREAD', {
+'type'  : 'uint32_t',
+'default'   : '0',
+'desc'  : ['Starting thread index to use when allocating compute 
resources.',
+   'Setting this to a non-zero value will reduce the 
maximum # of threads used.'],
+'category'  : 'perf',
+'advanced'  : True,
+}],
+
 ['MAX_THREADS_PER_CORE', {
 'type'  : 'uint32_t',
 'default'   : '1',
diff --git a/src/gallium/drivers/swr/rasterizer/core/api.cpp 
b/src/gallium/drivers/swr/rasterizer/core/api.cpp
index 9265440904..25a3f34841 100644
--- a/src/gallium/drivers/swr/rasterizer/core/api.cpp
+++ b/src/gallium/drivers/swr/rasterizer/core/api.cpp
@@ -95,16 +95,32 @@ HANDLE SwrCreateContext(
 pContext->dsRing[dc].pArena = new 
CachingArena(pContext->cachingArenaAllocator);
 }
 
-pContext->threadInfo.MAX_WORKER_THREADS= KNOB_MAX_WORKER_THREADS;
-pContext->threadInfo.MAX_NUMA_NODES= KNOB_MAX_NUMA_NODES;
-pContext->threadInfo.MAX_CORES_PER_NUMA_NODE   = 
KNOB_MAX_CORES_PER_NUMA_NODE;
-pContext->threadInfo.MAX_THREADS_PER_CORE  = KNOB_MAX_THREADS_PER_CORE;
-pContext->threadInfo.SINGLE_THREADED   = KNOB_SINGLE_THREADED;
-
 if (pCreateInfo->pThreadInfo)
 {
 pContext->threadInfo = *pCreateInfo->pThreadInfo;
 }
+else
+{
+pContext->threadInfo.MAX_WORKER_THREADS = 
KNOB_MAX_WORKER_THREADS;
+pContext->threadInfo.BASE_NUMA_NODE = KNOB_BASE_NUMA_NODE;
+pContext->threadInfo.BASE_CORE  = KNOB_BASE_CORE;
+pContext->threadInfo.BASE_THREAD= KNOB_BASE_THREAD;
+pContext->threadInfo.MAX_NUMA_NODES = KNOB_MAX_NUMA_NODES;
+pContext->threadInfo.MAX_CORES_PER_NUMA_NODE= 
KNOB_MAX_CORES_PER_NUMA_NODE;
+pContext->threadInfo.MAX_THREADS_PER_CORE   = 
KNOB_MAX_THREADS_PER_CORE;
+pContext->threadInfo.SINGLE_THREADED= KNOB_SINGLE_THREADED;
+}
+
+if (pCreateInfo->pApiThreadInfo)
+{
+pContext->apiThreadInfo = *pCreateInfo->pApiThreadInfo;
+}
+else
+{
+pContext->apiThreadInfo.bindAPIThread0  = true;
+pContext->apiThreadInfo.numAPIReservedThreads   = 1;
+pContext->apiThreadInfo.numAPIThreadsPerCore= 1;
+}
 
 memset(>WaitLock, 0, sizeof(pContext->WaitLock));
 memset(>FifosNotEmpty, 0, sizeof(pContext->FifosNotEmpty));
@@ -113,6 +129,11 @@ HANDLE SwrCreateContext(
 
 

[Mesa-dev] [PATCH 14/20] swr/rast: Remove no-op VBROADCAST of vID

2017-12-14 Thread Tim Rowley
---
 src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.cpp 
b/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.cpp
index ec3b5eafcc..1312ac0009 100644
--- a/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.cpp
+++ b/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.cpp
@@ -3101,7 +3101,7 @@ Value* FetchJit::GenerateCompCtrlVector(const 
ComponentControl ctrl)
 #else
 Value* pId = BITCAST(LOAD(GEP(mpFetchInfo, { 0, 
SWR_FETCH_CONTEXT_VertexID })), mSimdFP32Ty);
 #endif
-return VBROADCAST(pId);
+return pId;
 }
 case StoreInstanceId:
 {
@@ -3129,7 +3129,7 @@ Value* FetchJit::GenerateCompCtrlVector2(const 
ComponentControl ctrl)
 
 Value *pId = JOIN2(pId_lo, pId_hi);
 
-return VBROADCAST2(pId);
+return pId;
 }
 case StoreInstanceId:
 {
-- 
2.14.1

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH 01/20] swr/rast: Remove unneeded copy of gather mask

2017-12-14 Thread Tim Rowley
---
 .../drivers/swr/rasterizer/jitter/builder_misc.cpp | 22 +-
 .../drivers/swr/rasterizer/jitter/fetch_jit.cpp| 80 ++
 2 files changed, 23 insertions(+), 79 deletions(-)

diff --git a/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.cpp 
b/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.cpp
index 8ffe05b41c..0221106664 100644
--- a/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.cpp
+++ b/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.cpp
@@ -1107,23 +1107,19 @@ namespace SwrJit
 }
 
 void Builder::GATHER4PS(const SWR_FORMAT_INFO , Value* pSrcBase, 
Value* byteOffsets, 
-Value* mask, Value* vGatherComponents[], bool 
bPackedOutput)
+Value* vMask, Value* vGatherComponents[], bool 
bPackedOutput)
 {
 switch(info.bpp / info.numComps)
 {
 case 16: 
 {
 Value* vGatherResult[2];
-Value *vMask;
 
 // TODO: vGatherMaskedVal
 Value* vGatherMaskedVal = VIMMED1((float)0);
 
 // always have at least one component out of x or y to 
fetch
 
-// save mask as it is zero'd out after each gather
-vMask = mask;
-
 vGatherResult[0] = GATHERPS(vGatherMaskedVal, pSrcBase, 
byteOffsets, vMask);
 // e.g. result of first 8x32bit integer gather for 16bit 
components
 // 256i - 01234567
@@ -1135,7 +1131,6 @@ namespace SwrJit
 {
 // offset base to the next components(zw) in the 
vertex to gather
 pSrcBase = GEP(pSrcBase, C((char)4));
-vMask = mask;
 
 vGatherResult[1] =  GATHERPS(vGatherMaskedVal, 
pSrcBase, byteOffsets, vMask);
 // e.g. result of second 8x32bit integer gather for 
16bit components
@@ -1164,9 +1159,6 @@ namespace SwrJit
 {
 uint32_t swizzleIndex = info.swizzle[i];
 
-// save mask as it is zero'd out after each gather
-Value *vMask = mask;
-
 // Gather a SIMD of components
 vGatherComponents[swizzleIndex] = 
GATHERPS(vGatherComponents[swizzleIndex], pSrcBase, byteOffsets, vMask);
 
@@ -1182,14 +1174,14 @@ namespace SwrJit
 }
 
 void Builder::GATHER4DD(const SWR_FORMAT_INFO , Value* pSrcBase, 
Value* byteOffsets,
-Value* mask, Value* vGatherComponents[], bool 
bPackedOutput)
+Value* vMask, Value* vGatherComponents[], bool 
bPackedOutput)
 {
 switch (info.bpp / info.numComps)
 {
 case 8:
 {
 Value* vGatherMaskedVal = VIMMED1((int32_t)0);
-Value* vGatherResult = GATHERDD(vGatherMaskedVal, pSrcBase, 
byteOffsets, mask);
+Value* vGatherResult = GATHERDD(vGatherMaskedVal, pSrcBase, 
byteOffsets, vMask);
 // e.g. result of an 8x32bit integer gather for 8bit components
 // 256i - 01234567
 //xyzw xyzw xyzw xyzw xyzw xyzw xyzw xyzw 
@@ -1200,16 +1192,12 @@ namespace SwrJit
 case 16:
 {
 Value* vGatherResult[2];
-Value *vMask;
 
 // TODO: vGatherMaskedVal
 Value* vGatherMaskedVal = VIMMED1((int32_t)0);
 
 // always have at least one component out of x or y to fetch
 
-// save mask as it is zero'd out after each gather
-vMask = mask;
-
 vGatherResult[0] = GATHERDD(vGatherMaskedVal, pSrcBase, 
byteOffsets, vMask);
 // e.g. result of first 8x32bit integer gather for 16bit 
components
 // 256i - 01234567
@@ -1221,7 +1209,6 @@ namespace SwrJit
 {
 // offset base to the next components(zw) in the vertex to 
gather
 pSrcBase = GEP(pSrcBase, C((char)4));
-vMask = mask;
 
 vGatherResult[1] = GATHERDD(vGatherMaskedVal, pSrcBase, 
byteOffsets, vMask);
 // e.g. result of second 8x32bit integer gather for 16bit 
components
@@ -1251,9 +1238,6 @@ namespace SwrJit
 {
 uint32_t swizzleIndex = info.swizzle[i];
 
-// save mask as it is zero'd out after each gather
-Value *vMask = mask;
-
 // Gather a SIMD of components
 vGatherComponents[swizzleIndex] = 
GATHERDD(vGatherComponents[swizzleIndex], pSrcBase, byteOffsets, vMask);
 
diff --git a/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.cpp 

[Mesa-dev] [PATCH 11/20] swr/rast: SIMD16 Fetch - Fully widen 16-bit float vertex components

2017-12-14 Thread Tim Rowley
---
 .../drivers/swr/rasterizer/jitter/fetch_jit.cpp| 55 +++---
 1 file changed, 48 insertions(+), 7 deletions(-)

diff --git a/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.cpp 
b/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.cpp
index 2065db3475..c960dc77fb 100644
--- a/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.cpp
+++ b/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.cpp
@@ -1277,6 +1277,43 @@ void FetchJit::JitGatherVertices(const 
FETCH_COMPILE_STATE ,
 case 16:
 {
 #if USE_SIMD16_GATHERS
+#if USE_SIMD16_BUILDER
+Value *gatherResult[2];
+
+// if we have at least one component out of x or y to fetch
+if (isComponentEnabled(compMask, 0) || 
isComponentEnabled(compMask, 1))
+{
+gatherResult[0] = GATHERPS_16(gatherSrc16, 
pStreamBase, vOffsets16, vGatherMask16);
+
+// e.g. result of first 8x32bit integer gather for 
16bit components
+// 256i - 01234567
+//xyxy xyxy xyxy xyxy xyxy xyxy xyxy xyxy
+//
+}
+else
+{
+gatherResult[0] = VUNDEF2_I();
+}
+
+// if we have at least one component out of z or w to fetch
+if (isComponentEnabled(compMask, 2) || 
isComponentEnabled(compMask, 3))
+{
+// offset base to the next components(zw) in the 
vertex to gather
+pStreamBase = GEP(pStreamBase, C((char)4));
+
+gatherResult[1] = GATHERPS_16(gatherSrc16, 
pStreamBase, vOffsets16, vGatherMask16);
+
+// e.g. result of second 8x32bit integer gather for 
16bit components
+// 256i - 01234567
+//zwzw zwzw zwzw zwzw zwzw zwzw zwzw zwzw 
+//
+}
+else
+{
+gatherResult[1] = VUNDEF2_I();
+}
+
+#else
 Value *vGatherResult[2];
 Value *vGatherResult2[2];
 
@@ -1315,10 +1352,13 @@ void FetchJit::JitGatherVertices(const 
FETCH_COMPILE_STATE ,
 vGatherResult2[1] = VUNDEF_I();
 }
 
+#endif
 // if we have at least one component to shuffle into place
 if (compMask)
 {
 #if USE_SIMD16_BUILDER
+#if USE_SIMD16_BUILDER
+#else
 Value *gatherResult[2];
 
 gatherResult[0] = VUNDEF2_I();
@@ -1330,6 +1370,7 @@ void FetchJit::JitGatherVertices(const 
FETCH_COMPILE_STATE ,
 gatherResult[1] = INSERT2_I(gatherResult[1], 
vGatherResult[1],  0);
 gatherResult[1] = INSERT2_I(gatherResult[1], 
vGatherResult2[1], 1);
 
+#endif
 Value *pVtxOut2 = BITCAST(pVtxOut, 
PointerType::get(VectorType::get(mFP32Ty, mVWidth2), 0));
 
 Shuffle16bpcArgs args = 
std::forward_as_tuple(gatherResult, pVtxOut2, Instruction::CastOps::FPExt, 
CONVERT_NONE,
@@ -1511,21 +1552,21 @@ void FetchJit::JitGatherVertices(const 
FETCH_COMPILE_STATE ,
 // if we need to gather the component
 if (compCtrl[i] == StoreSrc)
 {
-Value *vMaskLo  = VSHUFFLE(vGatherMask, 
VUNDEF(mInt1Ty, 8), C({ 0, 1, 2, 3 }));
+Value *vMaskLo  = VSHUFFLE(vGatherMask,  
VUNDEF(mInt1Ty, 8), C({ 0, 1, 2, 3 }));
 Value *vMaskLo2 = VSHUFFLE(vGatherMask2, 
VUNDEF(mInt1Ty, 8), C({ 0, 1, 2, 3 }));
-Value *vMaskHi  = VSHUFFLE(vGatherMask, 
VUNDEF(mInt1Ty, 8), C({ 4, 5, 6, 7 }));
+Value *vMaskHi  = VSHUFFLE(vGatherMask,  
VUNDEF(mInt1Ty, 8), C({ 4, 5, 6, 7 }));
 Value *vMaskHi2 = VSHUFFLE(vGatherMask2, 
VUNDEF(mInt1Ty, 8), C({ 4, 5, 6, 7 }));
 
-Value *vOffsetsLo  = VEXTRACTI128(vOffsets, 
C(0));
+Value *vOffsetsLo  = VEXTRACTI128(vOffsets,  
C(0));
 Value *vOffsetsLo2 = VEXTRACTI128(vOffsets2, 
C(0));
-Value *vOffsetsHi  = VEXTRACTI128(vOffsets, 
C(1));
+Value *vOffsetsHi  = VEXTRACTI128(vOffsets,  
C(1));
 Value *vOffsetsHi2 = VEXTRACTI128(vOffsets2, 
C(1));
 
 Value *vZeroDouble = VECTOR_SPLAT(4, 
ConstantFP::get(IRB()->getDoubleTy(), 0.0f));
 
-

[Mesa-dev] [PATCH 03/20] swr/rast: Corrections to multi-scissor handling

2017-12-14 Thread Tim Rowley
binner's GatherScissors() will be turned into a real gather in the not
too distant future.
---
 src/gallium/drivers/swr/rasterizer/core/binner.cpp | 176 ++---
 1 file changed, 88 insertions(+), 88 deletions(-)

diff --git a/src/gallium/drivers/swr/rasterizer/core/binner.cpp 
b/src/gallium/drivers/swr/rasterizer/core/binner.cpp
index 52375f8956..8a5356b168 100644
--- a/src/gallium/drivers/swr/rasterizer/core/binner.cpp
+++ b/src/gallium/drivers/swr/rasterizer/core/binner.cpp
@@ -226,117 +226,117 @@ static void GatherScissors(const SWR_RECT 
*pScissorsInFixedPoint, const uint32_t
 simdscalari , simdscalari , simdscalari , 
simdscalari )
 {
 scisXmin = _simd_set_epi32(
-pScissorsInFixedPoint[pViewportIndex[0]].xmin,
-pScissorsInFixedPoint[pViewportIndex[1]].xmin,
-pScissorsInFixedPoint[pViewportIndex[2]].xmin,
-pScissorsInFixedPoint[pViewportIndex[3]].xmin,
-pScissorsInFixedPoint[pViewportIndex[4]].xmin,
-pScissorsInFixedPoint[pViewportIndex[5]].xmin,
+pScissorsInFixedPoint[pViewportIndex[7]].xmin,
 pScissorsInFixedPoint[pViewportIndex[6]].xmin,
-pScissorsInFixedPoint[pViewportIndex[7]].xmin);
+pScissorsInFixedPoint[pViewportIndex[5]].xmin,
+pScissorsInFixedPoint[pViewportIndex[4]].xmin,
+pScissorsInFixedPoint[pViewportIndex[3]].xmin,
+pScissorsInFixedPoint[pViewportIndex[2]].xmin,
+pScissorsInFixedPoint[pViewportIndex[1]].xmin,
+pScissorsInFixedPoint[pViewportIndex[0]].xmin);
 scisYmin = _simd_set_epi32(
-pScissorsInFixedPoint[pViewportIndex[0]].ymin,
-pScissorsInFixedPoint[pViewportIndex[1]].ymin,
-pScissorsInFixedPoint[pViewportIndex[2]].ymin,
-pScissorsInFixedPoint[pViewportIndex[3]].ymin,
-pScissorsInFixedPoint[pViewportIndex[4]].ymin,
-pScissorsInFixedPoint[pViewportIndex[5]].ymin,
+pScissorsInFixedPoint[pViewportIndex[7]].ymin,
 pScissorsInFixedPoint[pViewportIndex[6]].ymin,
-pScissorsInFixedPoint[pViewportIndex[7]].ymin);
+pScissorsInFixedPoint[pViewportIndex[5]].ymin,
+pScissorsInFixedPoint[pViewportIndex[4]].ymin,
+pScissorsInFixedPoint[pViewportIndex[3]].ymin,
+pScissorsInFixedPoint[pViewportIndex[2]].ymin,
+pScissorsInFixedPoint[pViewportIndex[1]].ymin,
+pScissorsInFixedPoint[pViewportIndex[0]].ymin);
 scisXmax = _simd_set_epi32(
-pScissorsInFixedPoint[pViewportIndex[0]].xmax,
-pScissorsInFixedPoint[pViewportIndex[1]].xmax,
-pScissorsInFixedPoint[pViewportIndex[2]].xmax,
-pScissorsInFixedPoint[pViewportIndex[3]].xmax,
-pScissorsInFixedPoint[pViewportIndex[4]].xmax,
-pScissorsInFixedPoint[pViewportIndex[5]].xmax,
+pScissorsInFixedPoint[pViewportIndex[7]].xmax,
 pScissorsInFixedPoint[pViewportIndex[6]].xmax,
-pScissorsInFixedPoint[pViewportIndex[7]].xmax);
+pScissorsInFixedPoint[pViewportIndex[5]].xmax,
+pScissorsInFixedPoint[pViewportIndex[4]].xmax,
+pScissorsInFixedPoint[pViewportIndex[3]].xmax,
+pScissorsInFixedPoint[pViewportIndex[2]].xmax,
+pScissorsInFixedPoint[pViewportIndex[1]].xmax,
+pScissorsInFixedPoint[pViewportIndex[0]].xmax);
 scisYmax = _simd_set_epi32(
-pScissorsInFixedPoint[pViewportIndex[0]].ymax,
-pScissorsInFixedPoint[pViewportIndex[1]].ymax,
-pScissorsInFixedPoint[pViewportIndex[2]].ymax,
-pScissorsInFixedPoint[pViewportIndex[3]].ymax,
-pScissorsInFixedPoint[pViewportIndex[4]].ymax,
-pScissorsInFixedPoint[pViewportIndex[5]].ymax,
+pScissorsInFixedPoint[pViewportIndex[7]].ymax,
 pScissorsInFixedPoint[pViewportIndex[6]].ymax,
-pScissorsInFixedPoint[pViewportIndex[7]].ymax);
+pScissorsInFixedPoint[pViewportIndex[5]].ymax,
+pScissorsInFixedPoint[pViewportIndex[4]].ymax,
+pScissorsInFixedPoint[pViewportIndex[3]].ymax,
+pScissorsInFixedPoint[pViewportIndex[2]].ymax,
+pScissorsInFixedPoint[pViewportIndex[01]].ymax,
+pScissorsInFixedPoint[pViewportIndex[00]].ymax);
 }
 
 static void GatherScissors(const SWR_RECT *pScissorsInFixedPoint, const 
uint32_t *pViewportIndex,
 simd16scalari , simd16scalari , simd16scalari , 
simd16scalari )
 {
 scisXmin = _simd16_set_epi32(
-pScissorsInFixedPoint[pViewportIndex[0]].xmin,
-pScissorsInFixedPoint[pViewportIndex[1]].xmin,
-pScissorsInFixedPoint[pViewportIndex[2]].xmin,
-pScissorsInFixedPoint[pViewportIndex[3]].xmin,
-pScissorsInFixedPoint[pViewportIndex[4]].xmin,
-pScissorsInFixedPoint[pViewportIndex[5]].xmin,
-pScissorsInFixedPoint[pViewportIndex[6]].xmin,
-pScissorsInFixedPoint[pViewportIndex[7]].xmin,
-pScissorsInFixedPoint[pViewportIndex[8]].xmin,
-pScissorsInFixedPoint[pViewportIndex[9]].xmin,
-pScissorsInFixedPoint[pViewportIndex[10]].xmin,

[Mesa-dev] [PATCH 08/20] swr/rast: Pull most of the VPAI manipulation out of the binner/clipper

2017-12-14 Thread Tim Rowley
Move out of binner/clipper; hand them down from the frontend code instead.
---
 src/gallium/drivers/swr/rasterizer/core/binner.cpp | 124 ++---
 src/gallium/drivers/swr/rasterizer/core/clip.cpp   |  25 ++---
 src/gallium/drivers/swr/rasterizer/core/clip.h |  58 +++---
 src/gallium/drivers/swr/rasterizer/core/context.h  |   4 +-
 .../drivers/swr/rasterizer/core/frontend.cpp   | 112 ++-
 src/gallium/drivers/swr/rasterizer/core/frontend.h |   8 +-
 src/gallium/drivers/swr/rasterizer/core/pa.h   |   4 +-
 7 files changed, 177 insertions(+), 158 deletions(-)

diff --git a/src/gallium/drivers/swr/rasterizer/core/binner.cpp 
b/src/gallium/drivers/swr/rasterizer/core/binner.cpp
index 22996c5a5d..a664ed812f 100644
--- a/src/gallium/drivers/swr/rasterizer/core/binner.cpp
+++ b/src/gallium/drivers/swr/rasterizer/core/binner.cpp
@@ -307,7 +307,8 @@ void SIMDCALL BinTrianglesImpl(
 uint32_t workerId,
 typename SIMD_T::Vec4 tri[3],
 uint32_t triMask,
-typename SIMD_T::Integer const )
+typename SIMD_T::Integer const ,
+typename SIMD_T::Integer const )
 {
 SWR_CONTEXT *pContext = pDC->pContext;
 
@@ -323,31 +324,6 @@ void SIMDCALL BinTrianglesImpl(
 typename SIMD_T::Float vRecipW1 = SIMD_T::set1_ps(1.0f);
 typename SIMD_T::Float vRecipW2 = SIMD_T::set1_ps(1.0f);
 
-typename SIMD_T::Integer viewportIdx = SIMD_T::setzero_si();
-typename SIMD_T::Vec4 vpiAttrib[3];
-typename SIMD_T::Integer vpai = SIMD_T::setzero_si();
-
-if (state.backendState.readViewportArrayIndex)
-{
-pa.Assemble(VERTEX_SGV_SLOT, vpiAttrib);
-
-vpai = SIMD_T::castps_si(vpiAttrib[0][VERTEX_SGV_VAI_COMP]);
-}
-
-
-if (state.backendState.readViewportArrayIndex) // VPAIOffsets are 
guaranteed 0-15 -- no OOB issues if they are offsets from 0 
-{
-// OOB indices => forced to zero.
-vpai = SIMD_T::max_epi32(vpai, SIMD_T::setzero_si());
-typename SIMD_T::Integer vNumViewports = 
SIMD_T::set1_epi32(KNOB_NUM_VIEWPORTS_SCISSORS);
-typename SIMD_T::Integer vClearMask = SIMD_T::cmplt_epi32(vpai, 
vNumViewports);
-viewportIdx = SIMD_T::and_si(vClearMask, vpai);
-}
-else
-{
-viewportIdx = vpai;
-}
-
 if (feState.vpTransformDisable)
 {
 // RHW is passed in directly when VP transform is disabled
@@ -375,7 +351,7 @@ void SIMDCALL BinTrianglesImpl(
 tri[2].v[2] = SIMD_T::mul_ps(tri[2].v[2], vRecipW2);
 
 // Viewport transform to screen space coords
-if (state.backendState.readViewportArrayIndex)
+if (pa.viewportArrayActive)
 {
 viewportTransform<3>(tri, state.vpMatrices, viewportIdx);
 }
@@ -568,8 +544,8 @@ void SIMDCALL BinTrianglesImpl(
 /// @todo:  Look at speeding this up -- weigh against corresponding costs 
in rasterizer.
 {
 typename SIMD_T::Integer scisXmin, scisYmin, scisXmax, scisYmax;
+if (pa.viewportArrayActive)
 
-if (state.backendState.readViewportArrayIndex)
 {
 GatherScissors([0], pViewportIndex, 
scisXmin, scisYmin, scisXmax, scisYmax);
 }
@@ -786,9 +762,10 @@ void BinTriangles(
 uint32_t workerId,
 simdvector tri[3],
 uint32_t triMask,
-simdscalari const )
+simdscalari const ,
+simdscalari const )
 {
-BinTrianglesImpl(pDC, pa, workerId, tri, 
triMask, primID);
+BinTrianglesImpl(pDC, pa, workerId, tri, 
triMask, primID, viewportIdx);
 }
 
 #if USE_SIMD16_FRONTEND
@@ -799,9 +776,10 @@ void SIMDCALL BinTriangles_simd16(
 uint32_t workerId,
 simd16vector tri[3],
 uint32_t triMask,
-simd16scalari const )
+simd16scalari const ,
+simd16scalari const )
 {
-BinTrianglesImpl(pDC, pa, workerId, tri, 
triMask, primID);
+BinTrianglesImpl(pDC, pa, workerId, tri, 
triMask, primID, viewportIdx);
 }
 
 #endif
@@ -1026,7 +1004,7 @@ void BinPostSetupPointsImpl(
 {
 typename SIMD_T::Integer scisXmin, scisYmin, scisXmax, scisYmax;
 
-if (state.backendState.readViewportArrayIndex)
+if (pa.viewportArrayActive)
 {
 GatherScissors([0], pViewportIndex, 
scisXmin, scisYmin, scisXmax, scisYmax);
 }
@@ -1176,38 +1154,13 @@ void BinPointsImpl(
 uint32_t workerId,
 typename SIMD_T::Vec4 prim[3],
 uint32_t primMask,
-typename SIMD_T::Integer const )
+typename SIMD_T::Integer const ,
+typename SIMD_T::Integer const )
 {
 const API_STATE& state = GetApiState(pDC);
 const SWR_FRONTEND_STATE& feState = state.frontendState;
 const SWR_RASTSTATE& rastState = state.rastState;
 
-// Read back viewport index if required
-typename SIMD_T::Integer viewportIdx = SIMD_T::setzero_si();
-typename SIMD_T::Vec4 vpiAttrib[1];
-typename SIMD_T::Integer vpai = 

[Mesa-dev] [PATCH 02/20] swr/rast: Binner fixes for viewport index offset handling

2017-12-14 Thread Tim Rowley
---
 src/gallium/drivers/swr/rasterizer/core/binner.cpp | 9 -
 src/gallium/drivers/swr/rasterizer/core/clip.h | 5 -
 2 files changed, 12 insertions(+), 2 deletions(-)

diff --git a/src/gallium/drivers/swr/rasterizer/core/binner.cpp 
b/src/gallium/drivers/swr/rasterizer/core/binner.cpp
index 9d1f0d8799..52375f8956 100644
--- a/src/gallium/drivers/swr/rasterizer/core/binner.cpp
+++ b/src/gallium/drivers/swr/rasterizer/core/binner.cpp
@@ -470,6 +470,10 @@ void SIMDCALL BinTrianglesImpl(
 typename SIMD_T::Integer vClearMask = SIMD_T::cmplt_epi32(vpai, 
vNumViewports);
 viewportIdx = SIMD_T::and_si(vClearMask, vpai);
 }
+else
+{
+viewportIdx = vpai;
+}
 
 if (feState.vpTransformDisable)
 {
@@ -1326,6 +1330,10 @@ void BinPointsImpl(
 typename SIMD_T::Integer vClearMask = SIMD_T::cmplt_epi32(vpai, 
vNumViewports);
 viewportIdx = SIMD_T::and_si(vClearMask, vpai);
 }
+else
+{
+viewportIdx = vpai;
+}
 
 if (!feState.vpTransformDisable)
 {
@@ -1647,7 +1655,6 @@ void SIMDCALL BinLinesImpl(
 if (state.backendState.readViewportArrayIndex)
 {
 pa.Assemble(VERTEX_SGV_SLOT, vpiAttrib);
-
 vpai = SIMD_T::castps_si(vpiAttrib[0][VERTEX_SGV_VAI_COMP]);
 }
 
diff --git a/src/gallium/drivers/swr/rasterizer/core/clip.h 
b/src/gallium/drivers/swr/rasterizer/core/clip.h
index 0d3d78057f..9d8bbc19e6 100644
--- a/src/gallium/drivers/swr/rasterizer/core/clip.h
+++ b/src/gallium/drivers/swr/rasterizer/core/clip.h
@@ -694,7 +694,6 @@ public:
 if (state.backendState.readViewportArrayIndex)
 {
 pa.Assemble(VERTEX_SGV_SLOT, vpiAttrib);
-
 vpai = SIMD_T::castps_si(vpiAttrib[0][VERTEX_SGV_VAI_COMP]);
 }
 
@@ -707,6 +706,10 @@ public:
 typename SIMD_T::Integer vClearMask = SIMD_T::cmplt_epi32(vpai, 
vNumViewports);
 viewportIdx = SIMD_T::and_si(vClearMask, vpai);
 }
+else
+{
+viewportIdx = vpai;
+}
 
 ComputeClipCodes(prim, viewportIdx);
 
-- 
2.14.1

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH 00/20] swr: update rasterizer

2017-12-14 Thread Tim Rowley
Highlights include simd16 work, thread pool initialization rework,
and code cleanup.

Tim Rowley (20):
  swr/rast: Remove unneeded copy of gather mask
  swr/rast: Binner fixes for viewport index offset handling
  swr/rast: Corrections to multi-scissor handling
  swr/rast: WIP - Widen fetch shader to SIMD16
  swr/rast: Convert gather masks to Nx1bit
  swr/rast: Rewrite Shuffle8bpcGatherd using shuffle
  swr/rast: Move GatherScissors to header
  swr/rast: Pull most of the VPAI manipulation out of the binner/clipper
  swr/rast: Pass prim to ClipSimd
  swr/rast: SIMD16 Fetch - Fully widen 32-bit float vertex components
  swr/rast: SIMD16 Fetch - Fully widen 16-bit float vertex components
  swr/rast: Replace INSERT2 vextract/vinsert with JOIN2 vshuffle
  swr/rast: SIMD16 Fetch - Fully widen 32-bit integer vertex components
  swr/rast: Remove no-op VBROADCAST of vID
  swr/rast: Pull of RTAI gather & offset out of clip/bin code
  swr/rast: Rework thread binding parameters for machine partitioning
  swr/rast: Replace VPSRL with LSHR
  swr/rast: Fix cache of API thread event manager
  swr/rast: EXTRACT2 changed from vextract/vinsert to vshuffle
  swr/rast: Move more RTAI handling out of binner

 .../swr/rasterizer/codegen/gen_llvm_ir_macros.py   |4 +-
 .../drivers/swr/rasterizer/codegen/knob_defs.py|   29 +-
 src/gallium/drivers/swr/rasterizer/core/api.cpp|   42 +-
 src/gallium/drivers/swr/rasterizer/core/api.h  |   33 +
 src/gallium/drivers/swr/rasterizer/core/binner.cpp |  345 ++-
 src/gallium/drivers/swr/rasterizer/core/binner.h   |  127 +++
 src/gallium/drivers/swr/rasterizer/core/clip.cpp   |   31 +-
 src/gallium/drivers/swr/rasterizer/core/clip.h |   67 +-
 src/gallium/drivers/swr/rasterizer/core/context.h  |5 +-
 .../drivers/swr/rasterizer/core/frontend.cpp   |  179 +++-
 src/gallium/drivers/swr/rasterizer/core/frontend.h |8 +-
 src/gallium/drivers/swr/rasterizer/core/pa.h   |5 +-
 .../drivers/swr/rasterizer/core/threads.cpp|  299 --
 src/gallium/drivers/swr/rasterizer/core/threads.h  |4 +
 .../drivers/swr/rasterizer/core/tilemgr.cpp|4 +-
 .../drivers/swr/rasterizer/jitter/builder_misc.cpp |  157 ++-
 .../drivers/swr/rasterizer/jitter/builder_misc.h   |   13 +-
 .../drivers/swr/rasterizer/jitter/fetch_jit.cpp| 1038 
 18 files changed, 1657 insertions(+), 733 deletions(-)

-- 
2.14.1

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH 04/20] swr/rast: WIP - Widen fetch shader to SIMD16

2017-12-14 Thread Tim Rowley
Widen vertex gather/storage to SIMD16 for all component types.
---
 .../drivers/swr/rasterizer/jitter/fetch_jit.cpp| 716 -
 1 file changed, 689 insertions(+), 27 deletions(-)

diff --git a/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.cpp 
b/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.cpp
index 337bb7f660..6c0e658e68 100644
--- a/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.cpp
+++ b/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.cpp
@@ -70,6 +70,9 @@ struct FetchJit : public Builder
 #else
 void Shuffle8bpcGatherd(Shuffle8bpcArgs );
 #endif
+#if USE_SIMD16_BUILDER
+void Shuffle8bpcGatherd2(Shuffle8bpcArgs );
+#endif
 
 typedef std::tuple Shuffle16bpcArgs;
@@ -78,6 +81,9 @@ struct FetchJit : public Builder
 #else
 void Shuffle16bpcGather(Shuffle16bpcArgs );
 #endif
+#if USE_SIMD16_BUILDER
+void Shuffle16bpcGather2(Shuffle16bpcArgs );
+#endif
 
 void StoreVertexElements(Value* pVtxOut, const uint32_t outputElt, const 
uint32_t numEltsToStore, Value* ()[4]);
 #if USE_SIMD16_BUILDER
@@ -726,7 +732,7 @@ void FetchJit::CreateGatherOddFormats(SWR_FORMAT format, 
Value* pMask, Value* pB
 // only works if pixel size is <= 32bits
 SWR_ASSERT(info.bpp <= 32);
 
-   Value* pGather = GATHERDD(VIMMED1(0), pBase, pOffsets, pMask);
+Value *pGather = GATHERDD(VIMMED1(0), pBase, pOffsets, pMask);
 
 for (uint32_t comp = 0; comp < 4; ++comp)
 {
@@ -825,6 +831,9 @@ void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE 
,
 Value* vVertexElements[4];
 #if USE_SIMD16_GATHERS
 Value* vVertexElements2[4];
+#if USE_SIMD16_BUILDER
+Value *pVtxSrc2[4];
+#endif
 #endif
 
 Value* startVertex = LOAD(mpFetchInfo, {0, SWR_FETCH_CONTEXT_StartVertex});
@@ -961,6 +970,7 @@ void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE 
,
 #if USE_SIMD16_GATHERS
 // override cur indices with 0 if pitch is 0
 Value* pZeroPitchMask = ICMP_EQ(vStride, VIMMED1(0));
+vCurIndices = SELECT(pZeroPitchMask, VIMMED1(0), vCurIndices);
 vCurIndices2 = SELECT(pZeroPitchMask, VIMMED1(0), vCurIndices2);
 
 // are vertices partially OOB?
@@ -983,7 +993,7 @@ void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE 
,
 
 // only fetch lanes that pass both tests
 vGatherMask = AND(vMaxGatherMask, vMinGatherMask);
-vGatherMask2 = AND(vMaxGatherMask, vMinGatherMask2);
+vGatherMask2 = AND(vMaxGatherMask2, vMinGatherMask2);
 }
 else
 {
@@ -1074,15 +1084,32 @@ void FetchJit::JitGatherVertices(const 
FETCH_COMPILE_STATE ,
 {
 if (isComponentEnabled(compMask, c))
 {
-vVertexElements[currentVertexElement] = pResults[c];
+#if USE_SIMD16_BUILDER
+// pack adjacent pairs of SIMD8s into SIMD16s
+pVtxSrc2[currentVertexElement] = VUNDEF2_F();
+pVtxSrc2[currentVertexElement] = 
INSERT2_F(pVtxSrc2[currentVertexElement], pResults[c],  0);
+pVtxSrc2[currentVertexElement] = 
INSERT2_F(pVtxSrc2[currentVertexElement], pResults2[c], 1);
+
+#else
+vVertexElements[currentVertexElement]  = pResults[c];
 vVertexElements2[currentVertexElement] = pResults2[c];
-currentVertexElement++;
+
+#endif
+currentVertexElement += 1;
 
 if (currentVertexElement > 3)
 {
+#if USE_SIMD16_BUILDER
+// store SIMD16s
+Value *pVtxOut2 = BITCAST(pVtxOut, 
PointerType::get(VectorType::get(mFP32Ty, mVWidth2), 0));
+
+StoreVertexElements2(pVtxOut2, outputElt, 4, pVtxSrc2);
+
+#else
 StoreVertexElements(pVtxOut, outputElt, 4, 
vVertexElements);
 StoreVertexElements(GEP(pVtxOut, C(1)), outputElt, 4, 
vVertexElements2);
 
+#endif
 outputElt += 1;
 
 // reset to the next vVertexElement to output
@@ -1113,9 +1140,12 @@ void FetchJit::JitGatherVertices(const 
FETCH_COMPILE_STATE ,
 else if(info.type[0] == SWR_TYPE_FLOAT)
 {
 ///@todo: support 64 bit vb accesses
-Value* gatherSrc = VIMMED1(0.0f);
+Value *gatherSrc = VIMMED1(0.0f);
 #if USE_SIMD16_GATHERS
-Value* gatherSrc2 = VIMMED1(0.0f);
+Value *gatherSrc2 = VIMMED1(0.0f);
+#if USE_SIMD16_BUILDER
+Value *gatherSrc16 = VIMMED2_1(0.0f);
+#endif
 #endif
 
 SWR_ASSERT(IsUniformFormat((SWR_FORMAT)ied.Format), 
@@ -1127,8 +1157,8 @@ void FetchJit::JitGatherVertices(const 
FETCH_COMPILE_STATE ,
 case 16:
 {
 #if 

[Mesa-dev] [PATCH 05/20] swr/rast: Convert gather masks to Nx1bit

2017-12-14 Thread Tim Rowley
Simplifies calling code, gets gather function interface closer to llvm's
masked_gather.
---
 .../drivers/swr/rasterizer/jitter/builder_misc.cpp | 20 +
 .../drivers/swr/rasterizer/jitter/fetch_jit.cpp| 34 +-
 2 files changed, 14 insertions(+), 40 deletions(-)

diff --git a/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.cpp 
b/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.cpp
index 0221106664..04092541e5 100644
--- a/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.cpp
+++ b/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.cpp
@@ -602,7 +602,7 @@ namespace SwrJit
 if(JM()->mArch.AVX2())
 {
 // force mask to , required by vgather
-Value *mask = BITCAST(vMask, mSimdFP32Ty);
+Value *mask = BITCAST(VMASK(vMask), mSimdFP32Ty);
 
 vGather = VGATHERPS(vSrc, pBase, vIndices, mask, C(scale));
 }
@@ -617,7 +617,6 @@ namespace SwrJit
 vGather = VUNDEF_F();
 Value *vScaleVec = VIMMED1((uint32_t)scale);
 Value *vOffsets = MUL(vIndices,vScaleVec);
-Value *mask = MASK(vMask);
 for(uint32_t i = 0; i < mVWidth; ++i)
 {
 // single component byte index
@@ -627,7 +626,7 @@ namespace SwrJit
 loadAddress = BITCAST(loadAddress,PointerType::get(mFP32Ty,0));
 // pointer to the value to load if we're masking off a 
component
 Value *maskLoadAddress = GEP(vSrcPtr,{C(0), C(i)});
-Value *selMask = VEXTRACT(mask,C(i));
+Value *selMask = VEXTRACT(vMask,C(i));
 // switch in a safe address to load if we're trying to access 
a vertex 
 Value *validAddress = SELECT(selMask, loadAddress, 
maskLoadAddress);
 Value *val = LOAD(validAddress);
@@ -648,7 +647,7 @@ namespace SwrJit
 if (JM()->mArch.AVX512F())
 {
 // force mask to , required by vgather2
-Value *mask = BITCAST(MASK2(vMask), mInt16Ty);
+Value *mask = BITCAST(vMask, mInt16Ty);
 
 vGather = VGATHERPS2(vSrc, pBase, vIndices, mask, 
C((uint32_t)scale));
 }
@@ -689,7 +688,7 @@ namespace SwrJit
 // use avx2 gather instruction if available
 if(JM()->mArch.AVX2())
 {
-vGather = VGATHERDD(vSrc, pBase, vIndices, vMask, C(scale));
+vGather = VGATHERDD(vSrc, pBase, vIndices, VMASK(vMask), C(scale));
 }
 else
 {
@@ -702,7 +701,6 @@ namespace SwrJit
 vGather = VUNDEF_I();
 Value *vScaleVec = VIMMED1((uint32_t)scale);
 Value *vOffsets = MUL(vIndices, vScaleVec);
-Value *mask = MASK(vMask);
 for(uint32_t i = 0; i < mVWidth; ++i)
 {
 // single component byte index
@@ -712,7 +710,7 @@ namespace SwrJit
 loadAddress = BITCAST(loadAddress, PointerType::get(mInt32Ty, 
0));
 // pointer to the value to load if we're masking off a 
component
 Value *maskLoadAddress = GEP(vSrcPtr, {C(0), C(i)});
-Value *selMask = VEXTRACT(mask, C(i));
+Value *selMask = VEXTRACT(vMask, C(i));
 // switch in a safe address to load if we're trying to access 
a vertex 
 Value *validAddress = SELECT(selMask, loadAddress, 
maskLoadAddress);
 Value *val = LOAD(validAddress, C(0));
@@ -739,6 +737,7 @@ namespace SwrJit
 // use avx2 gather instruction if available
 if(JM()->mArch.AVX2())
 {
+vMask = BITCAST(S_EXT(vMask, VectorType::get(mInt64Ty, 
mVWidth/2)), VectorType::get(mDoubleTy, mVWidth/2));
 vGather = VGATHERPD(vSrc, pBase, vIndices, vMask, C(scale));
 }
 else
@@ -752,7 +751,6 @@ namespace SwrJit
 vGather = UndefValue::get(VectorType::get(mDoubleTy, 4));
 Value *vScaleVec = VECTOR_SPLAT(4, C((uint32_t)scale));
 Value *vOffsets = MUL(vIndices,vScaleVec);
-Value *mask = MASK(vMask);
 for(uint32_t i = 0; i < mVWidth/2; ++i)
 {
 // single component byte index
@@ -762,7 +760,7 @@ namespace SwrJit
 loadAddress = 
BITCAST(loadAddress,PointerType::get(mDoubleTy,0));
 // pointer to the value to load if we're masking off a 
component
 Value *maskLoadAddress = GEP(vSrcPtr,{C(0), C(i)});
-Value *selMask = VEXTRACT(mask,C(i));
+Value *selMask = VEXTRACT(vMask,C(i));
 // switch in a safe address to load if we're trying to access 
a vertex
 Value *validAddress = SELECT(selMask, loadAddress, 
maskLoadAddress);
 Value *val = LOAD(validAddress);
@@ -1094,14 +1092,10 @@ namespace SwrJit
 const SWR_FORMAT_INFO  = GetFormatInfo(format);
 if(info.type[0] == 

[Mesa-dev] [PATCH 06/20] swr/rast: Rewrite Shuffle8bpcGatherd using shuffle

2017-12-14 Thread Tim Rowley
Ease future code maintenance, prepare for folding simd8 and simd16 versions.
---
 .../drivers/swr/rasterizer/jitter/fetch_jit.cpp| 244 ++---
 1 file changed, 62 insertions(+), 182 deletions(-)

diff --git a/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.cpp 
b/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.cpp
index 67a4a04072..a847cb74da 100644
--- a/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.cpp
+++ b/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.cpp
@@ -2014,206 +2014,86 @@ void FetchJit::Shuffle8bpcGatherd(Shuffle8bpcArgs 
)
 const uint32_t ()[4] = std::get<9>(args);
 
 // cast types
-Type* vGatherTy = mSimdInt32Ty;
 Type* v32x8Ty =  VectorType::get(mInt8Ty, mVWidth * 4 ); // vwidth is 
units of 32 bits
 
-// have to do extra work for sign extending
-if ((extendType == Instruction::CastOps::SExt) || (extendType == 
Instruction::CastOps::SIToFP)){
-Type* v16x8Ty = VectorType::get(mInt8Ty, mVWidth * 2); // 8x16bit ints 
in a 128bit lane
-Type* v128Ty = VectorType::get(IntegerType::getIntNTy(JM()->mContext, 
128), mVWidth / 4); // vwidth is units of 32 bits
-
-// shuffle mask, including any swizzling
-const char x = (char)swizzle[0]; const char y = (char)swizzle[1];
-const char z = (char)swizzle[2]; const char w = (char)swizzle[3];
-Value* vConstMask = C({char(x), char(x+4), char(x+8), char(x+12),
-char(y), char(y+4), char(y+8), char(y+12),
-char(z), char(z+4), char(z+8), char(z+12),
-char(w), char(w+4), char(w+8), char(w+12),
-char(x), char(x+4), char(x+8), char(x+12),
-char(y), char(y+4), char(y+8), char(y+12),
-char(z), char(z+4), char(z+8), char(z+12),
-char(w), char(w+4), char(w+8), char(w+12)});
-
-Value* vShufResult = BITCAST(PSHUFB(BITCAST(vGatherResult, v32x8Ty), 
vConstMask), vGatherTy);
-// after pshufb: group components together in each 128bit lane
-// 256i - 01234567
-//       
-
-Value* vi128XY = nullptr;
-if(isComponentEnabled(compMask, 0) || isComponentEnabled(compMask, 1)){
-vi128XY = BITCAST(PERMD(vShufResult, C({0, 4, 0, 0, 1, 5, 
0, 0})), v128Ty);
-// after PERMD: move and pack xy and zw components in low 64 bits 
of each 128bit lane
-// 256i - 01234567
-//  dcdc dcdc   dcdc dcdc (dc - don't care)
-}
-
-// do the same for zw components
-Value* vi128ZW = nullptr;
-if(isComponentEnabled(compMask, 2) || isComponentEnabled(compMask, 3)){
-vi128ZW = BITCAST(PERMD(vShufResult, C({2, 6, 0, 0, 3, 7, 
0, 0})), v128Ty);
-}
-
-// init denormalize variables if needed
-Instruction::CastOps fpCast;
-Value* conversionFactor;
-
-switch (conversionType)
-{
-case CONVERT_NORMALIZED:
-fpCast = Instruction::CastOps::SIToFP;
-conversionFactor = VIMMED1((float)(1.0 / 127.0));
-break;
-case CONVERT_SSCALED:
-fpCast = Instruction::CastOps::SIToFP;
-conversionFactor = VIMMED1((float)(1.0));
-break;
-case CONVERT_USCALED:
-SWR_INVALID("Type should not be sign extended!");
-conversionFactor = nullptr;
-break;
-default:
-SWR_ASSERT(conversionType == CONVERT_NONE);
-conversionFactor = nullptr;
-break;
-}
+for (uint32_t i = 0; i < 4; i++)
+{
+if (!isComponentEnabled(compMask, i))
+continue;
 
-// sign extend all enabled components. If we have a fill 
vVertexElements, output to current simdvertex
-for (uint32_t i = 0; i < 4; i++)
+if (compCtrl[i] == ComponentControl::StoreSrc)
 {
-if (isComponentEnabled(compMask, i))
-{
-if (compCtrl[i] == ComponentControl::StoreSrc)
-{
-// if x or z, extract 128bits from lane 0, else for y or 
w, extract from lane 1
-uint32_t lane = ((i == 0) || (i == 2)) ? 0 : 1;
-// if x or y, use vi128XY permute result, else use vi128ZW
-Value* selectedPermute = (i < 2) ? vi128XY : vi128ZW;
-
-// sign extend
-vVertexElements[currentVertexElement] = 
PMOVSXBD(BITCAST(VEXTRACT(selectedPermute, C(lane)), v16x8Ty));
-
-// denormalize if needed
-if (conversionType != CONVERT_NONE)
-{
-vVertexElements[currentVertexElement] = 
FMUL(CAST(fpCast, vVertexElements[currentVertexElement], mSimdFP32Ty), 
conversionFactor);
-}
-  

[Mesa-dev] [PATCH 10/20] swr/rast: SIMD16 Fetch - Fully widen 32-bit float vertex components

2017-12-14 Thread Tim Rowley
---
 .../swr/rasterizer/codegen/gen_llvm_ir_macros.py   |   3 +-
 .../drivers/swr/rasterizer/jitter/builder_misc.cpp |  41 -
 .../drivers/swr/rasterizer/jitter/builder_misc.h   |   7 +-
 .../drivers/swr/rasterizer/jitter/fetch_jit.cpp| 175 ++---
 4 files changed, 194 insertions(+), 32 deletions(-)

diff --git a/src/gallium/drivers/swr/rasterizer/codegen/gen_llvm_ir_macros.py 
b/src/gallium/drivers/swr/rasterizer/codegen/gen_llvm_ir_macros.py
index 44fc857371..ac8b3badf6 100644
--- a/src/gallium/drivers/swr/rasterizer/codegen/gen_llvm_ir_macros.py
+++ b/src/gallium/drivers/swr/rasterizer/codegen/gen_llvm_ir_macros.py
@@ -44,9 +44,10 @@ inst_aliases = {
 intrinsics = [
 ['VGATHERPD', 'x86_avx2_gather_d_pd_256', ['src', 'pBase', 'indices', 
'mask', 'scale']],
 ['VGATHERPS', 'x86_avx2_gather_d_ps_256', ['src', 'pBase', 'indices', 
'mask', 'scale']],
-['VGATHERPS2', 'x86_avx512_gather_dps_512', ['src', 'pBase', 
'indices', 'mask', 'scale']],
+['VGATHERPS_16', 'x86_avx512_gather_dps_512', ['src', 'pBase', 
'indices', 'mask', 'scale']],
 ['VGATHERDD', 'x86_avx2_gather_d_d_256', ['src', 'pBase', 'indices', 
'mask', 'scale']],
 ['VPSRLI', 'x86_avx2_psrli_d', ['src', 'imm']],
+['VPSRLI_16', 'x86_avx512_psrli_d_512', ['src', 'imm']],
 ['VSQRTPS', 'x86_avx_sqrt_ps_256', ['a']],
 ['VRSQRTPS', 'x86_avx_rsqrt_ps_256', ['a']],
 ['VRCPPS', 'x86_avx_rcp_ps_256', ['a']],
diff --git a/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.cpp 
b/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.cpp
index 04092541e5..b2210db717 100644
--- a/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.cpp
+++ b/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.cpp
@@ -639,7 +639,7 @@ namespace SwrJit
 }
 
 #if USE_SIMD16_BUILDER
-Value *Builder::GATHERPS2(Value *vSrc, Value *pBase, Value *vIndices, 
Value *vMask, uint8_t scale)
+Value *Builder::GATHERPS_16(Value *vSrc, Value *pBase, Value *vIndices, 
Value *vMask, uint8_t scale)
 {
 Value *vGather = VUNDEF2_F();
 
@@ -649,7 +649,7 @@ namespace SwrJit
 // force mask to , required by vgather2
 Value *mask = BITCAST(vMask, mInt16Ty);
 
-vGather = VGATHERPS2(vSrc, pBase, vIndices, mask, 
C((uint32_t)scale));
+vGather = VGATHERPS_16(vSrc, pBase, vIndices, mask, 
C((uint32_t)scale));
 }
 else
 {
@@ -659,8 +659,10 @@ namespace SwrJit
 Value *indices0 = EXTRACT2_I(vIndices, 0);
 Value *indices1 = EXTRACT2_I(vIndices, 1);
 
-Value *mask0 = EXTRACT2_I(vMask, 0);
-Value *mask1 = EXTRACT2_I(vMask, 1);
+Value *vmask16 = VMASK2(vMask);
+
+Value *mask0 = MASK(EXTRACT2_I(vmask16, 0));  // TODO: do this 
better..
+Value *mask1 = MASK(EXTRACT2_I(vmask16, 1));
 
 Value *gather0 = GATHERPS(src0, pBase, indices0, mask0, scale);
 Value *gather1 = GATHERPS(src1, pBase, indices1, mask1, scale);
@@ -771,6 +773,37 @@ namespace SwrJit
 return vGather;
 }
 
+#if USE_SIMD16_BUILDER
+Value *Builder::PSRLI(Value *a, Value *imm)
+{
+return VPSRLI(a, imm);
+}
+
+Value *Builder::PSRLI_16(Value *a, Value *imm)
+{
+Value *result = VUNDEF2_I();
+
+// use avx512 shift right instruction if available
+if (JM()->mArch.AVX512F())
+{
+result = VPSRLI_16(a, imm);
+}
+else
+{
+Value *a0 = EXTRACT2_I(a, 0);
+Value *a1 = EXTRACT2_I(a, 1);
+
+Value *result0 = PSRLI(a0, imm);
+Value *result1 = PSRLI(a1, imm);
+
+result = INSERT2_I(result, result0, 0);
+result = INSERT2_I(result, result1, 1);
+}
+
+return result;
+}
+
+#endif
 #if USE_SIMD16_BUILDER
 //
 /// @brief
diff --git a/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.h 
b/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.h
index d858a827db..62360a3ad7 100644
--- a/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.h
+++ b/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.h
@@ -130,7 +130,7 @@ void Gather4(const SWR_FORMAT format, Value* pSrcBase, 
Value* byteOffsets,
 
 Value *GATHERPS(Value *src, Value *pBase, Value *indices, Value *mask, uint8_t 
scale = 1);
 #if USE_SIMD16_BUILDER
-Value *GATHERPS2(Value *src, Value *pBase, Value *indices, Value *mask, 
uint8_t scale = 1);
+Value *GATHERPS_16(Value *src, Value *pBase, Value *indices, Value *mask, 
uint8_t scale = 1);
 #endif
 void GATHER4PS(const SWR_FORMAT_INFO , Value* pSrcBase, Value* 
byteOffsets,
Value* mask, Value* vGatherComponents[], bool bPackedOutput);
@@ -141,6 +141,11 @@ void GATHER4DD(const SWR_FORMAT_INFO , Value* 
pSrcBase, Value* byteOffsets,
 
 Value *GATHERPD(Value* src, Value* 

[Mesa-dev] [PATCH 07/20] swr/rast: Move GatherScissors to header

2017-12-14 Thread Tim Rowley
---
 src/gallium/drivers/swr/rasterizer/core/binner.cpp | 127 -
 src/gallium/drivers/swr/rasterizer/core/binner.h   | 127 +
 2 files changed, 127 insertions(+), 127 deletions(-)

diff --git a/src/gallium/drivers/swr/rasterizer/core/binner.cpp 
b/src/gallium/drivers/swr/rasterizer/core/binner.cpp
index 8a5356b168..22996c5a5d 100644
--- a/src/gallium/drivers/swr/rasterizer/core/binner.cpp
+++ b/src/gallium/drivers/swr/rasterizer/core/binner.cpp
@@ -212,133 +212,6 @@ INLINE void ProcessAttributes(
 }
 }
 
-//
-/// @brief  Gather scissor rect data based on per-prim viewport indices.
-/// @param pScissorsInFixedPoint - array of scissor rects in 16.8 fixed point.
-/// @param pViewportIndex - array of per-primitive vewport indexes.
-/// @param scisXmin - output vector of per-prmitive scissor rect Xmin data.
-/// @param scisYmin - output vector of per-prmitive scissor rect Ymin data.
-/// @param scisXmax - output vector of per-prmitive scissor rect Xmax data.
-/// @param scisYmax - output vector of per-prmitive scissor rect Ymax data.
-//
-/// @todo:  Look at speeding this up -- weigh against corresponding costs in 
rasterizer.
-static void GatherScissors(const SWR_RECT *pScissorsInFixedPoint, const 
uint32_t *pViewportIndex,
-simdscalari , simdscalari , simdscalari , 
simdscalari )
-{
-scisXmin = _simd_set_epi32(
-pScissorsInFixedPoint[pViewportIndex[7]].xmin,
-pScissorsInFixedPoint[pViewportIndex[6]].xmin,
-pScissorsInFixedPoint[pViewportIndex[5]].xmin,
-pScissorsInFixedPoint[pViewportIndex[4]].xmin,
-pScissorsInFixedPoint[pViewportIndex[3]].xmin,
-pScissorsInFixedPoint[pViewportIndex[2]].xmin,
-pScissorsInFixedPoint[pViewportIndex[1]].xmin,
-pScissorsInFixedPoint[pViewportIndex[0]].xmin);
-scisYmin = _simd_set_epi32(
-pScissorsInFixedPoint[pViewportIndex[7]].ymin,
-pScissorsInFixedPoint[pViewportIndex[6]].ymin,
-pScissorsInFixedPoint[pViewportIndex[5]].ymin,
-pScissorsInFixedPoint[pViewportIndex[4]].ymin,
-pScissorsInFixedPoint[pViewportIndex[3]].ymin,
-pScissorsInFixedPoint[pViewportIndex[2]].ymin,
-pScissorsInFixedPoint[pViewportIndex[1]].ymin,
-pScissorsInFixedPoint[pViewportIndex[0]].ymin);
-scisXmax = _simd_set_epi32(
-pScissorsInFixedPoint[pViewportIndex[7]].xmax,
-pScissorsInFixedPoint[pViewportIndex[6]].xmax,
-pScissorsInFixedPoint[pViewportIndex[5]].xmax,
-pScissorsInFixedPoint[pViewportIndex[4]].xmax,
-pScissorsInFixedPoint[pViewportIndex[3]].xmax,
-pScissorsInFixedPoint[pViewportIndex[2]].xmax,
-pScissorsInFixedPoint[pViewportIndex[1]].xmax,
-pScissorsInFixedPoint[pViewportIndex[0]].xmax);
-scisYmax = _simd_set_epi32(
-pScissorsInFixedPoint[pViewportIndex[7]].ymax,
-pScissorsInFixedPoint[pViewportIndex[6]].ymax,
-pScissorsInFixedPoint[pViewportIndex[5]].ymax,
-pScissorsInFixedPoint[pViewportIndex[4]].ymax,
-pScissorsInFixedPoint[pViewportIndex[3]].ymax,
-pScissorsInFixedPoint[pViewportIndex[2]].ymax,
-pScissorsInFixedPoint[pViewportIndex[01]].ymax,
-pScissorsInFixedPoint[pViewportIndex[00]].ymax);
-}
-
-static void GatherScissors(const SWR_RECT *pScissorsInFixedPoint, const 
uint32_t *pViewportIndex,
-simd16scalari , simd16scalari , simd16scalari , 
simd16scalari )
-{
-scisXmin = _simd16_set_epi32(
-pScissorsInFixedPoint[pViewportIndex[15]].xmin,
-pScissorsInFixedPoint[pViewportIndex[14]].xmin,
-pScissorsInFixedPoint[pViewportIndex[13]].xmin,
-pScissorsInFixedPoint[pViewportIndex[12]].xmin,
-pScissorsInFixedPoint[pViewportIndex[11]].xmin,
-pScissorsInFixedPoint[pViewportIndex[10]].xmin,
-pScissorsInFixedPoint[pViewportIndex[ 9]].xmin,
-pScissorsInFixedPoint[pViewportIndex[ 8]].xmin,
-pScissorsInFixedPoint[pViewportIndex[ 7]].xmin,
-pScissorsInFixedPoint[pViewportIndex[ 6]].xmin,
-pScissorsInFixedPoint[pViewportIndex[ 5]].xmin,
-pScissorsInFixedPoint[pViewportIndex[ 4]].xmin,
-pScissorsInFixedPoint[pViewportIndex[ 3]].xmin,
-pScissorsInFixedPoint[pViewportIndex[ 2]].xmin,
-pScissorsInFixedPoint[pViewportIndex[ 1]].xmin,
-pScissorsInFixedPoint[pViewportIndex[ 0]].xmin);
-
-scisYmin = _simd16_set_epi32(
-pScissorsInFixedPoint[pViewportIndex[15]].ymin,
-pScissorsInFixedPoint[pViewportIndex[14]].ymin,
-pScissorsInFixedPoint[pViewportIndex[13]].ymin,
-pScissorsInFixedPoint[pViewportIndex[12]].ymin,
-pScissorsInFixedPoint[pViewportIndex[11]].ymin,
-pScissorsInFixedPoint[pViewportIndex[10]].ymin,
-pScissorsInFixedPoint[pViewportIndex[ 9]].ymin,
-pScissorsInFixedPoint[pViewportIndex[ 8]].ymin,
-

[Mesa-dev] [PATCH 09/20] swr/rast: Pass prim to ClipSimd

2017-12-14 Thread Tim Rowley
---
 src/gallium/drivers/swr/rasterizer/core/clip.h | 10 +-
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/src/gallium/drivers/swr/rasterizer/core/clip.h 
b/src/gallium/drivers/swr/rasterizer/core/clip.h
index 148f661ab4..8b947668d3 100644
--- a/src/gallium/drivers/swr/rasterizer/core/clip.h
+++ b/src/gallium/drivers/swr/rasterizer/core/clip.h
@@ -437,7 +437,7 @@ public:
 return SIMD_T::movemask_ps(vClipCullMask);
 }
 
-void ClipSimd(const typename SIMD_T::Float , const typename 
SIMD_T::Float , PA_STATE , const typename SIMD_T::Integer 
, const typename SIMD_T::Integer )
+void ClipSimd(const typename SIMD_T::Vec4 prim[], const typename 
SIMD_T::Float , const typename SIMD_T::Float , PA_STATE 
, const typename SIMD_T::Integer , const typename SIMD_T::Integer 
)
 {
 // input/output vertex store for clipper
 SIMDVERTEX_T vertices[7]; // maximum 7 verts generated per 
triangle
@@ -452,10 +452,9 @@ public:
 
 // assemble pos
 typename SIMD_T::Vec4 tmpVector[NumVertsPerPrim];
-pa.Assemble(VERTEX_POSITION_SLOT, tmpVector);
 for (uint32_t i = 0; i < NumVertsPerPrim; ++i)
 {
-vertices[i].attrib[VERTEX_POSITION_SLOT] = tmpVector[i];
+vertices[i].attrib[VERTEX_POSITION_SLOT] = prim[i];
 }
 
 // assemble attribs
@@ -568,7 +567,8 @@ public:
 SIMDVERTEX_T transposedPrims[2];
 
 #endif
-for (uint32_t inputPrim = 0; inputPrim < pa.NumPrims(); ++inputPrim)
+uint32_t numInputPrims = pa.NumPrims();
+for (uint32_t inputPrim = 0; inputPrim < numInputPrims; ++inputPrim)
 {
 uint32_t numEmittedVerts = pVertexCount[inputPrim];
 if (numEmittedVerts < NumVertsPerPrim)
@@ -716,7 +716,7 @@ public:
 AR_BEGIN(FEGuardbandClip, pa.pDC->drawId);
 // we have to clip tris, execute the clipper, which will also
 // call the binner
-ClipSimd(SIMD_T::vmask_ps(primMask), SIMD_T::vmask_ps(clipMask), 
pa, primId, viewportIdx);
+ClipSimd(prim, SIMD_T::vmask_ps(primMask), 
SIMD_T::vmask_ps(clipMask), pa, primId, viewportIdx);
 AR_END(FEGuardbandClip, 1);
 }
 else if (validMask)
-- 
2.14.1

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH 08/10] swr/rast: Simplify GATHER* jit builder api

2017-11-20 Thread Tim Rowley
General cleanup, and prep work for possibly moving to llvm masked
gather intrinsic.
---
 .../drivers/swr/rasterizer/jitter/builder_misc.cpp | 32 ++---
 .../drivers/swr/rasterizer/jitter/builder_misc.h   |  6 +--
 .../drivers/swr/rasterizer/jitter/fetch_jit.cpp| 56 +++---
 src/gallium/drivers/swr/swr_shader.cpp |  2 +-
 4 files changed, 48 insertions(+), 48 deletions(-)

diff --git a/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.cpp 
b/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.cpp
index daa9cb1ec1..bd3a52566d 100644
--- a/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.cpp
+++ b/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.cpp
@@ -554,7 +554,7 @@ namespace SwrJit
 /// @param vIndices - SIMD wide value of VB byte offsets
 /// @param vMask - SIMD wide mask that controls whether to access memory 
or the src values
 /// @param scale - value to scale indices by
-Value *Builder::GATHERPS(Value* vSrc, Value* pBase, Value* vIndices, 
Value* vMask, Value* scale)
+Value *Builder::GATHERPS(Value* vSrc, Value* pBase, Value* vIndices, 
Value* vMask, uint8_t scale)
 {
 Value* vGather;
 
@@ -563,7 +563,7 @@ namespace SwrJit
 {
 // force mask to , required by vgather
 vMask = BITCAST(vMask, mSimdFP32Ty);
-vGather = VGATHERPS(vSrc,pBase,vIndices,vMask,scale);
+vGather = VGATHERPS(vSrc,pBase,vIndices,vMask,C(scale));
 }
 else
 {
@@ -574,7 +574,7 @@ namespace SwrJit
 STORE(vSrc, vSrcPtr);
 
 vGather = VUNDEF_F();
-Value *vScaleVec = VBROADCAST(Z_EXT(scale,mInt32Ty));
+Value *vScaleVec = VIMMED1((uint32_t)scale);
 Value *vOffsets = MUL(vIndices,vScaleVec);
 Value *mask = MASK(vMask);
 for(uint32_t i = 0; i < mVWidth; ++i)
@@ -606,14 +606,14 @@ namespace SwrJit
 /// @param vIndices - SIMD wide value of VB byte offsets
 /// @param vMask - SIMD wide mask that controls whether to access memory 
or the src values
 /// @param scale - value to scale indices by
-Value *Builder::GATHERDD(Value* vSrc, Value* pBase, Value* vIndices, 
Value* vMask, Value* scale)
+Value *Builder::GATHERDD(Value* vSrc, Value* pBase, Value* vIndices, 
Value* vMask, uint8_t scale)
 {
 Value* vGather;
 
 // use avx2 gather instruction if available
 if(JM()->mArch.AVX2())
 {
-vGather = VGATHERDD(vSrc, pBase, vIndices, vMask, scale);
+vGather = VGATHERDD(vSrc, pBase, vIndices, vMask, C(scale));
 }
 else
 {
@@ -624,7 +624,7 @@ namespace SwrJit
 STORE(vSrc, vSrcPtr);
 
 vGather = VUNDEF_I();
-Value *vScaleVec = VBROADCAST(Z_EXT(scale, mInt32Ty));
+Value *vScaleVec = VIMMED1((uint32_t)scale);
 Value *vOffsets = MUL(vIndices, vScaleVec);
 Value *mask = MASK(vMask);
 for(uint32_t i = 0; i < mVWidth; ++i)
@@ -656,14 +656,14 @@ namespace SwrJit
 /// @param vIndices - SIMD wide value of VB byte offsets
 /// @param vMask - SIMD wide mask that controls whether to access memory 
or the src values
 /// @param scale - value to scale indices by
-Value *Builder::GATHERPD(Value* vSrc, Value* pBase, Value* vIndices, 
Value* vMask, Value* scale)
+Value *Builder::GATHERPD(Value* vSrc, Value* pBase, Value* vIndices, 
Value* vMask, uint8_t scale)
 {
 Value* vGather;
 
 // use avx2 gather instruction if available
 if(JM()->mArch.AVX2())
 {
-vGather = VGATHERPD(vSrc, pBase, vIndices, vMask, scale);
+vGather = VGATHERPD(vSrc, pBase, vIndices, vMask, C(scale));
 }
 else
 {
@@ -674,7 +674,7 @@ namespace SwrJit
 STORE(vSrc, vSrcPtr);
 
 vGather = UndefValue::get(VectorType::get(mDoubleTy, 4));
-Value *vScaleVec = VECTOR_SPLAT(4, Z_EXT(scale,mInt32Ty));
+Value *vScaleVec = VECTOR_SPLAT(4, C((uint32_t)scale));
 Value *vOffsets = MUL(vIndices,vScaleVec);
 Value *mask = MASK(vMask);
 for(uint32_t i = 0; i < mVWidth/2; ++i)
@@ -1016,7 +1016,7 @@ namespace SwrJit
 // save mask as it is zero'd out after each gather
 vMask = mask;
 
-vGatherResult[0] = GATHERPS(vGatherMaskedVal, pSrcBase, 
byteOffsets, vMask, C((char)1));
+vGatherResult[0] = GATHERPS(vGatherMaskedVal, pSrcBase, 
byteOffsets, vMask);
 // e.g. result of first 8x32bit integer gather for 16bit 
components
 // 256i - 01234567
 //xyxy xyxy xyxy xyxy xyxy xyxy xyxy xyxy
@@ -1029,7 +1029,7 @@ namespace SwrJit
 pSrcBase = GEP(pSrcBase, C((char)4));
 vMask = mask;
 
-  

[Mesa-dev] [PATCH 05/10] swr/rast: Enable AVX-512 targets in the jitter

2017-11-20 Thread Tim Rowley
---
 src/gallium/drivers/swr/rasterizer/core/knobs.h| 8 
 src/gallium/drivers/swr/rasterizer/jitter/JitManager.h | 2 --
 2 files changed, 10 deletions(-)

diff --git a/src/gallium/drivers/swr/rasterizer/core/knobs.h 
b/src/gallium/drivers/swr/rasterizer/core/knobs.h
index fe0a044ae8..e00e2da650 100644
--- a/src/gallium/drivers/swr/rasterizer/core/knobs.h
+++ b/src/gallium/drivers/swr/rasterizer/core/knobs.h
@@ -61,18 +61,10 @@
 #define KNOB_SIMD_WIDTH 8
 #define KNOB_SIMD_BYTES 32
 #elif (KNOB_ARCH == KNOB_ARCH_AVX512)
-#if 0
-// not ready to enable this globally, enabled on the side (below)
 #define KNOB_ARCH_ISA AVX512F
 #define KNOB_ARCH_STR "AVX512"
-#define KNOB_SIMD_WIDTH 16
-#define KNOB_SIMD_BYTES 64
-#else
-#define KNOB_ARCH_ISA AVX2
-#define KNOB_ARCH_STR "AVX2"
 #define KNOB_SIMD_WIDTH 8
 #define KNOB_SIMD_BYTES 32
-#endif
 #else
 #error "Unknown architecture"
 #endif
diff --git a/src/gallium/drivers/swr/rasterizer/jitter/JitManager.h 
b/src/gallium/drivers/swr/rasterizer/jitter/JitManager.h
index 46ffe276a0..c30a807222 100644
--- a/src/gallium/drivers/swr/rasterizer/jitter/JitManager.h
+++ b/src/gallium/drivers/swr/rasterizer/jitter/JitManager.h
@@ -102,14 +102,12 @@ public:
 bForceAVX2 = true;
 bForceAVX512 = false;
 }
-#if 0
 else if(isaRequest == "avx512")
 {
 bForceAVX = false;
 bForceAVX2 = false;
 bForceAVX512 = true;
 }
-#endif
 };
 
 bool AVX2(void) { return bForceAVX ? 0 : InstructionSet::AVX2(); }
-- 
2.14.1

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH 06/10] swr/rast: Cache eventmanager

2017-11-20 Thread Tim Rowley
---
 src/gallium/drivers/swr/rasterizer/archrast/archrast.h | 1 +
 src/gallium/drivers/swr/rasterizer/core/api.cpp| 5 +
 src/gallium/drivers/swr/rasterizer/core/api.h  | 3 +++
 3 files changed, 9 insertions(+)

diff --git a/src/gallium/drivers/swr/rasterizer/archrast/archrast.h 
b/src/gallium/drivers/swr/rasterizer/archrast/archrast.h
index fa88a4948c..c74d6ad909 100644
--- a/src/gallium/drivers/swr/rasterizer/archrast/archrast.h
+++ b/src/gallium/drivers/swr/rasterizer/archrast/archrast.h
@@ -29,6 +29,7 @@
 
 #include "common/os.h"
 #include "gen_ar_event.hpp"
+#include "eventmanager.h"
 
 namespace ArchRast
 {
diff --git a/src/gallium/drivers/swr/rasterizer/core/api.cpp 
b/src/gallium/drivers/swr/rasterizer/core/api.cpp
index 20eeb29681..9265440904 100644
--- a/src/gallium/drivers/swr/rasterizer/core/api.cpp
+++ b/src/gallium/drivers/swr/rasterizer/core/api.cpp
@@ -143,6 +143,11 @@ HANDLE SwrCreateContext(
 #endif
 }
 
+#if defined(KNOB_ENABLE_AR)
+// cache the API thread event manager, for use with sim layer
+pCreateInfo->hArEventManager = pContext->pArContext[16];
+#endif
+
 // State setup AFTER context is fully initialized
 SetupDefaultState(pContext);
 
diff --git a/src/gallium/drivers/swr/rasterizer/core/api.h 
b/src/gallium/drivers/swr/rasterizer/core/api.h
index 60f56c6d76..c032b0bb10 100644
--- a/src/gallium/drivers/swr/rasterizer/core/api.h
+++ b/src/gallium/drivers/swr/rasterizer/core/api.h
@@ -213,6 +213,9 @@ struct SWR_CREATECONTEXT_INFO
 // Output: size required memory passed to for SwrSaveState / 
SwrRestoreState
 size_t  contextSaveSize;
 
+// ArchRast event manager.
+HANDLE  hArEventManager;
+
 // Input (optional): Threading info that overrides any set KNOB values.
 SWR_THREADING_INFO* pThreadInfo;
 
-- 
2.14.1

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH 02/10] swr/rast: Widen fetch shader to SIMD16

2017-11-20 Thread Tim Rowley
Widen fetch shader to SIMD16, enable SIMD16 types in the jitter,
and provide utility EXTRACT/INSERT SIMD8 <-> SIMD16 utility functions.
---
 .../drivers/swr/rasterizer/jitter/builder.cpp  | 20 
 .../drivers/swr/rasterizer/jitter/builder.h| 16 ++
 .../drivers/swr/rasterizer/jitter/builder_misc.cpp | 52 
 .../drivers/swr/rasterizer/jitter/builder_misc.h   |  9 
 .../drivers/swr/rasterizer/jitter/fetch_jit.cpp| 57 --
 5 files changed, 151 insertions(+), 3 deletions(-)

diff --git a/src/gallium/drivers/swr/rasterizer/jitter/builder.cpp 
b/src/gallium/drivers/swr/rasterizer/jitter/builder.cpp
index 6a33ec265f..4b83a3204c 100644
--- a/src/gallium/drivers/swr/rasterizer/jitter/builder.cpp
+++ b/src/gallium/drivers/swr/rasterizer/jitter/builder.cpp
@@ -41,6 +41,9 @@ namespace SwrJit
 : mpJitMgr(pJitMgr)
 {
 mVWidth = pJitMgr->mVWidth;
+#if USE_SIMD16_BUILDER
+mVWidth2 = pJitMgr->mVWidth * 2;
+#endif
 
 mpIRBuilder = >mBuilder;
 
@@ -65,17 +68,34 @@ namespace SwrJit
 mSimdFP32Ty = VectorType::get(mFP32Ty, mVWidth);
 mSimdVectorTy = ArrayType::get(mSimdFP32Ty, 4);
 mSimdVectorTRTy = ArrayType::get(mSimdFP32Ty, 5);
+#if USE_SIMD16_BUILDER
+mSimd2Int1Ty = VectorType::get(mInt1Ty, mVWidth2);
+mSimd2Int16Ty = VectorType::get(mInt16Ty, mVWidth2);
+mSimd2Int32Ty = VectorType::get(mInt32Ty, mVWidth2);
+mSimd2Int64Ty = VectorType::get(mInt64Ty, mVWidth2);
+mSimd2FP16Ty = VectorType::get(mFP16Ty, mVWidth2);
+mSimd2FP32Ty = VectorType::get(mFP32Ty, mVWidth2);
+mSimd2VectorTy = ArrayType::get(mSimd2FP32Ty, 4);
+mSimd2VectorTRTy = ArrayType::get(mSimd2FP32Ty, 5);
+#endif
 
 if (sizeof(uint32_t*) == 4)
 {
 mIntPtrTy = mInt32Ty;
 mSimdIntPtrTy = mSimdInt32Ty;
+#if USE_SIMD16_BUILDER
+mSimd2IntPtrTy = mSimd2Int32Ty;
+#endif
 }
 else
 {
 SWR_ASSERT(sizeof(uint32_t*) == 8);
+
 mIntPtrTy = mInt64Ty;
 mSimdIntPtrTy = mSimdInt64Ty;
+#if USE_SIMD16_BUILDER
+mSimd2IntPtrTy = mSimd2Int64Ty;
+#endif
 }
 }
 }
diff --git a/src/gallium/drivers/swr/rasterizer/jitter/builder.h 
b/src/gallium/drivers/swr/rasterizer/jitter/builder.h
index 8210e49b18..c6ab64e06e 100644
--- a/src/gallium/drivers/swr/rasterizer/jitter/builder.h
+++ b/src/gallium/drivers/swr/rasterizer/jitter/builder.h
@@ -32,6 +32,8 @@
 #include "JitManager.h"
 #include "common/formats.h"
 
+#define USE_SIMD16_BUILDER 0
+
 namespace SwrJit
 {
 using namespace llvm;
@@ -45,6 +47,9 @@ namespace SwrJit
 IRBuilder<>* mpIRBuilder;
 
 uint32_t mVWidth;
+#if USE_SIMD16_BUILDER
+uint32_t mVWidth2;
+#endif
 
 // Built in types.
 Type*mVoidTy;
@@ -70,6 +75,17 @@ namespace SwrJit
 Type*mSimdIntPtrTy;
 Type*mSimdVectorTy;
 Type*mSimdVectorTRTy;
+#if USE_SIMD16_BUILDER
+Type*mSimd2FP16Ty;
+Type*mSimd2FP32Ty;
+Type*mSimd2Int1Ty;
+Type*mSimd2Int16Ty;
+Type*mSimd2Int32Ty;
+Type*mSimd2Int64Ty;
+Type*mSimd2IntPtrTy;
+Type*mSimd2VectorTy;
+Type*mSimd2VectorTRTy;
+#endif
 
 #include "gen_builder.hpp"
 #include "gen_builder_x86.hpp"
diff --git a/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.cpp 
b/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.cpp
index 9ca36b2467..daa9cb1ec1 100644
--- a/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.cpp
+++ b/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.cpp
@@ -231,6 +231,13 @@ namespace SwrJit
 return UndefValue::get(VectorType::get(mFP32Ty, mVWidth));
 }
 
+#if USE_SIMD16_BUILDER
+Value *Builder::VUNDEF2_F()
+{
+return UndefValue::get(VectorType::get(mFP32Ty, mVWidth2));
+}
+
+#endif
 Value *Builder::VUNDEF(Type* t)
 {
 return UndefValue::get(VectorType::get(t, mVWidth));
@@ -690,6 +697,51 @@ namespace SwrJit
 return vGather;
 }
 
+#if USE_SIMD16_BUILDER
+//
+/// @brief
+Value *Builder::EXTRACT(Value *a2, uint32_t imm)
+{
+const uint32_t i0 = (imm > 0) ? mVWidth : 0;
+
+Value *result = VUNDEF_F();
+
+for (uint32_t i = 0; i < mVWidth; i += 1)
+{
+Value *temp = VEXTRACT(a2, C(i0 + i));
+
+result = VINSERT(result, temp, C(i));
+}
+
+return result;
+}
+
+//
+/// @brief
+Value *Builder::INSERT(Value *a2, Value * b, uint32_t imm)
+{
+   

[Mesa-dev] [PATCH 03/10] swr/rast: Code style change (NFC)

2017-11-20 Thread Tim Rowley
---
 src/gallium/drivers/swr/rasterizer/core/frontend.cpp | 9 +++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/src/gallium/drivers/swr/rasterizer/core/frontend.cpp 
b/src/gallium/drivers/swr/rasterizer/core/frontend.cpp
index e15b300979..2fe6cfcf69 100644
--- a/src/gallium/drivers/swr/rasterizer/core/frontend.cpp
+++ b/src/gallium/drivers/swr/rasterizer/core/frontend.cpp
@@ -39,6 +39,7 @@
 #include "tilemgr.h"
 #include "tessellator.h"
 #include 
+#include 
 
 //
 /// @brief Helper macro to generate a bitmask
@@ -770,6 +771,7 @@ void TransposeSOAtoAOS(uint8_t* pDst, uint8_t* pSrc, 
uint32_t numVerts, uint32_t
 }
 }
 
+
 //
 /// @brief Implements GS stage.
 /// @param pDC - pointer to draw context.
@@ -1335,8 +1337,11 @@ static void TessellationStages(
 
 SWR_ASSERT(pfnClipFunc);
 #if USE_SIMD16_FRONTEND
-tessPa.useAlternateOffset = false;
-pfnClipFunc(pDC, tessPa, workerId, prim_simd16, 
GenMask(numPrims), primID);
+
+{
+tessPa.useAlternateOffset = false;
+pfnClipFunc(pDC, tessPa, workerId, prim_simd16, 
GenMask(numPrims), primID);
+}
 #else
 pfnClipFunc(pDC, tessPa, workerId, prim,
 GenMask(tessPa.NumPrims()), 
_simd_set1_epi32(dsContext.PrimitiveID));
-- 
2.14.1

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH 00/10] swr: update rasterizer

2017-11-20 Thread Tim Rowley
Highlights are code cleanups and more progress on simd16.

Tim Rowley (10):
  swr/rast: support flexible vertex layout for DS output
  swr/rast: Widen fetch shader to SIMD16
  swr/rast: Code style change (NFC)
  swr/rast: Points with clipdistance can't go through simplepoints path
  swr/rast: Enable AVX-512 targets in the jitter
  swr/rast: Cache eventmanager
  swr/rast: Add alignment to transpose targets
  swr/rast: Simplify GATHER* jit builder api
  swr/rast: Implement AVX-512 GATHERPS in SIMD16 fetch shader
  swr/rast: Repair simd8 frontend code rot

 .../drivers/swr/rasterizer/archrast/archrast.h |   1 +
 .../swr/rasterizer/codegen/gen_llvm_ir_macros.py   |   1 +
 src/gallium/drivers/swr/rasterizer/core/api.cpp|   5 +
 src/gallium/drivers/swr/rasterizer/core/api.h  |   3 +
 src/gallium/drivers/swr/rasterizer/core/binner.cpp |  16 +-
 .../drivers/swr/rasterizer/core/frontend.cpp   |  12 +-
 src/gallium/drivers/swr/rasterizer/core/frontend.h |   3 +-
 src/gallium/drivers/swr/rasterizer/core/knobs.h|   8 -
 src/gallium/drivers/swr/rasterizer/core/state.h|   2 +
 .../drivers/swr/rasterizer/jitter/JitManager.h |   2 -
 .../drivers/swr/rasterizer/jitter/builder.cpp  |  20 ++
 .../drivers/swr/rasterizer/jitter/builder.h|  16 ++
 .../drivers/swr/rasterizer/jitter/builder_misc.cpp | 202 ++---
 .../drivers/swr/rasterizer/jitter/builder_misc.h   |  38 +++-
 .../drivers/swr/rasterizer/jitter/fetch_jit.cpp| 180 ++
 src/gallium/drivers/swr/swr_shader.cpp |   2 +-
 16 files changed, 429 insertions(+), 82 deletions(-)

-- 
2.14.1

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH 07/10] swr/rast: Add alignment to transpose targets

2017-11-20 Thread Tim Rowley
Needed to ensure alignment for avx512.

Fixes address sanitizer crash.
---
 src/gallium/drivers/swr/rasterizer/core/binner.cpp | 16 
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/src/gallium/drivers/swr/rasterizer/core/binner.cpp 
b/src/gallium/drivers/swr/rasterizer/core/binner.cpp
index b624ae69b3..9d1f0d8799 100644
--- a/src/gallium/drivers/swr/rasterizer/core/binner.cpp
+++ b/src/gallium/drivers/swr/rasterizer/core/binner.cpp
@@ -796,10 +796,10 @@ endBinTriangles:
 
 // transpose verts needed for backend
 /// @todo modify BE to take non-transformed verts
-simd4scalar vHorizX[SIMD_WIDTH];
-simd4scalar vHorizY[SIMD_WIDTH];
-simd4scalar vHorizZ[SIMD_WIDTH];
-simd4scalar vHorizW[SIMD_WIDTH];
+OSALIGNSIMD16(simd4scalar) vHorizX[SIMD_WIDTH];
+OSALIGNSIMD16(simd4scalar) vHorizY[SIMD_WIDTH];
+OSALIGNSIMD16(simd4scalar) vHorizZ[SIMD_WIDTH];
+OSALIGNSIMD16(simd4scalar) vHorizW[SIMD_WIDTH];
 
 TransposeVertices(vHorizX, tri[0].x, tri[1].x, tri[2].x);
 TransposeVertices(vHorizY, tri[0].y, tri[1].y, tri[2].y);
@@ -1510,10 +1510,10 @@ void BinPostSetupLinesImpl(
 
 // transpose verts needed for backend
 /// @todo modify BE to take non-transformed verts
-simd4scalar vHorizX[SIMD_WIDTH];
-simd4scalar vHorizY[SIMD_WIDTH];
-simd4scalar vHorizZ[SIMD_WIDTH];
-simd4scalar vHorizW[SIMD_WIDTH];
+OSALIGNSIMD16(simd4scalar) vHorizX[SIMD_WIDTH];
+OSALIGNSIMD16(simd4scalar) vHorizY[SIMD_WIDTH];
+OSALIGNSIMD16(simd4scalar) vHorizZ[SIMD_WIDTH];
+OSALIGNSIMD16(simd4scalar) vHorizW[SIMD_WIDTH];
 
 if (!primMask)
 {
-- 
2.14.1

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH 01/10] swr/rast: support flexible vertex layout for DS output

2017-11-20 Thread Tim Rowley
---
 src/gallium/drivers/swr/rasterizer/core/frontend.cpp | 1 +
 src/gallium/drivers/swr/rasterizer/core/state.h  | 2 ++
 2 files changed, 3 insertions(+)

diff --git a/src/gallium/drivers/swr/rasterizer/core/frontend.cpp 
b/src/gallium/drivers/swr/rasterizer/core/frontend.cpp
index 211e9e4b07..e15b300979 100644
--- a/src/gallium/drivers/swr/rasterizer/core/frontend.cpp
+++ b/src/gallium/drivers/swr/rasterizer/core/frontend.cpp
@@ -1237,6 +1237,7 @@ static void TessellationStages(
 dsContext.pDomainU = (simdscalar*)tsData.pDomainPointsU;
 dsContext.pDomainV = (simdscalar*)tsData.pDomainPointsV;
 dsContext.pOutputData = gt_pTessellationThreadData->pDSOutput;
+dsContext.outVertexAttribOffset = tsState.dsOutVtxAttribOffset;
 #if USE_SIMD16_FRONTEND
 dsContext.vectorStride = RoundUpEven(requiredDSVectorInvocations); 
 // simd8 -> simd16
 #else
diff --git a/src/gallium/drivers/swr/rasterizer/core/state.h 
b/src/gallium/drivers/swr/rasterizer/core/state.h
index 2af384fd90..d11ffc69b0 100644
--- a/src/gallium/drivers/swr/rasterizer/core/state.h
+++ b/src/gallium/drivers/swr/rasterizer/core/state.h
@@ -288,6 +288,7 @@ struct SWR_DS_CONTEXT
 uint32_tPrimitiveID;// IN: (SCALAR) PrimitiveID for the patch 
associated with the DS invocation
 uint32_tvectorOffset;   // IN: (SCALAR) vector index offset into 
SIMD data.
 uint32_tvectorStride;   // IN: (SCALAR) stride (in vectors) of 
output data per attribute-component
+uint32_toutVertexAttribOffset; // IN: (SCALAR) Offset to the 
attributes as processed by the next shader stage.
 ScalarPatch*pCpIn;  // IN: (SCALAR) Control patch
 simdscalar* pDomainU;   // IN: (SIMD) Domain Point U coords
 simdscalar* pDomainV;   // IN: (SIMD) Domain Point V coords
@@ -819,6 +820,7 @@ struct SWR_TS_STATE
 uint32_tnumHsOutputAttribs;
 uint32_tnumDsOutputAttribs;
 uint32_tdsAllocationSize;
+uint32_tdsOutVtxAttribOffset;
 
 // Offset to the start of the attributes of the input vertices, in 
simdvector units
 uint32_tvertexAttribOffset;
-- 
2.14.1

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH 04/10] swr/rast: Points with clipdistance can't go through simplepoints path

2017-11-20 Thread Tim Rowley
Fixes piglit glsl-1.20:vs-clip-vertex-primitives and
glsl-1.30:vs-clip-distance-primitives.
---
 src/gallium/drivers/swr/rasterizer/core/frontend.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/gallium/drivers/swr/rasterizer/core/frontend.h 
b/src/gallium/drivers/swr/rasterizer/core/frontend.h
index 5cb2f87c15..11099d6449 100644
--- a/src/gallium/drivers/swr/rasterizer/core/frontend.h
+++ b/src/gallium/drivers/swr/rasterizer/core/frontend.h
@@ -352,7 +352,8 @@ bool CanUseSimplePoints(DRAW_CONTEXT *pDC)
 return (state.rastState.sampleCount == SWR_MULTISAMPLE_1X &&
 state.rastState.pointSize == 1.0f &&
 !state.rastState.pointParam &&
-!state.rastState.pointSpriteEnable);
+!state.rastState.pointSpriteEnable &&
+!state.backendState.clipDistanceMask);
 }
 
 INLINE
-- 
2.14.1

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH 09/10] swr/rast: Implement AVX-512 GATHERPS in SIMD16 fetch shader

2017-11-20 Thread Tim Rowley
Disabled for now.
---
 .../swr/rasterizer/codegen/gen_llvm_ir_macros.py   |   1 +
 .../drivers/swr/rasterizer/jitter/builder_misc.cpp | 126 +++--
 .../drivers/swr/rasterizer/jitter/builder_misc.h   |  31 -
 .../drivers/swr/rasterizer/jitter/fetch_jit.cpp|  91 ---
 4 files changed, 220 insertions(+), 29 deletions(-)

diff --git a/src/gallium/drivers/swr/rasterizer/codegen/gen_llvm_ir_macros.py 
b/src/gallium/drivers/swr/rasterizer/codegen/gen_llvm_ir_macros.py
index ce892a9abe..44fc857371 100644
--- a/src/gallium/drivers/swr/rasterizer/codegen/gen_llvm_ir_macros.py
+++ b/src/gallium/drivers/swr/rasterizer/codegen/gen_llvm_ir_macros.py
@@ -44,6 +44,7 @@ inst_aliases = {
 intrinsics = [
 ['VGATHERPD', 'x86_avx2_gather_d_pd_256', ['src', 'pBase', 'indices', 
'mask', 'scale']],
 ['VGATHERPS', 'x86_avx2_gather_d_ps_256', ['src', 'pBase', 'indices', 
'mask', 'scale']],
+['VGATHERPS2', 'x86_avx512_gather_dps_512', ['src', 'pBase', 
'indices', 'mask', 'scale']],
 ['VGATHERDD', 'x86_avx2_gather_d_d_256', ['src', 'pBase', 'indices', 
'mask', 'scale']],
 ['VPSRLI', 'x86_avx2_psrli_d', ['src', 'imm']],
 ['VSQRTPS', 'x86_avx_sqrt_ps_256', ['a']],
diff --git a/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.cpp 
b/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.cpp
index bd3a52566d..8ffe05b41c 100644
--- a/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.cpp
+++ b/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.cpp
@@ -211,6 +211,28 @@ namespace SwrJit
 return ConstantVector::getSplat(mVWidth, cast(C(i)));
 }
 
+#if USE_SIMD16_BUILDER
+Value *Builder::VIMMED2_1(int i)
+{
+return ConstantVector::getSplat(mVWidth2, cast(C(i)));
+}
+
+Value *Builder::VIMMED2_1(uint32_t i)
+{
+return ConstantVector::getSplat(mVWidth2, cast(C(i)));
+}
+
+Value *Builder::VIMMED2_1(float i)
+{
+return ConstantVector::getSplat(mVWidth2, cast(C(i)));
+}
+
+Value *Builder::VIMMED2_1(bool i)
+{
+return ConstantVector::getSplat(mVWidth2, cast(C(i)));
+}
+
+#endif
 Value *Builder::VUNDEF_IPTR()
 {
 return UndefValue::get(VectorType::get(mInt32PtrTy,mVWidth));
@@ -237,6 +259,11 @@ namespace SwrJit
 return UndefValue::get(VectorType::get(mFP32Ty, mVWidth2));
 }
 
+Value *Builder::VUNDEF2_I()
+{
+return UndefValue::get(VectorType::get(mInt32Ty, mVWidth2));
+}
+
 #endif
 Value *Builder::VUNDEF(Type* t)
 {
@@ -254,6 +281,19 @@ namespace SwrJit
 return VECTOR_SPLAT(mVWidth, src);
 }
 
+#if USE_SIMD16_BUILDER
+Value *Builder::VBROADCAST2(Value *src)
+{
+// check if src is already a vector
+if (src->getType()->isVectorTy())
+{
+return src;
+}
+
+return VECTOR_SPLAT(mVWidth2, src);
+}
+
+#endif
 uint32_t Builder::IMMED(Value* v)
 {
 SWR_ASSERT(isa(v));
@@ -554,16 +594,17 @@ namespace SwrJit
 /// @param vIndices - SIMD wide value of VB byte offsets
 /// @param vMask - SIMD wide mask that controls whether to access memory 
or the src values
 /// @param scale - value to scale indices by
-Value *Builder::GATHERPS(Value* vSrc, Value* pBase, Value* vIndices, 
Value* vMask, uint8_t scale)
+Value *Builder::GATHERPS(Value *vSrc, Value *pBase, Value *vIndices, Value 
*vMask, uint8_t scale)
 {
-Value* vGather;
+Value *vGather;
 
 // use avx2 gather instruction if available
 if(JM()->mArch.AVX2())
 {
 // force mask to , required by vgather
-vMask = BITCAST(vMask, mSimdFP32Ty);
-vGather = VGATHERPS(vSrc,pBase,vIndices,vMask,C(scale));
+Value *mask = BITCAST(vMask, mSimdFP32Ty);
+
+vGather = VGATHERPS(vSrc, pBase, vIndices, mask, C(scale));
 }
 else
 {
@@ -598,6 +639,41 @@ namespace SwrJit
 return vGather;
 }
 
+#if USE_SIMD16_BUILDER
+Value *Builder::GATHERPS2(Value *vSrc, Value *pBase, Value *vIndices, 
Value *vMask, uint8_t scale)
+{
+Value *vGather = VUNDEF2_F();
+
+// use avx512 gather instruction if available
+if (JM()->mArch.AVX512F())
+{
+// force mask to , required by vgather2
+Value *mask = BITCAST(MASK2(vMask), mInt16Ty);
+
+vGather = VGATHERPS2(vSrc, pBase, vIndices, mask, 
C((uint32_t)scale));
+}
+else
+{
+Value *src0 = EXTRACT2_F(vSrc, 0);
+Value *src1 = EXTRACT2_F(vSrc, 1);
+
+Value *indices0 = EXTRACT2_I(vIndices, 0);
+Value *indices1 = EXTRACT2_I(vIndices, 1);
+
+Value *mask0 = EXTRACT2_I(vMask, 0);
+Value *mask1 = EXTRACT2_I(vMask, 1);
+
+Value *gather0 = GATHERPS(src0, pBase, indices0, mask0, scale);
+Value *gather1 = GATHERPS(src1, pBase, indices1, mask1, 

[Mesa-dev] [PATCH 10/10] swr/rast: Repair simd8 frontend code rot

2017-11-20 Thread Tim Rowley
Keep non-default simd8 frontend code running for comparison purposes.
---
 src/gallium/drivers/swr/rasterizer/core/frontend.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/gallium/drivers/swr/rasterizer/core/frontend.cpp 
b/src/gallium/drivers/swr/rasterizer/core/frontend.cpp
index 2fe6cfcf69..5a61dc33a0 100644
--- a/src/gallium/drivers/swr/rasterizer/core/frontend.cpp
+++ b/src/gallium/drivers/swr/rasterizer/core/frontend.cpp
@@ -956,7 +956,7 @@ static void GeometryShaderStage(
 PA_STATE_CUT gsPa(pDC, (uint8_t*)pGsBuffers->pGsTransposed, 
numEmittedVerts, pState->outputVertexSize, reinterpret_cast(pCutBuffer), numEmittedVerts, numAttribs, pState->outputTopology, 
processCutVerts, pa.numVertsPerPrim);
 
 #else
-PA_STATE_CUT gsPa(pDC, (uint8_t*)pGsBuffers->pGsTransposed, 
numEmittedVerts, pState->outputVertexSize, pCutBuffer, numEmittedVerts, 
numAttribs, pState->outputTopology, processCutVerts);
+PA_STATE_CUT gsPa(pDC, (uint8_t*)pGsBuffers->pGsTransposed, 
numEmittedVerts, pState->outputVertexSize, pCutBuffer, numEmittedVerts, 
numAttribs, pState->outputTopology, processCutVerts, pa.numVertsPerPrim);
 
 #endif
 while (gsPa.GetNextStreamOutput())
-- 
2.14.1

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH] swr/rast: Use gather instruction for i32gather_ps on simd16/avx512

2017-11-13 Thread Tim Rowley
Speed up avx512 platforms; fixes performance regression caused
by swithc to simdlib.

Cc: mesa-sta...@lists.freedesktop.org
---
 .../drivers/swr/rasterizer/common/simdlib_512_avx512.inl | 12 +---
 1 file changed, 1 insertion(+), 11 deletions(-)

diff --git a/src/gallium/drivers/swr/rasterizer/common/simdlib_512_avx512.inl 
b/src/gallium/drivers/swr/rasterizer/common/simdlib_512_avx512.inl
index 95e4c31909..c13b9f616a 100644
--- a/src/gallium/drivers/swr/rasterizer/common/simdlib_512_avx512.inl
+++ b/src/gallium/drivers/swr/rasterizer/common/simdlib_512_avx512.inl
@@ -484,17 +484,7 @@ SIMD_WRAPPER_2(unpacklo_ps);
 template
 static SIMDINLINE Float SIMDCALL i32gather_ps(float const* p, Integer idx) // 
return *(float*)(((int8*)p) + (idx * ScaleT))
 {
-uint32_t *pOffsets = (uint32_t*)
-Float vResult;
-float* pResult = (float*)
-for (uint32_t i = 0; i < SIMD_WIDTH; ++i)
-{
-uint32_t offset = pOffsets[i];
-offset = offset * static_cast(ScaleT);
-pResult[i] = *(float const*)(((uint8_t const*)p + offset));
-}
-
-return vResult;
+return _mm512_i32gather_ps(idx, p, static_cast(ScaleT));
 }
 
 static SIMDINLINE Float SIMDCALL load1_ps(float const *p)  // return *p
(broadcast 1 value to all elements)
-- 
2.14.1

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH] swr/rast: Faster emulated simd16 permute

2017-11-13 Thread Tim Rowley
Speed up simd16 frontend (default) on avx/avx2 platforms;
fixes performance regression caused by switch to simdlib.

Cc: mesa-sta...@lists.freedesktop.org
---
 .../swr/rasterizer/common/simdlib_512_emu.inl  | 34 +++---
 1 file changed, 11 insertions(+), 23 deletions(-)

diff --git a/src/gallium/drivers/swr/rasterizer/common/simdlib_512_emu.inl 
b/src/gallium/drivers/swr/rasterizer/common/simdlib_512_emu.inl
index d6af7b1c64..44eba0b126 100644
--- a/src/gallium/drivers/swr/rasterizer/common/simdlib_512_emu.inl
+++ b/src/gallium/drivers/swr/rasterizer/common/simdlib_512_emu.inl
@@ -521,36 +521,24 @@ SIMD_IWRAPPER_2(packus_epi32); // See documentation 
for _mm256_packus_epi32
 
 static SIMDINLINE Integer SIMDCALL permute_epi32(Integer const , Integer 
const ) // return a[swiz[i]] for each 32-bit lane i (int32)
 {
-Integer result;
-
-// Ugly slow implementation
-uint32_t const *pA = reinterpret_cast();
-uint32_t const *pSwiz = reinterpret_cast();
-uint32_t *pResult = reinterpret_cast();
-
-for (uint32_t i = 0; i < SIMD_WIDTH; ++i)
-{
-pResult[i] = pA[0xF & pSwiz[i]];
-}
-
-return result;
+return castps_si(permute_ps(castsi_ps(a), swiz));
 }
 
 static SIMDINLINE Float SIMDCALL permute_ps(Float const , Integer const 
)// return a[swiz[i]] for each 32-bit lane i (float)
 {
-Float result;
+const auto mask = SIMD256T::set1_epi32(7);
 
-// Ugly slow implementation
-float const *pA = reinterpret_cast();
-uint32_t const *pSwiz = reinterpret_cast();
-float *pResult = reinterpret_cast();
+auto lolo = SIMD256T::permute_ps(a.v8[0], SIMD256T::and_si(swiz.v8[0], 
mask));
+auto lohi = SIMD256T::permute_ps(a.v8[1], SIMD256T::and_si(swiz.v8[0], 
mask));
 
-for (uint32_t i = 0; i < SIMD_WIDTH; ++i)
-{
-pResult[i] = pA[0xF & pSwiz[i]];
-}
+auto hilo = SIMD256T::permute_ps(a.v8[0], SIMD256T::and_si(swiz.v8[1], 
mask));
+auto hihi = SIMD256T::permute_ps(a.v8[1], SIMD256T::and_si(swiz.v8[1], 
mask));
 
-return result;
+return Float
+{
+SIMD256T::blendv_ps(lolo, lohi, 
SIMD256T::castsi_ps(SIMD256T::cmpgt_epi32(swiz.v8[0], mask))),
+SIMD256T::blendv_ps(hilo, hihi, 
SIMD256T::castsi_ps(SIMD256T::cmpgt_epi32(swiz.v8[1], mask))),
+};
 }
 
 // All of the 512-bit permute2f128_XX intrinsics do the following:
-- 
2.14.1

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH] gallivm: allow arch rounding with avx512

2017-11-01 Thread Tim Rowley
Fixes piglit vs-roundeven-{float,vec[234]} with simd16 VS.
---
 src/gallium/auxiliary/gallivm/lp_bld_arit.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/gallium/auxiliary/gallivm/lp_bld_arit.c 
b/src/gallium/auxiliary/gallivm/lp_bld_arit.c
index cf1958b3b6..a1edd349f1 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_arit.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_arit.c
@@ -1953,7 +1953,8 @@ arch_rounding_available(const struct lp_type type)
 {
if ((util_cpu_caps.has_sse4_1 &&
(type.length == 1 || type.width*type.length == 128)) ||
-   (util_cpu_caps.has_avx && type.width*type.length == 256))
+   (util_cpu_caps.has_avx && type.width*type.length == 256) ||
+   (util_cpu_caps.has_avx512f && type.width*type.length == 512))
   return TRUE;
else if ((util_cpu_caps.has_altivec &&
 (type.width == 32 && type.length == 4)))
-- 
2.14.1

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH] gallium: add more exceptions to tgsi_util_get_inst_usage_mask

2017-10-19 Thread Tim Rowley
A number of double/int64 operations don't have matching
read and write usage masks, which the fallthrough case of
tgsi_util_get_inst_usage_mask assumes for componentwise
tagged instructions.

No regressions in llvmpipe piglit; fixes a large number of
swr regressions.
---
 src/gallium/auxiliary/tgsi/tgsi_util.c | 12 
 1 file changed, 12 insertions(+)

diff --git a/src/gallium/auxiliary/tgsi/tgsi_util.c 
b/src/gallium/auxiliary/tgsi/tgsi_util.c
index cfce59093c..afe5690ce0 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_util.c
+++ b/src/gallium/auxiliary/tgsi/tgsi_util.c
@@ -230,13 +230,25 @@ tgsi_util_get_inst_usage_mask(const struct 
tgsi_full_instruction *inst,
   read_mask = TGSI_WRITEMASK_XYZ;
   break;
 
+   case TGSI_OPCODE_DSEQ:
+   case TGSI_OPCODE_DSNE:
+   case TGSI_OPCODE_DSLT:
+   case TGSI_OPCODE_DSGE:
case TGSI_OPCODE_DP4:
case TGSI_OPCODE_PK4B:
case TGSI_OPCODE_PK4UB:
case TGSI_OPCODE_D2F:
+   case TGSI_OPCODE_D2I:
+   case TGSI_OPCODE_D2U:
case TGSI_OPCODE_I2F:
case TGSI_OPCODE_U2F:
+   case TGSI_OPCODE_U64SEQ:
+   case TGSI_OPCODE_U64SNE:
+   case TGSI_OPCODE_U64SLT:
+   case TGSI_OPCODE_U64SGE:
case TGSI_OPCODE_U642F:
+   case TGSI_OPCODE_I64SLT:
+   case TGSI_OPCODE_I64SGE:
case TGSI_OPCODE_I642F:
   read_mask = TGSI_WRITEMASK_XYZW;
   break;
-- 
2.11.0

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH 6/7] swr/rast: Add api to override draws in flight

2017-10-19 Thread Tim Rowley
Allow draws in flight to be overridden via SWR_CREATECONTEXT_INFO.

Patch by Jan Zielinski.
---
 src/gallium/drivers/swr/rasterizer/core/api.cpp| 26 +-
 src/gallium/drivers/swr/rasterizer/core/api.h  |  4 
 src/gallium/drivers/swr/rasterizer/core/context.h  |  2 ++
 .../drivers/swr/rasterizer/core/threads.cpp| 18 +++
 4 files changed, 31 insertions(+), 19 deletions(-)

diff --git a/src/gallium/drivers/swr/rasterizer/core/api.cpp 
b/src/gallium/drivers/swr/rasterizer/core/api.cpp
index 6323098..20eeb29 100644
--- a/src/gallium/drivers/swr/rasterizer/core/api.cpp
+++ b/src/gallium/drivers/swr/rasterizer/core/api.cpp
@@ -74,13 +74,19 @@ HANDLE SwrCreateContext(
 
 pContext->privateStateSize = pCreateInfo->privateStateSize;
 
-pContext->dcRing.Init(KNOB_MAX_DRAWS_IN_FLIGHT);
-pContext->dsRing.Init(KNOB_MAX_DRAWS_IN_FLIGHT);
+pContext->MAX_DRAWS_IN_FLIGHT = KNOB_MAX_DRAWS_IN_FLIGHT;
+if (pCreateInfo->MAX_DRAWS_IN_FLIGHT != 0)
+{
+pContext->MAX_DRAWS_IN_FLIGHT = pCreateInfo->MAX_DRAWS_IN_FLIGHT;
+}
+
+pContext->dcRing.Init(pContext->MAX_DRAWS_IN_FLIGHT);
+pContext->dsRing.Init(pContext->MAX_DRAWS_IN_FLIGHT);
 
-pContext->pMacroTileManagerArray = 
(MacroTileMgr*)AlignedMalloc(sizeof(MacroTileMgr) * KNOB_MAX_DRAWS_IN_FLIGHT, 
64);
-pContext->pDispatchQueueArray = 
(DispatchQueue*)AlignedMalloc(sizeof(DispatchQueue) * KNOB_MAX_DRAWS_IN_FLIGHT, 
64);
+pContext->pMacroTileManagerArray = 
(MacroTileMgr*)AlignedMalloc(sizeof(MacroTileMgr) * 
pContext->MAX_DRAWS_IN_FLIGHT, 64);
+pContext->pDispatchQueueArray = 
(DispatchQueue*)AlignedMalloc(sizeof(DispatchQueue) * 
pContext->MAX_DRAWS_IN_FLIGHT, 64);
 
-for (uint32_t dc = 0; dc < KNOB_MAX_DRAWS_IN_FLIGHT; ++dc)
+for (uint32_t dc = 0; dc < pContext->MAX_DRAWS_IN_FLIGHT; ++dc)
 {
 pContext->dcRing[dc].pArena = new 
CachingArena(pContext->cachingArenaAllocator);
 new (>pMacroTileManagerArray[dc]) 
MacroTileMgr(*pContext->dcRing[dc].pArena);
@@ -173,7 +179,7 @@ template
 void QueueWork(SWR_CONTEXT *pContext)
 {
 DRAW_CONTEXT* pDC = pContext->pCurDrawContext;
-uint32_t dcIndex = pDC->drawId % KNOB_MAX_DRAWS_IN_FLIGHT;
+uint32_t dcIndex = pDC->drawId % pContext->MAX_DRAWS_IN_FLIGHT;
 
 if (IsDraw)
 {
@@ -257,7 +263,7 @@ DRAW_CONTEXT* GetDrawContext(SWR_CONTEXT *pContext, bool 
isSplitDraw = false)
 }
 
 uint64_t curDraw = pContext->dcRing.GetHead();
-uint32_t dcIndex = curDraw % KNOB_MAX_DRAWS_IN_FLIGHT;
+uint32_t dcIndex = curDraw % pContext->MAX_DRAWS_IN_FLIGHT;
 
 if ((pContext->frameCount - pContext->lastFrameChecked) > 2 ||
 (curDraw - pContext->lastDrawChecked) > 0x1)
@@ -273,7 +279,7 @@ DRAW_CONTEXT* GetDrawContext(SWR_CONTEXT *pContext, bool 
isSplitDraw = false)
 pContext->pCurDrawContext = pCurDrawContext;
 
 // Assign next available entry in DS ring to this DC.
-uint32_t dsIndex = pContext->curStateId % KNOB_MAX_DRAWS_IN_FLIGHT;
+uint32_t dsIndex = pContext->curStateId % 
pContext->MAX_DRAWS_IN_FLIGHT;
 pCurDrawContext->pState = >dsRing[dsIndex];
 
 // Copy previous state to current state.
@@ -361,7 +367,7 @@ void SwrDestroyContext(HANDLE hContext)
 DestroyThreadPool(pContext, >threadPool);
 
 // free the fifos
-for (uint32_t i = 0; i < KNOB_MAX_DRAWS_IN_FLIGHT; ++i)
+for (uint32_t i = 0; i < pContext->MAX_DRAWS_IN_FLIGHT; ++i)
 {
 AlignedFree(pContext->dcRing[i].dynState.pStats);
 delete pContext->dcRing[i].pArena;
@@ -1481,7 +1487,7 @@ void SwrDispatch(
 pTaskData->threadGroupCountZ = threadGroupCountZ;
 
 uint32_t totalThreadGroups = threadGroupCountX * threadGroupCountY * 
threadGroupCountZ;
-uint32_t dcIndex = pDC->drawId % KNOB_MAX_DRAWS_IN_FLIGHT;
+uint32_t dcIndex = pDC->drawId % pContext->MAX_DRAWS_IN_FLIGHT;
 pDC->pDispatch = >pDispatchQueueArray[dcIndex];
 pDC->pDispatch->initialize(totalThreadGroups, pTaskData, 
);
 
diff --git a/src/gallium/drivers/swr/rasterizer/core/api.h 
b/src/gallium/drivers/swr/rasterizer/core/api.h
index 577cfb1..60f56c6 100644
--- a/src/gallium/drivers/swr/rasterizer/core/api.h
+++ b/src/gallium/drivers/swr/rasterizer/core/api.h
@@ -215,6 +215,10 @@ struct SWR_CREATECONTEXT_INFO
 
 // Input (optional): Threading info that overrides any set KNOB values.
 SWR_THREADING_INFO* pThreadInfo;
+
+// Input: if set to non-zero value, overrides KNOB value for maximum
+// number of draws in flight
+uint32_t MAX_DRAWS_IN_FLIGHT;
 };
 
 //
diff --git a/src/gallium/drivers/swr/rasterizer/core/context.h 
b/src/gallium/drivers/swr/rasterizer/core/context.h
index bcd5801..ae942f1 100644
--- a/src/gallium/drivers/swr/rasterizer/core/context.h
+++ b/src/gallium/drivers/swr/rasterizer/core/context.h
@@ -481,6 +481,8 @@ struct SWR_CONTEXT
  

[Mesa-dev] [PATCH 4/7] swr/rast: Change DS memory allocation

2017-10-19 Thread Tim Rowley
---
 src/gallium/drivers/swr/rasterizer/core/frontend.cpp | 4 ++--
 src/gallium/drivers/swr/rasterizer/core/state.h  | 1 +
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/src/gallium/drivers/swr/rasterizer/core/frontend.cpp 
b/src/gallium/drivers/swr/rasterizer/core/frontend.cpp
index a803512..211e9e4 100644
--- a/src/gallium/drivers/swr/rasterizer/core/frontend.cpp
+++ b/src/gallium/drivers/swr/rasterizer/core/frontend.cpp
@@ -1212,9 +1212,9 @@ static void TessellationStages(
 // Allocate DS Output memory
 uint32_t requiredDSVectorInvocations = AlignUp(tsData.NumDomainPoints, 
KNOB_SIMD_WIDTH) / KNOB_SIMD_WIDTH;
 #if USE_SIMD16_FRONTEND
-size_t requiredAllocSize = sizeof(simdvector) * 
RoundUpEven(requiredDSVectorInvocations) * tsState.numDsOutputAttribs;  // 
simd8 -> simd16, padding
+size_t requiredAllocSize = sizeof(simdvector) * 
RoundUpEven(requiredDSVectorInvocations) * tsState.dsAllocationSize;  // 
simd8 -> simd16, padding
 #else
-size_t requiredDSOutputVectors = requiredDSVectorInvocations * 
tsState.numDsOutputAttribs;
+size_t requiredDSOutputVectors = requiredDSVectorInvocations * 
tsState.dsAllocationSize;
 size_t requiredAllocSize = sizeof(simdvector) * 
requiredDSOutputVectors;
 #endif
 if (requiredAllocSize > gt_pTessellationThreadData->dsOutputAllocSize)
diff --git a/src/gallium/drivers/swr/rasterizer/core/state.h 
b/src/gallium/drivers/swr/rasterizer/core/state.h
index d9450fc..2af384f 100644
--- a/src/gallium/drivers/swr/rasterizer/core/state.h
+++ b/src/gallium/drivers/swr/rasterizer/core/state.h
@@ -818,6 +818,7 @@ struct SWR_TS_STATE
 uint32_tnumHsInputAttribs;
 uint32_tnumHsOutputAttribs;
 uint32_tnumDsOutputAttribs;
+uint32_tdsAllocationSize;
 
 // Offset to the start of the attributes of the input vertices, in 
simdvector units
 uint32_tvertexAttribOffset;
-- 
2.7.4

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH 1/7] swr/rast: Minor changes for os-x

2017-10-19 Thread Tim Rowley
---
 src/gallium/drivers/swr/rasterizer/core/threads.cpp | 6 --
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/src/gallium/drivers/swr/rasterizer/core/threads.cpp 
b/src/gallium/drivers/swr/rasterizer/core/threads.cpp
index 4bb395d..9ece064 100644
--- a/src/gallium/drivers/swr/rasterizer/core/threads.cpp
+++ b/src/gallium/drivers/swr/rasterizer/core/threads.cpp
@@ -30,7 +30,7 @@
 #include 
 #include 
 
-#if defined(__linux__) || defined(__gnu_linux__)
+#if defined(__linux__) || defined(__gnu_linux__) || defined(__APPLE__)
 #include 
 #include 
 #include 
@@ -218,6 +218,8 @@ void CalculateProcessorTopology(CPUNumaNodes& out_nodes, 
uint32_t& out_numThread
 }
 }
 
+#elif defined(__APPLE__)
+
 #else
 
 #error Unsupported platform
@@ -291,7 +293,7 @@ void bindThread(SWR_CONTEXT* pContext, uint32_t threadId, 
uint32_t procGroupId =
 
 SetThreadGroupAffinity(GetCurrentThread(), , nullptr);
 
-#else
+#elif defined(__linux__) || defined(__gnu_linux__)
 
 cpu_set_t cpuset;
 pthread_t thread = pthread_self();
-- 
2.7.4

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH 0/7] swr: rasterizer update

2017-10-19 Thread Tim Rowley
Highlights are code cleanups, some more simd16 work (disabled by default),
and tuning for the Intel Xeon Phi architecture.

Tim Rowley (7):
  swr/rast: Minor changes for os-x
  swr/rast: Miscellaneous viewport array code changes
  swr/rast: Fix indentation
  swr/rast: Change DS memory allocation
  swr/rast: Widen fetch shader to SIMD16 (disabled for now)
  swr/rast: Add api to override draws in flight
  swr: knob overrides for Intel Xeon Phi

 src/gallium/drivers/swr/rasterizer/core/api.cpp|  26 +-
 src/gallium/drivers/swr/rasterizer/core/api.h  |   4 +
 src/gallium/drivers/swr/rasterizer/core/binner.cpp |  45 ++-
 src/gallium/drivers/swr/rasterizer/core/clip.h |  14 +-
 src/gallium/drivers/swr/rasterizer/core/context.h  |   2 +
 .../drivers/swr/rasterizer/core/frontend.cpp   |  26 +-
 src/gallium/drivers/swr/rasterizer/core/pa.h   |  24 +-
 src/gallium/drivers/swr/rasterizer/core/pa_avx.cpp |   4 +-
 src/gallium/drivers/swr/rasterizer/core/state.h|   3 +-
 .../drivers/swr/rasterizer/core/threads.cpp|  24 +-
 .../drivers/swr/rasterizer/jitter/fetch_jit.cpp| 441 -
 src/gallium/drivers/swr/swr_context.cpp|  27 ++
 src/gallium/drivers/swr/swr_context.h  |   2 +
 src/gallium/drivers/swr/swr_loader.cpp |   4 +
 src/gallium/drivers/swr/swr_scratch.cpp|   2 +-
 src/gallium/drivers/swr/swr_screen.h   |   3 +
 16 files changed, 575 insertions(+), 76 deletions(-)

-- 
2.7.4

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH 3/7] swr/rast: Fix indentation

2017-10-19 Thread Tim Rowley
---
 src/gallium/drivers/swr/rasterizer/core/state.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/gallium/drivers/swr/rasterizer/core/state.h 
b/src/gallium/drivers/swr/rasterizer/core/state.h
index f7c9308..d9450fc 100644
--- a/src/gallium/drivers/swr/rasterizer/core/state.h
+++ b/src/gallium/drivers/swr/rasterizer/core/state.h
@@ -820,7 +820,7 @@ struct SWR_TS_STATE
 uint32_tnumDsOutputAttribs;
 
 // Offset to the start of the attributes of the input vertices, in 
simdvector units
-uint32_t vertexAttribOffset;
+uint32_tvertexAttribOffset;
 };
 
 // output merger state
-- 
2.7.4

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH 2/7] swr/rast: Miscellaneous viewport array code changes

2017-10-19 Thread Tim Rowley
---
 src/gallium/drivers/swr/rasterizer/core/binner.cpp | 45 --
 src/gallium/drivers/swr/rasterizer/core/clip.h | 14 +--
 .../drivers/swr/rasterizer/core/frontend.cpp   | 22 ++-
 src/gallium/drivers/swr/rasterizer/core/pa.h   | 24 ++--
 src/gallium/drivers/swr/rasterizer/core/pa_avx.cpp |  4 +-
 5 files changed, 71 insertions(+), 38 deletions(-)

diff --git a/src/gallium/drivers/swr/rasterizer/core/binner.cpp 
b/src/gallium/drivers/swr/rasterizer/core/binner.cpp
index e08e489..b624ae6 100644
--- a/src/gallium/drivers/swr/rasterizer/core/binner.cpp
+++ b/src/gallium/drivers/swr/rasterizer/core/binner.cpp
@@ -450,16 +450,22 @@ void SIMDCALL BinTrianglesImpl(
 typename SIMD_T::Float vRecipW1 = SIMD_T::set1_ps(1.0f);
 typename SIMD_T::Float vRecipW2 = SIMD_T::set1_ps(1.0f);
 
-typename SIMD_T::Integer viewportIdx = SIMD_T::set1_epi32(0);
+typename SIMD_T::Integer viewportIdx = SIMD_T::setzero_si();
+typename SIMD_T::Vec4 vpiAttrib[3];
+typename SIMD_T::Integer vpai = SIMD_T::setzero_si();
 
 if (state.backendState.readViewportArrayIndex)
 {
-typename SIMD_T::Vec4 vpiAttrib[3];
 pa.Assemble(VERTEX_SGV_SLOT, vpiAttrib);
 
+vpai = SIMD_T::castps_si(vpiAttrib[0][VERTEX_SGV_VAI_COMP]);
+}
+
+
+if (state.backendState.readViewportArrayIndex) // VPAIOffsets are 
guaranteed 0-15 -- no OOB issues if they are offsets from 0 
+{
 // OOB indices => forced to zero.
-typename SIMD_T::Integer vpai = 
SIMD_T::castps_si(vpiAttrib[0][VERTEX_SGV_VAI_COMP]);
-vpai = SIMD_T::max_epi32(SIMD_T::setzero_si(), vpai);
+vpai = SIMD_T::max_epi32(vpai, SIMD_T::setzero_si());
 typename SIMD_T::Integer vNumViewports = 
SIMD_T::set1_epi32(KNOB_NUM_VIEWPORTS_SCISSORS);
 typename SIMD_T::Integer vClearMask = SIMD_T::cmplt_epi32(vpai, 
vNumViewports);
 viewportIdx = SIMD_T::and_si(vClearMask, vpai);
@@ -815,6 +821,7 @@ endBinTriangles:
 SIMD_T::store_si(reinterpret_cast(aRTAI), 
SIMD_T::setzero_si());
 }
 
+
 // scan remaining valid triangles and bin each separately
 while (_BitScanForward(, triMask))
 {
@@ -1299,15 +1306,22 @@ void BinPointsImpl(
 const SWR_RASTSTATE& rastState = state.rastState;
 
 // Read back viewport index if required
-typename SIMD_T::Integer viewportIdx = SIMD_T::set1_epi32(0);
+typename SIMD_T::Integer viewportIdx = SIMD_T::setzero_si();
+typename SIMD_T::Vec4 vpiAttrib[1];
+typename SIMD_T::Integer vpai = SIMD_T::setzero_si();
+
 if (state.backendState.readViewportArrayIndex)
 {
-typename SIMD_T::Vec4 vpiAttrib[1];
 pa.Assemble(VERTEX_SGV_SLOT, vpiAttrib);
 
+vpai = SIMD_T::castps_si(vpiAttrib[0][VERTEX_SGV_VAI_COMP]);
+}
+
+
+if (state.backendState.readViewportArrayIndex) // VPAIOffsets are 
guaranteed 0-15 -- no OOB issues if they are offsets from 0 
+{
 // OOB indices => forced to zero.
-typename SIMD_T::Integer vpai = 
SIMD_T::castps_si(vpiAttrib[0][VERTEX_SGV_VAI_COMP]);
-vpai = SIMD_T::max_epi32(SIMD_T::setzero_si(), vpai);
+vpai = SIMD_T::max_epi32(vpai, SIMD_T::setzero_si());
 typename SIMD_T::Integer vNumViewports = 
SIMD_T::set1_epi32(KNOB_NUM_VIEWPORTS_SCISSORS);
 typename SIMD_T::Integer vClearMask = SIMD_T::cmplt_epi32(vpai, 
vNumViewports);
 viewportIdx = SIMD_T::and_si(vClearMask, vpai);
@@ -1626,15 +1640,22 @@ void SIMDCALL BinLinesImpl(
 
 typename SIMD_T::Float vRecipW[2] = { SIMD_T::set1_ps(1.0f), 
SIMD_T::set1_ps(1.0f) };
 
-typename SIMD_T::Integer viewportIdx = SIMD_T::set1_epi32(0);
+typename SIMD_T::Integer viewportIdx = SIMD_T::setzero_si();
+typename SIMD_T::Vec4 vpiAttrib[2];
+typename SIMD_T::Integer vpai = SIMD_T::setzero_si();
+
 if (state.backendState.readViewportArrayIndex)
 {
-typename SIMD_T::Vec4 vpiAttrib[2];
 pa.Assemble(VERTEX_SGV_SLOT, vpiAttrib);
 
+vpai = SIMD_T::castps_si(vpiAttrib[0][VERTEX_SGV_VAI_COMP]);
+}
+
+
+if (state.backendState.readViewportArrayIndex) // VPAIOffsets are 
guaranteed 0-15 -- no OOB issues if they are offsets from 0 
+{
 // OOB indices => forced to zero.
-typename SIMD_T::Integer vpai = 
SIMD_T::castps_si(vpiAttrib[0][VERTEX_SGV_VAI_COMP]);
-vpai = SIMD_T::max_epi32(SIMD_T::setzero_si(), vpai);
+vpai = SIMD_T::max_epi32(vpai, SIMD_T::setzero_si());
 typename SIMD_T::Integer vNumViewports = 
SIMD_T::set1_epi32(KNOB_NUM_VIEWPORTS_SCISSORS);
 typename SIMD_T::Integer vClearMask = SIMD_T::cmplt_epi32(vpai, 
vNumViewports);
 viewportIdx = SIMD_T::and_si(vClearMask, vpai);
diff --git a/src/gallium/drivers/swr/rasterizer/core/clip.h 
b/src/gallium/drivers/swr/rasterizer/core/clip.h
index e9a410d..0d3d780 100644
--- a/src/gallium/drivers/swr/rasterizer/core/clip.h
+++ b/src/gallium/drivers/swr/rasterizer/core/clip.h
@@ -641,7 

[Mesa-dev] [PATCH 7/7] swr: knob overrides for Intel Xeon Phi

2017-10-19 Thread Tim Rowley
Architecture benefits from having more threads/work outstanding.
---
 src/gallium/drivers/swr/swr_context.cpp | 27 +++
 src/gallium/drivers/swr/swr_context.h   |  2 ++
 src/gallium/drivers/swr/swr_loader.cpp  |  4 
 src/gallium/drivers/swr/swr_scratch.cpp |  2 +-
 src/gallium/drivers/swr/swr_screen.h|  3 +++
 5 files changed, 37 insertions(+), 1 deletion(-)

diff --git a/src/gallium/drivers/swr/swr_context.cpp 
b/src/gallium/drivers/swr/swr_context.cpp
index 34d9a25..b61720c 100644
--- a/src/gallium/drivers/swr/swr_context.cpp
+++ b/src/gallium/drivers/swr/swr_context.cpp
@@ -39,6 +39,7 @@
 
 #include "api.h"
 #include "backend.h"
+#include "knobs.h"
 
 static struct pipe_surface *
 swr_create_surface(struct pipe_context *pipe,
@@ -483,6 +484,8 @@ swr_create_context(struct pipe_screen *p_screen, void 
*priv, unsigned flags)
ctx->blendJIT =
   new std::unordered_map;
 
+   ctx->max_draws_in_flight = KNOB_MAX_DRAWS_IN_FLIGHT;
+
SWR_CREATECONTEXT_INFO createInfo;
memset(, 0, sizeof(createInfo));
createInfo.privateStateSize = sizeof(swr_draw_context);
@@ -491,6 +494,30 @@ swr_create_context(struct pipe_screen *p_screen, void 
*priv, unsigned flags)
createInfo.pfnClearTile = swr_StoreHotTileClear;
createInfo.pfnUpdateStats = swr_UpdateStats;
createInfo.pfnUpdateStatsFE = swr_UpdateStatsFE;
+
+   SWR_THREADING_INFO threadingInfo {0};
+
+   threadingInfo.MAX_WORKER_THREADS= KNOB_MAX_WORKER_THREADS;
+   threadingInfo.MAX_NUMA_NODES= KNOB_MAX_NUMA_NODES;
+   threadingInfo.MAX_CORES_PER_NUMA_NODE   = KNOB_MAX_CORES_PER_NUMA_NODE;
+   threadingInfo.MAX_THREADS_PER_CORE  = KNOB_MAX_THREADS_PER_CORE;
+   threadingInfo.SINGLE_THREADED   = KNOB_SINGLE_THREADED;
+
+   // Use non-standard settings for KNL
+   if (swr_screen(p_screen)->is_knl)
+   {
+  if (nullptr == getenv("KNOB_MAX_THREADS_PER_CORE"))
+ threadingInfo.MAX_THREADS_PER_CORE  = 2;
+
+  if (nullptr == getenv("KNOB_MAX_DRAWS_IN_FLIGHT"))
+  {
+ ctx->max_draws_in_flight = 2048;
+ createInfo.MAX_DRAWS_IN_FLIGHT = ctx->max_draws_in_flight;
+  }
+   }
+
+   createInfo.pThreadInfo = 
+
ctx->swrContext = ctx->api.pfnSwrCreateContext();
 
ctx->api.pfnSwrInit();
diff --git a/src/gallium/drivers/swr/swr_context.h 
b/src/gallium/drivers/swr/swr_context.h
index 8bed78f..5c280ee 100644
--- a/src/gallium/drivers/swr/swr_context.h
+++ b/src/gallium/drivers/swr/swr_context.h
@@ -173,6 +173,8 @@ struct swr_context {
unsigned dirty; /**< Mask of SWR_NEW_x flags */
 
SWR_INTERFACE api;
+
+   uint32_t max_draws_in_flight;
 };
 
 static INLINE struct swr_context *
diff --git a/src/gallium/drivers/swr/swr_loader.cpp 
b/src/gallium/drivers/swr/swr_loader.cpp
index e205fe2..9d6f918 100644
--- a/src/gallium/drivers/swr/swr_loader.cpp
+++ b/src/gallium/drivers/swr/swr_loader.cpp
@@ -38,11 +38,14 @@ swr_create_screen(struct sw_winsys *winsys)
 
util_cpu_detect();
 
+   bool is_knl = false;
+
if (!strlen(filename) &&
util_cpu_caps.has_avx512f && util_cpu_caps.has_avx512er) {
 #if HAVE_SWR_KNL
   fprintf(stderr, "KNL ");
   sprintf(filename, "%s%s%s", UTIL_DL_PREFIX, "swrKNL", UTIL_DL_EXT);
+  is_knl = true;
 #else
   fprintf(stderr, "KNL (not built) ");
 #endif
@@ -99,6 +102,7 @@ swr_create_screen(struct sw_winsys *winsys)
 
struct pipe_screen *screen = swr_create_screen_internal(winsys);
swr_screen(screen)->pfnSwrGetInterface = (PFNSwrGetInterface)pApiProc;
+   swr_screen(screen)->is_knl = is_knl;
 
return screen;
 }
diff --git a/src/gallium/drivers/swr/swr_scratch.cpp 
b/src/gallium/drivers/swr/swr_scratch.cpp
index d298a48..8afe73c 100644
--- a/src/gallium/drivers/swr/swr_scratch.cpp
+++ b/src/gallium/drivers/swr/swr_scratch.cpp
@@ -45,7 +45,7 @@ swr_copy_to_scratch_space(struct swr_context *ctx,
   ptr = ctx->api.pfnSwrAllocDrawContextMemory(ctx->swrContext, size, 4);
} else {
   /* Allocate enough so that MAX_DRAWS_IN_FLIGHT sets fit. */
-  unsigned int max_size_in_flight = size * KNOB_MAX_DRAWS_IN_FLIGHT;
+  uint32_t max_size_in_flight = size * ctx->max_draws_in_flight;
 
   /* Need to grow space */
   if (max_size_in_flight > space->current_size) {
diff --git a/src/gallium/drivers/swr/swr_screen.h 
b/src/gallium/drivers/swr/swr_screen.h
index 1c4e331..81b1a18 100644
--- a/src/gallium/drivers/swr/swr_screen.h
+++ b/src/gallium/drivers/swr/swr_screen.h
@@ -54,6 +54,9 @@ struct swr_screen {
 #endif
 
PFNSwrGetInterface pfnSwrGetInterface;
+
+   /* Do we run on Xeon Phi? */
+   bool is_knl;
 };
 
 static INLINE struct swr_screen *
-- 
2.7.4

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH 5/7] swr/rast: Widen fetch shader to SIMD16 (disabled for now)

2017-10-19 Thread Tim Rowley
Refactored the gather operation to process 16 elements at a time via
paired SIMD8 operations.
---
 .../drivers/swr/rasterizer/jitter/fetch_jit.cpp| 441 -
 1 file changed, 428 insertions(+), 13 deletions(-)

diff --git a/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.cpp 
b/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.cpp
index 1e3db90..30dbcfc 100644
--- a/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.cpp
+++ b/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.cpp
@@ -89,7 +89,13 @@ struct FetchJit : public Builder
 
 void JitLoadVertices(const FETCH_COMPILE_STATE , Value* 
streams, Value* vIndices, Value* pVtxOut);
 #if USE_SIMD16_SHADERS
+#define USE_SIMD16_GATHERS 0
+
+#if USE_SIMD16_GATHERS
+void JitGatherVertices(const FETCH_COMPILE_STATE , Value 
*streams, Value *vIndices, Value *vIndices2, Value *pVtxOut, bool useVertexID2);
+#else
 void JitGatherVertices(const FETCH_COMPILE_STATE , Value* 
streams, Value* vIndices, Value* pVtxOut, bool useVertexID2);
+#endif
 #else
 void JitGatherVertices(const FETCH_COMPILE_STATE , Value* 
streams, Value* vIndices, Value* pVtxOut);
 #endif
@@ -279,8 +285,12 @@ Function* FetchJit::Create(const FETCH_COMPILE_STATE& 
fetchState)
 }
 else
 {
+#if USE_SIMD16_GATHERS
+JitGatherVertices(fetchState, streams, vIndices, vIndices2, pVtxOut, 
false);
+#else
 JitGatherVertices(fetchState, streams, vIndices, pVtxOut, false);
 JitGatherVertices(fetchState, streams, vIndices2, GEP(pVtxOut, C(1)), 
true);
+#endif
 }
 #else
 (fetchState.bDisableVGATHER) ? JitLoadVertices(fetchState, streams, 
vIndices, pVtxOut)
@@ -792,8 +802,13 @@ void FetchJit::ConvertFormat(SWR_FORMAT format, Value 
*texels[4])
 /// @param vIndices - vector value of indices to gather
 /// @param pVtxOut - value pointer to output simdvertex struct
 #if USE_SIMD16_SHADERS
+#if USE_SIMD16_GATHERS
+void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE ,
+Value *streams, Value *vIndices, Value *vIndices2, Value *pVtxOut, bool 
useVertexID2)
+#else
 void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE ,
 Value* streams, Value* vIndices, Value* pVtxOut, bool useVertexID2)
+#endif
 #else
 void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE ,
 Value* streams, Value* vIndices, Value* pVtxOut)
@@ -802,6 +817,9 @@ void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE 
,
 uint32_t currentVertexElement = 0;
 uint32_t outputElt = 0;
 Value* vVertexElements[4];
+#if USE_SIMD16_GATHERS
+Value* vVertexElements2[4];
+#endif
 
 Value* startVertex = LOAD(mpFetchInfo, {0, SWR_FETCH_CONTEXT_StartVertex});
 Value* startInstance = LOAD(mpFetchInfo, {0, 
SWR_FETCH_CONTEXT_StartInstance});
@@ -809,7 +827,7 @@ void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE 
,
 Value* vBaseVertex = VBROADCAST(LOAD(mpFetchInfo, {0, 
SWR_FETCH_CONTEXT_BaseVertex}));
 curInstance->setName("curInstance");
 
-for(uint32_t nInputElt = 0; nInputElt < fetchState.numAttribs; ++nInputElt)
+for (uint32_t nInputElt = 0; nInputElt < fetchState.numAttribs; nInputElt 
+= 1)
 {
 const INPUT_ELEMENT_DESC& ied = fetchState.layout[nInputElt];
 
@@ -836,7 +854,8 @@ void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE 
,
 maxVertex = LOAD(maxVertex);
 
 Value *minVertex = NULL;
-if (fetchState.bPartialVertexBuffer) {
+if (fetchState.bPartialVertexBuffer)
+{
 // min vertex index for low bounds OOB checking
 minVertex = GEP(streams, {C(ied.StreamIndex), 
C(SWR_VERTEX_BUFFER_STATE_minVertex)});
 minVertex = LOAD(minVertex);
@@ -849,10 +868,13 @@ void FetchJit::JitGatherVertices(const 
FETCH_COMPILE_STATE ,
 }
 
 Value *vCurIndices;
+#if USE_SIMD16_GATHERS
+Value *vCurIndices2;
+#endif
 Value *startOffset;
 Value *vInstanceStride = VIMMED1(0);
 
-if(ied.InstanceEnable)
+if (ied.InstanceEnable)
 {
 Value* stepRate = C(ied.InstanceAdvancementState);
 
@@ -867,6 +889,9 @@ void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE 
,
 calcInstance = SELECT(isNonZeroStep, calcInstance, C(0));
 
 vCurIndices = VBROADCAST(calcInstance);
+#if USE_SIMD16_GATHERS
+vCurIndices2 = VBROADCAST(calcInstance);
+#endif
 
 startOffset = startInstance;
 }
@@ -878,6 +903,9 @@ void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE 
,
 
 // offset indices by baseVertex
 vCurIndices = ADD(vIndices, vBaseVertex);
+#if USE_SIMD16_GATHERS
+vCurIndices2 = ADD(vIndices2, vBaseVertex);
+#endif
 
 startOffset = startVertex;
 SWR_ASSERT((0), "TODO: Fill out more once driver sends this 
down.");
@@ -886,6 +914,9 @@ void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE 
,
 {
 // offset indices by baseVertex
  

[Mesa-dev] [PATCH 0/2] gallium/swr: simd16 work in progress

2017-10-11 Thread Tim Rowley
Changes to allow the swr work in progress native simd16 pipeline.
Currently enabling this via USE_SIMD16_SHADERS in knobs.h will run the
fetch shader with double pumped simd8, the vertex shaders in
native simd16, and the rest of the pipeline in simd8.

Tim Rowley (2):
  gallium: allow 512-bit vectors
  swr: simd16 shaders work in progress

 src/gallium/auxiliary/gallivm/lp_bld_tgsi_soa.c | 14 +++---
 src/gallium/auxiliary/gallivm/lp_bld_type.h |  4 ++--
 src/gallium/drivers/swr/swr_screen.cpp  |  6 ++
 src/gallium/drivers/swr/swr_screen.h|  3 +++
 src/gallium/drivers/swr/swr_shader.cpp  | 14 --
 5 files changed, 30 insertions(+), 11 deletions(-)

-- 
2.11.0

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH 1/2] gallium: allow 512-bit vectors

2017-10-11 Thread Tim Rowley
Increase the max allowed vector size from 256 to 512.

No piglit llvmpipe regressions running on avx2.

Cc: Dave Airlie 
Cc: Jose Fonseca 
---
 src/gallium/auxiliary/gallivm/lp_bld_tgsi_soa.c | 14 +++---
 src/gallium/auxiliary/gallivm/lp_bld_type.h |  4 ++--
 2 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/src/gallium/auxiliary/gallivm/lp_bld_tgsi_soa.c 
b/src/gallium/auxiliary/gallivm/lp_bld_tgsi_soa.c
index de18f629cd..97efc3a399 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_tgsi_soa.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_tgsi_soa.c
@@ -1272,9 +1272,9 @@ emit_fetch_constant(
 /**
  * Fetch 64-bit values from two separate channels.
  * 64-bit values are stored split across two channels, like xy and zw.
- * This function creates a set of 16 floats,
+ * This function creates a set of vec_length*2 floats,
  * extracts the values from the two channels,
- * puts them in the correct place, then casts to 8 64-bits.
+ * puts them in the correct place, then casts to vec_length 64-bits.
  */
 static LLVMValueRef
 emit_fetch_64bit(
@@ -1289,9 +1289,9 @@ emit_fetch_64bit(
LLVMValueRef res;
struct lp_build_context *bld_fetch = stype_to_fetch(bld_base, stype);
int i;
-   LLVMValueRef shuffles[16];
+   LLVMValueRef shuffles[2 * (LP_MAX_VECTOR_WIDTH/32)];
int len = bld_base->base.type.length * 2;
-   assert(len <= 16);
+   assert(len <= (2 * (LP_MAX_VECTOR_WIDTH/32)));
 
for (i = 0; i < bld_base->base.type.length * 2; i+=2) {
   shuffles[i] = lp_build_const_int32(gallivm, i / 2);
@@ -1691,7 +1691,7 @@ emit_fetch_deriv(
 }
 
 /**
- * store an array of 8 64-bit into two arrays of 8 floats
+ * store an array of vec-length 64-bit into two arrays of vec_length floats
  * i.e.
  * value is d0, d1, d2, d3 etc.
  * each 64-bit has high and low pieces x, y
@@ -1710,8 +1710,8 @@ emit_store_64bit_chan(struct lp_build_tgsi_context 
*bld_base,
struct lp_build_context *float_bld = _base->base;
unsigned i;
LLVMValueRef temp, temp2;
-   LLVMValueRef shuffles[8];
-   LLVMValueRef shuffles2[8];
+   LLVMValueRef shuffles[LP_MAX_VECTOR_WIDTH/32];
+   LLVMValueRef shuffles2[LP_MAX_VECTOR_WIDTH/32];
 
for (i = 0; i < bld_base->base.type.length; i++) {
   shuffles[i] = lp_build_const_int32(gallivm, i * 2);
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_type.h 
b/src/gallium/auxiliary/gallivm/lp_bld_type.h
index afe8722b05..62f1f85461 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_type.h
+++ b/src/gallium/auxiliary/gallivm/lp_bld_type.h
@@ -59,7 +59,7 @@ extern unsigned lp_native_vector_width;
  * Should only be used when lp_native_vector_width isn't available,
  * i.e. sizing/alignment of non-malloced variables.
  */
-#define LP_MAX_VECTOR_WIDTH 256
+#define LP_MAX_VECTOR_WIDTH 512
 
 /**
  * Minimum vector alignment for static variable alignment
@@ -67,7 +67,7 @@ extern unsigned lp_native_vector_width;
  * It should always be a constant equal to LP_MAX_VECTOR_WIDTH/8.  An
  * expression is non-portable.
  */
-#define LP_MIN_VECTOR_ALIGN 32
+#define LP_MIN_VECTOR_ALIGN 64
 
 /**
  * Several functions can only cope with vectors of length up to this value.
-- 
2.11.0

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH 2/2] swr: simd16 shaders work in progress

2017-10-11 Thread Tim Rowley
Start building vertex shaders as simd16.

Disabled by default, set USE_SIMD16_SHADERS in knobs.h to experiment.

Cc: Bruce Cherniak 
---
 src/gallium/drivers/swr/swr_screen.cpp |  6 ++
 src/gallium/drivers/swr/swr_screen.h   |  3 +++
 src/gallium/drivers/swr/swr_shader.cpp | 14 --
 3 files changed, 21 insertions(+), 2 deletions(-)

diff --git a/src/gallium/drivers/swr/swr_screen.cpp 
b/src/gallium/drivers/swr/swr_screen.cpp
index 639b18f930..46b3a003c6 100644
--- a/src/gallium/drivers/swr/swr_screen.cpp
+++ b/src/gallium/drivers/swr/swr_screen.cpp
@@ -1058,6 +1058,9 @@ swr_destroy_screen(struct pipe_screen *p_screen)
swr_fence_reference(p_screen, >flush_fence, NULL);
 
JitDestroyContext(screen->hJitMgr);
+#if USE_SIMD16_SHADERS
+   JitDestroyContext(screen->hJitMgr16);
+#endif
 
if (winsys->destroy)
   winsys->destroy(winsys);
@@ -1141,6 +1144,9 @@ swr_create_screen_internal(struct sw_winsys *winsys)
 
// Pass in "" for architecture for run-time determination
screen->hJitMgr = JitCreateContext(KNOB_SIMD_WIDTH, "", "swr");
+#if USE_SIMD16_SHADERS
+   screen->hJitMgr16 = JitCreateContext(16, "", "swr");
+#endif
 
swr_fence_init(>base);
 
diff --git a/src/gallium/drivers/swr/swr_screen.h 
b/src/gallium/drivers/swr/swr_screen.h
index a11ea9f41d..1c4e331583 100644
--- a/src/gallium/drivers/swr/swr_screen.h
+++ b/src/gallium/drivers/swr/swr_screen.h
@@ -49,6 +49,9 @@ struct swr_screen {
uint32_t client_copy_limit;
 
HANDLE hJitMgr;
+#if USE_SIMD16_SHADERS
+   HANDLE hJitMgr16;
+#endif
 
PFNSwrGetInterface pfnSwrGetInterface;
 };
diff --git a/src/gallium/drivers/swr/swr_shader.cpp 
b/src/gallium/drivers/swr/swr_shader.cpp
index 510bc0e457..732e08dae7 100644
--- a/src/gallium/drivers/swr/swr_shader.cpp
+++ b/src/gallium/drivers/swr/swr_shader.cpp
@@ -693,7 +693,7 @@ swr_compile_gs(struct swr_context *ctx, swr_jit_gs_key )
 void
 BuilderSWR::WriteVS(Value *pVal, Value *pVsContext, Value *pVtxOutput, 
unsigned slot, unsigned channel)
 {
-#if USE_SIMD16_FRONTEND
+#if USE_SIMD16_FRONTEND && !USE_SIMD16_SHADERS
// interleave the simdvertex components into the dest simd16vertex
//   slot16offset = slot8offset * 2
//   comp16offset = comp8offset * 2 + alternateOffset
@@ -756,6 +756,9 @@ BuilderSWR::CompileVS(struct swr_context *ctx, 
swr_jit_vs_key )
const_sizes_ptr->setName("num_vs_constants");
 
Value *vtxInput = LOAD(pVsCtx, {0, SWR_VS_CONTEXT_pVin});
+#if USE_SIMD16_SHADERS
+   vtxInput = BITCAST(vtxInput, PointerType::get(Gen_simd16vertex(JM()), 0));
+#endif
 
for (uint32_t attrib = 0; attrib < PIPE_MAX_SHADER_INPUTS; attrib++) {
   const unsigned mask = swr_vs->info.base.input_usage_mask[attrib];
@@ -777,7 +780,7 @@ BuilderSWR::CompileVS(struct swr_context *ctx, 
swr_jit_vs_key )
 
lp_build_tgsi_soa(gallivm,
  swr_vs->pipe.tokens,
- lp_type_float_vec(32, 32 * 8),
+ lp_type_float_vec(32, 32 * mVWidth),
  NULL, // mask
  wrap(consts_ptr),
  wrap(const_sizes_ptr),
@@ -795,6 +798,9 @@ BuilderSWR::CompileVS(struct swr_context *ctx, 
swr_jit_vs_key )
IRB()->SetInsertPoint(unwrap(LLVMGetInsertBlock(gallivm->builder)));
 
Value *vtxOutput = LOAD(pVsCtx, {0, SWR_VS_CONTEXT_pVout});
+#if USE_SIMD16_SHADERS
+   vtxOutput = BITCAST(vtxOutput, PointerType::get(Gen_simd16vertex(JM()), 0));
+#endif
 
for (uint32_t channel = 0; channel < TGSI_NUM_CHANNELS; channel++) {
   for (uint32_t attrib = 0; attrib < PIPE_MAX_SHADER_OUTPUTS; attrib++) {
@@ -905,7 +911,11 @@ swr_compile_vs(struct swr_context *ctx, swr_jit_vs_key 
)
   return NULL;
 
BuilderSWR builder(
+#if USE_SIMD16_SHADERS
+  reinterpret_cast(swr_screen(ctx->pipe.screen)->hJitMgr16),
+#else
   reinterpret_cast(swr_screen(ctx->pipe.screen)->hJitMgr),
+#endif
   "VS");
PFN_VERTEX_FUNC func = builder.CompileVS(ctx, key);
 
-- 
2.11.0

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH 2/2] swr/rast: use proper alignment for debug transposedPrims

2017-10-03 Thread Tim Rowley
Causing a crash in ParaView waveletcontour.py test when
_DEBUG defined due to vector aligned copy with unaligned
address.
---
 src/gallium/drivers/swr/rasterizer/core/clip.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/gallium/drivers/swr/rasterizer/core/clip.h 
b/src/gallium/drivers/swr/rasterizer/core/clip.h
index cde5261521..e9a410daa3 100644
--- a/src/gallium/drivers/swr/rasterizer/core/clip.h
+++ b/src/gallium/drivers/swr/rasterizer/core/clip.h
@@ -561,7 +561,7 @@ public:
 
 #if defined(_DEBUG)
 // TODO: need to increase stack size, allocating SIMD16-widened 
transposedPrims causes stack overflow in debug builds
-SIMDVERTEX_T *transposedPrims = 
reinterpret_cast(malloc(sizeof(SIMDVERTEX_T) * 
2));
+SIMDVERTEX_T *transposedPrims = 
reinterpret_cast(AlignedMalloc(sizeof(SIMDVERTEX_T) * 2, 64));
 
 #else
 SIMDVERTEX_T transposedPrims[2];
@@ -667,7 +667,7 @@ public:
 }
 
 #if defined(_DEBUG)
-free(transposedPrims);
+AlignedFree(transposedPrims);
 
 #endif
 // update global pipeline stat
-- 
2.11.0

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH 1/2] configure.ac: add _DEBUG to strip_unwanted_llvm_flags

2017-10-03 Thread Tim Rowley
Assert-enabled builds of llvm add _DEBUG to the LLVM_CFLAGS.

This was causing a crash with swr running the ParaView
waveletcontour.py test, due to a bug in our _DEBUG code.
---
 configure.ac | 1 +
 1 file changed, 1 insertion(+)

diff --git a/configure.ac b/configure.ac
index 903a3979d4..b2768f46c0 100644
--- a/configure.ac
+++ b/configure.ac
@@ -987,6 +987,7 @@ strip_unwanted_llvm_flags() {
 echo " `$1` " | sed -E \
 -e 's/[[[:space:]]]+-m[[^[:space:]]]*//g' \
 -e 's/[[[:space:]]]+-DNDEBUG[[[:space:]]]/ /g' \
+-e 's/[[[:space:]]]+-D_DEBUG[[[:space:]]]/ /g' \
 -e 's/[[[:space:]]]+-D_GNU_SOURCE[[[:space:]]]/ /g' \
 -e 's/[[[:space:]]]+-pedantic[[[:space:]]]/ /g' \
 -e 's/[[[:space:]]]+-W[[^[:space:]]]*//g' \
-- 
2.11.0

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH 7/9] swr/rast: Fix allocation of DS output data for USE_SIMD16_FRONTEND

2017-09-21 Thread Tim Rowley
---
 src/gallium/drivers/swr/rasterizer/core/frontend.cpp | 16 ++--
 1 file changed, 6 insertions(+), 10 deletions(-)

diff --git a/src/gallium/drivers/swr/rasterizer/core/frontend.cpp 
b/src/gallium/drivers/swr/rasterizer/core/frontend.cpp
index 22a5705..aea8e88 100644
--- a/src/gallium/drivers/swr/rasterizer/core/frontend.cpp
+++ b/src/gallium/drivers/swr/rasterizer/core/frontend.cpp
@@ -1062,7 +1062,7 @@ struct TessellationThreadLocalData
 size_t tsCtxSize;
 
 simdscalar* pDSOutput;
-size_t numDSOutputVectors;
+size_t dsOutputAllocSize;
 };
 
 THREAD TessellationThreadLocalData* gt_pTessellationThreadData = nullptr;
@@ -1210,24 +1210,20 @@ static void TessellationStages(
 
 // Allocate DS Output memory
 uint32_t requiredDSVectorInvocations = AlignUp(tsData.NumDomainPoints, 
KNOB_SIMD_WIDTH) / KNOB_SIMD_WIDTH;
-size_t requiredDSOutputVectors = requiredDSVectorInvocations * 
tsState.numDsOutputAttribs;
 #if USE_SIMD16_FRONTEND
 size_t requiredAllocSize = sizeof(simdvector) * 
RoundUpEven(requiredDSVectorInvocations) * tsState.numDsOutputAttribs;  // 
simd8 -> simd16, padding
 #else
+size_t requiredDSOutputVectors = requiredDSVectorInvocations * 
tsState.numDsOutputAttribs;
 size_t requiredAllocSize = sizeof(simdvector) * 
requiredDSOutputVectors;
 #endif
-if (requiredDSOutputVectors > 
gt_pTessellationThreadData->numDSOutputVectors)
+if (requiredAllocSize > gt_pTessellationThreadData->dsOutputAllocSize)
 {
 AlignedFree(gt_pTessellationThreadData->pDSOutput);
 gt_pTessellationThreadData->pDSOutput = 
(simdscalar*)AlignedMalloc(requiredAllocSize, 64);
-#if USE_SIMD16_FRONTEND
-gt_pTessellationThreadData->numDSOutputVectors = 
RoundUpEven(requiredDSVectorInvocations) * tsState.numDsOutputAttribs; // simd8 
-> simd16, padding
-#else
-gt_pTessellationThreadData->numDSOutputVectors = 
requiredDSOutputVectors;
-#endif
+gt_pTessellationThreadData->dsOutputAllocSize = requiredAllocSize;
 }
 SWR_ASSERT(gt_pTessellationThreadData->pDSOutput);
-SWR_ASSERT(gt_pTessellationThreadData->numDSOutputVectors >= 
requiredDSOutputVectors);
+SWR_ASSERT(gt_pTessellationThreadData->dsOutputAllocSize >= 
requiredAllocSize);
 
 #if defined(_DEBUG)
 memset(gt_pTessellationThreadData->pDSOutput, 0x90, requiredAllocSize);
@@ -1356,7 +1352,7 @@ static void TessellationStages(
 AlignedFree(gt_pTessellationThreadData->pDSOutput);
 gt_pTessellationThreadData->pDSOutput = nullptr;
 }
-gt_pTessellationThreadData->numDSOutputVectors = 0;
+gt_pTessellationThreadData->dsOutputAllocSize = 0;
 
 #endif
 TSDestroyCtx(tsCtx);
-- 
2.7.4

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH 0/9] swr: update rasterizer

2017-09-21 Thread Tim Rowley
Highlights: large change in the geometry shader api, cleanups.

Tim Rowley (9):
  swr/rast: Add support for R10G10B10_FLOAT_A2_UNORM pixel format
  swr/rast: New GS state/context API
  swr/rast: Fetch compile state changes
  swr/rast: Move SWR_GS_CONTEXT from thread local storage to stack
  swr/rast: Properly sized null GS buffer
  swr/rast: Slightly more efficient blend jit
  swr/rast: Fix allocation of DS output data for USE_SIMD16_FRONTEND
  swr/rast: Remove code supporting legacy llvm (<3.9)
  swr/rast: Handle instanceID offset / Instance Stride enable

 .../drivers/swr/rasterizer/common/formats.cpp  |  27 ++-
 .../drivers/swr/rasterizer/core/format_traits.h|   2 +-
 .../drivers/swr/rasterizer/core/frontend.cpp   | 252 +++--
 src/gallium/drivers/swr/rasterizer/core/state.h|  55 +++--
 .../drivers/swr/rasterizer/jitter/JitManager.cpp   |  11 +-
 .../drivers/swr/rasterizer/jitter/JitManager.h |   7 -
 .../drivers/swr/rasterizer/jitter/blend_jit.cpp|  30 +--
 .../drivers/swr/rasterizer/jitter/builder_misc.cpp | 118 ++
 .../drivers/swr/rasterizer/jitter/fetch_jit.cpp|  40 +++-
 .../drivers/swr/rasterizer/jitter/fetch_jit.h  |   7 +-
 src/gallium/drivers/swr/swr_shader.cpp | 183 +++
 11 files changed, 361 insertions(+), 371 deletions(-)

-- 
2.7.4

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH 4/9] swr/rast: Move SWR_GS_CONTEXT from thread local storage to stack

2017-09-21 Thread Tim Rowley
Move structure, as the size is significantly reduced due to dynamic
allocation of the GS buffers.
---
 .../drivers/swr/rasterizer/core/frontend.cpp   | 23 +++---
 1 file changed, 11 insertions(+), 12 deletions(-)

diff --git a/src/gallium/drivers/swr/rasterizer/core/frontend.cpp 
b/src/gallium/drivers/swr/rasterizer/core/frontend.cpp
index 26e76a9..15bc93d 100644
--- a/src/gallium/drivers/swr/rasterizer/core/frontend.cpp
+++ b/src/gallium/drivers/swr/rasterizer/core/frontend.cpp
@@ -708,8 +708,6 @@ void ProcessStreamIdBuffer(uint32_t stream, uint8_t* 
pStreamIdBase, uint32_t num
 }
 }
 
-THREAD SWR_GS_CONTEXT tlsGsContext;
-
 // Buffers that are allocated if GS is enabled
 struct GsBuffers
 {
@@ -798,21 +796,22 @@ static void GeometryShaderStage(
 
 const API_STATE& state = GetApiState(pDC);
 const SWR_GS_STATE* pState = 
+SWR_GS_CONTEXT gsContext;
 
 static uint8_t sNullBuffer[1024] = { 0 };
 
 for (uint32_t i = 0; i < KNOB_SIMD_WIDTH; ++i)
 {
-tlsGsContext.pStreams[i] = pGsBuffers->pGsOut[i];
+gsContext.pStreams[i] = pGsBuffers->pGsOut[i];
 }
-tlsGsContext.pVerts = (simdvector*)pGsBuffers->pGsIn;
-tlsGsContext.PrimitiveID = primID;
+gsContext.pVerts = (simdvector*)pGsBuffers->pGsIn;
+gsContext.PrimitiveID = primID;
 
 uint32_t numVertsPerPrim = NumVertsPerPrim(pa.binTopology, true);
 simdvector attrib[MAX_NUM_VERTS_PER_PRIM];
 
 // assemble all attributes for the input primitive
-tlsGsContext.inputVertStride = pState->inputVertStride;
+gsContext.inputVertStride = pState->inputVertStride;
 for (uint32_t slot = 0; slot < pState->numInputAttribs; ++slot)
 {
 uint32_t srcAttribSlot = pState->srcVertexAttribOffset + slot;
@@ -821,7 +820,7 @@ static void GeometryShaderStage(
 
 for (uint32_t i = 0; i < numVertsPerPrim; ++i)
 {
-tlsGsContext.pVerts[attribSlot + pState->inputVertStride * i] = 
attrib[i];
+gsContext.pVerts[attribSlot + pState->inputVertStride * i] = 
attrib[i];
 }
 }
 
@@ -829,7 +828,7 @@ static void GeometryShaderStage(
 pa.Assemble(VERTEX_POSITION_SLOT, attrib);
 for (uint32_t i = 0; i < numVertsPerPrim; ++i)
 {
-tlsGsContext.pVerts[VERTEX_POSITION_SLOT + pState->inputVertStride * 
i] = attrib[i];
+gsContext.pVerts[VERTEX_POSITION_SLOT + pState->inputVertStride * i] = 
attrib[i];
 }
 
 // record valid prims from the frontend to avoid over binning the newly 
generated
@@ -842,15 +841,15 @@ static void GeometryShaderStage(
 
 for (uint32_t instance = 0; instance < pState->instanceCount; ++instance)
 {
-tlsGsContext.InstanceID = instance;
-tlsGsContext.mask = GenerateMask(numInputPrims);
+gsContext.InstanceID = instance;
+gsContext.mask = GenerateMask(numInputPrims);
 
 // execute the geometry shader
-state.pfnGsFunc(GetPrivateState(pDC), );
+state.pfnGsFunc(GetPrivateState(pDC), );
 
 for (uint32_t i = 0; i < KNOB_SIMD_WIDTH; ++i)
 {
-tlsGsContext.pStreams[i] += pState->allocationSize;
+gsContext.pStreams[i] += pState->allocationSize;
 }
 }
 
-- 
2.7.4

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH 1/9] swr/rast: Add support for R10G10B10_FLOAT_A2_UNORM pixel format

2017-09-21 Thread Tim Rowley
---
 .../drivers/swr/rasterizer/common/formats.cpp  | 27 +++---
 .../drivers/swr/rasterizer/core/format_traits.h|  2 +-
 .../drivers/swr/rasterizer/jitter/builder_misc.cpp | 16 ++---
 3 files changed, 28 insertions(+), 17 deletions(-)

diff --git a/src/gallium/drivers/swr/rasterizer/common/formats.cpp 
b/src/gallium/drivers/swr/rasterizer/common/formats.cpp
index 263dec6..1c086ff 100644
--- a/src/gallium/drivers/swr/rasterizer/common/formats.cpp
+++ b/src/gallium/drivers/swr/rasterizer/common/formats.cpp
@@ -2729,16 +2729,27 @@ const SWR_FORMAT_INFO gFormatInfo[] = {
 { 0.0f, 0.0f, 0.0f, 0.0f },
 1, 1
 },
-// padding (0xD5)
+
+// R10G10B10_FLOAT_A2_UNORM (0xD5)
 {
-nullptr,
-{ SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, 
SWR_TYPE_UNKNOWN },
-{ 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 },
-0, 0, 0, false, false, false, false,
-{ false, false, false, false },
-{ 0.0f, 0.0f, 0.0f, 0.0f },
-1, 1
+"R10G10B10_FLOAT_A2_UNORM",
+{ SWR_TYPE_FLOAT, SWR_TYPE_FLOAT, SWR_TYPE_FLOAT, SWR_TYPE_UNORM },
+{ 0, 0, 0, 0x3f80 }, // Defaults for missing components
+{ 0, 1, 2, 3 }, // Swizzle
+{ 10, 10, 10, 2 }, // Bits per component
+32, // Bits per element
+4, // Bytes per element
+4, // Num components
+false, // isSRGB
+false, // isBC
+false, // isSubsampled
+false, // isLuminance
+{ false, false, false, false }, // Is normalized?
+{ 1.0f, 1.0f, 1.0f, 1.0f / 3.0f }, // To float scale factor
+1, // bcWidth
+1, // bcHeight
 },
+
 // R32_SINT (0xD6)
 {
 "R32_SINT",
diff --git a/src/gallium/drivers/swr/rasterizer/core/format_traits.h 
b/src/gallium/drivers/swr/rasterizer/core/format_traits.h
index c04ea5f..bc585dd 100644
--- a/src/gallium/drivers/swr/rasterizer/core/format_traits.h
+++ b/src/gallium/drivers/swr/rasterizer/core/format_traits.h
@@ -1237,7 +1237,7 @@ template<> struct FormatTraits :
 /// FormatTraits - Format traits specialization for 
R10G10B10_FLOAT_A2_UNORM
 //
 template<> struct FormatTraits :
-ComponentTraits,
+ComponentTraits,
 FormatSwizzle<0, 1, 2, 3>,
 Defaults<0, 0, 0, 0x3f80>
 {
diff --git a/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.cpp 
b/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.cpp
index 402fd26..b943909 100644
--- a/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.cpp
+++ b/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.cpp
@@ -42,7 +42,7 @@ namespace SwrJit
 ///number of mantissa bits.
 /// @param val - 32-bit float
 /// @todo Maybe move this outside of this file into a header?
-static uint16_t Convert32To16Float(float val)
+static uint16_t ConvertFloat32ToFloat16(float val)
 {
 uint32_t sign, exp, mant;
 uint32_t roundBits;
@@ -112,7 +112,7 @@ namespace SwrJit
 ///float
 /// @param val - 16-bit float
 /// @todo Maybe move this outside of this file into a header?
-static float ConvertSmallFloatTo32(uint32_t val)
+static float ConvertFloat16ToFloat32(uint32_t val)
 {
 uint32_t result;
 if ((val & 0x7fff) == 0)
@@ -888,11 +888,11 @@ namespace SwrJit
 else
 {
 FunctionType* pFuncTy = FunctionType::get(mFP32Ty, mInt16Ty);
-Function* pCvtPh2Ps = 
cast(JM()->mpCurrentModule->getOrInsertFunction("ConvertSmallFloatTo32",
 pFuncTy));
+Function* pCvtPh2Ps = 
cast(JM()->mpCurrentModule->getOrInsertFunction("ConvertFloat16ToFloat32",
 pFuncTy));
 
-if 
(sys::DynamicLibrary::SearchForAddressOfSymbol("ConvertSmallFloatTo32") == 
nullptr)
+if 
(sys::DynamicLibrary::SearchForAddressOfSymbol("ConvertFloat16ToFloat32") == 
nullptr)
 {
-sys::DynamicLibrary::AddSymbol("ConvertSmallFloatTo32", (void 
*));
+sys::DynamicLibrary::AddSymbol("ConvertFloat16ToFloat32", 
(void *));
 }
 
 Value* pResult = UndefValue::get(mSimdFP32Ty);
@@ -921,11 +921,11 @@ namespace SwrJit
 {
 // call scalar C function for now
 FunctionType* pFuncTy = FunctionType::get(mInt16Ty, mFP32Ty);
-Function* pCvtPs2Ph = 
cast(JM()->mpCurrentModule->getOrInsertFunction("Convert32To16Float", 
pFuncTy));
+Function* pCvtPs2Ph = 
cast(JM()->mpCurrentModule->getOrInsertFunction("ConvertFloat32ToFloat16",
 pFuncTy));
 
-if 
(sys::DynamicLibrary::SearchForAddressOfSymbol("Convert32To16Float") == nullptr)
+if 
(sys::DynamicLibrary::SearchForAddressOfSymbol("ConvertFloat32ToFloat16") == 

[Mesa-dev] [PATCH 5/9] swr/rast: Properly sized null GS buffer

2017-09-21 Thread Tim Rowley
---
 src/gallium/drivers/swr/rasterizer/core/frontend.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/gallium/drivers/swr/rasterizer/core/frontend.cpp 
b/src/gallium/drivers/swr/rasterizer/core/frontend.cpp
index 15bc93d..22a5705 100644
--- a/src/gallium/drivers/swr/rasterizer/core/frontend.cpp
+++ b/src/gallium/drivers/swr/rasterizer/core/frontend.cpp
@@ -798,7 +798,7 @@ static void GeometryShaderStage(
 const SWR_GS_STATE* pState = 
 SWR_GS_CONTEXT gsContext;
 
-static uint8_t sNullBuffer[1024] = { 0 };
+static uint8_t sNullBuffer[128] = { 0 };
 
 for (uint32_t i = 0; i < KNOB_SIMD_WIDTH; ++i)
 {
-- 
2.7.4

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH 3/9] swr/rast: Fetch compile state changes

2017-09-21 Thread Tim Rowley
Add ForceSequentialAccessEnable and InstanceIDOffsetEnable bools to
FETCH_COMPILE_STATE.
---
 src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.cpp | 6 ++
 src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.h   | 7 ++-
 2 files changed, 12 insertions(+), 1 deletion(-)

diff --git a/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.cpp 
b/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.cpp
index f3a4b27..9061298 100644
--- a/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.cpp
+++ b/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.cpp
@@ -275,6 +275,12 @@ Function* FetchJit::Create(const FETCH_COMPILE_STATE& 
fetchState)
  : JitGatherVertices(fetchState, streams, 
vIndices, pVtxOut);
 #endif
 
+if (fetchState.bInstanceIDOffsetEnable)
+{
+// TODO: 
+SWR_ASSERT((0), "Add support for handling InstanceID Offset Enable.");
+}
+
 RET_VOID();
 
 JitManager::DumpToFile(fetch, "src");
diff --git a/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.h 
b/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.h
index 0dd6de7..18fa963 100644
--- a/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.h
+++ b/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.h
@@ -107,6 +107,9 @@ struct FETCH_COMPILE_STATE
 bool bVertexIDOffsetEnable{ false };// Offset vertexID by StartVertex 
for non-indexed draws or BaseVertex for indexed draws
 bool bPartialVertexBuffer{ false }; // for indexed draws, map illegal 
indices to a known resident vertex
 
+bool bForceSequentialAccessEnable{ false };
+bool bInstanceIDOffsetEnable{ false };
+
 FETCH_COMPILE_STATE(bool disableVGATHER = false, bool diableIndexOOBCheck 
= false):
 bDisableVGATHER(disableVGATHER), 
bDisableIndexOOBCheck(diableIndexOOBCheck){ };
 
@@ -120,11 +123,13 @@ struct FETCH_COMPILE_STATE
 if (cutIndex != other.cutIndex) return false;
 if (bVertexIDOffsetEnable != other.bVertexIDOffsetEnable) return false;
 if (bPartialVertexBuffer != other.bPartialVertexBuffer) return false;
+if (bForceSequentialAccessEnable != 
other.bForceSequentialAccessEnable) return false;
+if (bInstanceIDOffsetEnable != other.bInstanceIDOffsetEnable) return 
false;
 
 for(uint32_t i = 0; i < numAttribs; ++i)
 {
 if((layout[i].bits != other.layout[i].bits) ||
-   ((layout[i].InstanceEnable == 1) &&
+   (((layout[i].InstanceEnable == 1) || 
(layout[i].InstanceStrideEnable == 1)) &&
 (layout[i].InstanceAdvancementState != 
other.layout[i].InstanceAdvancementState))){
 return false;
 }
-- 
2.7.4

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH 8/9] swr/rast: Remove code supporting legacy llvm (<3.9)

2017-09-21 Thread Tim Rowley
---
 .../drivers/swr/rasterizer/jitter/JitManager.cpp   |  11 ++-
 .../drivers/swr/rasterizer/jitter/JitManager.h |   7 --
 .../drivers/swr/rasterizer/jitter/builder_misc.cpp | 102 ++---
 3 files changed, 15 insertions(+), 105 deletions(-)

diff --git a/src/gallium/drivers/swr/rasterizer/jitter/JitManager.cpp 
b/src/gallium/drivers/swr/rasterizer/jitter/JitManager.cpp
index e4281f8..3f0772c 100644
--- a/src/gallium/drivers/swr/rasterizer/jitter/JitManager.cpp
+++ b/src/gallium/drivers/swr/rasterizer/jitter/JitManager.cpp
@@ -48,8 +48,9 @@
 #include "llvm/Support/FormattedStream.h"
 #include "llvm/Support/Path.h"
 #include "llvm/Support/MemoryBuffer.h"
+#include "llvm/Config/llvm-config.h"
 
-#if HAVE_LLVM < 0x400
+#if LLVM_VERSION_MAJOR < 4
 #include "llvm/Bitcode/ReaderWriter.h"
 #else
 #include "llvm/Bitcode/BitcodeWriter.h"
@@ -231,8 +232,8 @@ void JitManager::DumpAsm(Function* pFunction, const char* 
fileName)
 
 #if defined(_WIN32)
 DWORD pid = GetCurrentProcessId();
-TCHAR procname[MAX_PATH];
-GetModuleFileName(NULL, procname, MAX_PATH);
+char procname[MAX_PATH];
+GetModuleFileNameA(NULL, procname, MAX_PATH);
 const char* pBaseName = strrchr(procname, '\\');
 std::stringstream outDir;
 outDir << JITTER_OUTPUT_DIR << pBaseName << "_" << pid << std::ends;
@@ -269,8 +270,8 @@ void JitManager::DumpToFile(Function *f, const char 
*fileName)
 {
 #if defined(_WIN32)
 DWORD pid = GetCurrentProcessId();
-TCHAR procname[MAX_PATH];
-GetModuleFileName(NULL, procname, MAX_PATH);
+char procname[MAX_PATH];
+GetModuleFileNameA(NULL, procname, MAX_PATH);
 const char* pBaseName = strrchr(procname, '\\');
 std::stringstream outDir;
 outDir << JITTER_OUTPUT_DIR << pBaseName << "_" << pid << std::ends;
diff --git a/src/gallium/drivers/swr/rasterizer/jitter/JitManager.h 
b/src/gallium/drivers/swr/rasterizer/jitter/JitManager.h
index 4bc543b..46ffe27 100644
--- a/src/gallium/drivers/swr/rasterizer/jitter/JitManager.h
+++ b/src/gallium/drivers/swr/rasterizer/jitter/JitManager.h
@@ -47,13 +47,6 @@
 #include "llvm/ExecutionEngine/ObjectCache.h"
 
 #include "llvm/Config/llvm-config.h"
-#ifndef LLVM_VERSION_MAJOR
-#include "llvm/Config/config.h"
-#endif
-
-#ifndef HAVE_LLVM
-#define HAVE_LLVM ((LLVM_VERSION_MAJOR << 8) | LLVM_VERSION_MINOR)
-#endif
 
 #include "llvm/IR/Verifier.h"
 #include "llvm/ExecutionEngine/MCJIT.h"
diff --git a/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.cpp 
b/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.cpp
index b943909..9ca36b2 100644
--- a/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.cpp
+++ b/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.cpp
@@ -763,22 +763,10 @@ namespace SwrJit
 /// lower 8 values are used.
 Value *Builder::PMOVSXBD(Value* a)
 {
-// llvm-3.9 removed the pmovsxbd intrinsic
-#if HAVE_LLVM < 0x309
-// use avx2 byte sign extend instruction if available
-if(JM()->mArch.AVX2())
-{
-Function *pmovsxbd = 
Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::x86_avx2_pmovsxbd);
-return CALL(pmovsxbd, std::initializer_list{a});
-}
-else
-#endif
-{
-// VPMOVSXBD output type
-Type* v8x32Ty = VectorType::get(mInt32Ty, 8);
-// Extract 8 values from 128bit lane and sign extend
-return S_EXT(VSHUFFLE(a, a, C({0, 1, 2, 3, 4, 5, 6, 7})), 
v8x32Ty);
-}
+// VPMOVSXBD output type
+Type* v8x32Ty = VectorType::get(mInt32Ty, 8);
+// Extract 8 values from 128bit lane and sign extend
+return S_EXT(VSHUFFLE(a, a, C({0, 1, 2, 3, 4, 5, 6, 7})), 
v8x32Ty);
 }
 
 //
@@ -787,22 +775,10 @@ namespace SwrJit
 /// @param a - 128bit SIMD lane(8x16bit) of 16bit integer values.
 Value *Builder::PMOVSXWD(Value* a)
 {
-// llvm-3.9 removed the pmovsxwd intrinsic
-#if HAVE_LLVM < 0x309
-// use avx2 word sign extend if available
-if(JM()->mArch.AVX2())
-{
-Function *pmovsxwd = 
Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::x86_avx2_pmovsxwd);
-return CALL(pmovsxwd, std::initializer_list{a});
-}
-else
-#endif
-{
-// VPMOVSXWD output type
-Type* v8x32Ty = VectorType::get(mInt32Ty, 8);
-// Extract 8 values from 128bit lane and sign extend
-return S_EXT(VSHUFFLE(a, a, C({0, 1, 2, 3, 4, 5, 6, 7})), 
v8x32Ty);
-}
+// VPMOVSXWD output type
+Type* v8x32Ty = VectorType::get(mInt32Ty, 8);
+// Extract 8 values from 128bit lane and sign extend
+return S_EXT(VSHUFFLE(a, a, C({0, 1, 2, 3, 4, 5, 6, 7})), 
v8x32Ty);
 }
 
 

[Mesa-dev] [PATCH 9/9] swr/rast: Handle instanceID offset / Instance Stride enable

2017-09-21 Thread Tim Rowley
Supported in JitGatherVertices(); FetchJit::JitLoadVertices() may require
similar changes, will need address this if it is determined that this
path is still in use.

Handle Force Sequential Access in FetchJit::Create.
---
 .../drivers/swr/rasterizer/jitter/fetch_jit.cpp| 46 ++
 1 file changed, 39 insertions(+), 7 deletions(-)

diff --git a/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.cpp 
b/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.cpp
index 9061298..1e3db90 100644
--- a/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.cpp
+++ b/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.cpp
@@ -222,6 +222,18 @@ Function* FetchJit::Create(const FETCH_COMPILE_STATE& 
fetchState)
 default: SWR_INVALID("Unsupported index type"); vIndices = nullptr; 
break;
 }
 
+if(fetchState.bForceSequentialAccessEnable)
+{
+Value* pOffsets = C({ 0, 1, 2, 3, 4, 5, 6, 7 });
+
+// VertexData buffers are accessed sequentially, the index is equal to 
the vertex number
+vIndices = VBROADCAST(LOAD(mpFetchInfo, { 0, 
SWR_FETCH_CONTEXT_StartVertex }));
+vIndices = ADD(vIndices, pOffsets);
+#if USE_SIMD16_SHADERS
+vIndices2 = ADD(vIndices, VIMMED1(8));
+#endif
+}
+
 Value* vVertexId = vIndices;
 #if USE_SIMD16_SHADERS
 Value* vVertexId2 = vIndices2;
@@ -275,12 +287,6 @@ Function* FetchJit::Create(const FETCH_COMPILE_STATE& 
fetchState)
  : JitGatherVertices(fetchState, streams, 
vIndices, pVtxOut);
 #endif
 
-if (fetchState.bInstanceIDOffsetEnable)
-{
-// TODO: 
-SWR_ASSERT((0), "Add support for handling InstanceID Offset Enable.");
-}
-
 RET_VOID();
 
 JitManager::DumpToFile(fetch, "src");
@@ -362,6 +368,11 @@ void FetchJit::JitLoadVertices(const FETCH_COMPILE_STATE 
, Value* str
 
 vectors.clear();
 
+if (fetchState.bInstanceIDOffsetEnable)
+{
+SWR_ASSERT((0), "TODO: Fill out more once driver sends this down");
+}
+
 Value *vCurIndices;
 Value *startOffset;
 if(ied.InstanceEnable)
@@ -831,8 +842,16 @@ void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE 
,
 minVertex = LOAD(minVertex);
 }
 
+if (fetchState.bInstanceIDOffsetEnable)
+{
+// the InstanceID (curInstance) value is offset by 
StartInstanceLocation
+curInstance = ADD(curInstance, startInstance);
+}
+
 Value *vCurIndices;
 Value *startOffset;
+Value *vInstanceStride = VIMMED1(0);
+
 if(ied.InstanceEnable)
 {
 Value* stepRate = C(ied.InstanceAdvancementState);
@@ -853,11 +872,19 @@ void FetchJit::JitGatherVertices(const 
FETCH_COMPILE_STATE ,
 }
 else if (ied.InstanceStrideEnable)
 {
+// grab the instance advancement state, determines stride in bytes 
from one instance to the next
+Value* stepRate = C(ied.InstanceAdvancementState);
+vInstanceStride = VBROADCAST(MUL(curInstance, stepRate));
+
+// offset indices by baseVertex
+vCurIndices = ADD(vIndices, vBaseVertex);
+
+startOffset = startVertex;
 SWR_ASSERT((0), "TODO: Fill out more once driver sends this 
down.");
 }
 else
 {
-// offset indices by baseVertex
+// offset indices by baseVertex
 vCurIndices = ADD(vIndices, vBaseVertex);
 
 startOffset = startVertex;
@@ -925,6 +952,11 @@ void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE 
,
 Value* vOffsets = MUL(vCurIndices, vStride);
 vOffsets = ADD(vOffsets, vAlignmentOffsets);
 
+// if instance stride enable is:
+//  true  - add product of the instanceID and advancement state to the 
offst into the VB
+//  false - value of vInstanceStride has been initialialized to zero
+vOffsets = ADD(vOffsets, vInstanceStride);
+
 // Packing and component control 
 ComponentEnable compMask = (ComponentEnable)ied.ComponentPacking;
 const ComponentControl compCtrl[4] { 
(ComponentControl)ied.ComponentControl0, 
(ComponentControl)ied.ComponentControl1, 
-- 
2.7.4

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH 2/9] swr/rast: New GS state/context API

2017-09-21 Thread Tim Rowley
One piglit regression, which was a false pass:
  spec@glsl-1.50@execution@geometry@dynamic_input_array_index
---
 .../drivers/swr/rasterizer/core/frontend.cpp   | 227 -
 src/gallium/drivers/swr/rasterizer/core/state.h|  55 +++--
 src/gallium/drivers/swr/swr_shader.cpp | 183 -
 3 files changed, 253 insertions(+), 212 deletions(-)

diff --git a/src/gallium/drivers/swr/rasterizer/core/frontend.cpp 
b/src/gallium/drivers/swr/rasterizer/core/frontend.cpp
index f882869..26e76a9 100644
--- a/src/gallium/drivers/swr/rasterizer/core/frontend.cpp
+++ b/src/gallium/drivers/swr/rasterizer/core/frontend.cpp
@@ -710,45 +710,67 @@ void ProcessStreamIdBuffer(uint32_t stream, uint8_t* 
pStreamIdBase, uint32_t num
 
 THREAD SWR_GS_CONTEXT tlsGsContext;
 
-template
-struct GsBufferInfo
+// Buffers that are allocated if GS is enabled
+struct GsBuffers
 {
-GsBufferInfo(const SWR_GS_STATE )
-{
-const uint32_t vertexCount = gsState.maxNumVerts;
-const uint32_t vertexStride = sizeof(SIMDVERTEX);
-const uint32_t numSimdBatches = (vertexCount + SIMD_WIDTH - 1) / 
SIMD_WIDTH;
+uint8_t* pGsIn;
+uint8_t* pGsOut[KNOB_SIMD_WIDTH];
+uint8_t* pGsTransposed;
+void* pStreamCutBuffer;
+};
 
-vertexPrimitiveStride = vertexStride * numSimdBatches;
-vertexInstanceStride = vertexPrimitiveStride * SIMD_WIDTH;
+//
+/// @brief Transposes GS output from SOA to AOS to feed the primitive assembler
+/// @param pDst - Destination buffer in AOS form for the current SIMD width, 
fed into the primitive assembler
+/// @param pSrc - Buffer of vertices in SOA form written by the geometry shader
+/// @param numVerts - Number of vertices outputted by the GS
+/// @param numAttribs - Number of attributes per vertex
+template
+void TransposeSOAtoAOS(uint8_t* pDst, uint8_t* pSrc, uint32_t numVerts, 
uint32_t numAttribs)
+{
+uint32_t srcVertexStride = numAttribs * sizeof(float) * 4;
+uint32_t dstVertexStride = numAttribs * sizeof(typename SIMD_T::Float) * 4;
 
-if (gsState.isSingleStream)
-{
-cutPrimitiveStride = (vertexCount + 7) / 8;
-cutInstanceStride = cutPrimitiveStride * SIMD_WIDTH;
+OSALIGNSIMD16(uint32_t) gatherOffsets[SimdWidth];
 
-streamCutPrimitiveStride = 0;
-streamCutInstanceStride = 0;
-}
-else
-{
-cutPrimitiveStride = AlignUp(vertexCount * 2 / 8, 4);
-cutInstanceStride = cutPrimitiveStride * SIMD_WIDTH;
-
-streamCutPrimitiveStride = (vertexCount + 7) / 8;
-streamCutInstanceStride = streamCutPrimitiveStride * SIMD_WIDTH;
-}
+for (uint32_t i = 0; i < SimdWidth; ++i)
+{
+gatherOffsets[i] = srcVertexStride * i;
 }
+auto vGatherOffsets = SIMD_T::load_si((typename 
SIMD_T::Integer*)[0]);
 
-uint32_t vertexPrimitiveStride;
-uint32_t vertexInstanceStride;
+uint32_t numSimd = AlignUp(numVerts, SimdWidth) / SimdWidth;
+uint32_t remainingVerts = numVerts;
 
-uint32_t cutPrimitiveStride;
-uint32_t cutInstanceStride;
+for (uint32_t s = 0; s < numSimd; ++s)
+{
+uint8_t* pSrcBase = pSrc + s * srcVertexStride * SimdWidth;
+uint8_t* pDstBase = pDst + s * dstVertexStride;
 
-uint32_t streamCutPrimitiveStride;
-uint32_t streamCutInstanceStride;
-};
+// Compute mask to prevent src overflow
+uint32_t mask = std::min(remainingVerts, SimdWidth);
+mask = GenMask(mask);
+auto vMask = SIMD_T::vmask_ps(mask);
+auto viMask = SIMD_T::castps_si(vMask);
+
+for (uint32_t a = 0; a < numAttribs; ++a)
+{
+auto attribGatherX = SIMD_T::template mask_i32gather_ps(SIMD_T::setzero_ps(), (const float*)pSrcBase, 
vGatherOffsets, vMask);
+auto attribGatherY = SIMD_T::template mask_i32gather_ps(SIMD_T::setzero_ps(), (const float*)(pSrcBase + 
sizeof(float)), vGatherOffsets, vMask);
+auto attribGatherZ = SIMD_T::template mask_i32gather_ps(SIMD_T::setzero_ps(), (const float*)(pSrcBase + 
sizeof(float) * 2), vGatherOffsets, vMask);
+auto attribGatherW = SIMD_T::template mask_i32gather_ps(SIMD_T::setzero_ps(), (const float*)(pSrcBase + 
sizeof(float) * 3), vGatherOffsets, vMask);
+
+SIMD_T::maskstore_ps((float*)pDstBase, viMask, attribGatherX);
+SIMD_T::maskstore_ps((float*)(pDstBase + sizeof(typename 
SIMD_T::Float)), viMask, attribGatherY);
+SIMD_T::maskstore_ps((float*)(pDstBase + sizeof(typename 
SIMD_T::Float) * 2), viMask, attribGatherZ);
+SIMD_T::maskstore_ps((float*)(pDstBase + sizeof(typename 
SIMD_T::Float) * 3), viMask, attribGatherW);
+
+pSrcBase += sizeof(float) * 4;
+pDstBase += sizeof(typename SIMD_T::Float) * 4;
+}
+remainingVerts -= SimdWidth;
+}
+}
 
 

[Mesa-dev] [PATCH 6/9] swr/rast: Slightly more efficient blend jit

2017-09-21 Thread Tim Rowley
---
 .../drivers/swr/rasterizer/jitter/blend_jit.cpp| 30 --
 1 file changed, 10 insertions(+), 20 deletions(-)

diff --git a/src/gallium/drivers/swr/rasterizer/jitter/blend_jit.cpp 
b/src/gallium/drivers/swr/rasterizer/jitter/blend_jit.cpp
index f2e6e53..3258639 100644
--- a/src/gallium/drivers/swr/rasterizer/jitter/blend_jit.cpp
+++ b/src/gallium/drivers/swr/rasterizer/jitter/blend_jit.cpp
@@ -581,13 +581,13 @@ struct BlendJit : public Builder
 // load src1
 src1[i] = LOAD(pSrc1, { i });
 }
-Value* currentMask = VIMMED1(-1);
+Value* currentSampleMask = VIMMED1(-1);
 if (state.desc.alphaToCoverageEnable)
 {
 Value* pClampedSrc = FCLAMP(src[3], 0.0f, 1.0f);
 uint32_t bits = (1 << state.desc.numSamples) - 1;
-currentMask = FMUL(pClampedSrc, VBROADCAST(C((float)bits)));
-currentMask = FP_TO_SI(FADD(currentMask, VIMMED1(0.5f)), 
mSimdInt32Ty);
+currentSampleMask = FMUL(pClampedSrc, VBROADCAST(C((float)bits)));
+currentSampleMask = FP_TO_SI(FADD(currentSampleMask, 
VIMMED1(0.5f)), mSimdInt32Ty);
 }
 
 // alpha test
@@ -766,34 +766,24 @@ struct BlendJit : public Builder
 assert(!(state.desc.alphaToCoverageEnable));
 // load current mask
 Value* oMask = LOAD(ppoMask);
-Value* sampleMasked = VBROADCAST(SHL(C(1), sampleNum));
-oMask = AND(oMask, sampleMasked);
-currentMask = AND(oMask, currentMask);
+currentSampleMask = AND(oMask, currentSampleMask);
 }
 
 if(state.desc.sampleMaskEnable)
 {
 Value* sampleMask = LOAD(pBlendState, { 0, 
SWR_BLEND_STATE_sampleMask});
-Value* sampleMasked = SHL(C(1), sampleNum);
-sampleMask = AND(sampleMask, sampleMasked);
-sampleMask = VBROADCAST(ICMP_SGT(sampleMask, C(0)));
-sampleMask = S_EXT(sampleMask, mSimdInt32Ty);
-currentMask = AND(sampleMask, currentMask);
-}
-
-if (state.desc.alphaToCoverageEnable)
-{
-Value* sampleMasked = SHL(C(1), sampleNum);
-currentMask = AND(currentMask, VBROADCAST(sampleMasked));
+currentSampleMask = AND(VBROADCAST(sampleMask), currentSampleMask);
 }
 
 if(state.desc.sampleMaskEnable || state.desc.alphaToCoverageEnable ||
state.desc.oMaskEnable)
 {
-// load coverage mask
+// load coverage mask and mask off any lanes with no samples
 Value* pMask = LOAD(ppMask);
-currentMask = S_EXT(ICMP_UGT(currentMask, VBROADCAST(C(0))), 
mSimdInt32Ty);
-Value* outputMask = AND(pMask, currentMask);
+Value* sampleMasked = SHL(C(1), sampleNum);
+currentSampleMask = AND(currentSampleMask, 
VBROADCAST(sampleMasked));
+currentSampleMask = S_EXT(ICMP_UGT(currentSampleMask, 
VBROADCAST(C(0))), mSimdInt32Ty);
+Value* outputMask = AND(pMask, currentSampleMask);
 // store new mask
 STORE(outputMask, GEP(ppMask, C(0)));
 }
-- 
2.7.4

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH] swr/rast: remove llvm fence/atomics from generated files

2017-09-19 Thread Tim Rowley
We currently don't use these instructions, and since their API
changed in llvm-5.0 having them in the autogen files broke the mesa
release tarballs which ship with generated autogen files.

Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=102847
CC: mesa-sta...@lists.freedesktop.org
---
 src/gallium/drivers/swr/rasterizer/codegen/gen_llvm_ir_macros.py | 8 
 1 file changed, 8 insertions(+)

diff --git a/src/gallium/drivers/swr/rasterizer/codegen/gen_llvm_ir_macros.py 
b/src/gallium/drivers/swr/rasterizer/codegen/gen_llvm_ir_macros.py
index 025d38a..ce892a9 100644
--- a/src/gallium/drivers/swr/rasterizer/codegen/gen_llvm_ir_macros.py
+++ b/src/gallium/drivers/swr/rasterizer/codegen/gen_llvm_ir_macros.py
@@ -140,6 +140,14 @@ def parse_ir_builder(input_file):
 
 ignore = False
 
+# The following functions need to be ignored in openswr.
+# API change in llvm-5.0 breaks baked autogen files
+if (
+(func_name == 'CreateFence' or
+ func_name == 'CreateAtomicCmpXchg' or
+ func_name == 'CreateAtomicRMW')):
+ignore = True
+
 # The following functions need to be ignored.
 if (func_name == 'CreateInsertNUWNSWBinOp' or
 func_name == 'CreateMaskedIntrinsic' or
-- 
2.7.4

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH 08/10] swr/rast: Missed conversion to SIMD_T

2017-09-11 Thread Tim Rowley
---
 src/gallium/drivers/swr/rasterizer/core/binner.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/gallium/drivers/swr/rasterizer/core/binner.cpp 
b/src/gallium/drivers/swr/rasterizer/core/binner.cpp
index a6713e8..e08e489 100644
--- a/src/gallium/drivers/swr/rasterizer/core/binner.cpp
+++ b/src/gallium/drivers/swr/rasterizer/core/binner.cpp
@@ -1195,7 +1195,7 @@ void BinPostSetupPointsImpl(
 }
 
 OSALIGNSIMD16(float) aPointSize[SIMD_WIDTH];
-_simd16_store_ps(reinterpret_cast(aPointSize), vPointSize);
+SIMD_T::store_ps(reinterpret_cast(aPointSize), vPointSize);
 
 uint32_t *pPrimID = (uint32_t *)
 
-- 
2.7.4

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH 01/10] swr/rast: Add new API SwrStallBE

2017-09-11 Thread Tim Rowley
SwrStallBE stalls the backend threads until all work submitted before
the stall has finished.  The frontend threads can continue to make
forward progress.
---
 src/gallium/drivers/swr/rasterizer/core/api.cpp | 9 +
 src/gallium/drivers/swr/rasterizer/core/api.h   | 8 
 2 files changed, 17 insertions(+)

diff --git a/src/gallium/drivers/swr/rasterizer/core/api.cpp 
b/src/gallium/drivers/swr/rasterizer/core/api.cpp
index ccb6dfb..6323098 100644
--- a/src/gallium/drivers/swr/rasterizer/core/api.cpp
+++ b/src/gallium/drivers/swr/rasterizer/core/api.cpp
@@ -458,6 +458,14 @@ void SwrSync(HANDLE hContext, PFN_CALLBACK_FUNC pfnFunc, 
uint64_t userData, uint
 AR_API_END(APISync, 1);
 }
 
+void SwrStallBE(HANDLE hContext)
+{
+SWR_CONTEXT* pContext = GetContext(hContext);
+DRAW_CONTEXT* pDC = GetDrawContext(pContext);
+
+pDC->dependent = true;
+}
+
 void SwrWaitForIdle(HANDLE hContext)
 {
 SWR_CONTEXT *pContext = GetContext(hContext);
@@ -1672,6 +1680,7 @@ void SwrGetInterface(SWR_INTERFACE _funcs)
 out_funcs.pfnSwrSaveState = SwrSaveState;
 out_funcs.pfnSwrRestoreState = SwrRestoreState;
 out_funcs.pfnSwrSync = SwrSync;
+out_funcs.pfnSwrStallBE = SwrStallBE;
 out_funcs.pfnSwrWaitForIdle = SwrWaitForIdle;
 out_funcs.pfnSwrWaitForIdleFE = SwrWaitForIdleFE;
 out_funcs.pfnSwrSetVertexBuffers = SwrSetVertexBuffers;
diff --git a/src/gallium/drivers/swr/rasterizer/core/api.h 
b/src/gallium/drivers/swr/rasterizer/core/api.h
index a394205..577cfb1 100644
--- a/src/gallium/drivers/swr/rasterizer/core/api.h
+++ b/src/gallium/drivers/swr/rasterizer/core/api.h
@@ -263,6 +263,13 @@ SWR_FUNC(void, SwrSync,
 uint64_t userData3);
 
 //
+/// @brief Stall cmd. Stalls the backend until all previous work has been 
completed.
+///Frontend work can continue to make progress
+/// @param hContext - Handle passed back from SwrCreateContext
+SWR_FUNC(void, SwrStallBE,
+HANDLE hContext);
+
+//
 /// @brief Blocks until all rendering has been completed.
 /// @param hContext - Handle passed back from SwrCreateContext
 SWR_FUNC(void, SwrWaitForIdle,
@@ -709,6 +716,7 @@ struct SWR_INTERFACE
 PFNSwrSaveState pfnSwrSaveState;
 PFNSwrRestoreState pfnSwrRestoreState;
 PFNSwrSync pfnSwrSync;
+PFNSwrStallBE pfnSwrStallBE;
 PFNSwrWaitForIdle pfnSwrWaitForIdle;
 PFNSwrWaitForIdleFE pfnSwrWaitForIdleFE;
 PFNSwrSetVertexBuffers pfnSwrSetVertexBuffers;
-- 
2.7.4

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH 07/10] swr/rast: whitespace changes

2017-09-11 Thread Tim Rowley
---
 src/gallium/drivers/swr/rasterizer/jitter/jit_api.h | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/gallium/drivers/swr/rasterizer/jitter/jit_api.h 
b/src/gallium/drivers/swr/rasterizer/jitter/jit_api.h
index 9f69669..e589d2c 100644
--- a/src/gallium/drivers/swr/rasterizer/jitter/jit_api.h
+++ b/src/gallium/drivers/swr/rasterizer/jitter/jit_api.h
@@ -51,6 +51,7 @@
 
 struct ShaderInfo;
 
+
 //
 /// Jit Compile Info Input
 //
@@ -63,6 +64,7 @@ struct JIT_COMPILE_INPUT
 size_t irLength;
 
 bool enableJitSampler;
+
 };
 
 extern "C"
-- 
2.7.4

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH 10/10] swr/rast: Fetch compile state changes

2017-09-11 Thread Tim Rowley
Add InstanceStrideEnable field and rename InstanceDataStepRate to
InstanceAdvancementState in INPUT_ELEMENT_DESC structure.

Add stubs for handling InstanceStrideEnable in FetchJit::JitLoadVertices()
and FetchJit::JitGatherVertices() and assert if they are triggered.
---
 src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.cpp | 12 ++--
 src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.h   |  7 ---
 src/gallium/drivers/swr/swr_state.cpp   |  2 +-
 3 files changed, 15 insertions(+), 6 deletions(-)

diff --git a/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.cpp 
b/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.cpp
index 761c58c..f3a4b27 100644
--- a/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.cpp
+++ b/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.cpp
@@ -360,7 +360,7 @@ void FetchJit::JitLoadVertices(const FETCH_COMPILE_STATE 
, Value* str
 Value *startOffset;
 if(ied.InstanceEnable)
 {
-Value* stepRate = C(ied.InstanceDataStepRate);
+Value* stepRate = C(ied.InstanceAdvancementState);
 
 // prevent a div by 0 for 0 step rate
 Value* isNonZeroStep = ICMP_UGT(stepRate, C(0));
@@ -376,6 +376,10 @@ void FetchJit::JitLoadVertices(const FETCH_COMPILE_STATE 
, Value* str
 
 startOffset = startInstance;
 }
+else if (ied.InstanceStrideEnable)
+{
+SWR_ASSERT((0), "TODO: Fill out more once driver sends this 
down.");
+}
 else
 {
 // offset indices by baseVertex
@@ -825,7 +829,7 @@ void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE 
,
 Value *startOffset;
 if(ied.InstanceEnable)
 {
-Value* stepRate = C(ied.InstanceDataStepRate);
+Value* stepRate = C(ied.InstanceAdvancementState);
 
 // prevent a div by 0 for 0 step rate
 Value* isNonZeroStep = ICMP_UGT(stepRate, C(0));
@@ -841,6 +845,10 @@ void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE 
,
 
 startOffset = startInstance;
 }
+else if (ied.InstanceStrideEnable)
+{
+SWR_ASSERT((0), "TODO: Fill out more once driver sends this 
down.");
+}
 else
 {
 // offset indices by baseVertex
diff --git a/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.h 
b/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.h
index 4f456af..0dd6de7 100644
--- a/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.h
+++ b/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.h
@@ -45,16 +45,17 @@ struct INPUT_ELEMENT_DESC
 uint32_tFormat : 10;
 uint32_tStreamIndex : 6;
 uint32_tInstanceEnable : 1;
+uint32_tInstanceStrideEnable : 1;
 uint32_tComponentControl0 : 3;
 uint32_tComponentControl1 : 3;
 uint32_tComponentControl2 : 3;
 uint32_tComponentControl3 : 3;
 uint32_tComponentPacking : 4;
-uint32_t_reserved : 19;
+uint32_t_reserved : 18;
 };
 uint64_t bits;
 };
-uint32_t InstanceDataStepRate;
+uint32_t InstanceAdvancementState;
 };
 
 // used to set ComponentPacking
@@ -124,7 +125,7 @@ struct FETCH_COMPILE_STATE
 {
 if((layout[i].bits != other.layout[i].bits) ||
((layout[i].InstanceEnable == 1) &&
-(layout[i].InstanceDataStepRate != 
other.layout[i].InstanceDataStepRate))){
+(layout[i].InstanceAdvancementState != 
other.layout[i].InstanceAdvancementState))){
 return false;
 }
 }
diff --git a/src/gallium/drivers/swr/swr_state.cpp 
b/src/gallium/drivers/swr/swr_state.cpp
index 1491868..93108de 100644
--- a/src/gallium/drivers/swr/swr_state.cpp
+++ b/src/gallium/drivers/swr/swr_state.cpp
@@ -531,7 +531,7 @@ swr_create_vertex_elements_state(struct pipe_context *pipe,
 ? ComponentControl::StoreSrc
 : ComponentControl::Store1Fp;
  velems->fsState.layout[i].ComponentPacking = ComponentEnable::XYZW;
- velems->fsState.layout[i].InstanceDataStepRate =
+ velems->fsState.layout[i].InstanceAdvancementState =
 attribs[i].instance_divisor;
 
  /* Calculate the pitch of each stream */
-- 
2.7.4

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH 05/10] swr/rast: Migrate memory pointers to gfxptr_t type

2017-09-11 Thread Tim Rowley
---
 .../swr/rasterizer/codegen/gen_llvm_types.py|  2 +-
 src/gallium/drivers/swr/rasterizer/core/state.h |  5 +++--
 .../drivers/swr/rasterizer/memory/StoreTile.h   |  4 ++--
 .../drivers/swr/rasterizer/memory/TilingFunctions.h |  2 +-
 src/gallium/drivers/swr/swr_context.cpp | 18 +-
 src/gallium/drivers/swr/swr_draw.cpp|  8 
 src/gallium/drivers/swr/swr_resource.h  |  2 +-
 src/gallium/drivers/swr/swr_screen.cpp  | 21 ++---
 src/gallium/drivers/swr/swr_state.cpp   | 10 +-
 9 files changed, 36 insertions(+), 36 deletions(-)

diff --git a/src/gallium/drivers/swr/rasterizer/codegen/gen_llvm_types.py 
b/src/gallium/drivers/swr/rasterizer/codegen/gen_llvm_types.py
index 94f3f9f..ccf2bde 100644
--- a/src/gallium/drivers/swr/rasterizer/codegen/gen_llvm_types.py
+++ b/src/gallium/drivers/swr/rasterizer/codegen/gen_llvm_types.py
@@ -42,7 +42,7 @@ def gen_llvm_type(type, name, is_pointer, is_pointer_pointer, 
is_array, is_array
 else:
 if type == 'BYTE' or type == 'char' or type == 'uint8_t' or type == 
'int8_t' or type == 'bool':
 llvm_type = 'Type::getInt8Ty(ctx)'
-elif type == 'UINT64' or type == 'INT64' or type == 'uint64_t' or type 
== 'int64_t':
+elif type == 'UINT64' or type == 'INT64' or type == 'uint64_t' or type 
== 'int64_t' or type == 'gfxptr_t':
 llvm_type = 'Type::getInt64Ty(ctx)'
 elif type == 'UINT16' or type == 'int16_t' or type == 'uint16_t':
 llvm_type = 'Type::getInt16Ty(ctx)'
diff --git a/src/gallium/drivers/swr/rasterizer/core/state.h 
b/src/gallium/drivers/swr/rasterizer/core/state.h
index b0af663..13c1d8b 100644
--- a/src/gallium/drivers/swr/rasterizer/core/state.h
+++ b/src/gallium/drivers/swr/rasterizer/core/state.h
@@ -29,6 +29,7 @@
 
 #include "common/formats.h"
 #include "common/intrin.h"
+using gfxptr_t = unsigned long long;
 #include 
 #include 
 
@@ -513,7 +514,7 @@ enum SWR_AUX_MODE
 //
 struct SWR_SURFACE_STATE
 {
-uint8_t *pBaseAddress;
+gfxptr_t xpBaseAddress;
 SWR_SURFACE_TYPE type;  // @llvm_enum
 SWR_FORMAT format;  // @llvm_enum
 uint32_t width;
@@ -536,7 +537,7 @@ struct SWR_SURFACE_STATE
 
 uint32_t lodOffsets[2][15]; // lod offsets for sampled surfaces
 
-uint8_t *pAuxBaseAddress;   // Used for compression, append/consume 
counter, etc.
+gfxptr_t xpAuxBaseAddress;   // Used for compression, append/consume 
counter, etc.
 SWR_AUX_MODE auxMode;  // @llvm_enum
 
 
diff --git a/src/gallium/drivers/swr/rasterizer/memory/StoreTile.h 
b/src/gallium/drivers/swr/rasterizer/memory/StoreTile.h
index c3d14e9..512c338 100644
--- a/src/gallium/drivers/swr/rasterizer/memory/StoreTile.h
+++ b/src/gallium/drivers/swr/rasterizer/memory/StoreTile.h
@@ -1179,7 +1179,7 @@ struct StoreRasterTile
 resolveColor[3] *= oneOverNumSamples;
 
 // Use the resolve surface state
-SWR_SURFACE_STATE* pResolveSurface = 
(SWR_SURFACE_STATE*)pDstSurface->pAuxBaseAddress;
+SWR_SURFACE_STATE* pResolveSurface = 
(SWR_SURFACE_STATE*)pDstSurface->xpAuxBaseAddress;
 uint8_t *pDst = (uint8_t*)ComputeSurfaceAddress((x + rx), (y + ry),
 pResolveSurface->arrayIndex + renderTargetArrayIndex, 
pResolveSurface->arrayIndex + renderTargetArrayIndex,
 0, pResolveSurface->lod, pResolveSurface);
@@ -2390,7 +2390,7 @@ struct StoreMacroTile
 }
 }
 
-if (pDstSurface->pAuxBaseAddress)
+if (pDstSurface->xpAuxBaseAddress)
 {
 uint32_t sampleOffset = KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * 
(FormatTraits::bpp / 8);
 // Store each raster tile from the hot tile to the destination 
surface.
diff --git a/src/gallium/drivers/swr/rasterizer/memory/TilingFunctions.h 
b/src/gallium/drivers/swr/rasterizer/memory/TilingFunctions.h
index 9222d3e..6c801c7 100644
--- a/src/gallium/drivers/swr/rasterizer/memory/TilingFunctions.h
+++ b/src/gallium/drivers/swr/rasterizer/memory/TilingFunctions.h
@@ -694,5 +694,5 @@ template
 INLINE
 void* ComputeSurfaceAddress(uint32_t x, uint32_t y, uint32_t z, uint32_t 
array, uint32_t sampleNum, uint32_t lod, const SWR_SURFACE_STATE *pState)
 {
-return pState->pBaseAddress + ComputeSurfaceOffset(x, y, 
z, array, sampleNum, lod, pState);
+return (void*)(pState->xpBaseAddress + 
ComputeSurfaceOffset(x, y, z, array, sampleNum, lod, pState));
 }
diff --git a/src/gallium/drivers/swr/swr_context.cpp 
b/src/gallium/drivers/swr/swr_context.cpp
index c058870..e95bd3b 100644
--- a/src/gallium/drivers/swr/swr_context.cpp
+++ b/src/gallium/drivers/swr/swr_context.cpp
@@ -152,12 +152,12 @@ swr_transfer_map(struct pipe_context *pipe,
  for (int y = box->y; y < box->y + 

[Mesa-dev] [PATCH 04/10] swr/rast: Remove hardcoded clip/cull slot from clipper

2017-09-11 Thread Tim Rowley
---
 src/gallium/drivers/swr/rasterizer/core/clip.h | 35 +++---
 1 file changed, 21 insertions(+), 14 deletions(-)

diff --git a/src/gallium/drivers/swr/rasterizer/core/clip.h 
b/src/gallium/drivers/swr/rasterizer/core/clip.h
index e0aaf81..cde5261 100644
--- a/src/gallium/drivers/swr/rasterizer/core/clip.h
+++ b/src/gallium/drivers/swr/rasterizer/core/clip.h
@@ -372,13 +372,15 @@ public:
 int ComputeUserClipCullMask(PA_STATE , typename SIMD_T::Vec4 prim[])
 {
 uint8_t cullMask = state.backendState.cullDistanceMask;
+uint32_t vertexClipCullOffset = 
state.backendState.vertexClipCullOffset;
+
 typename SIMD_T::Float vClipCullMask = SIMD_T::setzero_ps();
 
 typename SIMD_T::Vec4 vClipCullDistLo[3];
 typename SIMD_T::Vec4 vClipCullDistHi[3];
 
-pa.Assemble(VERTEX_CLIPCULL_DIST_LO_SLOT, vClipCullDistLo);
-pa.Assemble(VERTEX_CLIPCULL_DIST_HI_SLOT, vClipCullDistHi);
+pa.Assemble(vertexClipCullOffset, vClipCullDistLo);
+pa.Assemble(vertexClipCullOffset + 1, vClipCullDistHi);
 
 DWORD index;
 while (_BitScanForward(, cullMask))
@@ -488,21 +490,22 @@ public:
 }
 
 // assemble user clip distances if enabled
+uint32_t vertexClipCullSlot = state.backendState.vertexClipCullOffset;
 if (state.backendState.clipDistanceMask & 0xf)
 {
-pa.Assemble(VERTEX_CLIPCULL_DIST_LO_SLOT, tmpVector);
+pa.Assemble(vertexClipCullSlot, tmpVector);
 for (uint32_t i = 0; i < NumVertsPerPrim; ++i)
 {
-vertices[i].attrib[VERTEX_CLIPCULL_DIST_LO_SLOT] = 
tmpVector[i];
+vertices[i].attrib[vertexClipCullSlot] = tmpVector[i];
 }
 }
 
 if (state.backendState.clipDistanceMask & 0xf0)
 {
-pa.Assemble(VERTEX_CLIPCULL_DIST_HI_SLOT, tmpVector);
+pa.Assemble(vertexClipCullSlot + 1, tmpVector);
 for (uint32_t i = 0; i < NumVertsPerPrim; ++i)
 {
-vertices[i].attrib[VERTEX_CLIPCULL_DIST_HI_SLOT] = 
tmpVector[i];
+vertices[i].attrib[vertexClipCullSlot + 1] = tmpVector[i];
 }
 }
 
@@ -613,26 +616,27 @@ public:
 }
 
 // transpose user clip distances if enabled
+uint32_t vertexClipCullSlot = backendState.vertexClipCullOffset;
 if (state.backendState.clipDistanceMask & 0x0f)
 {
-pBase = reinterpret_cast([0].attrib[VERTEX_CLIPCULL_DIST_LO_SLOT]) + sizeof(float) * 
inputPrim;
+pBase = reinterpret_cast([0].attrib[vertexClipCullSlot]) + sizeof(float) * inputPrim;
 
 for (uint32_t c = 0; c < 4; ++c)
 {
 SIMD256::Float temp = SIMD256::template 
mask_i32gather_ps(SIMD256::setzero_ps(), 
reinterpret_cast(pBase), vOffsets, vMask);
-transposedPrims[0].attrib[VERTEX_CLIPCULL_DIST_LO_SLOT][c] 
= SimdHelper::insert_lo_ps(temp);
+transposedPrims[0].attrib[vertexClipCullSlot][c] = 
SimdHelper::insert_lo_ps(temp);
 pBase += sizeof(typename SIMD_T::Float);
 }
 }
 
 if (state.backendState.clipDistanceMask & 0xf0)
 {
-pBase = reinterpret_cast([0].attrib[VERTEX_CLIPCULL_DIST_HI_SLOT]) + sizeof(float) * 
inputPrim;
+pBase = reinterpret_cast([0].attrib[vertexClipCullSlot + 1]) + sizeof(float) * inputPrim;
 
 for (uint32_t c = 0; c < 4; ++c)
 {
 SIMD256::Float temp = SIMD256::template 
mask_i32gather_ps(SIMD256::setzero_ps(), 
reinterpret_cast(pBase), vOffsets, vMask);
-transposedPrims[0].attrib[VERTEX_CLIPCULL_DIST_HI_SLOT][c] 
= SimdHelper::insert_lo_ps(temp);
+transposedPrims[0].attrib[vertexClipCullSlot + 1][c] = 
SimdHelper::insert_lo_ps(temp);
 pBase += sizeof(typename SIMD_T::Float);
 }
 }
@@ -692,6 +696,7 @@ public:
 
 // OOB indices => forced to zero.
 typename SIMD_T::Integer vpai = 
SIMD_T::castps_si(vpiAttrib[0][VERTEX_SGV_VAI_COMP]);
+vpai = SIMD_T::max_epi32(vpai, SIMD_T::setzero_si());
 typename SIMD_T::Integer vNumViewports = 
SIMD_T::set1_epi32(KNOB_NUM_VIEWPORTS_SCISSORS);
 typename SIMD_T::Integer vClearMask = SIMD_T::cmplt_epi32(vpai, 
vNumViewports);
 viewportIdx = SIMD_T::and_si(vClearMask, vpai);
@@ -822,6 +827,7 @@ private:
 float *pOutVerts)   // array of output 
positions. We'll write our new intersection point at i*4.
 {
 uint32_t vertexAttribOffset = 
this->state.backendState.vertexAttribOffset;
+uint32_t vertexClipCullOffset = 
this->state.backendState.vertexClipCullOffset;
 
 // compute interpolation factor
 typename SIMD_T::Float 

[Mesa-dev] [PATCH 06/10] swr/rast: add graph write to jit debug putput

2017-09-11 Thread Tim Rowley
---
 src/gallium/drivers/swr/rasterizer/jitter/JitManager.cpp | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/gallium/drivers/swr/rasterizer/jitter/JitManager.cpp 
b/src/gallium/drivers/swr/rasterizer/jitter/JitManager.cpp
index fc32b62..e4281f8 100644
--- a/src/gallium/drivers/swr/rasterizer/jitter/JitManager.cpp
+++ b/src/gallium/drivers/swr/rasterizer/jitter/JitManager.cpp
@@ -296,10 +296,10 @@ void JitManager::DumpToFile(Function *f, const char 
*fileName)
 #endif
 fd.flush();
 
-//raw_fd_ostream fd_cfg(fName, EC, llvm::sys::fs::F_Text);
-//WriteGraph(fd_cfg, (const Function*)f);
+raw_fd_ostream fd_cfg(fName, EC, llvm::sys::fs::F_Text);
+WriteGraph(fd_cfg, (const Function*)f);
 
-//fd_cfg.flush();
+fd_cfg.flush();
 }
 }
 
-- 
2.7.4

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH 03/10] swr/rast: Start to remove hardcoded clipcull_dist vertex attrib slot

2017-09-11 Thread Tim Rowley
Add new field in SWR_BACKEND_STATE::vertexClipCullOffset to specify the
start of the clip/cull section of the vertex header.  Removed use of
hardcoded slot from binner.
---
 src/gallium/drivers/swr/rasterizer/core/binner.cpp | 11 ++-
 src/gallium/drivers/swr/rasterizer/core/state.h|  9 ++---
 src/gallium/drivers/swr/swr_state.cpp  |  3 +++
 3 files changed, 15 insertions(+), 8 deletions(-)

diff --git a/src/gallium/drivers/swr/rasterizer/core/binner.cpp 
b/src/gallium/drivers/swr/rasterizer/core/binner.cpp
index 19afd1f..a6713e8 100644
--- a/src/gallium/drivers/swr/rasterizer/core/binner.cpp
+++ b/src/gallium/drivers/swr/rasterizer/core/binner.cpp
@@ -366,16 +366,17 @@ PFN_PROCESS_ATTRIBUTES GetProcessAttributesFunc(uint32_t 
NumVerts, bool IsSwizzl
 /// @param clipDistMask - mask of enabled clip distances
 /// @param pUserClipBuffer - buffer to store results
 template
-void ProcessUserClipDist(PA_STATE& pa, uint32_t primIndex, uint8_t 
clipDistMask, float *pRecipW, float* pUserClipBuffer)
+void ProcessUserClipDist(const SWR_BACKEND_STATE& state, PA_STATE& pa, 
uint32_t primIndex, float *pRecipW, float* pUserClipBuffer)
 {
 DWORD clipDist;
+uint32_t clipDistMask = state.clipDistanceMask;
 while (_BitScanForward(, clipDistMask))
 {
 clipDistMask &= ~(1 << clipDist);
 uint32_t clipSlot = clipDist >> 2;
 uint32_t clipComp = clipDist & 0x3;
 uint32_t clipAttribSlot = clipSlot == 0 ?
-VERTEX_CLIPCULL_DIST_LO_SLOT : VERTEX_CLIPCULL_DIST_HI_SLOT;
+state.vertexClipCullOffset : state.vertexClipCullOffset + 1;
 
 simd4scalar primClipDist[3];
 pa.AssembleSingle(clipAttribSlot, primIndex, primClipDist);
@@ -872,7 +873,7 @@ endBinTriangles:
 {
 uint32_t numClipDist = 
_mm_popcnt_u32(state.backendState.clipDistanceMask);
 desc.pUserClipBuffer = (float*)pArena->Alloc(numClipDist * 3 * 
sizeof(float));
-ProcessUserClipDist<3>(pa, triIndex, 
state.backendState.clipDistanceMask, [12], 
desc.pUserClipBuffer);
+ProcessUserClipDist<3>(state.backendState, pa, triIndex, 
[12], desc.pUserClipBuffer);
 }
 
 for (uint32_t y = aMTTop[triIndex]; y <= aMTBottom[triIndex]; ++y)
@@ -1248,7 +1249,7 @@ void BinPostSetupPointsImpl(
 desc.pUserClipBuffer = (float*)pArena->Alloc(numClipDist * 3 * 
sizeof(float));
 float dists[8];
 float one = 1.0f;
-ProcessUserClipDist<1>(pa, primIndex, 
backendState.clipDistanceMask, , dists);
+ProcessUserClipDist<1>(backendState, pa, primIndex, , 
dists);
 for (uint32_t i = 0; i < numClipDist; i++) {
 desc.pUserClipBuffer[3 * i + 0] = 0.0f;
 desc.pUserClipBuffer[3 * i + 1] = 0.0f;
@@ -1577,7 +1578,7 @@ void BinPostSetupLinesImpl(
 {
 uint32_t numClipDist = 
_mm_popcnt_u32(state.backendState.clipDistanceMask);
 desc.pUserClipBuffer = (float*)pArena->Alloc(numClipDist * 2 * 
sizeof(float));
-ProcessUserClipDist<2>(pa, primIndex, 
state.backendState.clipDistanceMask, [12], 
desc.pUserClipBuffer);
+ProcessUserClipDist<2>(state.backendState, pa, primIndex, 
[12], desc.pUserClipBuffer);
 }
 
 MacroTileMgr *pTileMgr = pDC->pTileMgr;
diff --git a/src/gallium/drivers/swr/rasterizer/core/state.h 
b/src/gallium/drivers/swr/rasterizer/core/state.h
index 284c523..b0af663 100644
--- a/src/gallium/drivers/swr/rasterizer/core/state.h
+++ b/src/gallium/drivers/swr/rasterizer/core/state.h
@@ -1070,12 +1070,15 @@ struct SWR_BACKEND_STATE
 bool readRenderTargetArrayIndex;// Forward render target array index 
from last FE stage to the backend
 bool readViewportArrayIndex;// Read viewport array index from last 
FE stage during binning
 
-// user clip/cull distance enables
+   // Offset to the start of the attributes of the input vertices, in 
simdvector units
+uint32_t vertexAttribOffset;
+
+// User clip/cull distance enables
 uint8_t cullDistanceMask;
 uint8_t clipDistanceMask;
 
-   // Offset to the start of the attributes of the input vertices, in 
simdvector units
-uint32_t vertexAttribOffset;
+// Offset to clip/cull attrib section of the vertex, in simdvector units
+uint32_t vertexClipCullOffset;
 };
 
 
diff --git a/src/gallium/drivers/swr/swr_state.cpp 
b/src/gallium/drivers/swr/swr_state.cpp
index d5b553b..69a4473 100644
--- a/src/gallium/drivers/swr/swr_state.cpp
+++ b/src/gallium/drivers/swr/swr_state.cpp
@@ -1766,6 +1766,9 @@ swr_update_derived(struct pipe_context *pipe,
backendState.cullDistanceMask =
   ctx->vs->info.base.culldist_writemask << 
ctx->vs->info.base.num_written_clipdistance;
 
+   // Assume old layout of SGV, POSITION, CLIPCULL, ATTRIB
+   backendState.vertexClipCullOffset = backendState.vertexAttribOffset - 2;
+

[Mesa-dev] [PATCH 09/10] swr/rast: adjust linux cpu topology identification code

2017-09-11 Thread Tim Rowley
Make more robust to handle strange strange configurations like a vmware
exported 4-way numa X 1-core configuration.
---
 .../drivers/swr/rasterizer/core/threads.cpp| 81 ++
 1 file changed, 38 insertions(+), 43 deletions(-)

diff --git a/src/gallium/drivers/swr/rasterizer/core/threads.cpp 
b/src/gallium/drivers/swr/rasterizer/core/threads.cpp
index b704d23..4bb395d 100644
--- a/src/gallium/drivers/swr/rasterizer/core/threads.cpp
+++ b/src/gallium/drivers/swr/rasterizer/core/threads.cpp
@@ -169,37 +169,16 @@ void CalculateProcessorTopology(CPUNumaNodes& out_nodes, 
uint32_t& out_numThread
 std::ifstream input("/proc/cpuinfo");
 std::string line;
 char* c;
-uint32_t threadId = uint32_t(-1);
+uint32_t procId = uint32_t(-1);
 uint32_t coreId = uint32_t(-1);
-uint32_t numaId = uint32_t(-1);
+uint32_t physId = uint32_t(-1);
 
 while (std::getline(input, line))
 {
 if (line.find("processor") != std::string::npos)
 {
-if (threadId != uint32_t(-1))
-{
-// Save information.
-if (out_nodes.size() <= numaId)
-{
-out_nodes.resize(numaId + 1);
-}
-
-auto& numaNode = out_nodes[numaId];
-if (numaNode.cores.size() <= coreId)
-{
-numaNode.cores.resize(coreId + 1);
-}
-
-auto& core = numaNode.cores[coreId];
-core.procGroup = coreId;
-core.threadIds.push_back(threadId);
-
-out_numThreadsPerProcGroup++;
-}
-
 auto data_start = line.find(": ") + 2;
-threadId = std::strtoul(_str()[data_start], , 10);
+procId = std::strtoul(_str()[data_start], , 10);
 continue;
 }
 if (line.find("core id") != std::string::npos)
@@ -211,29 +190,32 @@ void CalculateProcessorTopology(CPUNumaNodes& out_nodes, 
uint32_t& out_numThread
 if (line.find("physical id") != std::string::npos)
 {
 auto data_start = line.find(": ") + 2;
-numaId = std::strtoul(_str()[data_start], , 10);
+physId = std::strtoul(_str()[data_start], , 10);
 continue;
 }
+if (line.length() == 0)
+{
+if (physId + 1 > out_nodes.size())
+out_nodes.resize(physId + 1);
+auto& numaNode = out_nodes[physId];
+numaNode.numaId = physId;
+
+if (coreId + 1 > numaNode.cores.size())
+numaNode.cores.resize(coreId + 1);
+auto& core = numaNode.cores[coreId];
+core.procGroup = coreId;
+core.threadIds.push_back(procId);
+}
 }
 
-if (threadId != uint32_t(-1))
+out_numThreadsPerProcGroup = 0;
+for (auto  : out_nodes)
 {
-// Save information.
-if (out_nodes.size() <= numaId)
+for (auto  : node.cores)
 {
-out_nodes.resize(numaId + 1);
+out_numThreadsPerProcGroup = 
std::max((size_t)out_numThreadsPerProcGroup,
+  core.threadIds.size());
 }
-auto& numaNode = out_nodes[numaId];
-numaNode.numaId = numaId;
-if (numaNode.cores.size() <= coreId)
-{
-numaNode.cores.resize(coreId + 1);
-}
-auto& core = numaNode.cores[coreId];
-
-core.procGroup = coreId;
-core.threadIds.push_back(threadId);
-out_numThreadsPerProcGroup++;
 }
 
 #else
@@ -316,7 +298,11 @@ void bindThread(SWR_CONTEXT* pContext, uint32_t threadId, 
uint32_t procGroupId =
 CPU_ZERO();
 CPU_SET(threadId, );
 
-pthread_setaffinity_np(thread, sizeof(cpu_set_t), );
+int err = pthread_setaffinity_np(thread, sizeof(cpu_set_t), );
+if (err != 0)
+{
+fprintf(stderr, "pthread_setaffinity_np failure for tid %u: %s\n", 
threadId, strerror(err));
+}
 
 #endif
 }
@@ -1031,7 +1017,16 @@ void CreateThreadPool(SWR_CONTEXT* pContext, 
THREAD_POOL* pPool)
 }
 else
 {
-pPool->numaMask = numNodes - 1; // Only works for 2**n numa nodes (1, 
2, 4, etc.)
+// numa distribution assumes workers on all nodes
+bool useNuma = true;
+if (numCoresPerNode * numHyperThreads == 1)
+useNuma = false;
+
+if (useNuma) {
+pPool->numaMask = numNodes - 1; // Only works for 2**n numa nodes 
(1, 2, 4, etc.)
+} else {
+pPool->numaMask = 0;
+}
 
 uint32_t workerId = 0;
 for (uint32_t n = 0; n < numNodes; ++n)
@@ -1064,7 +1059,7 @@ void CreateThreadPool(SWR_CONTEXT* pContext, THREAD_POOL* 
pPool)
 pPool->pThreadData[workerId].workerId = workerId;
 pPool->pThreadData[workerId].procGroupId = core.procGroup;
 pPool->pThreadData[workerId].threadId = 

[Mesa-dev] [PATCH 02/10] swr/rast: Move clip/cull enables in API

2017-09-11 Thread Tim Rowley
Moved from from SWR_RASTSTATE to SWR_BACKEND_STATE.
---
 .../drivers/swr/rasterizer/core/backend.cpp|  4 ++--
 .../drivers/swr/rasterizer/core/backend_impl.h |  2 +-
 .../drivers/swr/rasterizer/core/backend_sample.cpp |  4 ++--
 .../swr/rasterizer/core/backend_singlesample.cpp   |  4 ++--
 src/gallium/drivers/swr/rasterizer/core/binner.cpp | 18 +-
 src/gallium/drivers/swr/rasterizer/core/clip.h | 22 +++---
 .../drivers/swr/rasterizer/core/rasterizer.cpp |  2 +-
 src/gallium/drivers/swr/rasterizer/core/state.h|  8 
 src/gallium/drivers/swr/swr_state.cpp  | 16 
 9 files changed, 40 insertions(+), 40 deletions(-)

diff --git a/src/gallium/drivers/swr/rasterizer/core/backend.cpp 
b/src/gallium/drivers/swr/rasterizer/core/backend.cpp
index 363349f..6282e87 100644
--- a/src/gallium/drivers/swr/rasterizer/core/backend.cpp
+++ b/src/gallium/drivers/swr/rasterizer/core/backend.cpp
@@ -272,9 +272,9 @@ void BackendNullPS(DRAW_CONTEXT *pDC, uint32_t workerId, 
uint32_t x, uint32_t y,
 AR_END(BEBarycentric, 0);
 
 // interpolate user clip distance if available
-if (state.rastState.clipDistanceMask)
+if (state.backendState.clipDistanceMask)
 {
-coverageMask &= 
~ComputeUserClipMask(state.rastState.clipDistanceMask, work.pUserClipBuffer, 
psContext.vI.sample, psContext.vJ.sample);
+coverageMask &= 
~ComputeUserClipMask(state.backendState.clipDistanceMask, work.pUserClipBuffer, 
psContext.vI.sample, psContext.vJ.sample);
 }
 
 simdscalar vCoverageMask = _simd_vmask_ps(coverageMask);
diff --git a/src/gallium/drivers/swr/rasterizer/core/backend_impl.h 
b/src/gallium/drivers/swr/rasterizer/core/backend_impl.h
index 0f430ef..593082b 100644
--- a/src/gallium/drivers/swr/rasterizer/core/backend_impl.h
+++ b/src/gallium/drivers/swr/rasterizer/core/backend_impl.h
@@ -886,7 +886,7 @@ void BackendPixelRate(DRAW_CONTEXT *pDC, uint32_t workerId, 
uint32_t x, uint32_t
 
 AR_END(BESetup, 0);
 
-PixelRateZTestLoop PixelRateZTest(pDC, workerId, work, coeffs, state, 
pDepthBuffer, pStencilBuffer, state.rastState.clipDistanceMask);
+PixelRateZTestLoop PixelRateZTest(pDC, workerId, work, coeffs, state, 
pDepthBuffer, pStencilBuffer, state.backendState.clipDistanceMask);
 
 psContext.vY.UL = _simd_add_ps(vULOffsetsY, 
_simd_set1_ps(static_cast(y)));
 psContext.vY.center = _simd_add_ps(vCenterOffsetsY, 
_simd_set1_ps(static_cast(y)));
diff --git a/src/gallium/drivers/swr/rasterizer/core/backend_sample.cpp 
b/src/gallium/drivers/swr/rasterizer/core/backend_sample.cpp
index bb2e9a9..04e34aa 100644
--- a/src/gallium/drivers/swr/rasterizer/core/backend_sample.cpp
+++ b/src/gallium/drivers/swr/rasterizer/core/backend_sample.cpp
@@ -128,9 +128,9 @@ void BackendSampleRate(DRAW_CONTEXT *pDC, uint32_t 
workerId, uint32_t x, uint32_
 AR_END(BEBarycentric, 0);
 
 // interpolate user clip distance if available
-if (state.rastState.clipDistanceMask)
+if (state.backendState.clipDistanceMask)
 {
-coverageMask &= 
~ComputeUserClipMask(state.rastState.clipDistanceMask, work.pUserClipBuffer, 
psContext.vI.sample, psContext.vJ.sample);
+coverageMask &= 
~ComputeUserClipMask(state.backendState.clipDistanceMask, work.pUserClipBuffer, 
psContext.vI.sample, psContext.vJ.sample);
 }
 
 simdscalar vCoverageMask = _simd_vmask_ps(coverageMask);
diff --git a/src/gallium/drivers/swr/rasterizer/core/backend_singlesample.cpp 
b/src/gallium/drivers/swr/rasterizer/core/backend_singlesample.cpp
index 18f4299..686b979 100644
--- a/src/gallium/drivers/swr/rasterizer/core/backend_singlesample.cpp
+++ b/src/gallium/drivers/swr/rasterizer/core/backend_singlesample.cpp
@@ -112,9 +112,9 @@ void BackendSingleSample(DRAW_CONTEXT *pDC, uint32_t 
workerId, uint32_t x, uint3
 AR_END(BEBarycentric, 1);
 
 // interpolate user clip distance if available
-if (state.rastState.clipDistanceMask)
+if (state.backendState.clipDistanceMask)
 {
-coverageMask &= 
~ComputeUserClipMask(state.rastState.clipDistanceMask, work.pUserClipBuffer, 
psContext.vI.center, psContext.vJ.center);
+coverageMask &= 
~ComputeUserClipMask(state.backendState.clipDistanceMask, work.pUserClipBuffer, 
psContext.vI.center, psContext.vJ.center);
 }
 
 simdscalar vCoverageMask = _simd_vmask_ps(coverageMask);
diff --git a/src/gallium/drivers/swr/rasterizer/core/binner.cpp 
b/src/gallium/drivers/swr/rasterizer/core/binner.cpp
index 01c2f8f..19afd1f 100644
--- 

[Mesa-dev] [PATCH 00/10] swr: update rasterizer

2017-09-11 Thread Tim Rowley
Mostly some api changes, plus making the cpu topology code a bit more
robust in the face of some odd configurations seen in virtualized
environments.

No piglit or vtk ctest regressions.

Tim Rowley (10):
  swr/rast: Add new API SwrStallBE
  swr/rast: Move clip/cull enables in API
  swr/rast: Start to remove hardcoded clipcull_dist vertex attrib slot
  swr/rast: Remove hardcoded clip/cull slot from clipper
  swr/rast: Migrate memory pointers to gfxptr_t type
  swr/rast: add graph write to jit debug putput
  swr/rast: whitespace changes
  swr/rast: Missed conversion to SIMD_T
  swr/rast: adjust linux cpu topology identification code
  swr/rast: Fetch compile state changes

 .../swr/rasterizer/codegen/gen_llvm_types.py   |  2 +-
 src/gallium/drivers/swr/rasterizer/core/api.cpp|  9 +++
 src/gallium/drivers/swr/rasterizer/core/api.h  |  8 +++
 .../drivers/swr/rasterizer/core/backend.cpp|  4 +-
 .../drivers/swr/rasterizer/core/backend_impl.h |  2 +-
 .../drivers/swr/rasterizer/core/backend_sample.cpp |  4 +-
 .../swr/rasterizer/core/backend_singlesample.cpp   |  4 +-
 src/gallium/drivers/swr/rasterizer/core/binner.cpp | 25 +++
 src/gallium/drivers/swr/rasterizer/core/clip.h | 57 ---
 .../drivers/swr/rasterizer/core/rasterizer.cpp |  2 +-
 src/gallium/drivers/swr/rasterizer/core/state.h| 16 +++--
 .../drivers/swr/rasterizer/core/threads.cpp| 81 ++
 .../drivers/swr/rasterizer/jitter/JitManager.cpp   |  6 +-
 .../drivers/swr/rasterizer/jitter/fetch_jit.cpp| 12 +++-
 .../drivers/swr/rasterizer/jitter/fetch_jit.h  |  7 +-
 .../drivers/swr/rasterizer/jitter/jit_api.h|  2 +
 .../drivers/swr/rasterizer/memory/StoreTile.h  |  4 +-
 .../swr/rasterizer/memory/TilingFunctions.h|  2 +-
 src/gallium/drivers/swr/swr_context.cpp| 18 ++---
 src/gallium/drivers/swr/swr_draw.cpp   |  8 +--
 src/gallium/drivers/swr/swr_resource.h |  2 +-
 src/gallium/drivers/swr/swr_screen.cpp | 21 +++---
 src/gallium/drivers/swr/swr_state.cpp  | 31 +
 23 files changed, 182 insertions(+), 145 deletions(-)

-- 
2.7.4

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH 6/8] swr/rast: SIMD16 FE remove templated immediates workaround

2017-09-05 Thread Tim Rowley
Fixed properly in gcc-compatible fashion.
---
 src/gallium/drivers/swr/rasterizer/core/binner.cpp | 110 -
 1 file changed, 20 insertions(+), 90 deletions(-)

diff --git a/src/gallium/drivers/swr/rasterizer/core/binner.cpp 
b/src/gallium/drivers/swr/rasterizer/core/binner.cpp
index e09ff7a..832c47d 100644
--- a/src/gallium/drivers/swr/rasterizer/core/binner.cpp
+++ b/src/gallium/drivers/swr/rasterizer/core/binner.cpp
@@ -404,35 +404,6 @@ void ProcessUserClipDist(PA_STATE& pa, uint32_t primIndex, 
uint8_t clipDistMask,
 }
 }
 
-// WA linux compiler issue with SIMDLIB and shift immediates
-#define SIMD_WA_SXXI_EPI32 1
-
-#if SIMD_WA_SXXI_EPI32
-template
-simdscalari simd_wa_slli_epi32(simdscalari a)
-{
-return SIMD256::slli_epi32(a);
-}
-
-template
-simd16scalari simd_wa_slli_epi32(simd16scalari a)
-{
-return SIMD512::slli_epi32(a);
-}
-
-template
-simdscalari simd_wa_srai_epi32(simdscalari a)
-{
-return SIMD256::srai_epi32(a);
-}
-
-template
-simd16scalari simd_wa_srai_epi32(simd16scalari a)
-{
-return SIMD512::srai_epi32(a);
-}
-
-#endif
 INLINE
 void TransposeVertices(simd4scalar()[8], const simdscalar , const 
simdscalar , const simdscalar )
 {
@@ -804,17 +775,10 @@ endBinTriangles:
 }
 
 // Convert triangle bbox to macrotile units.
-#if SIMD_WA_SXXI_EPI32
-bbox.xmin = 
simd_wa_srai_epi32(bbox.xmin);
-bbox.ymin = 
simd_wa_srai_epi32(bbox.ymin);
-bbox.xmax = 
simd_wa_srai_epi32(bbox.xmax);
-bbox.ymax = 
simd_wa_srai_epi32(bbox.ymax);
-#else
-bbox.xmin = 
SIMD_T::srai_epi32(bbox.xmin);
-bbox.ymin = 
SIMD_T::srai_epi32(bbox.ymin);
-bbox.xmax = 
SIMD_T::srai_epi32(bbox.xmax);
-bbox.ymax = 
SIMD_T::srai_epi32(bbox.ymax);
-#endif
+bbox.xmin = SIMD_T::template 
srai_epi32(bbox.xmin);
+bbox.ymin = SIMD_T::template 
srai_epi32(bbox.ymin);
+bbox.xmax = SIMD_T::template 
srai_epi32(bbox.xmax);
+bbox.ymax = SIMD_T::template 
srai_epi32(bbox.ymax);
 
 OSALIGNSIMD16(uint32_t) aMTLeft[SIMD_WIDTH], aMTRight[SIMD_WIDTH], 
aMTTop[SIMD_WIDTH], aMTBottom[SIMD_WIDTH];
 
@@ -1034,13 +998,8 @@ void BinPostSetupPointsImpl(
 primMask &= ~SIMD_T::movemask_ps(SIMD_T::castsi_ps(vYi));
 
 // compute macro tile coordinates 
-#if SIMD_WA_SXXI_EPI32
-typename SIMD_T::Integer macroX = 
simd_wa_srai_epi32(vXi);
-typename SIMD_T::Integer macroY = 
simd_wa_srai_epi32(vYi);
-#else
-typename SIMD_T::Integer macroX = 
SIMD_T::srai_epi32(vXi);
-typename SIMD_T::Integer macroY = 
SIMD_T::srai_epi32(vYi);
-#endif
+typename SIMD_T::Integer macroX = SIMD_T::template 
srai_epi32(vXi);
+typename SIMD_T::Integer macroY = SIMD_T::template 
srai_epi32(vYi);
 
 OSALIGNSIMD16(uint32_t) aMacroX[SIMD_WIDTH], aMacroY[SIMD_WIDTH];
 
@@ -1048,30 +1007,15 @@ void BinPostSetupPointsImpl(
 SIMD_T::store_si(reinterpret_cast(aMacroY), macroY);
 
 // compute raster tile coordinates
-#if SIMD_WA_SXXI_EPI32
-typename SIMD_T::Integer rasterX = 
simd_wa_srai_epi32(vXi);
-typename SIMD_T::Integer rasterY = 
simd_wa_srai_epi32(vYi);
-#else
-typename SIMD_T::Integer rasterX = 
SIMD_T::srai_epi32(vXi);
-typename SIMD_T::Integer rasterY = 
SIMD_T::srai_epi32(vYi);
-#endif
+typename SIMD_T::Integer rasterX = SIMD_T::template 
srai_epi32(vXi);
+typename SIMD_T::Integer rasterY = SIMD_T::template 
srai_epi32(vYi);
 
 // compute raster tile relative x,y for coverage mask
-#if SIMD_WA_SXXI_EPI32
-typename SIMD_T::Integer tileAlignedX = 
simd_wa_slli_epi32(rasterX);
-typename SIMD_T::Integer tileAlignedY = 
simd_wa_slli_epi32(rasterY);
-#else
-typename SIMD_T::Integer tileAlignedX = 
SIMD_T::slli_epi32(rasterX);
-typename SIMD_T::Integer tileAlignedY = 
SIMD_T::slli_epi32(rasterY);
-#endif
+typename SIMD_T::Integer tileAlignedX = SIMD_T::template 
slli_epi32(rasterX);
+typename SIMD_T::Integer tileAlignedY = SIMD_T::template 
slli_epi32(rasterY);
 
-#if SIMD_WA_SXXI_EPI32
-typename SIMD_T::Integer tileRelativeX = 
SIMD_T::sub_epi32(simd_wa_srai_epi32(vXi), tileAlignedX);
-typename SIMD_T::Integer tileRelativeY = 
SIMD_T::sub_epi32(simd_wa_srai_epi32(vYi), tileAlignedY);
-#else
-typename SIMD_T::Integer tileRelativeX = 
SIMD_T::sub_epi32(SIMD_T::srai_epi32(vXi), tileAlignedX);
-typename SIMD_T::Integer tileRelativeY = 
SIMD_T::sub_epi32(SIMD_T::srai_epi32(vYi), tileAlignedY);
-#endif
+typename SIMD_T::Integer tileRelativeX = 
SIMD_T::sub_epi32(SIMD_T::template srai_epi32(vXi), 
tileAlignedX);
+typename SIMD_T::Integer tileRelativeY = 
SIMD_T::sub_epi32(SIMD_T::template srai_epi32(vYi), 
tileAlignedY);
 
 OSALIGNSIMD16(uint32_t) aTileRelativeX[SIMD_WIDTH];
 OSALIGNSIMD16(uint32_t) aTileRelativeY[SIMD_WIDTH];
@@ -1223,17 +1167,10 @@ void BinPostSetupPointsImpl(
 primMask = primMask & ~maskOutsideScissor;
 
 // Convert 

[Mesa-dev] [PATCH 7/8] swr/rast: Remove use of C++14 template variable

2017-09-05 Thread Tim Rowley
SWR rasterizer must remain C++11 compliant.
---
 src/gallium/drivers/swr/rasterizer/core/binner.cpp |  6 +++---
 src/gallium/drivers/swr/rasterizer/core/binner.h   | 14 +++---
 2 files changed, 14 insertions(+), 6 deletions(-)

diff --git a/src/gallium/drivers/swr/rasterizer/core/binner.cpp 
b/src/gallium/drivers/swr/rasterizer/core/binner.cpp
index 832c47d..01c2f8f 100644
--- a/src/gallium/drivers/swr/rasterizer/core/binner.cpp
+++ b/src/gallium/drivers/swr/rasterizer/core/binner.cpp
@@ -502,7 +502,7 @@ void SIMDCALL BinTrianglesImpl(
 }
 
 // Adjust for pixel center location
-typename SIMD_T::Float offset = 
g_pixelOffsets[rastState.pixelLocation];
+typename SIMD_T::Float offset = 
SwrPixelOffsets::GetOffset(rastState.pixelLocation);
 
 tri[0].x = SIMD_T::add_ps(tri[0].x, offset);
 tri[0].y = SIMD_T::add_ps(tri[0].y, offset);
@@ -1332,7 +1332,7 @@ void BinPointsImpl(
 }
 }
 
-typename SIMD_T::Float offset = 
g_pixelOffsets[rastState.pixelLocation];
+typename SIMD_T::Float offset = 
SwrPixelOffsets::GetOffset(rastState.pixelLocation);
 
 prim[0].x = SIMD_T::add_ps(prim[0].x, offset);
 prim[0].y = SIMD_T::add_ps(prim[0].y, offset);
@@ -1666,7 +1666,7 @@ void SIMDCALL BinLinesImpl(
 }
 
 // adjust for pixel center location
-typename SIMD_T::Float offset = 
g_pixelOffsets[rastState.pixelLocation];
+typename SIMD_T::Float offset = 
SwrPixelOffsets::GetOffset(rastState.pixelLocation);
 
 prim[0].x = SIMD_T::add_ps(prim[0].x, offset);
 prim[0].y = SIMD_T::add_ps(prim[0].y, offset);
diff --git a/src/gallium/drivers/swr/rasterizer/core/binner.h 
b/src/gallium/drivers/swr/rasterizer/core/binner.h
index e842aa6..97e113f 100644
--- a/src/gallium/drivers/swr/rasterizer/core/binner.h
+++ b/src/gallium/drivers/swr/rasterizer/core/binner.h
@@ -31,11 +31,19 @@
 //
 /// @brief Offsets added to post-viewport vertex positions based on
 /// raster state.
+///
+/// Can't use templated variable because we must stick with C++11 features.
+/// Template variables were introduced with C++14
 template 
-static const typename SIMD_T::Float g_pixelOffsets[SWR_PIXEL_LOCATION_UL + 1] =
+struct SwrPixelOffsets
 {
-SIMD_T::set1_ps(0.0f),  // SWR_PIXEL_LOCATION_CENTER
-SIMD_T::set1_ps(0.5f),  // SWR_PIXEL_LOCATION_UL
+public:
+INLINE static typename SIMD_T::Float GetOffset(uint32_t loc)
+{
+SWR_ASSERT(loc <= 1);
+
+return SIMD_T::set1_ps(loc ? 0.5f : 0.0f);
+}
 };
 
 //
-- 
2.7.4

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH 0/8] swr: update rasterizer

2017-09-05 Thread Tim Rowley
Highlight is starting to unify the simd/simd16 code, removing lots of
temporary code duplication.

No piglit or vtk test regressions.

Tim Rowley (8):
  swr/rast: Allow gather of floats from fetch shader with 2-4GB offsets
  swr: set caps for VB 4-byte alignment
  swr/rast: Removed some trailing whitespace caught during review
  swr/rast: FE/Binner - unify SIMD8/16 functions using simdlib types
  swr/rast: SIMD16 PA - rename Assemble_simd16 to Assemble
  swr/rast: SIMD16 FE remove templated immediates workaround
  swr/rast: Remove use of C++14 template variable
  swr/rast: FE/Clipper - unify SIMD8/16 functions using simdlib types

 .../swr/rasterizer/codegen/gen_llvm_ir_macros.py   |1 +
 .../codegen/templates/gen_ar_eventhandlerfile.hpp  |4 +-
 src/gallium/drivers/swr/rasterizer/core/binner.cpp | 2312 ++--
 src/gallium/drivers/swr/rasterizer/core/binner.h   |  192 +-
 src/gallium/drivers/swr/rasterizer/core/clip.cpp   |   16 +-
 src/gallium/drivers/swr/rasterizer/core/clip.h | 1654 --
 .../drivers/swr/rasterizer/core/conservativeRast.h |1 +
 src/gallium/drivers/swr/rasterizer/core/fifo.hpp   |4 +-
 .../drivers/swr/rasterizer/core/frontend.cpp   |6 +-
 src/gallium/drivers/swr/rasterizer/core/pa.h   |   20 +-
 src/gallium/drivers/swr/rasterizer/core/state.h|7 +
 src/gallium/drivers/swr/rasterizer/core/utils.h|8 +
 .../drivers/swr/rasterizer/jitter/fetch_jit.cpp|7 +-
 src/gallium/drivers/swr/swr_screen.cpp |9 +-
 14 files changed, 1193 insertions(+), 3048 deletions(-)

-- 
2.7.4

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH 5/8] swr/rast: SIMD16 PA - rename Assemble_simd16 to Assemble

2017-09-05 Thread Tim Rowley
For consistency and to support overloading.
---
 src/gallium/drivers/swr/rasterizer/core/clip.h | 18 +-
 .../drivers/swr/rasterizer/core/frontend.cpp   |  6 +++---
 src/gallium/drivers/swr/rasterizer/core/pa.h   | 22 +++---
 3 files changed, 15 insertions(+), 31 deletions(-)

diff --git a/src/gallium/drivers/swr/rasterizer/core/clip.h 
b/src/gallium/drivers/swr/rasterizer/core/clip.h
index ffc69c4..5238284 100644
--- a/src/gallium/drivers/swr/rasterizer/core/clip.h
+++ b/src/gallium/drivers/swr/rasterizer/core/clip.h
@@ -399,8 +399,8 @@ public:
 simd16vector vClipCullDistLo[3];
 simd16vector vClipCullDistHi[3];
 
-pa.Assemble_simd16(VERTEX_CLIPCULL_DIST_LO_SLOT, vClipCullDistLo);
-pa.Assemble_simd16(VERTEX_CLIPCULL_DIST_HI_SLOT, vClipCullDistHi);
+pa.Assemble(VERTEX_CLIPCULL_DIST_LO_SLOT, vClipCullDistLo);
+pa.Assemble(VERTEX_CLIPCULL_DIST_HI_SLOT, vClipCullDistHi);
 
 DWORD index;
 while (_BitScanForward(, cullMask))
@@ -680,7 +680,7 @@ public:
 {
 #if USE_SIMD16_FRONTEND
 simd16vector attrib_simd16[NumVertsPerPrim];
-bool assemble = 
clipPa.Assemble_simd16(VERTEX_POSITION_SLOT, attrib_simd16);
+bool assemble = clipPa.Assemble(VERTEX_POSITION_SLOT, 
attrib_simd16);
 
 if (assemble)
 {
@@ -731,7 +731,7 @@ public:
 
 // assemble pos
 simd16vector tmpVector[NumVertsPerPrim];
-pa.Assemble_simd16(VERTEX_POSITION_SLOT, tmpVector);
+pa.Assemble(VERTEX_POSITION_SLOT, tmpVector);
 for (uint32_t i = 0; i < NumVertsPerPrim; ++i)
 {
 vertices[i].attrib[VERTEX_POSITION_SLOT] = tmpVector[i];
@@ -748,7 +748,7 @@ public:
 maxSlot = std::max(maxSlot, mapSlot);
 uint32_t inputSlot = backendState.vertexAttribOffset + mapSlot;
 
-pa.Assemble_simd16(inputSlot, tmpVector);
+pa.Assemble(inputSlot, tmpVector);
 
 // if constant interpolation enabled for this attribute, assign 
the provoking
 // vertex values to all edges
@@ -771,7 +771,7 @@ public:
 // assemble user clip distances if enabled
 if (this->state.rastState.clipDistanceMask & 0xf)
 {
-pa.Assemble_simd16(VERTEX_CLIPCULL_DIST_LO_SLOT, tmpVector);
+pa.Assemble(VERTEX_CLIPCULL_DIST_LO_SLOT, tmpVector);
 for (uint32_t i = 0; i < NumVertsPerPrim; ++i)
 {
 vertices[i].attrib[VERTEX_CLIPCULL_DIST_LO_SLOT] = 
tmpVector[i];
@@ -780,7 +780,7 @@ public:
 
 if (this->state.rastState.clipDistanceMask & 0xf0)
 {
-pa.Assemble_simd16(VERTEX_CLIPCULL_DIST_HI_SLOT, tmpVector);
+pa.Assemble(VERTEX_CLIPCULL_DIST_HI_SLOT, tmpVector);
 for (uint32_t i = 0; i < NumVertsPerPrim; ++i)
 {
 vertices[i].attrib[VERTEX_CLIPCULL_DIST_HI_SLOT] = 
tmpVector[i];
@@ -919,7 +919,7 @@ public:
 do
 {
 simd16vector attrib[NumVertsPerPrim];
-bool assemble = 
clipPa.Assemble_simd16(VERTEX_POSITION_SLOT, attrib);
+bool assemble = clipPa.Assemble(VERTEX_POSITION_SLOT, 
attrib);
 
 if (assemble)
 {
@@ -1060,7 +1060,7 @@ public:
 if (state.backendState.readViewportArrayIndex)
 {
 simd16vector vpiAttrib[NumVertsPerPrim];
-pa.Assemble_simd16(VERTEX_SGV_SLOT, vpiAttrib);
+pa.Assemble(VERTEX_SGV_SLOT, vpiAttrib);
 
 // OOB indices => forced to zero.
 simd16scalari vpai = 
_simd16_castps_si(vpiAttrib[0][VERTEX_SGV_VAI_COMP]);
diff --git a/src/gallium/drivers/swr/rasterizer/core/frontend.cpp 
b/src/gallium/drivers/swr/rasterizer/core/frontend.cpp
index 406a0e0..f882869 100644
--- a/src/gallium/drivers/swr/rasterizer/core/frontend.cpp
+++ b/src/gallium/drivers/swr/rasterizer/core/frontend.cpp
@@ -929,7 +929,7 @@ static void GeometryShaderStage(
 #if USE_SIMD16_FRONTEND
 simd16vector attrib_simd16[3];
 
-bool assemble = 
gsPa.Assemble_simd16(VERTEX_POSITION_SLOT, attrib_simd16);
+bool assemble = gsPa.Assemble(VERTEX_POSITION_SLOT, 
attrib_simd16);
 
 #else
 bool assemble = gsPa.Assemble(VERTEX_POSITION_SLOT, 
attrib);
@@ -1297,7 +1297,7 @@ static void TessellationStages(
 AR_BEGIN(FEPAAssemble, pDC->drawId);
 bool assemble =
 #if USE_SIMD16_FRONTEND
-tessPa.Assemble_simd16(VERTEX_POSITION_SLOT, 
prim_simd16);
+tessPa.Assemble(VERTEX_POSITION_SLOT, prim_simd16);
 #else
 tessPa.Assemble(VERTEX_POSITION_SLOT, prim);
 #endif
@@ -1646,7 +1646,7 @@ void ProcessDraw(
 simd16vector 

[Mesa-dev] [PATCH 8/8] swr/rast: FE/Clipper - unify SIMD8/16 functions using simdlib types

2017-09-05 Thread Tim Rowley
---
 src/gallium/drivers/swr/rasterizer/core/clip.cpp |   16 +-
 src/gallium/drivers/swr/rasterizer/core/clip.h   | 1650 ++
 src/gallium/drivers/swr/rasterizer/core/state.h  |7 +
 3 files changed, 465 insertions(+), 1208 deletions(-)

diff --git a/src/gallium/drivers/swr/rasterizer/core/clip.cpp 
b/src/gallium/drivers/swr/rasterizer/core/clip.cpp
index 4b5512c..a40f077 100644
--- a/src/gallium/drivers/swr/rasterizer/core/clip.cpp
+++ b/src/gallium/drivers/swr/rasterizer/core/clip.cpp
@@ -32,9 +32,9 @@
 #include "core/clip.h"
 
 // Temp storage used by the clipper
-THREAD simdvertex tlsTempVertices[7];
+THREAD SIMDVERTEX_T tlsTempVertices[7];
 #if USE_SIMD16_FRONTEND
-THREAD simd16vertex tlsTempVertices_simd16[7];
+THREAD SIMDVERTEX_T tlsTempVertices_simd16[7];
 #endif
 
 float ComputeInterpFactor(float boundaryCoord0, float boundaryCoord1)
@@ -164,7 +164,7 @@ void ClipTriangles(DRAW_CONTEXT *pDC, PA_STATE& pa, 
uint32_t workerId, simdvecto
 {
 SWR_CONTEXT *pContext = pDC->pContext;
 AR_BEGIN(FEClipTriangles, pDC->drawId);
-Clipper<3> clipper(workerId, pDC);
+Clipper clipper(workerId, pDC);
 clipper.ExecuteStage(pa, prims, primMask, primId);
 AR_END(FEClipTriangles, 1);
 }
@@ -173,7 +173,7 @@ void ClipLines(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t 
workerId, simdvector pr
 {
 SWR_CONTEXT *pContext = pDC->pContext;
 AR_BEGIN(FEClipLines, pDC->drawId);
-Clipper<2> clipper(workerId, pDC);
+Clipper clipper(workerId, pDC);
 clipper.ExecuteStage(pa, prims, primMask, primId);
 AR_END(FEClipLines, 1);
 }
@@ -182,7 +182,7 @@ void ClipPoints(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t 
workerId, simdvector p
 {
 SWR_CONTEXT *pContext = pDC->pContext;
 AR_BEGIN(FEClipPoints, pDC->drawId);
-Clipper<1> clipper(workerId, pDC);
+Clipper clipper(workerId, pDC);
 clipper.ExecuteStage(pa, prims, primMask, primId);
 AR_END(FEClipPoints, 1);
 }
@@ -195,7 +195,7 @@ void SIMDCALL ClipTriangles_simd16(DRAW_CONTEXT *pDC, 
PA_STATE& pa, uint32_t wor
 
 enum { VERTS_PER_PRIM = 3 };
 
-Clipper clipper(workerId, pDC);
+Clipper clipper(workerId, pDC);
 
 pa.useAlternateOffset = false;
 clipper.ExecuteStage(pa, prims, primMask, primId);
@@ -210,7 +210,7 @@ void SIMDCALL ClipLines_simd16(DRAW_CONTEXT *pDC, PA_STATE& 
pa, uint32_t workerI
 
 enum { VERTS_PER_PRIM = 2 };
 
-Clipper clipper(workerId, pDC);
+Clipper clipper(workerId, pDC);
 
 pa.useAlternateOffset = false;
 clipper.ExecuteStage(pa, prims, primMask, primId);
@@ -225,7 +225,7 @@ void SIMDCALL ClipPoints_simd16(DRAW_CONTEXT *pDC, 
PA_STATE& pa, uint32_t worker
 
 enum { VERTS_PER_PRIM = 1 };
 
-Clipper clipper(workerId, pDC);
+Clipper clipper(workerId, pDC);
 
 pa.useAlternateOffset = false;
 clipper.ExecuteStage(pa, prims, primMask, primId);
diff --git a/src/gallium/drivers/swr/rasterizer/core/clip.h 
b/src/gallium/drivers/swr/rasterizer/core/clip.h
index 5238284..d7b559b 100644
--- a/src/gallium/drivers/swr/rasterizer/core/clip.h
+++ b/src/gallium/drivers/swr/rasterizer/core/clip.h
@@ -33,9 +33,9 @@
 #include "rdtsc_core.h"
 
 // Temp storage used by the clipper
-extern THREAD simdvertex tlsTempVertices[7];
+extern THREAD SIMDVERTEX_T tlsTempVertices[7];
 #if USE_SIMD16_FRONTEND
-extern THREAD simd16vertex tlsTempVertices_simd16[7];
+extern THREAD SIMDVERTEX_T tlsTempVertices_simd16[7];
 #endif
 
 enum SWR_CLIPCODES
@@ -61,29 +61,29 @@ enum SWR_CLIPCODES
 
 #define GUARDBAND_CLIP_MASK 
(FRUSTUM_NEAR|FRUSTUM_FAR|GUARDBAND_LEFT|GUARDBAND_TOP|GUARDBAND_RIGHT|GUARDBAND_BOTTOM|NEGW)
 
-INLINE
-void ComputeClipCodes(const API_STATE& state, const simdvector& vertex, 
simdscalar& clipCodes, simdscalari const )
+template
+void ComputeClipCodes(const API_STATE , const typename SIMD_T::Vec4 
, typename SIMD_T::Float , typename SIMD_T::Integer const 
)
 {
-clipCodes = _simd_setzero_ps();
+clipCodes = SIMD_T::setzero_ps();
 
 // -w
-simdscalar vNegW = _simd_mul_ps(vertex.w, _simd_set1_ps(-1.0f));
+typename SIMD_T::Float vNegW = 
SIMD_T::mul_ps(vertex.w,SIMD_T::set1_ps(-1.0f));
 
 // FRUSTUM_LEFT
-simdscalar vRes = _simd_cmplt_ps(vertex.x, vNegW);
-clipCodes = _simd_and_ps(vRes, 
_simd_castsi_ps(_simd_set1_epi32(FRUSTUM_LEFT)));
+typename SIMD_T::Float vRes = SIMD_T::cmplt_ps(vertex.x, vNegW);
+clipCodes = SIMD_T::and_ps(vRes, 
SIMD_T::castsi_ps(SIMD_T::set1_epi32(FRUSTUM_LEFT)));
 
 // FRUSTUM_TOP
-vRes = _simd_cmplt_ps(vertex.y, vNegW);
-clipCodes = _simd_or_ps(clipCodes, _simd_and_ps(vRes, 
_simd_castsi_ps(_simd_set1_epi32(FRUSTUM_TOP;
+vRes = SIMD_T::cmplt_ps(vertex.y, vNegW);
+clipCodes = SIMD_T::or_ps(clipCodes, SIMD_T::and_ps(vRes, 
SIMD_T::castsi_ps(SIMD_T::set1_epi32(FRUSTUM_TOP;
 
 // FRUSTUM_RIGHT
-vRes = _simd_cmpgt_ps(vertex.x, vertex.w);
- 

[Mesa-dev] [PATCH 2/8] swr: set caps for VB 4-byte alignment

2017-09-05 Thread Tim Rowley
Needed to compensate for change to fetch jit requiring
alignment.

Fixes regressions in piglit: vertex-buffer-offsets and about
another hundred of the vs-input*byte* tests.
---
 src/gallium/drivers/swr/swr_screen.cpp | 9 ++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/src/gallium/drivers/swr/swr_screen.cpp 
b/src/gallium/drivers/swr/swr_screen.cpp
index cc8d995..85bf765 100644
--- a/src/gallium/drivers/swr/swr_screen.cpp
+++ b/src/gallium/drivers/swr/swr_screen.cpp
@@ -263,6 +263,12 @@ swr_get_param(struct pipe_screen *screen, enum pipe_cap 
param)
case PIPE_CAP_FAKE_SW_MSAA:
   return (swr_screen(screen)->msaa_max_count > 1) ? 0 : 1;
 
+   /* fetch jit change for 2-4GB buffers requires alignment */
+   case PIPE_CAP_VERTEX_BUFFER_OFFSET_4BYTE_ALIGNED_ONLY:
+   case PIPE_CAP_VERTEX_BUFFER_STRIDE_4BYTE_ALIGNED_ONLY:
+   case PIPE_CAP_VERTEX_ELEMENT_SRC_OFFSET_4BYTE_ALIGNED_ONLY:
+  return 1;
+
   /* unsupported features */
case PIPE_CAP_ANISOTROPIC_FILTER:
case PIPE_CAP_TEXTURE_BORDER_COLOR_QUIRK:
@@ -274,9 +280,6 @@ swr_get_param(struct pipe_screen *screen, enum pipe_cap 
param)
case PIPE_CAP_COMPUTE:
case PIPE_CAP_TGSI_VS_LAYER_VIEWPORT:
case PIPE_CAP_TGSI_CAN_COMPACT_CONSTANTS:
-   case PIPE_CAP_VERTEX_BUFFER_OFFSET_4BYTE_ALIGNED_ONLY:
-   case PIPE_CAP_VERTEX_BUFFER_STRIDE_4BYTE_ALIGNED_ONLY:
-   case PIPE_CAP_VERTEX_ELEMENT_SRC_OFFSET_4BYTE_ALIGNED_ONLY:
case PIPE_CAP_TGSI_TEXCOORD:
case PIPE_CAP_PREFER_BLIT_BASED_TEXTURE_TRANSFER:
case PIPE_CAP_MAX_TEXTURE_GATHER_COMPONENTS:
-- 
2.7.4

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH 1/8] swr/rast: Allow gather of floats from fetch shader with 2-4GB offsets

2017-09-05 Thread Tim Rowley
---
 src/gallium/drivers/swr/rasterizer/codegen/gen_llvm_ir_macros.py | 1 +
 src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.cpp  | 7 ++-
 2 files changed, 7 insertions(+), 1 deletion(-)

diff --git a/src/gallium/drivers/swr/rasterizer/codegen/gen_llvm_ir_macros.py 
b/src/gallium/drivers/swr/rasterizer/codegen/gen_llvm_ir_macros.py
index 2ed2b2f..025d38a 100644
--- a/src/gallium/drivers/swr/rasterizer/codegen/gen_llvm_ir_macros.py
+++ b/src/gallium/drivers/swr/rasterizer/codegen/gen_llvm_ir_macros.py
@@ -45,6 +45,7 @@ intrinsics = [
 ['VGATHERPD', 'x86_avx2_gather_d_pd_256', ['src', 'pBase', 'indices', 
'mask', 'scale']],
 ['VGATHERPS', 'x86_avx2_gather_d_ps_256', ['src', 'pBase', 'indices', 
'mask', 'scale']],
 ['VGATHERDD', 'x86_avx2_gather_d_d_256', ['src', 'pBase', 'indices', 
'mask', 'scale']],
+['VPSRLI', 'x86_avx2_psrli_d', ['src', 'imm']],
 ['VSQRTPS', 'x86_avx_sqrt_ps_256', ['a']],
 ['VRSQRTPS', 'x86_avx_rsqrt_ps_256', ['a']],
 ['VRCPPS', 'x86_avx_rcp_ps_256', ['a']],
diff --git a/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.cpp 
b/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.cpp
index dcfe897..761c58c 100644
--- a/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.cpp
+++ b/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.cpp
@@ -1005,7 +1005,12 @@ void FetchJit::JitGatherVertices(const 
FETCH_COMPILE_STATE ,
 Value *vMask = vGatherMask;
 
 // Gather a SIMD of vertices
-vVertexElements[currentVertexElement++] = 
GATHERPS(gatherSrc, pStreamBase, vOffsets, vMask, C((char)1));
+// APIs allow a 4GB range for offsets
+// However, GATHERPS uses signed 32-bit 
offsets, so only a 2GB range :(
+// But, we know that elements must be aligned 
for FETCH. :)
+// Right shift the offset by a bit and then 
scale by 2 to remove the sign extension.
+Value* vShiftedOffsets = VPSRLI(vOffsets, 
C(1));
+vVertexElements[currentVertexElement++] = 
GATHERPS(gatherSrc, pStreamBase, vShiftedOffsets, vMask, C((char)2));
 }
 else
 {
-- 
2.7.4

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH 3/8] swr/rast: Removed some trailing whitespace caught during review

2017-09-05 Thread Tim Rowley
---
 .../rasterizer/codegen/templates/gen_ar_eventhandlerfile.hpp |  4 ++--
 src/gallium/drivers/swr/rasterizer/core/fifo.hpp |  4 ++--
 src/gallium/drivers/swr/rasterizer/core/pa.h | 12 ++--
 3 files changed, 10 insertions(+), 10 deletions(-)

diff --git 
a/src/gallium/drivers/swr/rasterizer/codegen/templates/gen_ar_eventhandlerfile.hpp
 
b/src/gallium/drivers/swr/rasterizer/codegen/templates/gen_ar_eventhandlerfile.hpp
index 0ca9a78..d1852b3 100644
--- 
a/src/gallium/drivers/swr/rasterizer/codegen/templates/gen_ar_eventhandlerfile.hpp
+++ 
b/src/gallium/drivers/swr/rasterizer/codegen/templates/gen_ar_eventhandlerfile.hpp
@@ -23,7 +23,7 @@
 * @file ${filename}
 *
 * @brief Event handler interface.  auto-generated file
-* 
+*
 * DO NOT EDIT
 *
 * Generation Command Line:
@@ -57,7 +57,7 @@ namespace ArchRast
 std::stringstream outDir;
 outDir << KNOB_DEBUG_OUTPUT_DIR << pBaseName << "_" << pid << 
std::ends;
 CreateDirectory(outDir.str().c_str(), NULL);
-
+
 // There could be multiple threads creating thread pools. We
 // want to make sure they are uniquly identified by adding in
 // the creator's thread id into the filename.
diff --git a/src/gallium/drivers/swr/rasterizer/core/fifo.hpp 
b/src/gallium/drivers/swr/rasterizer/core/fifo.hpp
index 3be72f3..43d3a83 100644
--- a/src/gallium/drivers/swr/rasterizer/core/fifo.hpp
+++ b/src/gallium/drivers/swr/rasterizer/core/fifo.hpp
@@ -79,7 +79,7 @@ struct QUEUE
 long initial = InterlockedCompareExchange(, 1, 0);
 return (initial == 0);
 }
-
+
 void unlock()
 {
 mLock = 0;
@@ -112,7 +112,7 @@ struct QUEUE
 __m256 vSrc = _mm256_load_ps(pSrc + i*KNOB_SIMD_WIDTH);
 _mm256_stream_ps(pDst + i*KNOB_SIMD_WIDTH, vSrc);
 };
-
+
 const uint32_t numSimdLines = sizeof(T) / (KNOB_SIMD_WIDTH*4);
 static_assert(numSimdLines * KNOB_SIMD_WIDTH * 4 == sizeof(T),
 "FIFO element size should be multiple of SIMD width.");
diff --git a/src/gallium/drivers/swr/rasterizer/core/pa.h 
b/src/gallium/drivers/swr/rasterizer/core/pa.h
index cb3470f..87dba22 100644
--- a/src/gallium/drivers/swr/rasterizer/core/pa.h
+++ b/src/gallium/drivers/swr/rasterizer/core/pa.h
@@ -162,7 +162,7 @@ struct PA_STATE_OPT : public PA_STATE
 bool   isStreaming{ false };
 
 SIMDMASK   junkIndices  { 0 };  // temporary index store 
for unused virtual function
-
+
 PA_STATE_OPT() {}
 PA_STATE_OPT(DRAW_CONTEXT* pDC, uint32_t numPrims, uint8_t* pStream, 
uint32_t streamSizeInVerts,
 uint32_t vertexStride, bool in_isStreaming, PRIMITIVE_TOPOLOGY topo = 
TOP_UNKNOWN);
@@ -412,7 +412,7 @@ struct PA_STATE_CUT : public PA_STATE
 uint32_t vertsPerPrim{ 0 };
 bool processCutVerts{ false };   // vertex indices with cuts should be 
processed as normal, otherwise they
  // are ignored.  Fetch shader sends 
invalid verts on cuts that should be ignored
- // while the GS sends valid verts for 
every index 
+ // while the GS sends valid verts for 
every index
 
 simdvector  junkVector;  // junk simdvector for unimplemented 
API
 #if ENABLE_AVX512_SIMD16
@@ -575,7 +575,7 @@ struct PA_STATE_CUT : public PA_STATE
 return CheckBit(this->pCutIndices[vertexIndex], vertexOffset);
 }
 
-// iterates across the unprocessed verts until we hit the end or we 
+// iterates across the unprocessed verts until we hit the end or we
 // have assembled SIMD prims
 void ProcessVerts()
 {
@@ -583,7 +583,7 @@ struct PA_STATE_CUT : public PA_STATE
 this->numRemainingVerts > 0 &&
 this->curVertex != this->headVertex)
 {
-// if cut index, restart topology 
+// if cut index, restart topology
 if (IsCutIndex(this->curVertex))
 {
 if (this->processCutVerts)
@@ -923,7 +923,7 @@ struct PA_STATE_CUT : public PA_STATE
 case 6:
 SWR_ASSERT(this->adjExtraVert != -1, "Algorith failure!");
 AssembleTriStripAdj();
-
+
 uint32_t nextTri[6];
 if (this->reverseWinding)
 {
@@ -939,7 +939,7 @@ struct PA_STATE_CUT : public PA_STATE
 nextTri[1] = this->adjExtraVert;
 nextTri[2] = this->vert[3];
 nextTri[4] = this->vert[4];
-nextTri[5] = this->vert[0]; 
+nextTri[5] = this->vert[0];
 }
 for (uint32_t i = 0; i < 6; ++i)
 {
-- 
2.7.4

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH] swr: limit pipe_draw_info->restart_index usage

2017-08-23 Thread Tim Rowley
Only copy this value when in restart drawing mode.

Eliminates valgrind errors when running trivial programs.
---
 src/gallium/drivers/swr/swr_draw.cpp | 5 -
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/src/gallium/drivers/swr/swr_draw.cpp 
b/src/gallium/drivers/swr/swr_draw.cpp
index df1c11a..2363800 100644
--- a/src/gallium/drivers/swr/swr_draw.cpp
+++ b/src/gallium/drivers/swr/swr_draw.cpp
@@ -107,7 +107,10 @@ swr_draw_vbo(struct pipe_context *pipe, const struct 
pipe_draw_info *info)
}
 
struct swr_vertex_element_state *velems = ctx->velems;
-   velems->fsState.cutIndex = info->restart_index;
+   if (info->primitive_restart)
+  velems->fsState.cutIndex = info->restart_index;
+   else
+  velems->fsState.cutIndex = 0;
velems->fsState.bEnableCutIndex = info->primitive_restart;
velems->fsState.bPartialVertexBuffer = (info->min_index > 0);
 
-- 
2.7.4

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH] configure: remove trailing "-a" in swr architecture test

2017-08-10 Thread Tim Rowley
Fixes "configure: line 27326: test: argument expected"

CC: mesa-sta...@lists.freedesktop.org
---
 configure.ac | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/configure.ac b/configure.ac
index 5b12dd8..316e6a8 100644
--- a/configure.ac
+++ b/configure.ac
@@ -2545,7 +2545,7 @@ if test -n "$with_gallium_drivers"; then
 if test "x$HAVE_SWR_AVX" != xyes -a \
 "x$HAVE_SWR_AVX2" != xyes -a \
 "x$HAVE_SWR_KNL" != xyes -a \
-"x$HAVE_SWR_SKX" != xyes -a; then
+"x$HAVE_SWR_SKX" != xyes; then
AC_MSG_ERROR([swr enabled but no swr architectures selected])
 fi
 
-- 
2.7.4

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH] swr/rast: [rasterizer core] fix invalid casting for calls to Interlocked* functions

2017-08-09 Thread Tim Rowley
CID: 1416243, 1416244, 1416255
CC: mesa-sta...@lists.freedesktop.org
---
 src/gallium/drivers/swr/rasterizer/core/api.cpp | 2 +-
 src/gallium/drivers/swr/rasterizer/core/context.h   | 8 
 src/gallium/drivers/swr/rasterizer/core/threads.cpp | 4 ++--
 3 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/src/gallium/drivers/swr/rasterizer/core/api.cpp 
b/src/gallium/drivers/swr/rasterizer/core/api.cpp
index 8dc9ac2..ccb6dfb 100644
--- a/src/gallium/drivers/swr/rasterizer/core/api.cpp
+++ b/src/gallium/drivers/swr/rasterizer/core/api.cpp
@@ -189,7 +189,7 @@ void QueueWork(SWR_CONTEXT *pContext)
 
 if (IsDraw)
 {
-InterlockedIncrement((volatile long*)>drawsOutstandingFE);
+InterlockedIncrement(>drawsOutstandingFE);
 }
 
 _ReadWriteBarrier();
diff --git a/src/gallium/drivers/swr/rasterizer/core/context.h 
b/src/gallium/drivers/swr/rasterizer/core/context.h
index 131b3cb..bcd5801 100644
--- a/src/gallium/drivers/swr/rasterizer/core/context.h
+++ b/src/gallium/drivers/swr/rasterizer/core/context.h
@@ -409,12 +409,12 @@ struct DRAW_CONTEXT
 booldependent;  // Backend work is dependent on all 
previous BE
 boolisCompute;  // Is this DC a compute context?
 boolcleanupState;   // True if this is the last draw using an 
entry in the state ring.
-volatile bool   doneFE; // Is FE work done for this draw?
 
 FE_WORK FeWork;
 
+volatile OSALIGNLINE(bool)   doneFE; // Is FE work done for 
this draw?
 volatile OSALIGNLINE(uint32_t)   FeLock;
-volatile int32_tthreadsDone;
+volatile OSALIGNLINE(uint32_t)   threadsDone;
 
 SYNC_DESC   retireCallback; // Call this func when this DC is retired.
 };
@@ -503,9 +503,9 @@ struct SWR_CONTEXT
 // Scratch space for workers.
 uint8_t** ppScratch;
 
-volatile int32_t  drawsOutstandingFE;
+volatile OSALIGNLINE(uint32_t)  drawsOutstandingFE;
 
-CachingAllocator cachingArenaAllocator;
+OSALIGNLINE(CachingAllocator) cachingArenaAllocator;
 uint32_t frameCount;
 
 uint32_t lastFrameChecked;
diff --git a/src/gallium/drivers/swr/rasterizer/core/threads.cpp 
b/src/gallium/drivers/swr/rasterizer/core/threads.cpp
index 70bde02..b704d23 100644
--- a/src/gallium/drivers/swr/rasterizer/core/threads.cpp
+++ b/src/gallium/drivers/swr/rasterizer/core/threads.cpp
@@ -393,7 +393,7 @@ INLINE void ExecuteCallbacks(SWR_CONTEXT* pContext, 
uint32_t workerId, DRAW_CONT
 // inlined-only version
 INLINE int32_t CompleteDrawContextInl(SWR_CONTEXT* pContext, uint32_t 
workerId, DRAW_CONTEXT* pDC)
 {
-int32_t result = InterlockedDecrement((volatile long*)>threadsDone);
+int32_t result = 
static_cast(InterlockedDecrement(>threadsDone));
 SWR_ASSERT(result >= 0);
 
 AR_FLUSH(pDC->drawId);
@@ -639,7 +639,7 @@ INLINE void CompleteDrawFE(SWR_CONTEXT* pContext, uint32_t 
workerId, DRAW_CONTEX
 _mm_mfence();
 pDC->doneFE = true;
 
-InterlockedDecrement((volatile long*)>drawsOutstandingFE);
+InterlockedDecrement(>drawsOutstandingFE);
 }
 
 void WorkOnFifoFE(SWR_CONTEXT *pContext, uint32_t workerId, uint32_t 
)
-- 
2.7.4

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH v2 17/17] swr/rast: fix core / knights split of AVX512 intrinsics

2017-08-01 Thread Tim Rowley
Move AVX512BW specific intrinics to be Core-only.

Move some AVX512F intrinsics back to common implementation file.
---
 .../drivers/swr/rasterizer/common/simdlib.hpp  |  2 +
 .../swr/rasterizer/common/simdlib_512_avx512.inl   | 53 +
 .../rasterizer/common/simdlib_512_avx512_core.inl  | 54 ++
 .../common/simdlib_512_avx512_knights.inl  | 15 --
 4 files changed, 69 insertions(+), 55 deletions(-)

diff --git a/src/gallium/drivers/swr/rasterizer/common/simdlib.hpp 
b/src/gallium/drivers/swr/rasterizer/common/simdlib.hpp
index 22d7da4..500cf8a 100644
--- a/src/gallium/drivers/swr/rasterizer/common/simdlib.hpp
+++ b/src/gallium/drivers/swr/rasterizer/common/simdlib.hpp
@@ -214,6 +214,8 @@ struct SIMDBase : Traits::IsaImpl
 using Vec4  = typename Traits::Vec4;
 using Mask  = typename Traits::Mask;
 
+static const size_t VECTOR_BYTES = sizeof(Float);
+
 // Populates a SIMD Vec4 from a non-simd vector. So p = xyzw becomes  
  .
 static SIMDINLINE
 void vec4_load1_ps(Vec4& r, const float *p)
diff --git a/src/gallium/drivers/swr/rasterizer/common/simdlib_512_avx512.inl 
b/src/gallium/drivers/swr/rasterizer/common/simdlib_512_avx512.inl
index 1dbfff8..95e4c31 100644
--- a/src/gallium/drivers/swr/rasterizer/common/simdlib_512_avx512.inl
+++ b/src/gallium/drivers/swr/rasterizer/common/simdlib_512_avx512.inl
@@ -158,6 +158,11 @@ private:
 return _mm512_maskz_set1_epi32(m, -1);
 }
 
+static SIMDINLINE Integer vmask(__mmask8 m)
+{
+return _mm512_maskz_set1_epi64(m, -1LL);
+}
+
 public:
 //---
 // Single precision floating point arithmetic operations
@@ -187,8 +192,8 @@ static SIMDINLINE Float SIMDCALL floor_ps(Float a) { return 
round_ps 0xff) ? 0xff : (a + b) 
(uint8) 
+//SIMD_IWRAPPER_2(add_epi8);  // return a + b (int8)
+//SIMD_IWRAPPER_2(adds_epu8); // return ((a + b) > 0xff) ? 0xff : (a + b) 
(uint8) 
 SIMD_IWRAPPER_2(max_epi32); // return (a > b) ? a : b (int32)
 SIMD_IWRAPPER_2(max_epu32); // return (a > b) ? a : b (uint32)
 SIMD_IWRAPPER_2(min_epi32); // return (a < b) ? a : b (int32)
@@ -202,7 +207,7 @@ SIMD_IWRAPPER_2(mul_epi32); // return a * b (int32)
 SIMD_IWRAPPER_2(mullo_epi32);
 SIMD_IWRAPPER_2(sub_epi32); // return a - b (int32)
 SIMD_IWRAPPER_2(sub_epi64); // return a - b (int64)
-SIMD_IWRAPPER_2(subs_epu8); // return (b > a) ? 0 : (a - b) (uint8)
+//SIMD_IWRAPPER_2(subs_epu8); // return (b > a) ? 0 : (a - b) (uint8)
 
 //---
 // Logical operations
@@ -276,7 +281,7 @@ static SIMDINLINE Float SIMDCALL cvtepi32_ps(Integer a) // 
return (float)a(i
 return _mm512_cvtepi32_ps(a);
 }
 
-SIMD_IWRAPPER_1_8(cvtepu8_epi16); // return (int16)a(uint8 --> int16)
+//SIMD_IWRAPPER_1_8(cvtepu8_epi16); // return (int16)a(uint8 --> int16)
 SIMD_IWRAPPER_1_4(cvtepu8_epi32); // return (int32)a(uint8 --> int32)
 SIMD_IWRAPPER_1_8(cvtepu16_epi32);// return (int32)a(uint16 --> int32)
 SIMD_IWRAPPER_1_4(cvtepu16_epi64);// return (int64)a(uint16 --> int64)
@@ -317,20 +322,6 @@ static SIMDINLINE Float SIMDCALL cmpge_ps(Float a, Float 
b) { return cmp_ps(a, b); }
 
 template
-static SIMDINLINE Integer SIMDCALL cmp_epi8(Integer a, Integer b)
-{
-// Legacy vector mask generator
-__mmask64 result = _mm512_cmp_epi8_mask(a, b, static_cast(CmpTypeT));
-return vmask(result);
-}
-template
-static SIMDINLINE Integer SIMDCALL cmp_epi16(Integer a, Integer b)
-{
-// Legacy vector mask generator
-__mmask32 result = _mm512_cmp_epi16_mask(a, b, static_cast(CmpTypeT));
-return vmask(result);
-}
-template
 static SIMDINLINE Integer SIMDCALL cmp_epi32(Integer a, Integer b)
 {
 // Legacy vector mask generator
@@ -345,12 +336,12 @@ static SIMDINLINE Integer SIMDCALL cmp_epi64(Integer a, 
Integer b)
 return vmask(result);
 }
 
-SIMD_IWRAPPER_2_CMP(cmpeq_epi8,  cmp_epi8);// return a 
== b (int8)
-SIMD_IWRAPPER_2_CMP(cmpeq_epi16, cmp_epi16);   // return a 
== b (int16)
+//SIMD_IWRAPPER_2_CMP(cmpeq_epi8,  cmp_epi8);// return 
a == b (int8)
+//SIMD_IWRAPPER_2_CMP(cmpeq_epi16, cmp_epi16);   // return 
a == b (int16)
 SIMD_IWRAPPER_2_CMP(cmpeq_epi32, cmp_epi32);   // return a 
== b (int32)
 SIMD_IWRAPPER_2_CMP(cmpeq_epi64, cmp_epi64);   // return a 
== b (int64)
-SIMD_IWRAPPER_2_CMP(cmpgt_epi8,  cmp_epi8);// return a 
> b (int8)
-SIMD_IWRAPPER_2_CMP(cmpgt_epi16, cmp_epi16);   // return a 
> b (int16)
+//SIMD_IWRAPPER_2_CMP(cmpgt_epi8,  cmp_epi8);// return 
a > b (int8)
+//SIMD_IWRAPPER_2_CMP(cmpgt_epi16, cmp_epi16);   // return 
a > b (int16)
 SIMD_IWRAPPER_2_CMP(cmpgt_epi32, cmp_epi32);   // return a 
> b (int32)
 SIMD_IWRAPPER_2_CMP(cmpgt_epi64, cmp_epi64);   // return a 
> b (int64)
 SIMD_IWRAPPER_2_CMP(cmplt_epi32, cmp_epi32);   // return a 
< b (int32)
@@ -458,7 +449,7 @@ 

[Mesa-dev] [PATCH v2 14/17] swr/rast: gen_knobs template code style

2017-08-01 Thread Tim Rowley
---
 src/gallium/drivers/swr/rasterizer/codegen/templates/gen_knobs.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/gallium/drivers/swr/rasterizer/codegen/templates/gen_knobs.cpp 
b/src/gallium/drivers/swr/rasterizer/codegen/templates/gen_knobs.cpp
index e6fe165..a950643 100644
--- a/src/gallium/drivers/swr/rasterizer/codegen/templates/gen_knobs.cpp
+++ b/src/gallium/drivers/swr/rasterizer/codegen/templates/gen_knobs.cpp
@@ -203,8 +203,8 @@ GlobalKnobs g_GlobalKnobs;
 //
 GlobalKnobs::GlobalKnobs()
 {
-% for knob in knobs:
-InitKnob(${knob[0]});
+% for knob in knobs :
+InitKnob(${ knob[0] });
 % endfor
 }
 
-- 
2.7.4

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH v2 16/17] swr/rast: simplify knob default value setup

2017-08-01 Thread Tim Rowley
---
 .../drivers/swr/rasterizer/codegen/templates/gen_knobs.h| 13 -
 src/gallium/drivers/swr/rasterizer/core/knobs_init.h| 12 +++-
 2 files changed, 11 insertions(+), 14 deletions(-)

diff --git a/src/gallium/drivers/swr/rasterizer/codegen/templates/gen_knobs.h 
b/src/gallium/drivers/swr/rasterizer/codegen/templates/gen_knobs.h
index b02870b..d81f7d0 100644
--- a/src/gallium/drivers/swr/rasterizer/codegen/templates/gen_knobs.h
+++ b/src/gallium/drivers/swr/rasterizer/codegen/templates/gen_knobs.h
@@ -67,12 +67,6 @@ public:
 return Value();
 }
 
-protected:
-Knob(T const ) :
-m_Value(expandEnvironmentVariables(defaultValue))
-{
-}
-
 private:
 T m_Value;
 };
@@ -83,10 +77,10 @@ private:
 
 {   \\
 
-Knob_##_name() : Knob<_type>(_default) { }  \\
-
 static const char* Name() { return "KNOB_" #_name; }\\
 
+static _type DefaultValue() { return (_default); }  \\
+
 } _name;
 
 #define GET_KNOB(_name) g_GlobalKnobs._name.Value()
@@ -117,8 +111,9 @@ struct GlobalKnobs
 % endif
 
 % endfor
-GlobalKnobs();
+
 std::string ToString(const char* optPerLinePrefix="");
+GlobalKnobs();
 };
 extern GlobalKnobs g_GlobalKnobs;
 
diff --git a/src/gallium/drivers/swr/rasterizer/core/knobs_init.h 
b/src/gallium/drivers/swr/rasterizer/core/knobs_init.h
index ba2df22..12c2a30 100644
--- a/src/gallium/drivers/swr/rasterizer/core/knobs_init.h
+++ b/src/gallium/drivers/swr/rasterizer/core/knobs_init.h
@@ -91,16 +91,18 @@ static inline void ConvertEnvToKnob(const char* pOverride, 
std::string& knobValu
 template 
 static inline void InitKnob(T& knob)
 {
-
-// TODO, read registry first
-
-// Second, read environment variables
+// Read environment variables
 const char* pOverride = getenv(knob.Name());
 
 if (pOverride)
 {
-auto knobValue = knob.Value();
+auto knobValue = knob.DefaultValue();
 ConvertEnvToKnob(pOverride, knobValue);
 knob.Value(knobValue);
 }
+else
+{
+// Set default value
+knob.Value(knob.DefaultValue());
+}
 }
-- 
2.7.4

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH v2 15/17] swr/rast: split gen_knobs templates into .h/.cpp

2017-08-01 Thread Tim Rowley
Switch to a 1:1 mapping template:generated for future maintenance.
---
 src/gallium/drivers/swr/Makefile.am|   3 +-
 src/gallium/drivers/swr/SConscript |   2 +-
 .../drivers/swr/rasterizer/codegen/gen_knobs.py|  14 +-
 .../swr/rasterizer/codegen/templates/gen_knobs.cpp | 108 --
 .../swr/rasterizer/codegen/templates/gen_knobs.h   | 157 +
 5 files changed, 166 insertions(+), 118 deletions(-)
 create mode 100644 
src/gallium/drivers/swr/rasterizer/codegen/templates/gen_knobs.h

diff --git a/src/gallium/drivers/swr/Makefile.am 
b/src/gallium/drivers/swr/Makefile.am
index 73fe904..b20f128 100644
--- a/src/gallium/drivers/swr/Makefile.am
+++ b/src/gallium/drivers/swr/Makefile.am
@@ -115,7 +115,7 @@ rasterizer/codegen/gen_knobs.cpp: 
rasterizer/codegen/gen_knobs.py rasterizer/cod
--output rasterizer/codegen/gen_knobs.cpp \
--gen_cpp
 
-rasterizer/codegen/gen_knobs.h: rasterizer/codegen/gen_knobs.py 
rasterizer/codegen/knob_defs.py rasterizer/codegen/templates/gen_knobs.cpp 
rasterizer/codegen/gen_common.py
+rasterizer/codegen/gen_knobs.h: rasterizer/codegen/gen_knobs.py 
rasterizer/codegen/knob_defs.py rasterizer/codegen/templates/gen_knobs.h 
rasterizer/codegen/gen_common.py
$(MKDIR_GEN)
$(PYTHON_GEN) \
$(srcdir)/rasterizer/codegen/gen_knobs.py \
@@ -347,5 +347,6 @@ EXTRA_DIST = \
rasterizer/codegen/templates/gen_builder.hpp \
rasterizer/codegen/templates/gen_header_init.hpp \
rasterizer/codegen/templates/gen_knobs.cpp \
+   rasterizer/codegen/templates/gen_knobs.h \
rasterizer/codegen/templates/gen_llvm.hpp \
rasterizer/codegen/templates/gen_rasterizer.cpp
diff --git a/src/gallium/drivers/swr/SConscript 
b/src/gallium/drivers/swr/SConscript
index c578d7a..b394cbc 100644
--- a/src/gallium/drivers/swr/SConscript
+++ b/src/gallium/drivers/swr/SConscript
@@ -54,7 +54,7 @@ env.CodeGenerate(
 command = python_cmd + ' $SCRIPT --output $TARGET --gen_h'
 )
 Depends('rasterizer/codegen/gen_knobs.h',
-swrroot + 'rasterizer/codegen/templates/gen_knobs.cpp')
+swrroot + 'rasterizer/codegen/templates/gen_knobs.h')
 
 env.CodeGenerate(
 target = 'rasterizer/jitter/gen_state_llvm.h',
diff --git a/src/gallium/drivers/swr/rasterizer/codegen/gen_knobs.py 
b/src/gallium/drivers/swr/rasterizer/codegen/gen_knobs.py
index 2c271c7..33f62a2 100644
--- a/src/gallium/drivers/swr/rasterizer/codegen/gen_knobs.py
+++ b/src/gallium/drivers/swr/rasterizer/codegen/gen_knobs.py
@@ -37,27 +37,25 @@ def main(args=sys.argv[1:]):
 args = parser.parse_args()
 
 cur_dir = os.path.dirname(os.path.abspath(__file__))
-template_file = os.path.join(cur_dir, 'templates', 'gen_knobs.cpp')
+template_cpp = os.path.join(cur_dir, 'templates', 'gen_knobs.cpp')
+template_h = os.path.join(cur_dir, 'templates', 'gen_knobs.h')
 
 if args.gen_h:
 MakoTemplateWriter.to_file(
-template_file,
+template_h,
 args.output,
 cmdline=sys.argv,
 filename='gen_knobs',
-knobs=knob_defs.KNOBS,
-includes=['core/knobs_init.h', 'common/os.h', 'sstream', 
'iomanip'],
-gen_header=True)
+knobs=knob_defs.KNOBS)
 
 if args.gen_cpp:
 MakoTemplateWriter.to_file(
-template_file,
+template_cpp,
 args.output,
 cmdline=sys.argv,
 filename='gen_knobs',
 knobs=knob_defs.KNOBS,
-includes=['core/knobs_init.h', 'common/os.h', 'sstream', 
'iomanip'],
-gen_header=False)
+includes=['core/knobs_init.h', 'common/os.h', 'sstream', 
'iomanip'])
 
 return 0
 
diff --git a/src/gallium/drivers/swr/rasterizer/codegen/templates/gen_knobs.cpp 
b/src/gallium/drivers/swr/rasterizer/codegen/templates/gen_knobs.cpp
index a950643..2f4c47a 100644
--- a/src/gallium/drivers/swr/rasterizer/codegen/templates/gen_knobs.cpp
+++ b/src/gallium/drivers/swr/rasterizer/codegen/templates/gen_knobs.cpp
@@ -20,11 +20,7 @@
 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
 * IN THE SOFTWARE.
 *
-% if gen_header:
-* @file ${filename}.h
-% else:
 * @file ${filename}.cpp
-% endif 
 *
 * @brief Dynamic Knobs for Core.
 *
@@ -35,105 +31,6 @@
 *
 **/
 <% calc_max_knob_len(knobs) %>
-%if gen_header:
-#pragma once
-#include 
-
-struct KnobBase
-{
-private:
-// Update the input string.
-static void autoExpandEnvironmentVariables(std::string );
-
-protected:
-// Leave input alone and return new string.
-static std::string expandEnvironmentVariables(std::string const )
-{
-std::string text = input;
-autoExpandEnvironmentVariables(text);
-return text;
-}
-
-template 
-static T expandEnvironmentVariables(T const )
-{
-

[Mesa-dev] [PATCH v2 06/17] swr/rast: stop using MSFT types in platform independent code

2017-08-01 Thread Tim Rowley
---
 src/gallium/drivers/swr/rasterizer/common/os.h |  6 --
 src/gallium/drivers/swr/rasterizer/core/api.cpp|  2 +-
 src/gallium/drivers/swr/rasterizer/core/api.h  |  4 ++--
 src/gallium/drivers/swr/rasterizer/core/binner.cpp |  4 ++--
 src/gallium/drivers/swr/rasterizer/core/blend.h|  2 +-
 src/gallium/drivers/swr/rasterizer/core/clip.h |  8 
 src/gallium/drivers/swr/rasterizer/core/fifo.hpp   |  2 +-
 src/gallium/drivers/swr/rasterizer/core/format_traits.h|  4 ++--
 src/gallium/drivers/swr/rasterizer/core/pa.h   |  2 +-
 src/gallium/drivers/swr/rasterizer/core/threads.cpp|  4 ++--
 src/gallium/drivers/swr/rasterizer/core/tilemgr.h  | 12 ++--
 src/gallium/drivers/swr/rasterizer/core/utils.h| 10 ++
 src/gallium/drivers/swr/rasterizer/jitter/blend_jit.cpp|  2 +-
 src/gallium/drivers/swr/rasterizer/jitter/builder_misc.cpp |  4 ++--
 14 files changed, 35 insertions(+), 31 deletions(-)

diff --git a/src/gallium/drivers/swr/rasterizer/common/os.h 
b/src/gallium/drivers/swr/rasterizer/common/os.h
index dc90fca..4ed6b88 100644
--- a/src/gallium/drivers/swr/rasterizer/common/os.h
+++ b/src/gallium/drivers/swr/rasterizer/common/os.h
@@ -220,12 +220,6 @@ void *AlignedMalloc(unsigned int size, unsigned int 
alignment)
 return ret;
 }
 
-inline
-unsigned char _bittest(const LONG *a, LONG b)
-{
-return ((*(unsigned *)(a) & (1 << b)) != 0);
-}
-
 static inline
 void AlignedFree(void* p)
 {
diff --git a/src/gallium/drivers/swr/rasterizer/core/api.cpp 
b/src/gallium/drivers/swr/rasterizer/core/api.cpp
index 855d133..8dc9ac2 100644
--- a/src/gallium/drivers/swr/rasterizer/core/api.cpp
+++ b/src/gallium/drivers/swr/rasterizer/core/api.cpp
@@ -189,7 +189,7 @@ void QueueWork(SWR_CONTEXT *pContext)
 
 if (IsDraw)
 {
-InterlockedIncrement((volatile LONG*)>drawsOutstandingFE);
+InterlockedIncrement((volatile long*)>drawsOutstandingFE);
 }
 
 _ReadWriteBarrier();
diff --git a/src/gallium/drivers/swr/rasterizer/core/api.h 
b/src/gallium/drivers/swr/rasterizer/core/api.h
index 236e0fc..a394205 100644
--- a/src/gallium/drivers/swr/rasterizer/core/api.h
+++ b/src/gallium/drivers/swr/rasterizer/core/api.h
@@ -697,8 +697,8 @@ SWR_FUNC(void, SwrStoreHotTileToSurface,
 SWR_FUNC(void, SwrStoreHotTileClear,
  SWR_SURFACE_STATE *pDstSurface,
  SWR_RENDERTARGET_ATTACHMENT renderTargetIndex,
- UINT x,
- UINT y,
+ uint32_t x,
+ uint32_t y,
  uint32_t renderTargetArrayIndex,
  const float* pClearColor);
 
diff --git a/src/gallium/drivers/swr/rasterizer/core/binner.cpp 
b/src/gallium/drivers/swr/rasterizer/core/binner.cpp
index de6691b..c1f0f07 100644
--- a/src/gallium/drivers/swr/rasterizer/core/binner.cpp
+++ b/src/gallium/drivers/swr/rasterizer/core/binner.cpp
@@ -64,7 +64,7 @@ INLINE void ProcessAttributes(
 static_assert(NumVertsT::value > 0 && NumVertsT::value <= 3, "Invalid 
value for NumVertsT");
 const SWR_BACKEND_STATE& backendState = pDC->pState->state.backendState;
 // Conservative Rasterization requires degenerate tris to have constant 
attribute interpolation
-LONG constantInterpMask = IsDegenerate::value ? 0x : 
backendState.constantInterpolationMask;
+uint32_t constantInterpMask = IsDegenerate::value ? 0x : 
backendState.constantInterpolationMask;
 const uint32_t provokingVertex = 
pDC->pState->state.frontendState.topologyProvokingVertex;
 const PRIMITIVE_TOPOLOGY topo = pDC->pState->state.topology;
 
@@ -93,7 +93,7 @@ INLINE void ProcessAttributes(
 
 if (HasConstantInterpT::value || IsDegenerate::value)
 {
-if (_bittest(, i))
+if (CheckBit(constantInterpMask, i))
 {
 uint32_t vid;
 uint32_t adjustedTriIndex;
diff --git a/src/gallium/drivers/swr/rasterizer/core/blend.h 
b/src/gallium/drivers/swr/rasterizer/core/blend.h
index 1b98e44..c89c476 100644
--- a/src/gallium/drivers/swr/rasterizer/core/blend.h
+++ b/src/gallium/drivers/swr/rasterizer/core/blend.h
@@ -278,7 +278,7 @@ INLINE void Clamp(simdvector )
 }
 
 template
-void Blend(const SWR_BLEND_STATE *pBlendState, const 
SWR_RENDER_TARGET_BLEND_STATE *pState, simdvector , simdvector& src1, BYTE 
*pDst, simdvector )
+void Blend(const SWR_BLEND_STATE *pBlendState, const 
SWR_RENDER_TARGET_BLEND_STATE *pState, simdvector , simdvector& src1, 
uint8_t *pDst, simdvector )
 {
 // load render target
 simdvector dst;
diff --git a/src/gallium/drivers/swr/rasterizer/core/clip.h 
b/src/gallium/drivers/swr/rasterizer/core/clip.h
index bf16792..ca6596e 100644
--- a/src/gallium/drivers/swr/rasterizer/core/clip.h
+++ b/src/gallium/drivers/swr/rasterizer/core/clip.h
@@ -464,7 +464,7 @@ public:
 // input/output vertex store for clipper
 simdvertex vertices[7]; // maximum 7 verts generated per 

[Mesa-dev] [PATCH v2 08/17] swr/rast: rename frontend pVertexStore

2017-08-01 Thread Tim Rowley
Rename to reflect global nature.
---
 src/gallium/drivers/swr/rasterizer/core/frontend.cpp | 15 +--
 1 file changed, 9 insertions(+), 6 deletions(-)

diff --git a/src/gallium/drivers/swr/rasterizer/core/frontend.cpp 
b/src/gallium/drivers/swr/rasterizer/core/frontend.cpp
index f9eda83..e51f967 100644
--- a/src/gallium/drivers/swr/rasterizer/core/frontend.cpp
+++ b/src/gallium/drivers/swr/rasterizer/core/frontend.cpp
@@ -1332,7 +1332,7 @@ static void TessellationStages(
 TSDestroyCtx(tsCtx);
 }
 
-THREAD PA_STATE::SIMDVERTEX *pVertexStore = nullptr;
+THREAD PA_STATE::SIMDVERTEX *gpVertexStore = nullptr;
 THREAD uint32_t gVertexStoreSize = 0;
 
 //
@@ -1459,19 +1459,22 @@ void ProcessDraw(
 // grow the vertex store for the PA as necessary
 if (gVertexStoreSize < vertexStoreSize)
 {
-if (pVertexStore != nullptr)
+if (gpVertexStore != nullptr)
 {
-AlignedFree(pVertexStore);
+AlignedFree(gpVertexStore);
+gpVertexStore = nullptr;
 }
 
-pVertexStore = reinterpret_cast(AlignedMalloc(vertexStoreSize, 64));
+SWR_ASSERT(gpVertexStore == nullptr);
+
+gpVertexStore = reinterpret_cast(AlignedMalloc(vertexStoreSize, 64));
 gVertexStoreSize = vertexStoreSize;
 
-SWR_ASSERT(pVertexStore != nullptr);
+SWR_ASSERT(gpVertexStore != nullptr);
 }
 
 // choose primitive assembler
-PA_FACTORY paFactory(pDC, state.topology, 
work.numVerts, pVertexStore, numVerts, state.frontendState.vsVertexSize);
+PA_FACTORY paFactory(pDC, state.topology, 
work.numVerts, gpVertexStore, numVerts, state.frontendState.vsVertexSize);
 PA_STATE& pa = paFactory.GetPA();
 
 #if USE_SIMD16_FRONTEND
-- 
2.7.4

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH v2 13/17] swr/rast: switch gen_knobs.cpp license

2017-08-01 Thread Tim Rowley
Unintentionally added with an apache2 license; relicense to match
the rest of the tree.
---
 .../swr/rasterizer/codegen/templates/gen_knobs.cpp | 29 +-
 1 file changed, 17 insertions(+), 12 deletions(-)

diff --git a/src/gallium/drivers/swr/rasterizer/codegen/templates/gen_knobs.cpp 
b/src/gallium/drivers/swr/rasterizer/codegen/templates/gen_knobs.cpp
index 06b93bd..e6fe165 100644
--- a/src/gallium/drivers/swr/rasterizer/codegen/templates/gen_knobs.cpp
+++ b/src/gallium/drivers/swr/rasterizer/codegen/templates/gen_knobs.cpp
@@ -1,19 +1,24 @@
 /**
+* Copyright (C) 2015-2017 Intel Corporation.   All Rights Reserved.
 *
-* Copyright 2015-2017
-* Intel Corporation
+* Permission is hereby granted, free of charge, to any person obtaining a
+* copy of this software and associated documentation files (the "Software"),
+* to deal in the Software without restriction, including without limitation
+* the rights to use, copy, modify, merge, publish, distribute, sublicense,
+* and/or sell copies of the Software, and to permit persons to whom the
+* Software is furnished to do so, subject to the following conditions:
 *
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
+* The above copyright notice and this permission notice (including the next
+* paragraph) shall be included in all copies or substantial portions of the
+* Software.
 *
-* http ://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+* IN THE SOFTWARE.
 *
 % if gen_header:
 * @file ${filename}.h
-- 
2.7.4

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH v2 12/17] swr/rast: fix scons gen_knobs.h dependency

2017-08-01 Thread Tim Rowley
Copy/paste error was duplicating a gen_knobs.cpp rule.
---
 src/gallium/drivers/swr/SConscript | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/gallium/drivers/swr/SConscript 
b/src/gallium/drivers/swr/SConscript
index a32807d..c578d7a 100644
--- a/src/gallium/drivers/swr/SConscript
+++ b/src/gallium/drivers/swr/SConscript
@@ -53,7 +53,7 @@ env.CodeGenerate(
 source = '',
 command = python_cmd + ' $SCRIPT --output $TARGET --gen_h'
 )
-Depends('rasterizer/codegen/gen_knobs.cpp',
+Depends('rasterizer/codegen/gen_knobs.h',
 swrroot + 'rasterizer/codegen/templates/gen_knobs.cpp')
 
 env.CodeGenerate(
-- 
2.7.4

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH v2 09/17] swr/rast: vmask() implementations for KNL

2017-08-01 Thread Tim Rowley
---
 .../swr/rasterizer/common/simdlib_512_avx512_knights.inl   | 14 ++
 1 file changed, 14 insertions(+)

diff --git 
a/src/gallium/drivers/swr/rasterizer/common/simdlib_512_avx512_knights.inl 
b/src/gallium/drivers/swr/rasterizer/common/simdlib_512_avx512_knights.inl
index 17001be..2ee7639 100644
--- a/src/gallium/drivers/swr/rasterizer/common/simdlib_512_avx512_knights.inl
+++ b/src/gallium/drivers/swr/rasterizer/common/simdlib_512_avx512_knights.inl
@@ -132,6 +132,20 @@
 }
 #define SIMD_IWRAPPER_2I(op) SIMD_IWRAPPER_2I_(op, op)
 
+private:
+static SIMDINLINE Integer vmask(__mmask8 m)
+{
+return _mm512_mask_set1_epi64(_mm512_setzero_si512(), m, -1LL);
+}
+static SIMDINLINE Integer vmask(__mmask32 m)
+{
+return _mm512_mask_set1_epi16(_mm512_setzero_si512(), m, -1);
+}
+static SIMDINLINE Integer vmask(__mmask64 m)
+{
+return _mm512_mask_set1_epi8(_mm512_setzero_si512(), m, -1);
+}
+
 public:
 SIMD_WRAPPERI_2_(and_ps, and_epi32);  // return a & b   (float 
treated as int)
 SIMD_WRAPPERI_2_(andnot_ps, andnot_epi32);// return (~a) & b(float 
treated as int)
-- 
2.7.4

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


  1   2   3   4   5   6   7   >