From: Ruiling Song <[email protected]> Different from ocl 1.2, which gather all constant into one surface. ocl2 only gather program global/constant into one surface. But keep other constant passed through kernel argument in separate buffer.
Signed-off-by: Ruiling Song <[email protected]> --- backend/src/backend/gen_insn_selection.cpp | 15 ++++-- backend/src/backend/program.cpp | 21 +++++++- backend/src/backend/program.h | 6 +++ backend/src/backend/program.hpp | 4 ++ backend/src/gbe_bin_interpreter.cpp | 2 + backend/src/ir/profile.cpp | 4 +- backend/src/ir/profile.hpp | 3 +- backend/src/ir/unit.hpp | 50 ++++++++++++++++++ backend/src/llvm/llvm_gen_backend.cpp | 84 ++++++++++++++++++------------ kernels/compiler_program_global.cl | 68 ++++++++++++++++++++++++ src/cl_command_queue.c | 2 +- src/cl_command_queue_gen7.c | 16 ++++++ src/cl_gbe_loader.cpp | 10 ++++ src/cl_gbe_loader.h | 2 + src/cl_program.c | 46 ++++++++++++++++ src/cl_program.h | 2 + utests/CMakeLists.txt | 1 + utests/compiler_program_global.cpp | 80 ++++++++++++++++++++++++++++ 18 files changed, 374 insertions(+), 42 deletions(-) create mode 100644 kernels/compiler_program_global.cl create mode 100644 utests/compiler_program_global.cpp diff --git a/backend/src/backend/gen_insn_selection.cpp b/backend/src/backend/gen_insn_selection.cpp index d19f985..6ef077b 100644 --- a/backend/src/backend/gen_insn_selection.cpp +++ b/backend/src/backend/gen_insn_selection.cpp @@ -3596,6 +3596,13 @@ namespace gbe LoadInstructionPattern(void) : SelectionPattern(1, 1) { this->opcodes.push_back(ir::OP_LOAD); } + bool isReadConstantLegacy(const ir::LoadInstruction &load) const { + ir::AddressMode AM = load.getAddressMode(); + ir::AddressSpace AS = load.getAddressSpace(); + if (AM != ir::AM_Stateless && AS == ir::MEM_CONSTANT) + return true; + return false; + } void untypedReadStateless(Selection::Opaque &sel, GenRegister addr, vector<GenRegister> &dst @@ -3678,7 +3685,7 @@ namespace gbe unsigned SI = insn.getSurfaceIndex(); sel.UNTYPED_READ(addr, dst.data(), valueNum, GenRegister::immud(SI), btiTemp); } - } else if (addrSpace == ir::MEM_LOCAL || addrSpace == ir::MEM_CONSTANT ) { + } else if (addrSpace == ir::MEM_LOCAL || isReadConstantLegacy(insn) ) { // stateless mode, local/constant still use bti access unsigned bti = addrSpace == ir::MEM_CONSTANT ? BTI_CONSTANT : 0xfe; GenRegister addrDW = addr; @@ -3842,7 +3849,7 @@ namespace gbe b = GenRegister::immud(insn.getSurfaceIndex()); } read64Legacy(sel, addr, dst, b, btiTemp); - } else if (addrSpace == MEM_LOCAL || addrSpace == MEM_CONSTANT) { + } else if (addrSpace == MEM_LOCAL || isReadConstantLegacy(insn)) { GenRegister b = GenRegister::immud(addrSpace == MEM_LOCAL? 0xfe : BTI_CONSTANT); GenRegister addrDW = addr; if (addrBytes == 8) @@ -4063,7 +4070,7 @@ namespace gbe unsigned SI = insn.getSurfaceIndex(); sel.BYTE_GATHER(dst, addr, elemSize, GenRegister::immud(SI), btiTemp); } - } else if (addrSpace == ir::MEM_LOCAL || addrSpace == ir::MEM_CONSTANT) { + } else if (addrSpace == ir::MEM_LOCAL || isReadConstantLegacy(insn)) { unsigned bti = addrSpace == ir::MEM_CONSTANT ? BTI_CONSTANT : 0xfe; GenRegister addrDW = addr; if (addrBytes == 8) { @@ -4207,7 +4214,7 @@ namespace gbe const Type type = insn.getValueType(); const uint32_t elemSize = getByteScatterGatherSize(sel, type); - if (addrSpace == MEM_CONSTANT) { + if (isReadConstantLegacy(insn)) { // XXX TODO read 64bit constant through constant cache // Per HW Spec, constant cache messages can read at least DWORD data. // So, byte/short data type, we have to read through data cache. diff --git a/backend/src/backend/program.cpp b/backend/src/backend/program.cpp index 36af95f..ce4f927 100644 --- a/backend/src/backend/program.cpp +++ b/backend/src/backend/program.cpp @@ -104,11 +104,13 @@ namespace gbe { return it->offset; // we found it! } - Program::Program(void) : constantSet(NULL) {} + Program::Program(void) : constantSet(NULL), + relocTable(NULL) {} Program::~Program(void) { for (map<std::string, Kernel*>::iterator it = kernels.begin(); it != kernels.end(); ++it) GBE_DELETE(it->second); if (constantSet) delete constantSet; + if (relocTable) delete relocTable; } #ifdef GBE_COMPILER_AVAILABLE @@ -151,6 +153,7 @@ namespace gbe { bool Program::buildFromUnit(const ir::Unit &unit, std::string &error) { constantSet = new ir::ConstantSet(unit.getConstantSet()); + relocTable = new ir::RelocTable(unit.getRelocTable()); const auto &set = unit.getFunctionSet(); const uint32_t kernelNum = set.size(); if (OCL_OUTPUT_GEN_IR) std::cout << unit; @@ -978,6 +981,18 @@ namespace gbe { program->getGlobalConstantData(mem); } + static size_t programGetGlobalRelocCount(gbe_program gbeProgram) { + if (gbeProgram == NULL) return 0; + const gbe::Program *program = (const gbe::Program*) gbeProgram; + return program->getGlobalRelocCount(); + } + + static void programGetGlobalRelocTable(gbe_program gbeProgram, char *mem) { + if (gbeProgram == NULL) return; + const gbe::Program *program = (const gbe::Program*) gbeProgram; + program->getGlobalRelocTable(mem); + } + static uint32_t programGetKernelNum(gbe_program gbeProgram) { if (gbeProgram == NULL) return 0; const gbe::Program *program = (const gbe::Program*) gbeProgram; @@ -1220,6 +1235,8 @@ GBE_EXPORT_SYMBOL gbe_program_link_from_llvm_cb *gbe_program_link_from_llvm = NU GBE_EXPORT_SYMBOL gbe_program_build_from_llvm_cb *gbe_program_build_from_llvm = NULL; GBE_EXPORT_SYMBOL gbe_program_get_global_constant_size_cb *gbe_program_get_global_constant_size = NULL; GBE_EXPORT_SYMBOL gbe_program_get_global_constant_data_cb *gbe_program_get_global_constant_data = NULL; +GBE_EXPORT_SYMBOL gbe_program_get_global_reloc_count_cb *gbe_program_get_global_reloc_count = NULL; +GBE_EXPORT_SYMBOL gbe_program_get_global_reloc_table_cb *gbe_program_get_global_reloc_table = NULL; GBE_EXPORT_SYMBOL gbe_program_clean_llvm_resource_cb *gbe_program_clean_llvm_resource = NULL; GBE_EXPORT_SYMBOL gbe_program_delete_cb *gbe_program_delete = NULL; GBE_EXPORT_SYMBOL gbe_program_get_kernel_num_cb *gbe_program_get_kernel_num = NULL; @@ -1269,6 +1286,8 @@ namespace gbe gbe_program_check_opt = gbe::programCheckOption; gbe_program_get_global_constant_size = gbe::programGetGlobalConstantSize; gbe_program_get_global_constant_data = gbe::programGetGlobalConstantData; + gbe_program_get_global_reloc_count = gbe::programGetGlobalRelocCount; + gbe_program_get_global_reloc_table = gbe::programGetGlobalRelocTable; gbe_program_clean_llvm_resource = gbe::programCleanLlvmResource; gbe_program_delete = gbe::programDelete; gbe_program_get_kernel_num = gbe::programGetKernelNum; diff --git a/backend/src/backend/program.h b/backend/src/backend/program.h index 86b3177..03150bc 100644 --- a/backend/src/backend/program.h +++ b/backend/src/backend/program.h @@ -99,6 +99,7 @@ enum gbe_curbe_type { GBE_CURBE_BLOCK_IP, GBE_CURBE_DW_BLOCK_IP, GBE_CURBE_THREAD_NUM, + GBE_CURBE_CONSTANT_ADDRSPACE, GBE_GEN_REG, }; @@ -243,6 +244,11 @@ extern gbe_program_get_global_constant_size_cb *gbe_program_get_global_constant_ typedef void (gbe_program_get_global_constant_data_cb)(gbe_program gbeProgram, char *mem); extern gbe_program_get_global_constant_data_cb *gbe_program_get_global_constant_data; +typedef size_t (gbe_program_get_global_reloc_count_cb)(gbe_program gbeProgram); +extern gbe_program_get_global_reloc_count_cb *gbe_program_get_global_reloc_count; + +typedef void (gbe_program_get_global_reloc_table_cb)(gbe_program gbeProgram, char *mem); +extern gbe_program_get_global_reloc_table_cb *gbe_program_get_global_reloc_table; /*! Get the size of defined samplers */ typedef size_t (gbe_kernel_get_sampler_size_cb)(gbe_kernel gbeKernel); extern gbe_kernel_get_sampler_size_cb *gbe_kernel_get_sampler_size; diff --git a/backend/src/backend/program.hpp b/backend/src/backend/program.hpp index efe192f..e58ddf0 100644 --- a/backend/src/backend/program.hpp +++ b/backend/src/backend/program.hpp @@ -280,6 +280,8 @@ namespace gbe { /*! Get the content of global constant arrays */ void getGlobalConstantData(char *mem) const { constantSet->getData(mem); } + uint32_t getGlobalRelocCount(void) const { return relocTable->getCount(); } + void getGlobalRelocTable(char *p) const { relocTable->getData(p); } static const uint32_t magic_begin = TO_MAGIC('P', 'R', 'O', 'G'); static const uint32_t magic_end = TO_MAGIC('G', 'O', 'R', 'P'); @@ -309,6 +311,8 @@ namespace gbe { map<std::string, Kernel*> kernels; /*! Global (constants) outside any kernel */ ir::ConstantSet *constantSet; + /*! relocation table */ + ir::RelocTable *relocTable; /*! Use custom allocators */ GBE_CLASS(Program); }; diff --git a/backend/src/gbe_bin_interpreter.cpp b/backend/src/gbe_bin_interpreter.cpp index 4594a0a..0957092 100644 --- a/backend/src/gbe_bin_interpreter.cpp +++ b/backend/src/gbe_bin_interpreter.cpp @@ -61,6 +61,8 @@ struct BinInterpCallBackInitializer gbe_program_get_global_constant_size = gbe::programGetGlobalConstantSize; gbe_program_delete = gbe::programDelete; gbe_program_get_global_constant_data = gbe::programGetGlobalConstantData; + gbe_program_get_global_reloc_count = gbe::programGetGlobalRelocCount; + gbe_program_get_global_reloc_table = gbe::programGetGlobalRelocTable; gbe_kernel_get_sampler_data = gbe::kernelGetSamplerData; gbe_kernel_get_image_data = gbe::kernelGetImageData; gbe_kernel_get_arg_info = gbe::kernelGetArgInfo; diff --git a/backend/src/ir/profile.cpp b/backend/src/ir/profile.cpp index 3ead8a7..0699167 100644 --- a/backend/src/ir/profile.cpp +++ b/backend/src/ir/profile.cpp @@ -43,7 +43,8 @@ namespace ir { "zero", "one", "retVal", "printf_buffer_pointer", "printf_index_buffer_pointer", - "dwblockip" + "dwblockip", + "constant_addrspace_start" }; #if GBE_DEBUG @@ -86,6 +87,7 @@ namespace ir { DECL_NEW_REG(FAMILY_QWORD, printfbptr, 1, GBE_CURBE_PRINTF_BUF_POINTER); DECL_NEW_REG(FAMILY_QWORD, printfiptr, 1, GBE_CURBE_PRINTF_INDEX_POINTER); DECL_NEW_REG(FAMILY_DWORD, dwblockip, 0, GBE_CURBE_DW_BLOCK_IP); + DECL_NEW_REG(FAMILY_QWORD, constant_addrspace, 1, GBE_CURBE_CONSTANT_ADDRSPACE); } #undef DECL_NEW_REG diff --git a/backend/src/ir/profile.hpp b/backend/src/ir/profile.hpp index a8445c4..79761d4 100644 --- a/backend/src/ir/profile.hpp +++ b/backend/src/ir/profile.hpp @@ -71,7 +71,8 @@ namespace ir { static const Register printfbptr = Register(27); // printf buffer address . static const Register printfiptr = Register(28); // printf index buffer address. static const Register dwblockip = Register(29); // blockip - static const uint32_t regNum = 30; // number of special registers + static const Register constant_addrspace = Register(30); // starting address of program-scope constant + static const uint32_t regNum = 31; // number of special registers extern const char *specialRegMean[]; // special register name. } /* namespace ocl */ diff --git a/backend/src/ir/unit.hpp b/backend/src/ir/unit.hpp index 8ff858d..b8df145 100644 --- a/backend/src/ir/unit.hpp +++ b/backend/src/ir/unit.hpp @@ -27,6 +27,7 @@ #include "ir/constant.hpp" #include "ir/register.hpp" #include "sys/map.hpp" +#include <string.h> namespace gbe { namespace ir { @@ -37,6 +38,52 @@ namespace ir { /*! Complete unit of compilation. It contains a set of functions and a set of * constant the functions may refer to. */ + struct RelocEntry { + RelocEntry(unsigned int rO, unsigned int dO): + refOffset(rO), + defOffset(dO) {} + + unsigned int refOffset; + unsigned int defOffset; + }; + + class RelocTable : public NonCopyable, public Serializable + { + public: + void addEntry(unsigned refOffset, unsigned defOffset) { + entries.push_back(RelocEntry(refOffset, defOffset)); + } + RelocTable() {} + RelocTable(const RelocTable& other) : Serializable(other), + entries(other.entries) {} + uint32_t getCount() { return entries.size(); } + void getData(char *p) { + if (entries.size() > 1 && p) + memcpy(p, entries.data(), entries.size()*sizeof(RelocEntry)); + } + static const uint32_t magic_begin = TO_MAGIC('R', 'E', 'L', 'C'); + static const uint32_t magic_end = TO_MAGIC('C', 'L', 'E', 'R'); + + /* format: + magic_begin | + const_data_size | + const_data | + constant_1_size | + constant_1 | + ........ | + constant_n_size | + constant_n | + magic_end | + total_size + */ + + /*! Implements the serialization. */ + virtual size_t serializeToBin(std::ostream& outs) { return 0;} + virtual size_t deserializeFromBin(std::istream& ins) { return 0; } + private: + vector<RelocEntry> entries; + GBE_CLASS(RelocTable); + }; class Unit : public NonCopyable { public: @@ -70,6 +117,8 @@ namespace ir { } /*! Return the constant set */ ConstantSet& getConstantSet(void) { return constantSet; } + const RelocTable& getRelocTable(void) const { return relocTable; } + RelocTable& getRelocTable(void) { return relocTable; } /*! Return the constant set */ const ConstantSet& getConstantSet(void) const { return constantSet; } void setValid(bool value) { valid = value; } @@ -78,6 +127,7 @@ namespace ir { friend class ContextInterface; //!< Can free modify the unit FunctionSet functions; //!< All the defined functions ConstantSet constantSet; //!< All the constants defined in the unit + RelocTable relocTable; PointerSize pointerSize; //!< Size shared by all pointers GBE_CLASS(Unit); bool valid; diff --git a/backend/src/llvm/llvm_gen_backend.cpp b/backend/src/llvm/llvm_gen_backend.cpp index cb47097..d23a598 100644 --- a/backend/src/llvm/llvm_gen_backend.cpp +++ b/backend/src/llvm/llvm_gen_backend.cpp @@ -506,7 +506,7 @@ namespace gbe virtual bool doInitialization(Module &M); /*! helper function for parsing global constant data */ - void getConstantData(const Constant * c, void* mem, uint32_t& offset) const; + void getConstantData(const Constant * c, void* mem, uint32_t& offset, vector<ir::RelocEntry> &) const; void collectGlobalConstant(void) const; ir::ImmediateIndex processConstantImmIndex(Constant *CPV, int32_t index = 0u); const ir::Immediate &processConstantImm(Constant *CPV, int32_t index = 0u); @@ -1111,8 +1111,9 @@ namespace gbe break; } case 2: - new_bti = BTI_CONSTANT; - + // ocl 2.0, constant pointer use separate bti + new_bti = btiBase; + incBtiBase(); break; case 3: new_bti = BTI_LOCAL; @@ -1349,22 +1350,34 @@ namespace gbe return; } - void GenWriter::getConstantData(const Constant * c, void* mem, uint32_t& offset) const { + void GenWriter::getConstantData(const Constant * c, void* mem, uint32_t& offset, vector<ir::RelocEntry> &relocs) const { Type * type = c->getType(); Type::TypeID id = type->getTypeID(); GBE_ASSERT(c); + if (isa<GlobalVariable>(c)) { + const GlobalVariable *GV = cast<GlobalVariable>(c); + + unsigned valueAddrSpace = GV->getType()->getAddressSpace(); + ir::Constant cc = unit.getConstantSet().getConstant(c->getName()); + unsigned int defOffset = cc.getOffset(); + + relocs.push_back(ir::RelocEntry(offset, defOffset)); + uint32_t size = getTypeByteSize(unit, type); + memset((char*)mem+offset, 0, size); + offset += size; + return; + } if(isa<UndefValue>(c)) { uint32_t size = getTypeByteSize(unit, type); offset += size; return; - } else if(isa<ConstantAggregateZero>(c)) { + } else if(isa<ConstantAggregateZero>(c) || isa<ConstantPointerNull>(c)) { uint32_t size = getTypeByteSize(unit, type); memset((char*)mem+offset, 0, size); offset += size; return; } - switch(id) { case Type::TypeID::StructTyID: { @@ -1382,7 +1395,7 @@ namespace gbe offset += padding/8; const Constant* sub = cast<Constant>(c->getOperand(op)); GBE_ASSERT(sub); - getConstantData(sub, mem, offset); + getConstantData(sub, mem, offset, relocs); } break; } @@ -1401,7 +1414,7 @@ namespace gbe uint32_t ops = c->getNumOperands(); for(uint32_t op = 0; op < ops; ++op) { Constant * ca = dyn_cast<Constant>(c->getOperand(op)); - getConstantData(ca, mem, offset); + getConstantData(ca, mem, offset, relocs); offset += padding; } } @@ -1449,21 +1462,34 @@ namespace gbe const Module::GlobalListType &globalList = TheModule->getGlobalList(); for(auto i = globalList.begin(); i != globalList.end(); i ++) { const GlobalVariable &v = *i; - if(!v.isConstantUsed()) continue; const char *name = v.getName().data(); unsigned addrSpace = v.getType()->getAddressSpace(); - if(addrSpace == ir::AddressSpace::MEM_CONSTANT || v.isConstant()) { - GBE_ASSERT(v.hasInitializer()); - const Constant *c = v.getInitializer(); - Type * type = c->getType(); + + vector<ir::RelocEntry> relocs; + if(addrSpace == 2 /* __constant */ + || addrSpace == 1 + || addrSpace == 0) { + Type * type = v.getValueType(); uint32_t size = getTypeByteSize(unit, type); void* mem = malloc(size); uint32_t offset = 0; - getConstantData(c, mem, offset); + if (v.hasInitializer()) { + const Constant *c = v.getInitializer(); + getConstantData(c, mem, offset, relocs); + } else { + memset(mem, 0, size); + } uint32_t alignment = getAlignmentByte(unit, type); unit.newConstant((char *)mem, name, size, alignment); free(mem); + uint32_t refOffset = unit.getConstantSet().getConstant(name).getOffset(); + for (uint32_t k = 0; k < relocs.size(); k++) { + unit.getRelocTable().addEntry( + refOffset + relocs[k].refOffset, + relocs[k].defOffset + ); + } } } } @@ -2562,33 +2588,23 @@ namespace gbe this->newRegister(const_cast<GlobalVariable*>(&v)); ir::Register reg = regTranslator.getScalar(const_cast<GlobalVariable*>(&v), 0); ctx.LOADI(getType(ctx, v.getType()), reg, ctx.newIntegerImmediate(oldSlm + padding/8, getType(ctx, v.getType()))); - } else if(addrSpace == ir::MEM_CONSTANT || v.isConstant()) { - GBE_ASSERT(v.hasInitializer()); - this->newRegister(const_cast<GlobalVariable*>(&v)); - ir::Register reg = regTranslator.getScalar(const_cast<GlobalVariable*>(&v), 0); - ir::Constant &con = unit.getConstantSet().getConstant(v.getName()); - ctx.LOADI(getType(ctx, v.getType()), reg, ctx.newIntegerImmediate(con.getOffset(), getType(ctx, v.getType()))); - } else { + } else if(addrSpace == ir::MEM_CONSTANT + || addrSpace == ir::MEM_GLOBAL + || v.isConstant()) { if(v.getName().equals(StringRef("__gen_ocl_printf_buf"))) { ctx.getFunction().getPrintfSet()->setBufBTI(BtiMap.find(const_cast<GlobalVariable*>(&v))->second); regTranslator.newScalarProxy(ir::ocl::printfbptr, const_cast<GlobalVariable*>(&v)); } else if(v.getName().equals(StringRef("__gen_ocl_printf_index_buf"))) { ctx.getFunction().getPrintfSet()->setIndexBufBTI(BtiMap.find(const_cast<GlobalVariable*>(&v))->second); regTranslator.newScalarProxy(ir::ocl::printfiptr, const_cast<GlobalVariable*>(&v)); - } else if(v.getName().str().substr(0, 4) == ".str") { - /* When there are multi printf statements in multi kernel fucntions within the same - translate unit, if they have the same sting parameter, such as - kernel_func1 () { - printf("Line is %d\n", line_num1); - } - kernel_func2 () { - printf("Line is %d\n", line_num2); - } - The Clang will just generate one global string named .strXXX to represent "Line is %d\n" - So when translating the kernel_func1, we can not unref that global var, so we will - get here. Just ignore it to avoid assert. */ } else { - GBE_ASSERT(0 && "Unsupported private memory access pattern"); + this->newRegister(const_cast<GlobalVariable*>(&v)); + ir::Register reg = regTranslator.getScalar(const_cast<GlobalVariable*>(&v), 0); + ir::Constant &con = unit.getConstantSet().getConstant(v.getName()); + ctx.LOADI(getType(ctx, v.getType()), reg, ctx.newIntegerImmediate(con.getOffset(), getType(ctx, v.getType()))); + if (!legacyMode) { + ctx.ADD(getType(ctx, v.getType()), reg, ir::ocl::constant_addrspace, reg); + } } } } diff --git a/kernels/compiler_program_global.cl b/kernels/compiler_program_global.cl new file mode 100644 index 0000000..405c53f --- /dev/null +++ b/kernels/compiler_program_global.cl @@ -0,0 +1,68 @@ +struct config{ + int s0; + global short *s1; +}; + +global int i = 5; +global int bb = 4; +global int *global p; + +/* array */ +global int ba[12]; + +/* short/long data type */ +global short s; +global short s2; +global long l; + +/* pointer in constant AS to global */ +global int * constant px =&i; + +/* constant pointer relocation */ +constant int x = 2; +constant int y =1; +constant int *constant z[2] = {&x, &y}; + +/* structure with pointer field */ +global struct config c[2] = {{1, &s}, {2, &s2} }; + + +global int a = 1; +global int b = 2; +global int * constant gArr[2]= {&a, &b}; + +__kernel void compiler_program_global0(const global int *src, int dynamic) { + size_t gid = get_global_id(0); + /* global read/write */ + p = &i; + *p += 1; + + /* pointer in struct memory access */ + *c[gid&1].s1 += 2; + + s = 2; + l = 3; + + /* constant AS pointer (points to global) memory access */ + *px += *z[dynamic]; + + p = &bb; + /* array */ + if (gid < 11) + ba[gid] = src[gid]; +} + +__kernel void compiler_program_global1(global int *dst, int dynamic) { + size_t gid = get_global_id(0); +// static global sg; + + dst[11] = i; + dst[12] = *p; + dst[13] = s; + dst[14] = l; + dst[15] = *gArr[dynamic]; + + if (gid < 11) + dst[gid] = ba[gid]; +} + diff --git a/src/cl_command_queue.c b/src/cl_command_queue.c index 9dc3fe6..442c6a2 100644 --- a/src/cl_command_queue.c +++ b/src/cl_command_queue.c @@ -161,7 +161,7 @@ cl_command_queue_bind_surface(cl_command_queue queue, cl_kernel k) for (i = 0; i < k->arg_n; ++i) { int32_t offset; // location of the address in the curbe arg_type = interp_kernel_get_arg_type(k->opaque, i); - if (arg_type != GBE_ARG_GLOBAL_PTR || !k->args[i].mem) + if (!(arg_type == GBE_ARG_GLOBAL_PTR || arg_type == GBE_ARG_CONSTANT_PTR) || !k->args[i].mem) continue; offset = interp_kernel_get_curbe_offset(k->opaque, GBE_CURBE_KERNEL_ARGUMENT, i); if (offset < 0) diff --git a/src/cl_command_queue_gen7.c b/src/cl_command_queue_gen7.c index 2edc3be..61ffe7e 100644 --- a/src/cl_command_queue_gen7.c +++ b/src/cl_command_queue_gen7.c @@ -29,6 +29,7 @@ #include <assert.h> #include <stdio.h> #include <string.h> +#include <unistd.h> #define MAX_GROUP_SIZE_IN_HALFSLICE 512 static INLINE size_t cl_kernel_compute_batch_sz(cl_kernel k) { return 256+256; } @@ -117,6 +118,11 @@ cl_upload_constant_buffer(cl_command_queue queue, cl_kernel ker) * we need raw_size & aligned_size */ GET_QUEUE_THREAD_GPGPU(queue); + // TODO this is only valid for OpenCL 1.2, + // under ocl1.2 we gather all constant into one dedicated surface. + // but in 2.0 we put program global into one surface, but constants + // pass through kernel argument in each separate buffer +#if 0 int32_t arg; size_t offset = 0; uint32_t raw_size = 0, aligned_size =0; @@ -185,6 +191,16 @@ cl_upload_constant_buffer(cl_command_queue queue, cl_kernel ker) } } cl_buffer_unmap(bo); +#endif + // pass the starting of constant address space + int32_t constant_addrspace = interp_kernel_get_curbe_offset(ker->opaque, GBE_CURBE_CONSTANT_ADDRSPACE, 0); + if (constant_addrspace >= 0) { + size_t global_const_size = interp_program_get_global_constant_size(ker->program->opaque); + if (global_const_size > 0) { + *(uint64_t*)(ker->curbe + constant_addrspace) = (uint64_t)ker->program->global_data_ptr; + cl_gpgpu_bind_buf(gpgpu, ker->program->global_data, constant_addrspace, 0, ALIGN(global_const_size, getpagesize()), BTI_CONSTANT); + } + } return 0; } diff --git a/src/cl_gbe_loader.cpp b/src/cl_gbe_loader.cpp index e832a53..d75c92c 100644 --- a/src/cl_gbe_loader.cpp +++ b/src/cl_gbe_loader.cpp @@ -38,6 +38,8 @@ gbe_program_clean_llvm_resource_cb *compiler_program_clean_llvm_resource = NULL; gbe_program_new_from_binary_cb *interp_program_new_from_binary = NULL; gbe_program_get_global_constant_size_cb *interp_program_get_global_constant_size = NULL; gbe_program_get_global_constant_data_cb *interp_program_get_global_constant_data = NULL; +gbe_program_get_global_reloc_count_cb *interp_program_get_global_reloc_count = NULL; +gbe_program_get_global_reloc_table_cb *interp_program_get_global_reloc_table = NULL; gbe_program_delete_cb *interp_program_delete = NULL; gbe_program_get_kernel_num_cb *interp_program_get_kernel_num = NULL; gbe_program_get_kernel_by_name_cb *interp_program_get_kernel_by_name = NULL; @@ -109,6 +111,14 @@ struct GbeLoaderInitializer if (interp_program_get_global_constant_data == NULL) return false; + interp_program_get_global_reloc_count = *(gbe_program_get_global_reloc_count_cb**)dlsym(dlhInterp, "gbe_program_get_global_reloc_count"); + if (interp_program_get_global_reloc_count == NULL) + return false; + + interp_program_get_global_reloc_table = *(gbe_program_get_global_reloc_table_cb**)dlsym(dlhInterp, "gbe_program_get_global_reloc_table"); + if (interp_program_get_global_reloc_table == NULL) + return false; + interp_program_delete = *(gbe_program_delete_cb**)dlsym(dlhInterp, "gbe_program_delete"); if (interp_program_delete == NULL) return false; diff --git a/src/cl_gbe_loader.h b/src/cl_gbe_loader.h index de91c85..28741ff 100644 --- a/src/cl_gbe_loader.h +++ b/src/cl_gbe_loader.h @@ -38,6 +38,8 @@ extern gbe_program_clean_llvm_resource_cb *compiler_program_clean_llvm_resource; extern gbe_program_new_from_binary_cb *interp_program_new_from_binary; extern gbe_program_get_global_constant_size_cb *interp_program_get_global_constant_size; extern gbe_program_get_global_constant_data_cb *interp_program_get_global_constant_data; +extern gbe_program_get_global_reloc_count_cb *interp_program_get_global_reloc_count; +extern gbe_program_get_global_reloc_table_cb *interp_program_get_global_reloc_table; extern gbe_program_delete_cb *interp_program_delete; extern gbe_program_get_kernel_num_cb *interp_program_get_kernel_num; extern gbe_program_get_kernel_by_name_cb *interp_program_get_kernel_by_name; diff --git a/src/cl_program.c b/src/cl_program.c index 98b6d51..ffdb2a1 100644 --- a/src/cl_program.c +++ b/src/cl_program.c @@ -97,6 +97,9 @@ cl_program_delete(cl_program p) cl_kernel_delete(p->ker[i]); cl_free(p->ker); + cl_free(p->global_data_ptr); + if (p->global_data_ptr) + cl_buffer_unreference(p->global_data); /* Program belongs to their parent context */ cl_context_delete(p->ctx); @@ -191,6 +194,42 @@ LOCAL cl_bool headerCompare(const unsigned char *BufPtr, BINARY_HEADER_INDEX ind #define isLLVM_LIB(BufPtr) headerCompare(BufPtr, BHI_LIBRARY) #define isGenBinary(BufPtr) headerCompare(BufPtr, BHI_GEN_BINARY) +static cl_int get_program_global_data(cl_program prog) { + cl_buffer_mgr bufmgr = NULL; + bufmgr = cl_context_get_bufmgr(prog->ctx); + assert(bufmgr); + size_t const_size = interp_program_get_global_constant_size(prog->opaque); + if (const_size == 0) return CL_SUCCESS; + + int page_size = getpagesize(); + size_t alignedSz = ALIGN(const_size, page_size); + char * p = (char*)cl_aligned_malloc(alignedSz, page_size); + prog->global_data_ptr = p; + interp_program_get_global_constant_data(prog->opaque, (char*)p); + + prog->global_data = cl_buffer_alloc_userptr(bufmgr, "program global data", p, alignedSz, 0); + cl_buffer_set_softpin_offset(prog->global_data, (size_t)p); + + uint32_t reloc_count = interp_program_get_global_reloc_count(prog->opaque); + if (reloc_count > 0) { + uint32_t x; + struct RelocEntry {int refOffset; int defOffset;}; + char *temp = (char*) malloc(reloc_count *sizeof(int)*2); + interp_program_get_global_reloc_table(prog->opaque, temp); + for (x = 0; x < reloc_count; x++) { + int ref_offset = ((struct RelocEntry *)temp)[x].refOffset; + *(uint64_t*)&(p[ref_offset]) = ((struct RelocEntry *)temp)[x].defOffset + (uint64_t)p; + } + free(temp); + } +#if 0 + int x = 0; + for (x = 0; x < const_size; x++) { + printf("offset %d data: %x\n", x, (unsigned)p[x]); + } +#endif + return CL_SUCCESS; +} LOCAL cl_program cl_program_create_from_binary(cl_context ctx, cl_uint num_devices, @@ -603,6 +642,9 @@ cl_program_build(cl_program p, const char *options) memcpy(p->bin + copyed, interp_kernel_get_code(opaque), sz); copyed += sz; } + if ((err = get_program_global_data(p)) != CL_SUCCESS) + goto error; + p->is_built = 1; p->build_status = CL_BUILD_SUCCESS; return CL_SUCCESS; @@ -697,6 +739,10 @@ cl_program_link(cl_context context, memcpy(p->bin + copyed, interp_kernel_get_code(opaque), sz); copyed += sz; } + + if ((err = get_program_global_data(p)) != CL_SUCCESS) + goto error; + done: if(p) p->is_built = 1; if(p) p->build_status = CL_BUILD_SUCCESS; diff --git a/src/cl_program.h b/src/cl_program.h index 63ad16d..083d66a 100644 --- a/src/cl_program.h +++ b/src/cl_program.h @@ -54,6 +54,8 @@ struct _cl_program { cl_kernel *ker; /* All kernels included by the OCL file */ cl_program prev, next; /* We chain the programs together */ cl_context ctx; /* Its parent context */ + cl_buffer global_data; + char * global_data_ptr; char *bin; /* The program copied verbatim */ size_t bin_sz; /* Its size in memory */ char *source; /* Program sources */ diff --git a/utests/CMakeLists.txt b/utests/CMakeLists.txt index 78442cb..0fca450 100644 --- a/utests/CMakeLists.txt +++ b/utests/CMakeLists.txt @@ -191,6 +191,7 @@ set (utests_sources compiler_bool_cross_basic_block.cpp compiler_private_const.cpp compiler_private_data_overflow.cpp + compiler_program_global.cpp compiler_getelementptr_bitcast.cpp compiler_sub_group_any.cpp compiler_sub_group_all.cpp diff --git a/utests/compiler_program_global.cpp b/utests/compiler_program_global.cpp new file mode 100644 index 0000000..ef7c655 --- /dev/null +++ b/utests/compiler_program_global.cpp @@ -0,0 +1,80 @@ +#include "utest_helper.hpp" +#include "utest_file_map.hpp" + +static int init_program(const char* name, cl_context ctx, cl_program *pg ) +{ + cl_int err; + char* ker_path = cl_do_kiss_path(name, device); + + cl_file_map_t *fm = cl_file_map_new(); + err = cl_file_map_open(fm, ker_path); + if(err != CL_FILE_MAP_SUCCESS) + OCL_ASSERT(0); + const char *src = cl_file_map_begin(fm); + + *pg = clCreateProgramWithSource(ctx, 1, &src, NULL, &err); + free(ker_path); + cl_file_map_delete(fm); + return 0; + +} + +void compiler_program_global() +{ + const int n = 16; + int cpu_src[16]; + cl_int err; + + // Setup kernel and buffers + cl_program program; + init_program("compiler_program_global.cl", ctx, &program); + OCL_CALL (clBuildProgram, program, 1, &device, "-cl-std=CL2.0", NULL, NULL); + + cl_kernel k0 = clCreateKernel(program, "compiler_program_global0", &err); + assert(err == CL_SUCCESS); + cl_kernel k1 = clCreateKernel(program, "compiler_program_global1", &err); + assert(err == CL_SUCCESS); + + OCL_CREATE_BUFFER(buf[0], 0, n * sizeof(int), NULL); + OCL_CREATE_BUFFER(buf[1], 0, n * sizeof(int), NULL); + + OCL_CALL (clSetKernelArg, k0, 0, sizeof(cl_mem), &buf[0]); + OCL_CALL (clSetKernelArg, k1, 0, sizeof(cl_mem), &buf[1]); + + int dynamic = 1; + OCL_CALL (clSetKernelArg, k0, 1, sizeof(cl_int), &dynamic); + OCL_CALL (clSetKernelArg, k1, 1, sizeof(cl_int), &dynamic); + + globals[0] = 16; + locals[0] = 16; + + OCL_MAP_BUFFER(0); + for (int i = 0; i < n; ++i) + cpu_src[i] = ((int*)buf_data[0])[i] = i; + OCL_UNMAP_BUFFER(0); + + // Run the kernel on GPU + OCL_CALL (clEnqueueNDRangeKernel, queue, k0, 1, NULL, globals, locals, 0, NULL, NULL); + OCL_CALL (clEnqueueNDRangeKernel, queue, k1, 1, NULL, globals, locals, 0, NULL, NULL); + + // Compare + OCL_MAP_BUFFER(1); + for (int32_t i = 0; i < n; ++i) { +// printf("i=%d dst=%d\n", i, ((int*)buf_data[1])[i]); + switch(i) { + default: OCL_ASSERT(((int*)buf_data[1])[i] == i); break; + case 11: OCL_ASSERT(((int*)buf_data[1])[i] == 7); break; + case 12: OCL_ASSERT(((int*)buf_data[1])[i] == 4); break; + case 13: OCL_ASSERT(((int*)buf_data[1])[i] == 2); break; + case 14: OCL_ASSERT(((int*)buf_data[1])[i] == 3); break; + case 15: OCL_ASSERT(((int*)buf_data[1])[i] == 2); break; + } + } + OCL_UNMAP_BUFFER(1); + clReleaseKernel(k0); + clReleaseKernel(k1); + clReleaseProgram(program); +} + +MAKE_UTEST_FROM_FUNCTION(compiler_program_global); + -- 2.4.1 _______________________________________________ Beignet mailing list [email protected] http://lists.freedesktop.org/mailman/listinfo/beignet
