From: Junyan He <junyan...@intel.com> 2.0 Spec require a global memory and the global pointer can point to any global variable. We add a rela.rodata section in ELF file to support the relocation. The global memory just available for 2.0 later.
Signed-off-by: Junyan He <junyan...@intel.com> --- CMakeLists.txt | 2 +- backend/src/backend/gen_program_elf.cpp | 54 ++++++++-- backend/src/ir/reloc.cpp | 2 +- backend/src/ir/reloc.hpp | 22 +++- backend/src/llvm/llvm_gen_backend.cpp | 3 +- src/cl_gen7_device.h | 2 +- src/gen/cl_command_queue_gen.c | 52 +++++++++- src/gen/cl_gen.h | 25 +++++ src/gen/cl_gen75_device.h | 2 +- src/gen/cl_gen7_device.h | 2 +- src/gen/cl_gen8_device.h | 2 +- src/gen/cl_gen9_device.h | 5 +- src/gen/cl_gen_device_common.h | 4 - src/gen/cl_kernel_gen.c | 8 +- src/gen/cl_program_gen.c | 178 ++++++++++++++++++++++++++++++-- src/gen/intel_driver.c | 11 +- src/gen/intel_driver.h | 2 + 17 files changed, 327 insertions(+), 49 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index fe895d0..e6babe4 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -239,7 +239,7 @@ IF (EXPERIMENTAL_DOUBLE) ADD_DEFINITIONS(-DENABLE_FP64) ENDIF(EXPERIMENTAL_DOUBLE) -SET(CAN_OPENCL_20 OFF) +SET(CAN_OPENCL_20 ON) IF (CMAKE_SIZEOF_VOID_P EQUAL 4) SET(CAN_OPENCL_20 OFF) ENDIF (CMAKE_SIZEOF_VOID_P EQUAL 4) diff --git a/backend/src/backend/gen_program_elf.cpp b/backend/src/backend/gen_program_elf.cpp index feea392..0c78964 100644 --- a/backend/src/backend/gen_program_elf.cpp +++ b/backend/src/backend/gen_program_elf.cpp @@ -262,10 +262,12 @@ public: section *ker_info_sec; section *cl_info_sec; section *rodata_sec; + section *reloc_rodata_sec; symbol_section_accessor *syma; string_section_accessor *stra; note_section_accessor *note_writer; note_section_accessor *cl_note_writer; + relocation_section_accessor *rela; Elf32_Word sym_num; uint64_t bitcode_offset; @@ -280,8 +282,8 @@ public: GenProgramElfContext::GenProgramElfContext(GenProgram &prog) : genProg(prog), text_sec(NULL), sym_sec(NULL), strtab_sec(NULL), ker_info_sec(NULL), - cl_info_sec(NULL), rodata_sec(NULL), syma(NULL), stra(NULL), note_writer(NULL), - cl_note_writer(NULL), sym_num(0), bitcode_offset(0) + cl_info_sec(NULL), rodata_sec(NULL), reloc_rodata_sec(NULL), syma(NULL), stra(NULL), + note_writer(NULL), cl_note_writer(NULL), rela(NULL), sym_num(0), bitcode_offset(0) { writer.create(ELFCLASS64, ELFDATA2LSB); writer.set_os_abi(ELFOSABI_LINUX); @@ -339,6 +341,8 @@ GenProgramElfContext::~GenProgramElfContext(void) GBE_DELETE(note_writer); if (cl_note_writer) GBE_DELETE(cl_note_writer); + if (rela) + GBE_DELETE(rela); } /*Store the special vitrual register map */ @@ -653,6 +657,41 @@ GenProgram::toBinaryFormat(size_t &ret_size) getGlobalConstantData(const_data); elf_ctx->rodata_sec->set_data(const_data, getGlobalConstantSize()); GBE_FREE(const_data); + + if (getGlobalRelocCount() > 0) { + elf_ctx->reloc_rodata_sec = elf_ctx->writer.sections.add(".rel.rodata"); + elf_ctx->reloc_rodata_sec->set_type(SHT_RELA); + elf_ctx->reloc_rodata_sec->set_info(elf_ctx->rodata_sec->get_index()); + elf_ctx->reloc_rodata_sec->set_addr_align(0x4); + elf_ctx->reloc_rodata_sec->set_entry_size(elf_ctx->writer.get_default_entry_size(SHT_RELA)); + elf_ctx->reloc_rodata_sec->set_link(elf_ctx->sym_sec->get_index()); + elf_ctx->rela = GBE_NEW(relocation_section_accessor, elf_ctx->writer, elf_ctx->reloc_rodata_sec); + + char *reloc_data = static_cast<char *>(GBE_MALLOC(getGlobalRelocCount() * sizeof(ir::RelocEntry))); + getGlobalRelocTable(reloc_data); + ir::RelocEntry *rel_entry = reinterpret_cast<ir::RelocEntry *>(reloc_data); + std::sort(rel_entry, rel_entry + getGlobalRelocCount(), + [](ir::RelocEntry &a, ir::RelocEntry &b) { return a.defOffset < b.defOffset; }); + + std::string last_name; + unsigned int var_defOffset; + Elf_Word var_symbol; + for (uint32_t e = 0; e < getGlobalRelocCount(); e++) { + if (last_name != relocTable->getEntryName(rel_entry[e])) { + // Add a global symbol + var_defOffset = rel_entry[e].defOffset; + last_name = relocTable->getEntryName(rel_entry[e]); + assert(last_name != ""); // Must have a name + var_symbol = elf_ctx->syma->add_symbol(*elf_ctx->stra, last_name.c_str(), var_defOffset, + this->constantSet->getConstant(last_name).getSize(), + STB_GLOBAL, STT_OBJECT, 0, elf_ctx->rodata_sec->get_index()); + } + elf_ctx->rela->add_entry(rel_entry[e].refOffset, var_symbol, (unsigned char)R_386_32, + rel_entry[e].defOffset - var_defOffset); + } + + GBE_FREE(reloc_data); + } } /* Add the note about GPU info */ @@ -707,22 +746,17 @@ GenProgram::toBinaryFormat(size_t &ret_size) if (write_cl_version == false) { std::string ocl_version_str; - Elf32_Word cl_version[2]; // major and minor - - oclVersion = k->getOclVersion(); + oclVersion = k->getOclVersion(); // major and minor if (oclVersion == 120) { ocl_version_str = "OpenCL 1.2"; - cl_version[0] = 1; - cl_version[1] = 2; } else if (oclVersion == 200) { ocl_version_str = "OpenCL 2.0"; - cl_version[0] = 2; - cl_version[1] = 0; } else assert(0); elf_ctx->cl_note_writer->add_note(GenProgramElfContext::GEN_NOTE_TYPE_CL_VERSION, - ocl_version_str, cl_version, sizeof(cl_version)); + ocl_version_str, &oclVersion, sizeof(oclVersion)); + write_cl_version = true; } else { assert(oclVersion == k->getOclVersion()); } diff --git a/backend/src/ir/reloc.cpp b/backend/src/ir/reloc.cpp index 4884610..70dc0f6 100644 --- a/backend/src/ir/reloc.cpp +++ b/backend/src/ir/reloc.cpp @@ -67,7 +67,7 @@ namespace ir { for (uint32_t i = 0; i < sz; i++) { IN_UPDATE_SZ(refOffset); IN_UPDATE_SZ(defOffset); - addEntry(refOffset, defOffset); + addEntry(refOffset, defOffset, NULL); } IN_UPDATE_SZ(magic); diff --git a/backend/src/ir/reloc.hpp b/backend/src/ir/reloc.hpp index de33a8a..27cc943 100644 --- a/backend/src/ir/reloc.hpp +++ b/backend/src/ir/reloc.hpp @@ -27,6 +27,7 @@ #include "sys/vector.hpp" #include <string.h> +#include <map> namespace gbe { namespace ir { @@ -42,17 +43,31 @@ namespace ir { unsigned int refOffset; unsigned int defOffset; + friend bool operator< (const RelocEntry& a, const RelocEntry& b) { + if (a.defOffset < b.defOffset) + return true; + if (a.refOffset < b.refOffset) + return true; + return false; + } }; class RelocTable : public NonCopyable, public Serializable { public: - void addEntry(unsigned refOffset, unsigned defOffset) { + void addEntry(unsigned refOffset, unsigned defOffset, const char *name) { entries.push_back(RelocEntry(refOffset, defOffset)); + RelocEntry& re = entries.back(); + entryNames[re] = name; + } + std::string getEntryName(RelocEntry& re) { + if (entryNames.find(re) == entryNames.end()) + return std::string(); + return entryNames[re]; } RelocTable() : Serializable() {} - RelocTable(const RelocTable& other) : Serializable(other), - entries(other.entries) {} + RelocTable(const RelocTable& other) : + Serializable(other), entries(other.entries), entryNames(other.entryNames) {} uint32_t getCount() { return entries.size(); } void getData(char *p) { if (entries.size() > 0 && p) @@ -80,6 +95,7 @@ namespace ir { virtual uint32_t deserializeFromBin(std::istream& ins); private: vector<RelocEntry> entries; + std::map<RelocEntry, std::string> entryNames; GBE_CLASS(RelocTable); }; diff --git a/backend/src/llvm/llvm_gen_backend.cpp b/backend/src/llvm/llvm_gen_backend.cpp index 3fefa92..7b07d8d 100644 --- a/backend/src/llvm/llvm_gen_backend.cpp +++ b/backend/src/llvm/llvm_gen_backend.cpp @@ -1691,8 +1691,7 @@ namespace gbe for (uint32_t k = 0; k < relocs.size(); k++) { unit.getRelocTable().addEntry( refOffset + relocs[k].refOffset, - relocs[k].defOffset - ); + relocs[k].defOffset, name); } } } diff --git a/src/cl_gen7_device.h b/src/cl_gen7_device.h index 01aa0f3..8dfa52f 100644 --- a/src/cl_gen7_device.h +++ b/src/cl_gen7_device.h @@ -30,5 +30,5 @@ //temporarily define to only export builtin kernel block_motion_estimate_intel only for Gen7 //will remove after HSW+ also support #define GEN7_DEVICE -#include "cl_gt_device.h" +#include "cl_gen_device_common.h" #undef GEN7_DEVICE diff --git a/src/gen/cl_command_queue_gen.c b/src/gen/cl_command_queue_gen.c index 8bbfe2c..1f3e1c1 100644 --- a/src/gen/cl_command_queue_gen.c +++ b/src/gen/cl_command_queue_gen.c @@ -67,7 +67,6 @@ typedef struct gen_gpgpu { drm_intel_bo *scratch_bo; /* Scratch buffer */ drm_intel_bo *const_bo; /* Constant buffer */ - drm_intel_bo *stack_bo; /* stack buffer */ drm_intel_bo *time_stamp_bo; /* The buffer to record exec timestamps */ @@ -267,12 +266,18 @@ gen_gpgpu_setup_global_mem(cl_kernel kernel, cl_kernel_gen kernel_gen, gen_gpgpu int32_t offset = 0; cl_mem mem; uint32_t bti; + cl_program_gen prog_gen; + DEV_PRIVATE_DATA(kernel->program, gpu->device, prog_gen); for (i = 0; i < kernel->arg_n; i++) { if (kernel->args[i].arg_type != ArgTypePointer) continue; - if (kernel->args[i].arg_addrspace != AddressSpaceGlobal) + if (kernel->args[i].arg_addrspace != AddressSpaceGlobal && + kernel->args[i].arg_addrspace != AddressSpaceConstant) + continue; + + if (prog_gen->cl_version < 200 && kernel->args[i].arg_addrspace == AddressSpaceConstant) continue; mem = NULL; @@ -372,6 +377,44 @@ gen_gpgpu_setup_scratch(gen_gpgpu *gpu) } static cl_int +gen_setup_constant_buffer_for_20(cl_kernel kernel, cl_kernel_gen kernel_gen, + cl_program_gen prog_gen, gen_gpgpu *gpu) +{ +#ifndef HAS_BO_SET_SOFTPIN + return CL_OUT_OF_RESOURCES; +#else + int i; + cl_bool need_const_buf = CL_FALSE; + cl_int const_addr_curbe_offset = -1; + cl_gen_virt_phy_offset map = kernel_gen->virt_reg_phy_offset; + + for (i = 0; i < kernel_gen->virt_reg_phy_offset_num; i++) { + if (map[i].virt_reg == GBE_CURBE_CONSTANT_ADDRSPACE) { + need_const_buf = CL_TRUE; + const_addr_curbe_offset = map[i].phy_offset; + assert(map[i].size == 8); + break; + } + } + + if (need_const_buf == CL_FALSE) + return CL_SUCCESS; + + assert(prog_gen->global_mem_data); // Should always have something + assert(const_addr_curbe_offset >= 0); + + gpu->mem.const_bo = intel_buffer_alloc_userptr(gpu->bufmgr, "program global data", + prog_gen->global_mem_data, prog_gen->global_mem_data_size, 0); + drm_intel_bo_set_softpin_offset(gpu->mem.const_bo, (size_t)prog_gen->global_mem_data); + drm_intel_bo_use_48b_address_range(gpu->mem.const_bo, 1); + *(char **)(gpu->thread.curbe + const_addr_curbe_offset) = prog_gen->global_mem_data; + gen_gpgpu_bind_one_bo(gpu, gpu->mem.const_bo, const_addr_curbe_offset, 0, + prog_gen->global_mem_data_size, BTI_CONSTANT); + return CL_SUCCESS; +#endif +} + +static cl_int gen_setup_constant_buffer(cl_kernel kernel, cl_kernel_gen kernel_gen, gen_gpgpu *gpu) { cl_program_gen prog_gen; @@ -383,6 +426,11 @@ gen_setup_constant_buffer(cl_kernel kernel, cl_kernel_gen kernel_gen, gen_gpgpu int i; DEV_PRIVATE_DATA(kernel->program, gpu->device, prog_gen); + /* 2.0 is different from before */ + if (prog_gen->cl_version >= 200) { + return gen_setup_constant_buffer_for_20(kernel, kernel_gen, prog_gen, gpu); + } + if (prog_gen->rodata) { const_buf_size = prog_gen->rodata_data->d_size; aligned_const_buf_size = ALIGN(const_buf_size, 8); diff --git a/src/gen/cl_gen.h b/src/gen/cl_gen.h index d04a644..0f50e37 100644 --- a/src/gen/cl_gen.h +++ b/src/gen/cl_gen.h @@ -182,6 +182,14 @@ extern cl_int cl_kernel_get_info_gen(cl_device_id device, cl_kernel kernel, extern cl_int cl_kernel_create_gen(cl_device_id device, cl_kernel kernel); /*********************************** Program *****************************************/ +enum cl_gen_program_note_type { + GEN_NOTE_TYPE_CL_VERSION = 1, + GEN_NOTE_TYPE_GPU_VERSION = 2, + GEN_NOTE_TYPE_GPU_INFO = 3, + GEN_NOTE_TYPE_CL_INFO = 4, + GEN_NOTE_TYPE_COMPILER_INFO = 5, +}; + typedef struct _cl_program_gen { _cl_program_for_device prog_base; Elf *elf; @@ -205,9 +213,26 @@ typedef struct _cl_program_gen { Elf_Scn *func_cl_info; cl_int func_cl_info_sec_index; Elf_Data *func_cl_info_data; + Elf_Scn *ro_reloc; + cl_int ro_reloc_index; + Elf_Data *ro_reloc_data; + char *global_mem_data; + cl_uint global_mem_data_size; + char *gpu_name; + cl_uint gpu_version_major; + cl_uint gpu_version_minor; + char *compiler_name; + cl_uint compiler_version_major; + cl_uint compiler_version_minor; + char *cl_version_str; + cl_uint cl_version; } _cl_program_gen; typedef _cl_program_gen *cl_program_gen; +#define GEN_ELF_RELOC_GET_SYM(PROG_GEN, RELOC_ENTRY) \ + gelf_getclass(PROG_GEN->elf) == ELFCLASS64 ? ELF64_R_SYM(RELOC_ENTRY->r_info) : ELF32_R_SYM(RELOC_ENTRY->r_info) +#define GEN_ELF_RELOC_GET_TYPE(PROG_GEN, RELOC_ENTRY) \ + gelf_getclass(PROG_GEN->elf) == ELFCLASS64 ? ELF64_R_TYPE(RELOC_ENTRY->r_info) : ELF32_R_TYPE(RELOC_ENTRY->r_info) extern void *cl_program_new_gen(cl_device_id device, cl_program p); extern void cl_program_delete_gen(cl_device_id device, cl_program p); extern cl_int cl_program_load_binary_gen(cl_device_id device, cl_program prog); diff --git a/src/gen/cl_gen75_device.h b/src/gen/cl_gen75_device.h index 99b76bf..0d6c812 100644 --- a/src/gen/cl_gen75_device.h +++ b/src/gen/cl_gen75_device.h @@ -21,7 +21,7 @@ .max_parameter_size = 1024, .global_mem_cache_line_size = 64, /* XXX */ .global_mem_cache_size = 8 << 10, /* XXX */ -.local_mem_type = CL_GLOBAL, +.local_mem_type = CL_LOCAL, .local_mem_size = 64 << 10, .scratch_mem_size = 2 << 20, .max_mem_alloc_size = 2 * 1024 * 1024 * 1024ul, diff --git a/src/gen/cl_gen7_device.h b/src/gen/cl_gen7_device.h index 7bf1202..8dfa52f 100644 --- a/src/gen/cl_gen7_device.h +++ b/src/gen/cl_gen7_device.h @@ -21,7 +21,7 @@ .max_parameter_size = 1024, .global_mem_cache_line_size = 64, /* XXX */ .global_mem_cache_size = 8 << 10, /* XXX */ -.local_mem_type = CL_GLOBAL, +.local_mem_type = CL_LOCAL, .local_mem_size = 64 << 10, .scratch_mem_size = 12 << 10, .max_mem_alloc_size = 2 * 1024 * 1024 * 1024ul, diff --git a/src/gen/cl_gen8_device.h b/src/gen/cl_gen8_device.h index b807272..c8b7754 100644 --- a/src/gen/cl_gen8_device.h +++ b/src/gen/cl_gen8_device.h @@ -21,7 +21,7 @@ .max_parameter_size = 1024, .global_mem_cache_line_size = 64, /* XXX */ .global_mem_cache_size = 8 << 10, /* XXX */ -.local_mem_type = CL_GLOBAL, +.local_mem_type = CL_LOCAL, .local_mem_size = 64 << 10, .scratch_mem_size = 2 << 20, .max_mem_alloc_size = 2 * 1024 * 1024 * 1024ul, diff --git a/src/gen/cl_gen9_device.h b/src/gen/cl_gen9_device.h index d069332..7412e98 100644 --- a/src/gen/cl_gen9_device.h +++ b/src/gen/cl_gen9_device.h @@ -21,10 +21,13 @@ .max_parameter_size = 1024, .global_mem_cache_line_size = 64, /* XXX */ .global_mem_cache_size = 8 << 10, /* XXX */ -.local_mem_type = CL_GLOBAL, +.local_mem_type = CL_LOCAL, .local_mem_size = 64 << 10, .scratch_mem_size = 2 << 20, .max_mem_alloc_size = 4 * 1024 * 1024 * 1024ul, .global_mem_size = 4 * 1024 * 1024 * 1024ul, +#define GEN9_DEVICE 1 #include "cl_gen_device_common.h" +#undef GEN9_DEVICE + diff --git a/src/gen/cl_gen_device_common.h b/src/gen/cl_gen_device_common.h index 9fef422..16b4811 100644 --- a/src/gen/cl_gen_device_common.h +++ b/src/gen/cl_gen_device_common.h @@ -49,11 +49,7 @@ .native_vector_width_float = 4, .native_vector_width_double = 2, .native_vector_width_half = 8, -#ifdef ENABLE_OPENCL_20 -.address_bits = 64, -#else .address_bits = 32, -#endif .svm_capabilities = CL_DEVICE_SVM_COARSE_GRAIN_BUFFER, .preferred_platform_atomic_alignment = 0, .preferred_global_atomic_alignment = 0, diff --git a/src/gen/cl_kernel_gen.c b/src/gen/cl_kernel_gen.c index 4e85c1d..0fd5809 100644 --- a/src/gen/cl_kernel_gen.c +++ b/src/gen/cl_kernel_gen.c @@ -199,7 +199,7 @@ cl_program_gen_get_kernel_func_cl_info(cl_device_id device, cl_kernel kernel) desc_type = *(cl_uint *)(prog_gen->func_cl_info_data->d_buf + offset + 2 * sizeof(cl_uint)); name = prog_gen->func_cl_info_data->d_buf + offset + sizeof(cl_uint) * 3; - if (desc_type != 0x04) { + if (desc_type != GEN_NOTE_TYPE_CL_INFO) { offset += 3 * sizeof(cl_uint) + ALIGN(name_size, 4) + ALIGN(desc_size, 4); continue; } @@ -355,7 +355,7 @@ cl_program_gen_get_one_kernel_func(cl_device_id device, cl_kernel kernel, GElf_S desc_size = *(cl_uint *)(prog_gen->func_gpu_info_data->d_buf + offset + sizeof(cl_uint)); desc_type = *(cl_uint *)(prog_gen->func_gpu_info_data->d_buf + offset + 2 * sizeof(cl_uint)); name = prog_gen->func_gpu_info_data->d_buf + offset + sizeof(cl_uint) * 3; - if (desc_type != 0x03) { + if (desc_type != GEN_NOTE_TYPE_GPU_INFO) { offset += 3 * sizeof(cl_uint) + ALIGN(name_size, 4) + ALIGN(desc_size, 4); continue; } @@ -535,9 +535,9 @@ cl_kernel_create_gen(cl_device_id device, cl_kernel kernel) for (i = 0; i < (int)(prog_gen->symtab_entry_num); i++) { p_sym_entry = gelf_getsym(prog_gen->symtab_data, i, &sym_entry); assert(p_sym_entry == &sym_entry); - if ((p_sym_entry->st_info & 0x0f) != STT_FUNC) + if (ELF32_ST_TYPE(p_sym_entry->st_info) != STT_FUNC) continue; - if (((p_sym_entry->st_info & 0x0f0) >> 4) != STB_GLOBAL) + if (ELF32_ST_BIND(p_sym_entry->st_info) != STB_GLOBAL) continue; name = p_sym_entry->st_name + prog_gen->strtab_data->d_buf; diff --git a/src/gen/cl_program_gen.c b/src/gen/cl_program_gen.c index 3c0b796..3b2f4b5 100644 --- a/src/gen/cl_program_gen.c +++ b/src/gen/cl_program_gen.c @@ -17,6 +17,7 @@ */ #include "cl_gen.h" +#include <unistd.h> struct binary_type_header_info { unsigned char header[7]; @@ -94,6 +95,24 @@ cl_program_delete_gen(cl_device_id device, cl_program p) } pd->kernel_names = NULL; + if (gen_elf->compiler_name) + CL_FREE(gen_elf->compiler_name); + gen_elf->compiler_name = NULL; + + if (gen_elf->gpu_name) + CL_FREE(gen_elf->gpu_name); + gen_elf->gpu_name = NULL; + + if (gen_elf->cl_version_str) + CL_FREE(gen_elf->cl_version_str); + gen_elf->cl_version_str = NULL; + + if (gen_elf->global_mem_data) { + CL_FREE(gen_elf->global_mem_data); + assert(gen_elf->global_mem_data_size > 0); + } + gen_elf->global_mem_data = NULL; + if (gen_elf->elf) elf_end(gen_elf->elf); gen_elf->elf = NULL; @@ -102,6 +121,69 @@ cl_program_delete_gen(cl_device_id device, cl_program p) } static cl_int +cl_program_gen_alloc_global_mem(cl_device_id device, cl_program prog, cl_program_gen prog_gen) +{ + int i; + cl_uint const_buf_size = 0; + cl_uint aligned_const_buf_size = 0; + + if (prog_gen->cl_version < 200 && prog_gen->rodata_data != NULL) + return CL_INVALID_PROGRAM; + + if (prog_gen->cl_version < 200 || prog_gen->rodata_data == NULL) + return CL_SUCCESS; + + const_buf_size = prog_gen->rodata_data->d_size; + aligned_const_buf_size = ALIGN(const_buf_size, getpagesize()); + prog_gen->global_mem_data = CL_MEMALIGN(getpagesize(), aligned_const_buf_size); + if (prog_gen->global_mem_data == NULL) + return CL_OUT_OF_RESOURCES; + + prog_gen->global_mem_data_size = aligned_const_buf_size; + memset(prog_gen->global_mem_data, 0, aligned_const_buf_size); + memcpy(prog_gen->global_mem_data, prog_gen->rodata_data->d_buf, prog_gen->rodata_data->d_size); + + /* Do some reloc setting */ + if (prog_gen->ro_reloc) { + GElf_Rela entry; + GElf_Rela *p_entry; + cl_int ro_reloc_num; + GElf_Shdr *p_sec_header = NULL; + GElf_Shdr sec_header; + GElf_Sym *p_sym_entry; + GElf_Sym sym_entry; + char *const_buf_addr = prog_gen->global_mem_data; + assert(prog_gen->ro_reloc_data); + + p_sec_header = gelf_getshdr(prog_gen->ro_reloc, &sec_header); + ro_reloc_num = p_sec_header->sh_size / p_sec_header->sh_entsize; + for (i = 0; i < ro_reloc_num; i++) { + p_entry = gelf_getrela(prog_gen->ro_reloc_data, i, &entry); + if (p_entry == NULL) { + return CL_INVALID_PROGRAM; + } + + if ((cl_uint)(GEN_ELF_RELOC_GET_TYPE(prog_gen, p_entry)) != R_386_32) { + return CL_INVALID_PROGRAM; + } + + p_sym_entry = gelf_getsym(prog_gen->symtab_data, + GEN_ELF_RELOC_GET_SYM(prog_gen, p_entry), &sym_entry); + if (p_sym_entry == NULL) { + return CL_INVALID_PROGRAM; + } + + assert(p_entry->r_offset > 0); + assert(sizeof(void *) == 8); // Must be 64 bits + *(char **)(const_buf_addr + p_entry->r_offset) = + (char *)(const_buf_addr + p_sym_entry->st_value + p_entry->r_addend); + } + } + + return CL_SUCCESS; +} + +static cl_int cl_program_load_binary_gen_elf(cl_device_id device, cl_program prog) { cl_program_for_device pd; @@ -115,9 +197,13 @@ cl_program_load_binary_gen_elf(cl_device_id device, cl_program prog) GElf_Sym *p_sym_entry; GElf_Sym sym_entry; char *name; - int ret; size_t val = 0; int i, j; + cl_int offset; + cl_uint name_size; + cl_uint desc_size; + cl_uint desc_type; + cl_int ret; DEV_PRIVATE_DATA(prog, device, elf); pd = &elf->prog_base; @@ -191,11 +277,14 @@ cl_program_load_binary_gen_elf(cl_device_id device, cl_program prog) } else if (strcmp(sh_strtab_data->d_buf + p_sec_header->sh_name, ".rodata") == 0) { elf->rodata = elf_sec; elf->rodata_sec_index = i; + } else if (strcmp(sh_strtab_data->d_buf + p_sec_header->sh_name, ".rel.rodata") == 0) { + elf->ro_reloc = elf_sec; + elf->ro_reloc_index = i; } } - if (elf->text == NULL || elf->symtab == NULL || - elf->strtab == NULL || elf->func_gpu_info == NULL) { + if (elf->text == NULL || elf->symtab == NULL || elf->strtab == NULL || + elf->func_gpu_info == NULL || elf->func_cl_info == NULL) { elf_end(elf_p); elf->elf = NULL; return CL_INVALID_PROGRAM; @@ -213,13 +302,15 @@ cl_program_load_binary_gen_elf(cl_device_id device, cl_program prog) assert(p_sec_header->sh_size % p_sec_header->sh_entsize == 0); elf->func_gpu_info_data = elf_getdata(elf->func_gpu_info, NULL); assert(elf->func_gpu_info_data); + elf->func_cl_info_data = elf_getdata(elf->func_cl_info, NULL); + assert(elf->func_cl_info_data); if (elf->rodata) { elf->rodata_data = elf_getdata(elf->rodata, NULL); assert(elf->rodata_data); } - if (elf->func_cl_info) { - elf->func_cl_info_data = elf_getdata(elf->func_cl_info, NULL); - assert(elf->func_cl_info_data); + if (elf->ro_reloc) { + elf->ro_reloc_data = elf_getdata(elf->ro_reloc, NULL); + assert(elf->ro_reloc_data); } /* Add all kernel names */ @@ -228,9 +319,9 @@ cl_program_load_binary_gen_elf(cl_device_id device, cl_program prog) for (i = 0; i < (int)(elf->symtab_entry_num); i++) { p_sym_entry = gelf_getsym(elf->symtab_data, i, &sym_entry); assert(p_sym_entry == &sym_entry); - if ((p_sym_entry->st_info & 0x0f) != STT_FUNC) + if (ELF32_ST_TYPE(p_sym_entry->st_info) != STT_FUNC) continue; - if (((p_sym_entry->st_info & 0x0f0) >> 4) != STB_GLOBAL) + if (ELF32_ST_BIND(p_sym_entry->st_info) != STB_GLOBAL) continue; name = p_sym_entry->st_name + elf->strtab_data->d_buf; @@ -254,9 +345,9 @@ cl_program_load_binary_gen_elf(cl_device_id device, cl_program prog) for (i = 0; i < (int)(elf->symtab_entry_num); i++) { p_sym_entry = gelf_getsym(elf->symtab_data, i, &sym_entry); assert(p_sym_entry == &sym_entry); - if ((p_sym_entry->st_info & 0x0f) != STT_FUNC) + if (ELF32_ST_TYPE(p_sym_entry->st_info) != STT_FUNC) continue; - if (((p_sym_entry->st_info & 0x0f0) >> 4) != STB_GLOBAL) + if (ELF32_ST_BIND(p_sym_entry->st_info) != STB_GLOBAL) continue; pd->kernel_names[j] = @@ -273,7 +364,72 @@ cl_program_load_binary_gen_elf(cl_device_id device, cl_program prog) } assert(j == pd->kernel_num); - return CL_SUCCESS; + /* Get the compiler name and gpu version */ + offset = 0; + while (offset < elf->func_gpu_info_data->d_size) { + name_size = *(cl_uint *)(elf->func_gpu_info_data->d_buf + offset); + desc_size = *(cl_uint *)(elf->func_gpu_info_data->d_buf + offset + sizeof(cl_uint)); + desc_type = *(cl_uint *)(elf->func_gpu_info_data->d_buf + offset + 2 * sizeof(cl_uint)); + if (desc_type == GEN_NOTE_TYPE_COMPILER_INFO) { + elf->compiler_name = CL_CALLOC(name_size + 1, sizeof(char)); + if (elf->compiler_name == NULL) { + elf_end(elf_p); + elf->elf = NULL; + return CL_OUT_OF_HOST_MEMORY; + } + memcpy(elf->compiler_name, elf->func_gpu_info_data->d_buf + offset + sizeof(cl_uint) * 3, name_size); + elf->compiler_name[name_size] = 0; + elf->compiler_version_major = *(cl_uint *)(elf->func_gpu_info_data->d_buf + offset + + 3 * sizeof(cl_uint) + ALIGN(name_size, 4)); + elf->compiler_version_minor = *(cl_uint *)(elf->func_gpu_info_data->d_buf + offset + + 3 * sizeof(cl_uint) + ALIGN(name_size, 4) + sizeof(cl_uint)); + } else if (desc_type == GEN_NOTE_TYPE_GPU_VERSION) { + elf->gpu_name = CL_CALLOC(name_size + 1, sizeof(char)); + if (elf->gpu_name == NULL) { + elf_end(elf_p); + elf->elf = NULL; + return CL_OUT_OF_HOST_MEMORY; + } + memcpy(elf->gpu_name, elf->func_gpu_info_data->d_buf + offset + sizeof(cl_uint) * 3, name_size); + elf->gpu_name[name_size] = 0; + elf->gpu_version_major = *(cl_uint *)(elf->func_gpu_info_data->d_buf + offset + + 3 * sizeof(cl_uint) + ALIGN(name_size, 4)); + elf->gpu_version_minor = *(cl_uint *)(elf->func_gpu_info_data->d_buf + offset + + 3 * sizeof(cl_uint) + ALIGN(name_size, 4) + sizeof(cl_uint)); + } + + offset += 3 * sizeof(cl_uint) + ALIGN(name_size, 4) + ALIGN(desc_size, 4); + } + + /* Get the OpenCL version */ + offset = 0; + while (offset < elf->func_cl_info_data->d_size) { + name_size = *(cl_uint *)(elf->func_cl_info_data->d_buf + offset); + desc_size = *(cl_uint *)(elf->func_cl_info_data->d_buf + offset + sizeof(cl_uint)); + desc_type = *(cl_uint *)(elf->func_cl_info_data->d_buf + offset + 2 * sizeof(cl_uint)); + if (desc_type == GEN_NOTE_TYPE_CL_VERSION) { + elf->cl_version_str = CL_CALLOC(name_size + 1, sizeof(char)); + if (elf->cl_version_str == NULL) { + elf_end(elf_p); + elf->elf = NULL; + return CL_OUT_OF_HOST_MEMORY; + } + memcpy(elf->cl_version_str, elf->func_cl_info_data->d_buf + offset + sizeof(cl_uint) * 3, name_size); + elf->cl_version_str[name_size] = 0; + elf->cl_version = *(cl_uint *)(elf->func_cl_info_data->d_buf + offset + + 3 * sizeof(cl_uint) + ALIGN(name_size, 4)); + } + + offset += 3 * sizeof(cl_uint) + ALIGN(name_size, 4) + ALIGN(desc_size, 4); + } + + ret = cl_program_gen_alloc_global_mem(device, prog, elf); + if (ret != CL_SUCCESS) { + elf_end(elf_p); + elf->elf = NULL; + } + + return ret; } LOCAL cl_int diff --git a/src/gen/intel_driver.c b/src/gen/intel_driver.c index 2f62b22..eac6366 100644 --- a/src/gen/intel_driver.c +++ b/src/gen/intel_driver.c @@ -560,8 +560,8 @@ intel_buffer_set_tiling(cl_buffer bo, cl_image_tiling_t tiling, size_t stride) return ret; } -static cl_buffer -intel_buffer_alloc_userptr(cl_buffer_mgr bufmgr, const char *name, +LOCAL drm_intel_bo* +intel_buffer_alloc_userptr(dri_bufmgr *bufmgr, const char *name, void *data, size_t size, unsigned long flags) { #ifdef HAS_USERPTR @@ -572,7 +572,7 @@ intel_buffer_alloc_userptr(cl_buffer_mgr bufmgr, const char *name, if (bo == NULL) bo = drm_intel_bo_alloc_userptr((drm_intel_bufmgr *)bufmgr, name, data, I915_TILING_NONE, 0, size, flags | I915_USERPTR_UNSYNCHRONIZED); - return (cl_buffer)bo; + return bo; #else return NULL; #endif @@ -912,12 +912,11 @@ intel_update_device_info(cl_device_id device) host_ptr = CL_MEMALIGN(sz, 4096); if (host_ptr != NULL) { - cl_buffer bo = intel_buffer_alloc_userptr((cl_buffer_mgr)driver->bufmgr, - "CL memory object", host_ptr, sz, 0); + drm_intel_bo* bo = intel_buffer_alloc_userptr(driver->bufmgr, "CL memory object", host_ptr, sz, 0); if (bo == NULL) device->host_unified_memory = CL_FALSE; else - drm_intel_bo_unreference((drm_intel_bo *)bo); + drm_intel_bo_unreference(bo); CL_FREE(host_ptr); } else device->host_unified_memory = CL_FALSE; diff --git a/src/gen/intel_driver.h b/src/gen/intel_driver.h index 825eebf..d01cd55 100644 --- a/src/gen/intel_driver.h +++ b/src/gen/intel_driver.h @@ -132,6 +132,8 @@ extern int intel_get_device_id(void); /* methods working in shared mode */ extern dri_bo *intel_driver_share_buffer(intel_driver_t *, const char *sname, uint32_t name); extern uint32_t intel_driver_shared_name(intel_driver_t *, dri_bo *); +extern dri_bo* intel_buffer_alloc_userptr(dri_bufmgr *bufmgr, const char *name, + void *data, size_t size, unsigned long flags); /* init the call backs used by the ocl driver */ extern void intel_setup_callbacks(void); -- 2.7.4 _______________________________________________ Beignet mailing list Beignet@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/beignet