From: Ruiling Song <[email protected]> to avoid zero address in local memory, 4 bytes is reserved. this will be fixed later.
Signed-off-by: Ruiling Song <[email protected]> --- backend/src/backend/program.h | 1 + backend/src/ir/profile.cpp | 4 +++- backend/src/ir/profile.hpp | 3 ++- backend/src/libocl/include/ocl_misc.h | 6 ++++++ backend/src/libocl/src/ocl_misc.cl | 21 +++++++++++++++++++++ backend/src/llvm/llvm_gen_backend.cpp | 22 ++++++++++++++++++++++ backend/src/llvm/llvm_gen_ocl_function.hxx | 1 + src/cl_command_queue_gen7.c | 12 +++++++++--- 8 files changed, 65 insertions(+), 5 deletions(-) diff --git a/backend/src/backend/program.h b/backend/src/backend/program.h index 03150bc..0eece8f 100644 --- a/backend/src/backend/program.h +++ b/backend/src/backend/program.h @@ -100,6 +100,7 @@ enum gbe_curbe_type { GBE_CURBE_DW_BLOCK_IP, GBE_CURBE_THREAD_NUM, GBE_CURBE_CONSTANT_ADDRSPACE, + GBE_CURBE_STACK_SIZE, GBE_GEN_REG, }; diff --git a/backend/src/ir/profile.cpp b/backend/src/ir/profile.cpp index 0699167..4f28e34 100644 --- a/backend/src/ir/profile.cpp +++ b/backend/src/ir/profile.cpp @@ -44,7 +44,8 @@ namespace ir { "retVal", "printf_buffer_pointer", "printf_index_buffer_pointer", "dwblockip", - "constant_addrspace_start" + "constant_addrspace_start", + "stack_size" }; #if GBE_DEBUG @@ -88,6 +89,7 @@ namespace ir { DECL_NEW_REG(FAMILY_QWORD, printfiptr, 1, GBE_CURBE_PRINTF_INDEX_POINTER); DECL_NEW_REG(FAMILY_DWORD, dwblockip, 0, GBE_CURBE_DW_BLOCK_IP); DECL_NEW_REG(FAMILY_QWORD, constant_addrspace, 1, GBE_CURBE_CONSTANT_ADDRSPACE); + DECL_NEW_REG(FAMILY_QWORD, stacksize, 1, GBE_CURBE_STACK_SIZE); } #undef DECL_NEW_REG diff --git a/backend/src/ir/profile.hpp b/backend/src/ir/profile.hpp index 79761d4..f348e0d 100644 --- a/backend/src/ir/profile.hpp +++ b/backend/src/ir/profile.hpp @@ -72,7 +72,8 @@ namespace ir { static const Register printfiptr = Register(28); // printf index buffer address. static const Register dwblockip = Register(29); // blockip static const Register constant_addrspace = Register(30); // starting address of program-scope constant - static const uint32_t regNum = 31; // number of special registers + static const Register stacksize = Register(31); // stack buffer total size + static const uint32_t regNum = 32; // number of special registers extern const char *specialRegMean[]; // special register name. } /* namespace ocl */ diff --git a/backend/src/libocl/include/ocl_misc.h b/backend/src/libocl/include/ocl_misc.h index 359025b..dba821d 100644 --- a/backend/src/libocl/include/ocl_misc.h +++ b/backend/src/libocl/include/ocl_misc.h @@ -137,4 +137,10 @@ struct time_stamp { }; struct time_stamp __gen_ocl_get_timestamp(void); +bool __gen_ocl_in_local(size_t p); +bool __gen_ocl_in_private(size_t p); + +local void *to_local(generic void *p); +global void *to_global(generic void *p); +private void *to_private(generic void *p); #endif diff --git a/backend/src/libocl/src/ocl_misc.cl b/backend/src/libocl/src/ocl_misc.cl index 7f40054..232534d 100644 --- a/backend/src/libocl/src/ocl_misc.cl +++ b/backend/src/libocl/src/ocl_misc.cl @@ -229,3 +229,24 @@ struct time_stamp __gen_ocl_get_timestamp(void) { return val; }; +bool __gen_ocl_in_local(size_t p) { + bool cond1 = p > 0; + bool cond2 = p < 64*1024; + return cond1 && cond2; +} + +local void *to_local(generic void *p) { + bool cond = __gen_ocl_in_local((size_t)p); + return cond ? (local void*)p : NULL; +} +private void *to_private(generic void *p) { + bool cond = __gen_ocl_in_private((size_t)p); + return cond ? (private void*)p : NULL; +} + +global void *to_global(generic void *p) { + bool cond1 = __gen_ocl_in_local((size_t)p); + bool cond2 = __gen_ocl_in_private((size_t)p); + bool cond = cond1 || cond2; + return !cond ? (global void*)p : NULL; +} diff --git a/backend/src/llvm/llvm_gen_backend.cpp b/backend/src/llvm/llvm_gen_backend.cpp index d23a598..2e0bedc 100644 --- a/backend/src/llvm/llvm_gen_backend.cpp +++ b/backend/src/llvm/llvm_gen_backend.cpp @@ -2580,6 +2580,8 @@ namespace gbe const Constant *c = v.getInitializer(); Type *ty = c->getType(); uint32_t oldSlm = f.getSLMSize(); + // FIXME temporary reserve 4 bytes to avoid 0 address + if (oldSlm == 0) oldSlm = 4; uint32_t align = 8 * getAlignmentByte(unit, ty); uint32_t padding = getPadding(oldSlm*8, align); @@ -3604,6 +3606,7 @@ namespace gbe case GEN_OCL_SIMD_SIZE: case GEN_OCL_READ_TM: case GEN_OCL_REGION: + case GEN_OCL_IN_PRIVATE: case GEN_OCL_SIMD_ID: case GEN_OCL_SIMD_SHUFFLE: case GEN_OCL_WORK_GROUP_ALL: @@ -3969,6 +3972,25 @@ namespace gbe ctx.READ_ARF(ir::TYPE_U32, dst, ir::ARF_TM); break; } + case GEN_OCL_IN_PRIVATE: + { + const ir::Register dst = this->getRegister(&I); + uint32_t stackSize = ctx.getFunction().getStackSize(); + if (stackSize == 0) { + ctx.MOV(ir::TYPE_BOOL, dst, ir::ocl::zero); + } else { + ir::Register cmp0 = ctx.reg(ir::FAMILY_BOOL); + ir::Register cmp1 = ctx.reg(ir::FAMILY_BOOL); + const ir::Register src0 = this->getRegister(*AI); + ir::Register tmp = ctx.reg(ir::FAMILY_QWORD); + + ctx.GE(ir::TYPE_U64, cmp0, src0, ir::ocl::stackbuffer); + ctx.ADD(ir::TYPE_U64, tmp, ir::ocl::stackbuffer, ir::ocl::stacksize); + ctx.LT(ir::TYPE_U64, cmp1, src0, tmp); + ctx.AND(ir::TYPE_BOOL, dst, cmp0, cmp1); + } + break; + } case GEN_OCL_REGION: { const ir::Register dst = this->getRegister(&I); diff --git a/backend/src/llvm/llvm_gen_ocl_function.hxx b/backend/src/llvm/llvm_gen_ocl_function.hxx index 0849f1e..92d4ea3 100644 --- a/backend/src/llvm/llvm_gen_ocl_function.hxx +++ b/backend/src/llvm/llvm_gen_ocl_function.hxx @@ -167,6 +167,7 @@ DECL_LLVM_GEN_FUNCTION(SIMD_SHUFFLE, intel_sub_group_shuffle) DECL_LLVM_GEN_FUNCTION(READ_TM, __gen_ocl_read_tm) DECL_LLVM_GEN_FUNCTION(REGION, __gen_ocl_region) +DECL_LLVM_GEN_FUNCTION(IN_PRIVATE, __gen_ocl_in_private) // printf function DECL_LLVM_GEN_FUNCTION(PRINTF, __gen_ocl_printf) diff --git a/src/cl_command_queue_gen7.c b/src/cl_command_queue_gen7.c index 61ffe7e..eba3445 100644 --- a/src/cl_command_queue_gen7.c +++ b/src/cl_command_queue_gen7.c @@ -263,7 +263,7 @@ cl_bind_stack(cl_gpgpu gpgpu, cl_kernel ker) const int32_t per_lane_stack_sz = ker->stack_size; const int32_t value = GBE_CURBE_EXTRA_ARGUMENT; const int32_t sub_value = GBE_STACK_BUFFER; - const int32_t offset = interp_kernel_get_curbe_offset(ker->opaque, value, sub_value); + const int32_t offset_stack_buffer = interp_kernel_get_curbe_offset(ker->opaque, value, sub_value); int32_t stack_sz = per_lane_stack_sz; /* No stack required for this kernel */ @@ -273,7 +273,7 @@ cl_bind_stack(cl_gpgpu gpgpu, cl_kernel ker) /* The stack size is given for *each* SIMD lane. So, we accordingly compute * the size we need for the complete machine */ - assert(offset >= 0); + assert(offset_stack_buffer >= 0); stack_sz *= interp_kernel_get_simd_width(ker->opaque); stack_sz *= device->max_compute_unit * ctx->device->max_thread_per_unit; /* Because HSW calc stack offset per thread is relative with half slice, when @@ -282,7 +282,13 @@ cl_bind_stack(cl_gpgpu gpgpu, cl_kernel ker) */ if(cl_driver_get_ver(ctx->drv) == 75) stack_sz *= 4; - cl_gpgpu_set_stack(gpgpu, offset, stack_sz, BTI_PRIVATE); + + const int32_t offset_stack_size = interp_kernel_get_curbe_offset(ker->opaque, GBE_CURBE_STACK_SIZE, 0); + if (offset_stack_size >= 0) { + *(uint64_t *)(ker->curbe + offset_stack_size) = stack_sz; + } + + cl_gpgpu_set_stack(gpgpu, offset_stack_buffer, stack_sz, BTI_PRIVATE); } static int -- 2.4.1 _______________________________________________ Beignet mailing list [email protected] http://lists.freedesktop.org/mailman/listinfo/beignet
