For the comment part, I will refine it. About function name, I also wanted to use intel_gpgpu_alloc_constant_buffer as beignet use constant_buffer at many places which refer to the curbe buffer. so, I hesitated to use alloc_constant_buffer. I also noticed the function name is too long :(
What about changing existing " cl_gpgpu_upload_constants " " intel_gpgpu_load_constant_buffer " into "cl_gpgpu_upload_curbes" " intel_gpgpu_load_curbe_buffer " If it is OK, I will change it, so that the symbols do not introduce misunderstanding. Thanks! Ruiling -----Original Message----- From: Zhigang Gong [mailto:[email protected]] Sent: Wednesday, September 04, 2013 10:42 AM To: Song, Ruiling Cc: [email protected] Subject: Re: [Beignet] [PATCH 2/3] Implement constant buffer based on constant cache. Nice patch, some minor comments as below. On Tue, Sep 03, 2013 at 03:42:36PM +0800, Ruiling Song wrote: > Currently, simply allocate enough graphics memory as constant memory space. > And bind it to bti 2. Constant cache read are backed by dword scatter read. > Different from other data port messages, the address need to be dword > aligned, and the addresses are in units of dword. > > The constant address space data are placed in order: first global > constant, then the constant buffer kernel argument. > > Signed-off-by: Ruiling Song <[email protected]> > --- > backend/src/backend/context.cpp | 12 +---- > backend/src/backend/gen_insn_selection.cpp | 30 +++++++++++- > backend/src/backend/gen_reg_allocation.cpp | 1 - > backend/src/backend/program.h | 2 - > backend/src/ir/profile.cpp | 2 - > backend/src/ir/profile.hpp | 5 +- > backend/src/llvm/llvm_gen_backend.cpp | 9 +--- > src/cl_command_queue.c | 18 -------- > src/cl_command_queue.h | 3 -- > src/cl_command_queue_gen7.c | 69 > ++++++++++++++++++++++++---- > src/cl_driver.h | 3 ++ > src/cl_driver_defs.c | 1 + > src/cl_gt_device.h | 2 +- > src/cl_kernel.c | 10 ---- > src/intel/intel_driver.c | 2 +- > src/intel/intel_gpgpu.c | 42 ++++++++++++++++- > 16 files changed, 138 insertions(+), 73 deletions(-) > > diff --git a/backend/src/backend/context.cpp > b/backend/src/backend/context.cpp index 5484869..ac3a243 100644 > --- a/backend/src/backend/context.cpp > +++ b/backend/src/backend/context.cpp > @@ -458,15 +458,6 @@ namespace gbe > } > }); > #undef INSERT_REG > - this->newCurbeEntry(GBE_CURBE_GLOBAL_CONSTANT_OFFSET, 0, sizeof(int)); > - specialRegs.insert(ir::ocl::constoffst); > - > - // Insert serialized global constant arrays if used > - const ir::ConstantSet& constantSet = unit.getConstantSet(); > - if (constantSet.getConstantNum()) { > - size_t size = constantSet.getDataSize(); > - this->newCurbeEntry(GBE_CURBE_GLOBAL_CONSTANT_DATA, 0, size); > - } > > // Insert the number of threads > this->newCurbeEntry(GBE_CURBE_THREAD_NUM, 0, sizeof(uint32_t)); > @@ -640,8 +631,7 @@ namespace gbe > reg == ir::ocl::goffset0 || > reg == ir::ocl::goffset1 || > reg == ir::ocl::goffset2 || > - reg == ir::ocl::workdim || > - reg == ir::ocl::constoffst) > + reg == ir::ocl::workdim) > return true; > return false; > } > diff --git a/backend/src/backend/gen_insn_selection.cpp > b/backend/src/backend/gen_insn_selection.cpp > index 1e81dac..f6f28ff 100644 > --- a/backend/src/backend/gen_insn_selection.cpp > +++ b/backend/src/backend/gen_insn_selection.cpp > @@ -2101,6 +2101,23 @@ namespace gbe > sel.UNTYPED_READ(addr, dst.data(), valueNum, bti); > } > > + void emitDWordGather(Selection::Opaque &sel, > + const ir::LoadInstruction &insn, > + GenRegister addr, > + uint32_t bti) const > + { > + using namespace ir; > + const uint32_t valueNum = insn.getValueNum(); > + const uint32_t simdWidth = sel.ctx.getSimdWidth(); > + GBE_ASSERT(valueNum == 1); > + GenRegister dst = GenRegister::retype(sel.selReg(insn.getValue(0)), > GEN_TYPE_F); > + // get dword based address > + GenRegister addrDW = GenRegister::udxgrf(simdWidth, > sel.reg(FAMILY_DWORD)); > + sel.SHR(addrDW, GenRegister::retype(addr, GEN_TYPE_UD), > + GenRegister::immud(2)); > + > + sel.DWORD_GATHER(dst, addrDW, bti); > + } > + > void emitRead64(Selection::Opaque &sel, > const ir::LoadInstruction &insn, > GenRegister addr, @@ -2171,8 +2188,17 @@ > namespace gbe > GBE_ASSERT(sel.ctx.isScalarReg(insn.getValue(0)) == false); > const Type type = insn.getValueType(); > const uint32_t elemSize = getByteScatterGatherSize(type); > - if (insn.getAddressSpace() == MEM_CONSTANT) > - this->emitIndirectMove(sel, insn, address); > + if (insn.getAddressSpace() == MEM_CONSTANT) { > + // XXX read 64bit constant through constant cache It's better to add TODO in the above comment. And if you can point out: why for the short/char data type, it can't be accessed from constant cache, it will be a little bit clearer. > + if(insn.isAligned() == true && elemSize == GEN_BYTE_SCATTER_QWORD) > + this->emitRead64(sel, insn, address, 0x2); > + else if(insn.isAligned() == true && elemSize == > GEN_BYTE_SCATTER_DWORD) > + this->emitDWordGather(sel, insn, address, 0x2); > + else { > + const GenRegister value = sel.selReg(insn.getValue(0)); > + this->emitByteGather(sel, insn, elemSize, address, value, 0x2); > + } > + } > else if (insn.isAligned() == true && elemSize == > GEN_BYTE_SCATTER_QWORD) > this->emitRead64(sel, insn, address, space == MEM_LOCAL ? 0xfe : > 0x00); > else if (insn.isAligned() == true && elemSize == > GEN_BYTE_SCATTER_DWORD) diff --git > a/backend/src/backend/gen_reg_allocation.cpp > b/backend/src/backend/gen_reg_allocation.cpp > index 0bb75a2..2abfb12 100644 > --- a/backend/src/backend/gen_reg_allocation.cpp > +++ b/backend/src/backend/gen_reg_allocation.cpp > @@ -573,7 +573,6 @@ namespace gbe > allocatePayloadReg(GBE_CURBE_GROUP_NUM_Z, ocl::numgroup2); > allocatePayloadReg(GBE_CURBE_STACK_POINTER, ocl::stackptr); > allocatePayloadReg(GBE_CURBE_THREAD_NUM, ocl::threadn); > - allocatePayloadReg(GBE_CURBE_GLOBAL_CONSTANT_OFFSET, ocl::constoffst); > > // Group and barrier IDs are always allocated by the hardware in r0 > RA.insert(std::make_pair(ocl::groupid0, 1*sizeof(float))); // > r0.1 diff --git a/backend/src/backend/program.h > b/backend/src/backend/program.h index d20e7af..ff4d157 100644 > --- a/backend/src/backend/program.h > +++ b/backend/src/backend/program.h > @@ -70,8 +70,6 @@ enum gbe_curbe_type { > GBE_CURBE_GROUP_NUM_Y, > GBE_CURBE_GROUP_NUM_Z, > GBE_CURBE_WORK_DIM, > - GBE_CURBE_GLOBAL_CONSTANT_OFFSET, > - GBE_CURBE_GLOBAL_CONSTANT_DATA, > GBE_CURBE_IMAGE_INFO, > GBE_CURBE_STACK_POINTER, > GBE_CURBE_KERNEL_ARGUMENT, > diff --git a/backend/src/ir/profile.cpp b/backend/src/ir/profile.cpp > index 675018a..927e43d 100644 > --- a/backend/src/ir/profile.cpp > +++ b/backend/src/ir/profile.cpp > @@ -40,7 +40,6 @@ namespace ir { > "stack_pointer", > "block_ip", > "barrier_id", "thread_number", > - "const_curbe_offset", > "work_dimension", > }; > > @@ -76,7 +75,6 @@ namespace ir { > DECL_NEW_REG(FAMILY_WORD, blockip); > DECL_NEW_REG(FAMILY_DWORD, barrierid); > DECL_NEW_REG(FAMILY_DWORD, threadn); > - DECL_NEW_REG(FAMILY_DWORD, constoffst); > DECL_NEW_REG(FAMILY_DWORD, workdim); > } > #undef DECL_NEW_REG > diff --git a/backend/src/ir/profile.hpp b/backend/src/ir/profile.hpp > index 4b0ef5e..c79bc3b 100644 > --- a/backend/src/ir/profile.hpp > +++ b/backend/src/ir/profile.hpp > @@ -63,9 +63,8 @@ namespace ir { > static const Register blockip = Register(19); // blockip > static const Register barrierid = Register(20);// barrierid > static const Register threadn = Register(21); // number of threads > - static const Register constoffst = Register(22); // offset of global > constant array's curbe > - static const Register workdim = Register(23); // work dimention. > - static const uint32_t regNum = 24; // number of special > registers > + static const Register workdim = Register(22); // work dimention. > + static const uint32_t regNum = 23; // number of special > registers > extern const char *specialRegMean[]; // special register name. > } /* namespace ocl */ > > diff --git a/backend/src/llvm/llvm_gen_backend.cpp > b/backend/src/llvm/llvm_gen_backend.cpp > index 12d809d..e747d00 100644 > --- a/backend/src/llvm/llvm_gen_backend.cpp > +++ b/backend/src/llvm/llvm_gen_backend.cpp > @@ -1243,12 +1243,7 @@ namespace gbe > ir::Register reg = ctx.reg(ir::RegisterFamily::FAMILY_DWORD); > ir::Constant &con = unit.getConstantSet().getConstant(j ++); > con.setReg(reg.value()); > - if(con.getOffset() != 0) { > - ctx.LOADI(ir::TYPE_S32, reg, > ctx.newIntegerImmediate(con.getOffset(), ir::TYPE_S32)); > - ctx.ADD(ir::TYPE_S32, reg, ir::ocl::constoffst, reg); > - } else { > - ctx.MOV(ir::TYPE_S32, reg, ir::ocl::constoffst); > - } > + ctx.LOADI(ir::TYPE_S32, reg, > + ctx.newIntegerImmediate(con.getOffset(), ir::TYPE_S32)); > } > > // Visit all the instructions and emit the IR registers or the > value to @@ -2407,7 +2402,7 @@ namespace gbe > const ir::Type type = getType(ctx, elemType); > const ir::RegisterFamily pointerFamily = > ctx.getPointerFamily(); > > - if (type == ir::TYPE_FLOAT || type == ir::TYPE_U32 || type == > ir::TYPE_S32) { > + if ((type == ir::TYPE_FLOAT || type == ir::TYPE_U32 || type == > + ir::TYPE_S32) && addrSpace != ir::MEM_CONSTANT) { > // One message is enough here. Nothing special to do > if (elemNum <= 4) { > // Build the tuple data in the vector diff --git > a/src/cl_command_queue.c b/src/cl_command_queue.c index > 9606d6b..2454db6 100644 > --- a/src/cl_command_queue.c > +++ b/src/cl_command_queue.c > @@ -150,24 +150,6 @@ cl_command_queue_bind_surface(cl_command_queue queue, > cl_kernel k) > return CL_SUCCESS; > } > > -LOCAL cl_int cl_command_queue_upload_constant_buffer(cl_kernel k, > - char * dst) > -{ > - int i; > - for(i = 0; i < k->arg_n; i++) { > - enum gbe_arg_type arg_type = gbe_kernel_get_arg_type(k->opaque, i); > - > - if(arg_type == GBE_ARG_CONSTANT_PTR && k->args[i].mem) { > - uint32_t offset = gbe_kernel_get_curbe_offset(k->opaque, > GBE_CURBE_EXTRA_ARGUMENT, i+GBE_CONSTANT_BUFFER); > - cl_mem mem = k->args[i].mem; > - cl_buffer_map(mem->bo, 1); > - void * addr = cl_buffer_get_virtual(mem->bo); > - memcpy(dst + offset, addr, mem->size); > - cl_buffer_unmap(mem->bo); > - } > - } > - return CL_SUCCESS; > -} > > #if USE_FULSIM > extern void drm_intel_bufmgr_gem_stop_aubfile(cl_buffer_mgr); > diff --git a/src/cl_command_queue.h b/src/cl_command_queue.h index > 135d659..9fe1dd1 100644 > --- a/src/cl_command_queue.h > +++ b/src/cl_command_queue.h > @@ -76,8 +76,5 @@ extern cl_int > cl_command_queue_bind_surface(cl_command_queue, cl_kernel); > > /* Bind all the image surfaces in the GPGPU state */ extern cl_int > cl_command_queue_bind_image(cl_command_queue, cl_kernel); > - > -/*update constant buffer to final curbe */ -extern cl_int > cl_command_queue_upload_constant_buffer(cl_kernel k, char * dst); > #endif /* __CL_COMMAND_QUEUE_H__ */ > > diff --git a/src/cl_command_queue_gen7.c b/src/cl_command_queue_gen7.c > index 1d415d4..53a4711 100644 > --- a/src/cl_command_queue_gen7.c > +++ b/src/cl_command_queue_gen7.c > @@ -95,6 +95,62 @@ error: > return err; > } > > +static void > +cl_upload_constant_buffer(cl_command_queue queue, cl_kernel ker) { > + /* calculate constant buffer size */ > + int32_t arg; > + size_t offset; > + gbe_program prog = ker->program->opaque; > + const int32_t arg_n = gbe_kernel_get_arg_num(ker->opaque); > + size_t global_const_size = > +gbe_program_get_global_constant_size(prog); > + uint32_t constant_buf_size = 0; > + for (arg = 0; arg < arg_n; ++arg) { > + const enum gbe_arg_type type = gbe_kernel_get_arg_type(ker->opaque, arg); > + if (type == GBE_ARG_CONSTANT_PTR && ker->args[arg].mem) { > + cl_mem mem = ker->args[arg].mem; > + constant_buf_size += ALIGN(mem->size, 4); > + } > + } > + if(global_const_size == 0 && constant_buf_size == 0) > + return; > + > + cl_buffer bo = > + cl_gpgpu_alloc_constant_address_space_buffer(queue->gpgpu, > + constant_buf_size + global_const_size + 4); cl_buffer_map(bo, 1); > + char * cst_addr = cl_buffer_get_virtual(bo); offset = 0; if > + (global_const_size > 0) { > + /* Write the global constant arrays */ > + gbe_program_get_global_constant_data(prog, > + (char*)(cst_addr+offset)); } offset += ALIGN(global_const_size, > + 4); > + > + if(global_const_size == 0) { > + /* reserve 4 bytes to get rid of 0 address */ > + offset += 4; > + } > + > + /* upload constant buffer argument */ int32_t curbe_offset = 0; > + for (arg = 0; arg < arg_n; ++arg) { > + const enum gbe_arg_type type = gbe_kernel_get_arg_type(ker->opaque, arg); > + if (type == GBE_ARG_CONSTANT_PTR && ker->args[arg].mem) { > + cl_mem mem = ker->args[arg].mem; > + > + curbe_offset = gbe_kernel_get_curbe_offset(ker->opaque, > GBE_CURBE_KERNEL_ARGUMENT, arg); > + assert(curbe_offset >= 0); > + *(uint32_t *) (ker->curbe + curbe_offset) = offset; > + > + cl_buffer_map(mem->bo, 1); > + void * addr = cl_buffer_get_virtual(mem->bo); > + memcpy(cst_addr + offset, addr, mem->size); > + cl_buffer_unmap(mem->bo); > + offset += ALIGN(mem->size, 4); > + } > + } > + cl_buffer_unmap(bo); > +} > + > /* Will return the total amount of slm used */ static int32_t > cl_curbe_fill(cl_kernel ker, @@ -122,7 +178,6 @@ > cl_curbe_fill(cl_kernel ker, > UPLOAD(GBE_CURBE_GROUP_NUM_Z, global_wk_sz[2]/local_wk_sz[2]); > UPLOAD(GBE_CURBE_THREAD_NUM, thread_n); > UPLOAD(GBE_CURBE_WORK_DIM, work_dim); > - UPLOAD(GBE_CURBE_GLOBAL_CONSTANT_OFFSET, > gbe_kernel_get_curbe_offset(ker->opaque, > GBE_CURBE_GLOBAL_CONSTANT_DATA, 0) + 32); #undef UPLOAD > > /* Write identity for the stack pointer. This is required by the > stack pointer @@ -134,14 +189,6 @@ cl_curbe_fill(cl_kernel ker, > int32_t i; > for (i = 0; i < (int32_t) simd_sz; ++i) stackptr[i] = i; > } > - > - /* Write global constant arrays */ > - if ((offset = gbe_kernel_get_curbe_offset(ker->opaque, > GBE_CURBE_GLOBAL_CONSTANT_DATA, 0)) >= 0) { > - /* Write the global constant arrays */ > - gbe_program prog = ker->program->opaque; > - gbe_program_get_global_constant_data(prog, ker->curbe + offset); > - } > - > /* Handle the various offsets to SLM */ > const int32_t arg_n = gbe_kernel_get_arg_num(ker->opaque); > int32_t arg, slm_offset = 0; > @@ -242,6 +289,9 @@ cl_command_queue_ND_range_gen7(cl_command_queue queue, > cl_setup_scratch(gpgpu, ker); > /* Bind a stack if needed */ > cl_bind_stack(gpgpu, ker); > + > + cl_upload_constant_buffer(queue, ker); > + > cl_gpgpu_states_setup(gpgpu, &kernel); > > /* Curbe step 2. Give the localID and upload it to video memory */ > @@ -250,7 +300,6 @@ cl_command_queue_ND_range_gen7(cl_command_queue queue, > TRY_ALLOC (final_curbe, (char*) alloca(thread_n * cst_sz)); > for (i = 0; i < thread_n; ++i) { > memcpy(final_curbe + cst_sz * i, ker->curbe, cst_sz); > - cl_command_queue_upload_constant_buffer(ker, final_curbe + cst_sz * > i); > } > TRY (cl_set_varying_payload, ker, final_curbe, local_wk_sz, simd_sz, > cst_sz, thread_n); > cl_gpgpu_upload_constants(gpgpu, final_curbe, thread_n*cst_sz); > diff --git a/src/cl_driver.h b/src/cl_driver.h index 0ce03fe..0977bc3 > 100644 > --- a/src/cl_driver.h > +++ b/src/cl_driver.h > @@ -161,6 +161,9 @@ extern cl_gpgpu_set_perf_counters_cb > *cl_gpgpu_set_perf_counters; typedef void > (cl_gpgpu_upload_constants_cb)(cl_gpgpu, const void* data, uint32_t > size); extern cl_gpgpu_upload_constants_cb > *cl_gpgpu_upload_constants; > > +typedef cl_buffer > +(cl_gpgpu_alloc_constant_address_space_buffer_cb)(cl_gpgpu, uint32_t > +size); extern cl_gpgpu_alloc_constant_address_space_buffer_cb > +*cl_gpgpu_alloc_constant_address_space_buffer; > + > /* Setup all indirect states */ > typedef void (cl_gpgpu_states_setup_cb)(cl_gpgpu, cl_gpgpu_kernel > *kernel); extern cl_gpgpu_states_setup_cb *cl_gpgpu_states_setup; > diff --git a/src/cl_driver_defs.c b/src/cl_driver_defs.c index > 7c4c866..d69e4af 100644 > --- a/src/cl_driver_defs.c > +++ b/src/cl_driver_defs.c > @@ -54,6 +54,7 @@ LOCAL cl_gpgpu_set_stack_cb *cl_gpgpu_set_stack = > NULL; LOCAL cl_gpgpu_set_scratch_cb *cl_gpgpu_set_scratch = NULL; > LOCAL cl_gpgpu_bind_image_cb *cl_gpgpu_bind_image = NULL; LOCAL > cl_gpgpu_state_init_cb *cl_gpgpu_state_init = NULL; > +LOCAL cl_gpgpu_alloc_constant_address_space_buffer_cb * > +cl_gpgpu_alloc_constant_address_space_buffer = NULL; > LOCAL cl_gpgpu_set_perf_counters_cb *cl_gpgpu_set_perf_counters = > NULL; LOCAL cl_gpgpu_upload_constants_cb *cl_gpgpu_upload_constants = > NULL; LOCAL cl_gpgpu_states_setup_cb *cl_gpgpu_states_setup = NULL; > diff --git a/src/cl_gt_device.h b/src/cl_gt_device.h index > f58e1fd..db4daa3 100644 > --- a/src/cl_gt_device.h > +++ b/src/cl_gt_device.h > @@ -51,7 +51,7 @@ > .single_fp_config = 0, /* XXX */ > .global_mem_cache_type = CL_READ_WRITE_CACHE, .global_mem_size = 4, > -.max_constant_buffer_size = 64 << 10, > +.max_constant_buffer_size = 512 << 10, > .max_constant_args = 8, > .error_correction_support = CL_FALSE, .host_unified_memory = > CL_FALSE, diff --git a/src/cl_kernel.c b/src/cl_kernel.c index > 12a08c5..4ba1c11 100644 > --- a/src/cl_kernel.c > +++ b/src/cl_kernel.c > @@ -186,16 +186,6 @@ cl_kernel_set_arg(cl_kernel k, cl_uint index, > size_t sz, const void *value) > > mem = *(cl_mem*) value; > > - if(arg_type == GBE_ARG_CONSTANT_PTR) { > - int32_t cbOffset; > - cbOffset = gbe_kernel_set_const_buffer_size(k->opaque, index, mem->size); > - //constant ptr's curbe offset changed, update it > - if(cbOffset >= 0) { > - offset = gbe_kernel_get_curbe_offset(k->opaque, > GBE_CURBE_KERNEL_ARGUMENT, index); > - *((uint32_t *)(k->curbe + offset)) = cbOffset; //cb offset in curbe > - } > - } > - > cl_mem_add_ref(mem); > if (k->args[index].mem) > cl_mem_delete(k->args[index].mem); > diff --git a/src/intel/intel_driver.c b/src/intel/intel_driver.c index > 9959447..ef6e6c3 100644 > --- a/src/intel/intel_driver.c > +++ b/src/intel/intel_driver.c > @@ -380,7 +380,7 @@ cl_intel_driver_new(cl_context_prop props) > /* We use the first 2 slots(0,1) for all the bufs. should modify the above comments. Now we use 3 slots, 0,1 for normal buffers an 2 for constant buffer. > * Notify the gbe this base index, thus gbe can avoid conflicts > * when it allocates slots for images*/ > - gbe_set_image_base_index(2); > + gbe_set_image_base_index(3); > exit: > return driver; > error: > diff --git a/src/intel/intel_gpgpu.c b/src/intel/intel_gpgpu.c index > 1301b66..eb15daa 100644 > --- a/src/intel/intel_gpgpu.c > +++ b/src/intel/intel_gpgpu.c > @@ -96,6 +96,7 @@ struct intel_gpgpu > struct { drm_intel_bo *bo; } sampler_state_b; > struct { drm_intel_bo *bo; } perf_b; > struct { drm_intel_bo *bo; } scratch_b; > + struct { drm_intel_bo *bo; } constant_b; > > uint32_t per_thread_scratch; > struct { > @@ -138,6 +139,9 @@ intel_gpgpu_delete(intel_gpgpu_t *gpgpu) > if (gpgpu->scratch_b.bo) > drm_intel_bo_unreference(gpgpu->scratch_b.bo); > > + if(gpgpu->constant_b.bo) > + drm_intel_bo_unreference(gpgpu->constant_b.bo); > + > intel_batchbuffer_delete(gpgpu->batch); > cl_free(gpgpu); > } > @@ -391,7 +395,7 @@ intel_gpgpu_state_init(intel_gpgpu_t *gpgpu, > /* Binded buffers */ > gpgpu->binded_n = 0; > gpgpu->img_bitmap = 0; > - gpgpu->img_index_base = 2; > + gpgpu->img_index_base = 3; > gpgpu->sampler_bitmap = ~((1 << max_sampler_n) - 1); > > /* URB */ > @@ -404,7 +408,7 @@ intel_gpgpu_state_init(intel_gpgpu_t *gpgpu, > dri_bo_unreference(gpgpu->curbe_b.bo); > uint32_t size_cb = gpgpu->urb.num_cs_entries * gpgpu->urb.size_cs_entry * > 64; > size_cb = ALIGN(size_cb, 4096); > - bo = dri_bo_alloc(gpgpu->drv->bufmgr, "CONSTANT_BUFFER", size_cb, > 64); > + bo = dri_bo_alloc(gpgpu->drv->bufmgr, "CURBE_BUFFER", size_cb, 64); > assert(bo); > gpgpu->curbe_b.bo = bo; > > @@ -468,6 +472,39 @@ intel_gpgpu_set_buf_reloc_gen7(intel_gpgpu_t *gpgpu, > int32_t index, dri_bo* obj_ > obj_bo); > } > > +static dri_bo* > +intel_gpgpu_alloc_constant_address_space_buffer(intel_gpgpu_t *gpgpu, > +uint32_t size) How about just use intel_gpgpu_alloc_constant_buffer? The above function name is a little bit long for me. --Zhigang > +{ > + uint32_t s = size - 1; > + assert(size != 0); > + > + surface_heap_t *heap = gpgpu->surface_heap_b.bo->virtual; > + gen7_surface_state_t *ss2 = (gen7_surface_state_t *) > + heap->surface[2]; memset(ss2, 0, sizeof(gen7_surface_state_t)); > + ss2->ss0.surface_type = I965_SURFACE_BUFFER; > + ss2->ss0.surface_format = I965_SURFACEFORMAT_RAW; > + ss2->ss2.width = s & 0x7f; /* bits 6:0 of sz */ > + ss2->ss2.height = (s >> 7) & 0x3fff; /* bits 20:7 of sz */ > + ss2->ss3.depth = (s >> 21) & 0x3ff; /* bits 30:21 of sz */ > + ss2->ss5.cache_control = cc_llc_l3; heap->binding_table[2] = > + offsetof(surface_heap_t, surface) + 2* sizeof(gen7_surface_state_t); > + > + if(gpgpu->constant_b.bo) > + dri_bo_unreference(gpgpu->constant_b.bo); > + gpgpu->constant_b.bo = drm_intel_bo_alloc(gpgpu->drv->bufmgr, > +"CONSTANT_BUFFER", s, 64); > + assert(gpgpu->constant_b.bo); > + ss2->ss1.base_addr = gpgpu->constant_b.bo->offset; > + dri_bo_emit_reloc(gpgpu->surface_heap_b.bo, > + I915_GEM_DOMAIN_RENDER, > + I915_GEM_DOMAIN_RENDER, > + 0, > + heap->binding_table[2] + > + offsetof(gen7_surface_state_t, ss1), > + gpgpu->constant_b.bo); > + return gpgpu->constant_b.bo; > +} > + > + > /* Map address space with two 2GB surfaces. One surface for untyped message > and > * one surface for byte scatters / gathers. Actually the HW does not require > two > * surfaces but Fulsim complains > @@ -926,6 +963,7 @@ intel_set_gpgpu_callbacks(void) > cl_gpgpu_state_init = (cl_gpgpu_state_init_cb *) intel_gpgpu_state_init; > cl_gpgpu_set_perf_counters = (cl_gpgpu_set_perf_counters_cb *) > intel_gpgpu_set_perf_counters; > cl_gpgpu_upload_constants = (cl_gpgpu_upload_constants_cb *) > intel_gpgpu_upload_constants; > + cl_gpgpu_alloc_constant_address_space_buffer = > + (cl_gpgpu_alloc_constant_address_space_buffer_cb *) > + intel_gpgpu_alloc_constant_address_space_buffer; > cl_gpgpu_states_setup = (cl_gpgpu_states_setup_cb *) > intel_gpgpu_states_setup; > cl_gpgpu_upload_samplers = (cl_gpgpu_upload_samplers_cb *) > intel_gpgpu_upload_samplers; > cl_gpgpu_batch_reset = (cl_gpgpu_batch_reset_cb *) > intel_gpgpu_batch_reset; > -- > 1.7.9.5 > > _______________________________________________ > Beignet mailing list > [email protected] > http://lists.freedesktop.org/mailman/listinfo/beignet _______________________________________________ Beignet mailing list [email protected] http://lists.freedesktop.org/mailman/listinfo/beignet
