LGTM, pushed, thanks.
On Wed, Sep 04, 2013 at 02:24:54PM +0800, Ruiling Song wrote: > Currently, simply allocate enough graphics memory as constant memory space. > And bind it to bti 2. Constant cache read are backed by dword scatter read. > Different from other data port messages, the address need to be dword aligned, > and the addresses are in units of dword. > > The constant address space data are placed in order: first global constant, > then the constant buffer kernel argument. > > v2: change function & variable naming, to make clear 'curbe' and 'constant > buffer' > > Signed-off-by: Ruiling Song <[email protected]> > --- > backend/src/backend/context.cpp | 12 +---- > backend/src/backend/gen_insn_selection.cpp | 32 +++++++++++- > backend/src/backend/gen_reg_allocation.cpp | 1 - > backend/src/backend/program.h | 2 - > backend/src/ir/profile.cpp | 2 - > backend/src/ir/profile.hpp | 5 +- > backend/src/llvm/llvm_gen_backend.cpp | 9 +--- > src/cl_command_queue.c | 18 ------- > src/cl_command_queue.h | 3 -- > src/cl_command_queue_gen7.c | 77 > +++++++++++++++++++++++----- > src/cl_driver.h | 11 ++-- > src/cl_driver_defs.c | 3 +- > src/cl_gt_device.h | 2 +- > src/cl_kernel.c | 10 ---- > src/intel/intel_driver.c | 2 +- > src/intel/intel_gpgpu.c | 60 ++++++++++++++++++---- > 16 files changed, 158 insertions(+), 91 deletions(-) > > diff --git a/backend/src/backend/context.cpp b/backend/src/backend/context.cpp > index 5484869..ac3a243 100644 > --- a/backend/src/backend/context.cpp > +++ b/backend/src/backend/context.cpp > @@ -458,15 +458,6 @@ namespace gbe > } > }); > #undef INSERT_REG > - this->newCurbeEntry(GBE_CURBE_GLOBAL_CONSTANT_OFFSET, 0, sizeof(int)); > - specialRegs.insert(ir::ocl::constoffst); > - > - // Insert serialized global constant arrays if used > - const ir::ConstantSet& constantSet = unit.getConstantSet(); > - if (constantSet.getConstantNum()) { > - size_t size = constantSet.getDataSize(); > - this->newCurbeEntry(GBE_CURBE_GLOBAL_CONSTANT_DATA, 0, size); > - } > > // Insert the number of threads > this->newCurbeEntry(GBE_CURBE_THREAD_NUM, 0, sizeof(uint32_t)); > @@ -640,8 +631,7 @@ namespace gbe > reg == ir::ocl::goffset0 || > reg == ir::ocl::goffset1 || > reg == ir::ocl::goffset2 || > - reg == ir::ocl::workdim || > - reg == ir::ocl::constoffst) > + reg == ir::ocl::workdim) > return true; > return false; > } > diff --git a/backend/src/backend/gen_insn_selection.cpp > b/backend/src/backend/gen_insn_selection.cpp > index 1e81dac..b2c2798 100644 > --- a/backend/src/backend/gen_insn_selection.cpp > +++ b/backend/src/backend/gen_insn_selection.cpp > @@ -2101,6 +2101,23 @@ namespace gbe > sel.UNTYPED_READ(addr, dst.data(), valueNum, bti); > } > > + void emitDWordGather(Selection::Opaque &sel, > + const ir::LoadInstruction &insn, > + GenRegister addr, > + uint32_t bti) const > + { > + using namespace ir; > + const uint32_t valueNum = insn.getValueNum(); > + const uint32_t simdWidth = sel.ctx.getSimdWidth(); > + GBE_ASSERT(valueNum == 1); > + GenRegister dst = GenRegister::retype(sel.selReg(insn.getValue(0)), > GEN_TYPE_F); > + // get dword based address > + GenRegister addrDW = GenRegister::udxgrf(simdWidth, > sel.reg(FAMILY_DWORD)); > + sel.SHR(addrDW, GenRegister::retype(addr, GEN_TYPE_UD), > GenRegister::immud(2)); > + > + sel.DWORD_GATHER(dst, addrDW, bti); > + } > + > void emitRead64(Selection::Opaque &sel, > const ir::LoadInstruction &insn, > GenRegister addr, > @@ -2171,8 +2188,19 @@ namespace gbe > GBE_ASSERT(sel.ctx.isScalarReg(insn.getValue(0)) == false); > const Type type = insn.getValueType(); > const uint32_t elemSize = getByteScatterGatherSize(type); > - if (insn.getAddressSpace() == MEM_CONSTANT) > - this->emitIndirectMove(sel, insn, address); > + if (insn.getAddressSpace() == MEM_CONSTANT) { > + // XXX TODO read 64bit constant through constant cache > + // Per HW Spec, constant cache messages can read at least DWORD data. > + // So, byte/short data type, we have to read through data cache. > + if(insn.isAligned() == true && elemSize == GEN_BYTE_SCATTER_QWORD) > + this->emitRead64(sel, insn, address, 0x2); > + else if(insn.isAligned() == true && elemSize == > GEN_BYTE_SCATTER_DWORD) > + this->emitDWordGather(sel, insn, address, 0x2); > + else { > + const GenRegister value = sel.selReg(insn.getValue(0)); > + this->emitByteGather(sel, insn, elemSize, address, value, 0x2); > + } > + } > else if (insn.isAligned() == true && elemSize == > GEN_BYTE_SCATTER_QWORD) > this->emitRead64(sel, insn, address, space == MEM_LOCAL ? 0xfe : > 0x00); > else if (insn.isAligned() == true && elemSize == > GEN_BYTE_SCATTER_DWORD) > diff --git a/backend/src/backend/gen_reg_allocation.cpp > b/backend/src/backend/gen_reg_allocation.cpp > index 0bb75a2..2abfb12 100644 > --- a/backend/src/backend/gen_reg_allocation.cpp > +++ b/backend/src/backend/gen_reg_allocation.cpp > @@ -573,7 +573,6 @@ namespace gbe > allocatePayloadReg(GBE_CURBE_GROUP_NUM_Z, ocl::numgroup2); > allocatePayloadReg(GBE_CURBE_STACK_POINTER, ocl::stackptr); > allocatePayloadReg(GBE_CURBE_THREAD_NUM, ocl::threadn); > - allocatePayloadReg(GBE_CURBE_GLOBAL_CONSTANT_OFFSET, ocl::constoffst); > > // Group and barrier IDs are always allocated by the hardware in r0 > RA.insert(std::make_pair(ocl::groupid0, 1*sizeof(float))); // r0.1 > diff --git a/backend/src/backend/program.h b/backend/src/backend/program.h > index d20e7af..ff4d157 100644 > --- a/backend/src/backend/program.h > +++ b/backend/src/backend/program.h > @@ -70,8 +70,6 @@ enum gbe_curbe_type { > GBE_CURBE_GROUP_NUM_Y, > GBE_CURBE_GROUP_NUM_Z, > GBE_CURBE_WORK_DIM, > - GBE_CURBE_GLOBAL_CONSTANT_OFFSET, > - GBE_CURBE_GLOBAL_CONSTANT_DATA, > GBE_CURBE_IMAGE_INFO, > GBE_CURBE_STACK_POINTER, > GBE_CURBE_KERNEL_ARGUMENT, > diff --git a/backend/src/ir/profile.cpp b/backend/src/ir/profile.cpp > index 675018a..927e43d 100644 > --- a/backend/src/ir/profile.cpp > +++ b/backend/src/ir/profile.cpp > @@ -40,7 +40,6 @@ namespace ir { > "stack_pointer", > "block_ip", > "barrier_id", "thread_number", > - "const_curbe_offset", > "work_dimension", > }; > > @@ -76,7 +75,6 @@ namespace ir { > DECL_NEW_REG(FAMILY_WORD, blockip); > DECL_NEW_REG(FAMILY_DWORD, barrierid); > DECL_NEW_REG(FAMILY_DWORD, threadn); > - DECL_NEW_REG(FAMILY_DWORD, constoffst); > DECL_NEW_REG(FAMILY_DWORD, workdim); > } > #undef DECL_NEW_REG > diff --git a/backend/src/ir/profile.hpp b/backend/src/ir/profile.hpp > index 4b0ef5e..c79bc3b 100644 > --- a/backend/src/ir/profile.hpp > +++ b/backend/src/ir/profile.hpp > @@ -63,9 +63,8 @@ namespace ir { > static const Register blockip = Register(19); // blockip > static const Register barrierid = Register(20);// barrierid > static const Register threadn = Register(21); // number of threads > - static const Register constoffst = Register(22); // offset of global > constant array's curbe > - static const Register workdim = Register(23); // work dimention. > - static const uint32_t regNum = 24; // number of special > registers > + static const Register workdim = Register(22); // work dimention. > + static const uint32_t regNum = 23; // number of special > registers > extern const char *specialRegMean[]; // special register name. > } /* namespace ocl */ > > diff --git a/backend/src/llvm/llvm_gen_backend.cpp > b/backend/src/llvm/llvm_gen_backend.cpp > index 12d809d..e747d00 100644 > --- a/backend/src/llvm/llvm_gen_backend.cpp > +++ b/backend/src/llvm/llvm_gen_backend.cpp > @@ -1243,12 +1243,7 @@ namespace gbe > ir::Register reg = ctx.reg(ir::RegisterFamily::FAMILY_DWORD); > ir::Constant &con = unit.getConstantSet().getConstant(j ++); > con.setReg(reg.value()); > - if(con.getOffset() != 0) { > - ctx.LOADI(ir::TYPE_S32, reg, > ctx.newIntegerImmediate(con.getOffset(), ir::TYPE_S32)); > - ctx.ADD(ir::TYPE_S32, reg, ir::ocl::constoffst, reg); > - } else { > - ctx.MOV(ir::TYPE_S32, reg, ir::ocl::constoffst); > - } > + ctx.LOADI(ir::TYPE_S32, reg, ctx.newIntegerImmediate(con.getOffset(), > ir::TYPE_S32)); > } > > // Visit all the instructions and emit the IR registers or the value to > @@ -2407,7 +2402,7 @@ namespace gbe > const ir::Type type = getType(ctx, elemType); > const ir::RegisterFamily pointerFamily = ctx.getPointerFamily(); > > - if (type == ir::TYPE_FLOAT || type == ir::TYPE_U32 || type == > ir::TYPE_S32) { > + if ((type == ir::TYPE_FLOAT || type == ir::TYPE_U32 || type == > ir::TYPE_S32) && addrSpace != ir::MEM_CONSTANT) { > // One message is enough here. Nothing special to do > if (elemNum <= 4) { > // Build the tuple data in the vector > diff --git a/src/cl_command_queue.c b/src/cl_command_queue.c > index 9606d6b..2454db6 100644 > --- a/src/cl_command_queue.c > +++ b/src/cl_command_queue.c > @@ -150,24 +150,6 @@ cl_command_queue_bind_surface(cl_command_queue queue, > cl_kernel k) > return CL_SUCCESS; > } > > -LOCAL cl_int cl_command_queue_upload_constant_buffer(cl_kernel k, > - char * dst) > -{ > - int i; > - for(i = 0; i < k->arg_n; i++) { > - enum gbe_arg_type arg_type = gbe_kernel_get_arg_type(k->opaque, i); > - > - if(arg_type == GBE_ARG_CONSTANT_PTR && k->args[i].mem) { > - uint32_t offset = gbe_kernel_get_curbe_offset(k->opaque, > GBE_CURBE_EXTRA_ARGUMENT, i+GBE_CONSTANT_BUFFER); > - cl_mem mem = k->args[i].mem; > - cl_buffer_map(mem->bo, 1); > - void * addr = cl_buffer_get_virtual(mem->bo); > - memcpy(dst + offset, addr, mem->size); > - cl_buffer_unmap(mem->bo); > - } > - } > - return CL_SUCCESS; > -} > > #if USE_FULSIM > extern void drm_intel_bufmgr_gem_stop_aubfile(cl_buffer_mgr); > diff --git a/src/cl_command_queue.h b/src/cl_command_queue.h > index 135d659..9fe1dd1 100644 > --- a/src/cl_command_queue.h > +++ b/src/cl_command_queue.h > @@ -76,8 +76,5 @@ extern cl_int > cl_command_queue_bind_surface(cl_command_queue, cl_kernel); > > /* Bind all the image surfaces in the GPGPU state */ > extern cl_int cl_command_queue_bind_image(cl_command_queue, cl_kernel); > - > -/*update constant buffer to final curbe */ > -extern cl_int cl_command_queue_upload_constant_buffer(cl_kernel k, char * > dst); > #endif /* __CL_COMMAND_QUEUE_H__ */ > > diff --git a/src/cl_command_queue_gen7.c b/src/cl_command_queue_gen7.c > index 1d415d4..68630cf 100644 > --- a/src/cl_command_queue_gen7.c > +++ b/src/cl_command_queue_gen7.c > @@ -76,7 +76,7 @@ cl_set_varying_payload(const cl_kernel ker, > block_ips[curr] = 0; > } > > - /* Copy them to the constant buffer */ > + /* Copy them to the curbe buffer */ > curr = 0; > for (i = 0; i < thread_n; ++i, data += cst_sz) { > uint32_t *ids0 = (uint32_t *) (data + id_offset[0]); > @@ -95,6 +95,62 @@ error: > return err; > } > > +static void > +cl_upload_constant_buffer(cl_command_queue queue, cl_kernel ker) > +{ > + /* calculate constant buffer size */ > + int32_t arg; > + size_t offset; > + gbe_program prog = ker->program->opaque; > + const int32_t arg_n = gbe_kernel_get_arg_num(ker->opaque); > + size_t global_const_size = gbe_program_get_global_constant_size(prog); > + uint32_t constant_buf_size = 0; > + for (arg = 0; arg < arg_n; ++arg) { > + const enum gbe_arg_type type = gbe_kernel_get_arg_type(ker->opaque, arg); > + if (type == GBE_ARG_CONSTANT_PTR && ker->args[arg].mem) { > + cl_mem mem = ker->args[arg].mem; > + constant_buf_size += ALIGN(mem->size, 4); > + } > + } > + if(global_const_size == 0 && constant_buf_size == 0) > + return; > + > + cl_buffer bo = cl_gpgpu_alloc_constant_buffer(queue->gpgpu, > constant_buf_size + global_const_size + 4); > + cl_buffer_map(bo, 1); > + char * cst_addr = cl_buffer_get_virtual(bo); > + offset = 0; > + if (global_const_size > 0) { > + /* Write the global constant arrays */ > + gbe_program_get_global_constant_data(prog, (char*)(cst_addr+offset)); > + } > + offset += ALIGN(global_const_size, 4); > + > + if(global_const_size == 0) { > + /* reserve 4 bytes to get rid of 0 address */ > + offset += 4; > + } > + > + /* upload constant buffer argument */ > + int32_t curbe_offset = 0; > + for (arg = 0; arg < arg_n; ++arg) { > + const enum gbe_arg_type type = gbe_kernel_get_arg_type(ker->opaque, arg); > + if (type == GBE_ARG_CONSTANT_PTR && ker->args[arg].mem) { > + cl_mem mem = ker->args[arg].mem; > + > + curbe_offset = gbe_kernel_get_curbe_offset(ker->opaque, > GBE_CURBE_KERNEL_ARGUMENT, arg); > + assert(curbe_offset >= 0); > + *(uint32_t *) (ker->curbe + curbe_offset) = offset; > + > + cl_buffer_map(mem->bo, 1); > + void * addr = cl_buffer_get_virtual(mem->bo); > + memcpy(cst_addr + offset, addr, mem->size); > + cl_buffer_unmap(mem->bo); > + offset += ALIGN(mem->size, 4); > + } > + } > + cl_buffer_unmap(bo); > +} > + > /* Will return the total amount of slm used */ > static int32_t > cl_curbe_fill(cl_kernel ker, > @@ -122,7 +178,6 @@ cl_curbe_fill(cl_kernel ker, > UPLOAD(GBE_CURBE_GROUP_NUM_Z, global_wk_sz[2]/local_wk_sz[2]); > UPLOAD(GBE_CURBE_THREAD_NUM, thread_n); > UPLOAD(GBE_CURBE_WORK_DIM, work_dim); > - UPLOAD(GBE_CURBE_GLOBAL_CONSTANT_OFFSET, > gbe_kernel_get_curbe_offset(ker->opaque, GBE_CURBE_GLOBAL_CONSTANT_DATA, 0) + > 32); > #undef UPLOAD > > /* Write identity for the stack pointer. This is required by the stack > pointer > @@ -134,14 +189,6 @@ cl_curbe_fill(cl_kernel ker, > int32_t i; > for (i = 0; i < (int32_t) simd_sz; ++i) stackptr[i] = i; > } > - > - /* Write global constant arrays */ > - if ((offset = gbe_kernel_get_curbe_offset(ker->opaque, > GBE_CURBE_GLOBAL_CONSTANT_DATA, 0)) >= 0) { > - /* Write the global constant arrays */ > - gbe_program prog = ker->program->opaque; > - gbe_program_get_global_constant_data(prog, ker->curbe + offset); > - } > - > /* Handle the various offsets to SLM */ > const int32_t arg_n = gbe_kernel_get_arg_num(ker->opaque); > int32_t arg, slm_offset = 0; > @@ -220,9 +267,9 @@ cl_command_queue_ND_range_gen7(cl_command_queue queue, > /* Compute the number of HW threads we need */ > TRY (cl_kernel_work_group_sz, ker, local_wk_sz, 3, &local_sz); > kernel.thread_n = thread_n = (local_sz + simd_sz - 1) / simd_sz; > - kernel.cst_sz = cst_sz; > + kernel.curbe_sz = cst_sz; > > - /* Curbe step 1: fill the constant buffer data shared by all threads */ > + /* Curbe step 1: fill the constant urb buffer data shared by all threads */ > if (ker->curbe) { > kernel.slm_sz = cl_curbe_fill(ker, work_dim, global_wk_off, > global_wk_sz, local_wk_sz, thread_n); > if (kernel.slm_sz > ker->program->ctx->device->local_mem_size) > @@ -242,6 +289,9 @@ cl_command_queue_ND_range_gen7(cl_command_queue queue, > cl_setup_scratch(gpgpu, ker); > /* Bind a stack if needed */ > cl_bind_stack(gpgpu, ker); > + > + cl_upload_constant_buffer(queue, ker); > + > cl_gpgpu_states_setup(gpgpu, &kernel); > > /* Curbe step 2. Give the localID and upload it to video memory */ > @@ -250,10 +300,9 @@ cl_command_queue_ND_range_gen7(cl_command_queue queue, > TRY_ALLOC (final_curbe, (char*) alloca(thread_n * cst_sz)); > for (i = 0; i < thread_n; ++i) { > memcpy(final_curbe + cst_sz * i, ker->curbe, cst_sz); > - cl_command_queue_upload_constant_buffer(ker, final_curbe + cst_sz * > i); > } > TRY (cl_set_varying_payload, ker, final_curbe, local_wk_sz, simd_sz, > cst_sz, thread_n); > - cl_gpgpu_upload_constants(gpgpu, final_curbe, thread_n*cst_sz); > + cl_gpgpu_upload_curbes(gpgpu, final_curbe, thread_n*cst_sz); > } > > /* Start a new batch buffer */ > diff --git a/src/cl_driver.h b/src/cl_driver.h > index 0ce03fe..95d6485 100644 > --- a/src/cl_driver.h > +++ b/src/cl_driver.h > @@ -100,7 +100,7 @@ typedef enum gpu_command_status { > typedef struct cl_gpgpu_kernel { > const char *name; /* kernel name and bo name */ > uint32_t grf_blocks; /* register blocks kernel wants (in 8 reg blocks) > */ > - uint32_t cst_sz; /* total size of all constants */ > + uint32_t curbe_sz; /* total size of all curbes */ > cl_buffer bo; /* kernel code in the proper addr space */ > int32_t barrierID; /* barrierID for _this_ kernel */ > uint32_t use_slm:1; /* For gen7 (automatic barrier management) */ > @@ -157,9 +157,12 @@ extern cl_gpgpu_state_init_cb *cl_gpgpu_state_init; > typedef void (cl_gpgpu_set_perf_counters_cb)(cl_gpgpu, cl_buffer perf); > extern cl_gpgpu_set_perf_counters_cb *cl_gpgpu_set_perf_counters; > > -/* Fills current constant buffer with data */ > -typedef void (cl_gpgpu_upload_constants_cb)(cl_gpgpu, const void* data, > uint32_t size); > -extern cl_gpgpu_upload_constants_cb *cl_gpgpu_upload_constants; > +/* Fills current curbe buffer with data */ > +typedef void (cl_gpgpu_upload_curbes_cb)(cl_gpgpu, const void* data, > uint32_t size); > +extern cl_gpgpu_upload_curbes_cb *cl_gpgpu_upload_curbes; > + > +typedef cl_buffer (cl_gpgpu_alloc_constant_buffer_cb)(cl_gpgpu, uint32_t > size); > +extern cl_gpgpu_alloc_constant_buffer_cb *cl_gpgpu_alloc_constant_buffer; > > /* Setup all indirect states */ > typedef void (cl_gpgpu_states_setup_cb)(cl_gpgpu, cl_gpgpu_kernel *kernel); > diff --git a/src/cl_driver_defs.c b/src/cl_driver_defs.c > index 7c4c866..ae130fa 100644 > --- a/src/cl_driver_defs.c > +++ b/src/cl_driver_defs.c > @@ -54,8 +54,9 @@ LOCAL cl_gpgpu_set_stack_cb *cl_gpgpu_set_stack = NULL; > LOCAL cl_gpgpu_set_scratch_cb *cl_gpgpu_set_scratch = NULL; > LOCAL cl_gpgpu_bind_image_cb *cl_gpgpu_bind_image = NULL; > LOCAL cl_gpgpu_state_init_cb *cl_gpgpu_state_init = NULL; > +LOCAL cl_gpgpu_alloc_constant_buffer_cb * cl_gpgpu_alloc_constant_buffer = > NULL; > LOCAL cl_gpgpu_set_perf_counters_cb *cl_gpgpu_set_perf_counters = NULL; > -LOCAL cl_gpgpu_upload_constants_cb *cl_gpgpu_upload_constants = NULL; > +LOCAL cl_gpgpu_upload_curbes_cb *cl_gpgpu_upload_curbes = NULL; > LOCAL cl_gpgpu_states_setup_cb *cl_gpgpu_states_setup = NULL; > LOCAL cl_gpgpu_upload_samplers_cb *cl_gpgpu_upload_samplers = NULL; > LOCAL cl_gpgpu_batch_reset_cb *cl_gpgpu_batch_reset = NULL; > diff --git a/src/cl_gt_device.h b/src/cl_gt_device.h > index f58e1fd..db4daa3 100644 > --- a/src/cl_gt_device.h > +++ b/src/cl_gt_device.h > @@ -51,7 +51,7 @@ > .single_fp_config = 0, /* XXX */ > .global_mem_cache_type = CL_READ_WRITE_CACHE, > .global_mem_size = 4, > -.max_constant_buffer_size = 64 << 10, > +.max_constant_buffer_size = 512 << 10, > .max_constant_args = 8, > .error_correction_support = CL_FALSE, > .host_unified_memory = CL_FALSE, > diff --git a/src/cl_kernel.c b/src/cl_kernel.c > index 12a08c5..4ba1c11 100644 > --- a/src/cl_kernel.c > +++ b/src/cl_kernel.c > @@ -186,16 +186,6 @@ cl_kernel_set_arg(cl_kernel k, cl_uint index, size_t sz, > const void *value) > > mem = *(cl_mem*) value; > > - if(arg_type == GBE_ARG_CONSTANT_PTR) { > - int32_t cbOffset; > - cbOffset = gbe_kernel_set_const_buffer_size(k->opaque, index, mem->size); > - //constant ptr's curbe offset changed, update it > - if(cbOffset >= 0) { > - offset = gbe_kernel_get_curbe_offset(k->opaque, > GBE_CURBE_KERNEL_ARGUMENT, index); > - *((uint32_t *)(k->curbe + offset)) = cbOffset; //cb offset in curbe > - } > - } > - > cl_mem_add_ref(mem); > if (k->args[index].mem) > cl_mem_delete(k->args[index].mem); > diff --git a/src/intel/intel_driver.c b/src/intel/intel_driver.c > index 9959447..ef6e6c3 100644 > --- a/src/intel/intel_driver.c > +++ b/src/intel/intel_driver.c > @@ -380,7 +380,7 @@ cl_intel_driver_new(cl_context_prop props) > /* We use the first 2 slots(0,1) for all the bufs. > * Notify the gbe this base index, thus gbe can avoid conflicts > * when it allocates slots for images*/ > - gbe_set_image_base_index(2); > + gbe_set_image_base_index(3); > exit: > return driver; > error: > diff --git a/src/intel/intel_gpgpu.c b/src/intel/intel_gpgpu.c > index 1301b66..6dac89d 100644 > --- a/src/intel/intel_gpgpu.c > +++ b/src/intel/intel_gpgpu.c > @@ -79,7 +79,7 @@ struct intel_gpgpu > intel_batchbuffer_t *batch; > cl_gpgpu_kernel *ker; > drm_intel_bo *binded_buf[max_buf_n]; /* all buffers binded for the call */ > - uint32_t binded_offset[max_buf_n]; /* their offsets in the constant > buffer */ > + uint32_t binded_offset[max_buf_n]; /* their offsets in the curbe buffer > */ > uint32_t binded_n; /* number of buffers binded */ > > unsigned long img_bitmap; /* image usage bitmap. */ > @@ -96,6 +96,7 @@ struct intel_gpgpu > struct { drm_intel_bo *bo; } sampler_state_b; > struct { drm_intel_bo *bo; } perf_b; > struct { drm_intel_bo *bo; } scratch_b; > + struct { drm_intel_bo *bo; } constant_b; > > uint32_t per_thread_scratch; > struct { > @@ -138,6 +139,9 @@ intel_gpgpu_delete(intel_gpgpu_t *gpgpu) > if (gpgpu->scratch_b.bo) > drm_intel_bo_unreference(gpgpu->scratch_b.bo); > > + if(gpgpu->constant_b.bo) > + drm_intel_bo_unreference(gpgpu->constant_b.bo); > + > intel_batchbuffer_delete(gpgpu->batch); > cl_free(gpgpu); > } > @@ -231,7 +235,7 @@ intel_gpgpu_load_vfe_state(intel_gpgpu_t *gpgpu) > } > > static void > -intel_gpgpu_load_constant_buffer(intel_gpgpu_t *gpgpu) > +intel_gpgpu_load_curbe_buffer(intel_gpgpu_t *gpgpu) > { > BEGIN_BATCH(gpgpu->batch, 4); > OUT_BATCH(gpgpu->batch, CMD(2,0,1) | (4 - 2)); /* length-2 */ > @@ -319,7 +323,7 @@ intel_gpgpu_batch_start(intel_gpgpu_t *gpgpu) > intel_gpgpu_select_pipeline(gpgpu); > intel_gpgpu_set_base_address(gpgpu); > intel_gpgpu_load_vfe_state(gpgpu); > - intel_gpgpu_load_constant_buffer(gpgpu); > + intel_gpgpu_load_curbe_buffer(gpgpu); > intel_gpgpu_load_idrt(gpgpu); > > if (gpgpu->perf_b.bo) { > @@ -391,7 +395,7 @@ intel_gpgpu_state_init(intel_gpgpu_t *gpgpu, > /* Binded buffers */ > gpgpu->binded_n = 0; > gpgpu->img_bitmap = 0; > - gpgpu->img_index_base = 2; > + gpgpu->img_index_base = 3; > gpgpu->sampler_bitmap = ~((1 << max_sampler_n) - 1); > > /* URB */ > @@ -399,12 +403,12 @@ intel_gpgpu_state_init(intel_gpgpu_t *gpgpu, > gpgpu->urb.size_cs_entry = size_cs_entry; > gpgpu->max_threads = max_threads; > > - /* Constant buffer */ > + /* Constant URB buffer */ > if(gpgpu->curbe_b.bo) > dri_bo_unreference(gpgpu->curbe_b.bo); > uint32_t size_cb = gpgpu->urb.num_cs_entries * gpgpu->urb.size_cs_entry * > 64; > size_cb = ALIGN(size_cb, 4096); > - bo = dri_bo_alloc(gpgpu->drv->bufmgr, "CONSTANT_BUFFER", size_cb, 64); > + bo = dri_bo_alloc(gpgpu->drv->bufmgr, "CURBE_BUFFER", size_cb, 64); > assert(bo); > gpgpu->curbe_b.bo = bo; > > @@ -468,6 +472,39 @@ intel_gpgpu_set_buf_reloc_gen7(intel_gpgpu_t *gpgpu, > int32_t index, dri_bo* obj_ > obj_bo); > } > > +static dri_bo* > +intel_gpgpu_alloc_constant_buffer(intel_gpgpu_t *gpgpu, uint32_t size) > +{ > + uint32_t s = size - 1; > + assert(size != 0); > + > + surface_heap_t *heap = gpgpu->surface_heap_b.bo->virtual; > + gen7_surface_state_t *ss2 = (gen7_surface_state_t *) heap->surface[2]; > + memset(ss2, 0, sizeof(gen7_surface_state_t)); > + ss2->ss0.surface_type = I965_SURFACE_BUFFER; > + ss2->ss0.surface_format = I965_SURFACEFORMAT_RAW; > + ss2->ss2.width = s & 0x7f; /* bits 6:0 of sz */ > + ss2->ss2.height = (s >> 7) & 0x3fff; /* bits 20:7 of sz */ > + ss2->ss3.depth = (s >> 21) & 0x3ff; /* bits 30:21 of sz */ > + ss2->ss5.cache_control = cc_llc_l3; > + heap->binding_table[2] = offsetof(surface_heap_t, surface) + 2* > sizeof(gen7_surface_state_t); > + > + if(gpgpu->constant_b.bo) > + dri_bo_unreference(gpgpu->constant_b.bo); > + gpgpu->constant_b.bo = drm_intel_bo_alloc(gpgpu->drv->bufmgr, > "CONSTANT_BUFFER", s, 64); > + assert(gpgpu->constant_b.bo); > + ss2->ss1.base_addr = gpgpu->constant_b.bo->offset; > + dri_bo_emit_reloc(gpgpu->surface_heap_b.bo, > + I915_GEM_DOMAIN_RENDER, > + I915_GEM_DOMAIN_RENDER, > + 0, > + heap->binding_table[2] + > + offsetof(gen7_surface_state_t, ss1), > + gpgpu->constant_b.bo); > + return gpgpu->constant_b.bo; > +} > + > + > /* Map address space with two 2GB surfaces. One surface for untyped message > and > * one surface for byte scatters / gathers. Actually the HW does not require > two > * surfaces but Fulsim complains > @@ -611,7 +648,7 @@ intel_gpgpu_build_idrt(intel_gpgpu_t *gpgpu, > cl_gpgpu_kernel *kernel) > desc->desc2.sampler_state_pointer = gpgpu->sampler_state_b.bo->offset >> 5; > desc->desc3.binding_table_entry_count = 0; /* no prefetch */ > desc->desc3.binding_table_pointer = 0; > - desc->desc4.curbe_read_len = kernel->cst_sz / 32; > + desc->desc4.curbe_read_len = kernel->curbe_sz / 32; > desc->desc4.curbe_read_offset = 0; > > /* Barriers / SLM are automatically handled on Gen7+ */ > @@ -650,7 +687,7 @@ intel_gpgpu_build_idrt(intel_gpgpu_t *gpgpu, > cl_gpgpu_kernel *kernel) > } > > static void > -intel_gpgpu_upload_constants(intel_gpgpu_t *gpgpu, const void* data, > uint32_t size) > +intel_gpgpu_upload_curbes(intel_gpgpu_t *gpgpu, const void* data, uint32_t > size) > { > unsigned char *curbe = NULL; > cl_gpgpu_kernel *k = gpgpu->ker; > @@ -665,9 +702,9 @@ intel_gpgpu_upload_constants(intel_gpgpu_t *gpgpu, const > void* data, uint32_t si > /* Now put all the relocations for our flat address space */ > for (i = 0; i < k->thread_n; ++i) > for (j = 0; j < gpgpu->binded_n; ++j) { > - *(uint32_t*)(curbe + gpgpu->binded_offset[j]+i*k->cst_sz) = > gpgpu->binded_buf[j]->offset; > + *(uint32_t*)(curbe + gpgpu->binded_offset[j]+i*k->curbe_sz) = > gpgpu->binded_buf[j]->offset; > drm_intel_bo_emit_reloc(gpgpu->curbe_b.bo, > - gpgpu->binded_offset[j]+i*k->cst_sz, > + gpgpu->binded_offset[j]+i*k->curbe_sz, > gpgpu->binded_buf[j], > 0, > I915_GEM_DOMAIN_RENDER, > @@ -925,7 +962,8 @@ intel_set_gpgpu_callbacks(void) > cl_gpgpu_set_stack = (cl_gpgpu_set_stack_cb *) intel_gpgpu_set_stack; > cl_gpgpu_state_init = (cl_gpgpu_state_init_cb *) intel_gpgpu_state_init; > cl_gpgpu_set_perf_counters = (cl_gpgpu_set_perf_counters_cb *) > intel_gpgpu_set_perf_counters; > - cl_gpgpu_upload_constants = (cl_gpgpu_upload_constants_cb *) > intel_gpgpu_upload_constants; > + cl_gpgpu_upload_curbes = (cl_gpgpu_upload_curbes_cb *) > intel_gpgpu_upload_curbes; > + cl_gpgpu_alloc_constant_buffer = (cl_gpgpu_alloc_constant_buffer_cb *) > intel_gpgpu_alloc_constant_buffer; > cl_gpgpu_states_setup = (cl_gpgpu_states_setup_cb *) > intel_gpgpu_states_setup; > cl_gpgpu_upload_samplers = (cl_gpgpu_upload_samplers_cb *) > intel_gpgpu_upload_samplers; > cl_gpgpu_batch_reset = (cl_gpgpu_batch_reset_cb *) intel_gpgpu_batch_reset; > -- > 1.7.9.5 > > _______________________________________________ > Beignet mailing list > [email protected] > http://lists.freedesktop.org/mailman/listinfo/beignet _______________________________________________ Beignet mailing list [email protected] http://lists.freedesktop.org/mailman/listinfo/beignet
