From: Yang Rong <[email protected]> Can't use 32bits ops on pointer relative instructions. Prepare to enable SPIR64.
Signed-off-by: Ruiling Song <[email protected]> Signed-off-by: Yang Rong <[email protected]> --- backend/src/backend/gen_context.cpp | 17 +++++++++++++++ backend/src/ir/lowering.cpp | 10 +++++++-- backend/src/ir/profile.cpp | 8 +++---- backend/src/llvm/llvm_gen_backend.cpp | 37 +++++++++++++++++++++------------ backend/src/llvm/llvm_printf_parser.cpp | 29 +++++++++++++++++++------- 5 files changed, 74 insertions(+), 27 deletions(-) diff --git a/backend/src/backend/gen_context.cpp b/backend/src/backend/gen_context.cpp index 02d0bfd..cef4e4c 100644 --- a/backend/src/backend/gen_context.cpp +++ b/backend/src/backend/gen_context.cpp @@ -242,6 +242,23 @@ namespace gbe p->MUL(tmpReg, tmpReg, GenRegister::immuw(perThreadSize)); p->curr.execWidth = this->simdWidth; p->ADD(stackptr, stackptr, tmpReg); + if (fn.getPointerFamily() == ir::FAMILY_QWORD) { + const GenRegister selStatckPtr2 = this->simdWidth == 8 ? + GenRegister::ul8grf(ir::ocl::stackptr) : + GenRegister::ul16grf(ir::ocl::stackptr); + const GenRegister stackptr2 = ra->genReg(selStatckPtr2); + int simdWidth = p->curr.execWidth; + if (simdWidth == 16) { + // we need do second quarter first, because the dst type is QW, + // while the src is DW. If we do first quater first, the 1st + // quarter's dst would contain the 2nd quarter's src. + p->curr.execWidth = 8; + p->curr.quarterControl = GEN_COMPRESSION_Q2; + p->MOV(GenRegister::Qn(stackptr2, 1), GenRegister::Qn(stackptr,1)); + } + p->curr.quarterControl = GEN_COMPRESSION_Q1; + p->MOV(stackptr2, stackptr); + } p->pop(); } diff --git a/backend/src/ir/lowering.cpp b/backend/src/ir/lowering.cpp index 66ced8c..9ae90ef 100644 --- a/backend/src/ir/lowering.cpp +++ b/backend/src/ir/lowering.cpp @@ -367,8 +367,14 @@ namespace ir { const uint32_t offset = valueID * size; const Register reg = load->getValue(valueID); - - Instruction mov = ir::INDIRECT_MOV(type, reg, arg, load->getAddressRegister(), offset); + Register addressReg = load->getAddressRegister(); + if (fn->getPointerFamily() == FAMILY_QWORD) { + Register tmp = fn->newRegister(FAMILY_DWORD); + Instruction cvt = ir::CVT(ir::TYPE_U32, ir::TYPE_U64, tmp, load->getAddressRegister()); + cvt.insert(ins_after, &ins_after); + addressReg = tmp; + } + Instruction mov = ir::INDIRECT_MOV(type, reg, arg, addressReg, offset); mov.insert(ins_after, &ins_after); replaced = true; } diff --git a/backend/src/ir/profile.cpp b/backend/src/ir/profile.cpp index 4486863..c3e1c9a 100644 --- a/backend/src/ir/profile.cpp +++ b/backend/src/ir/profile.cpp @@ -74,8 +74,8 @@ namespace ir { DECL_NEW_REG(FAMILY_DWORD, goffset0, 1, GBE_CURBE_GLOBAL_OFFSET_X); DECL_NEW_REG(FAMILY_DWORD, goffset1, 1, GBE_CURBE_GLOBAL_OFFSET_Y); DECL_NEW_REG(FAMILY_DWORD, goffset2, 1, GBE_CURBE_GLOBAL_OFFSET_Z); - DECL_NEW_REG(FAMILY_DWORD, stackptr, 0); - DECL_NEW_REG(FAMILY_QWORD, stackbuffer, 1, GBE_CURBE_EXTRA_ARGUMENT, GBE_STACK_BUFFER); + DECL_NEW_REG(fn.getPointerFamily(), stackptr, 0); + DECL_NEW_REG(fn.getPointerFamily(), stackbuffer, 1, GBE_CURBE_EXTRA_ARGUMENT, GBE_STACK_BUFFER); DECL_NEW_REG(FAMILY_WORD, blockip, 0, GBE_CURBE_BLOCK_IP); DECL_NEW_REG(FAMILY_DWORD, barrierid, 1); DECL_NEW_REG(FAMILY_DWORD, threadn, 1, GBE_CURBE_THREAD_NUM); @@ -83,8 +83,8 @@ namespace ir { DECL_NEW_REG(FAMILY_DWORD, zero, 1); DECL_NEW_REG(FAMILY_DWORD, one, 1); DECL_NEW_REG(FAMILY_WORD, retVal, 1); - DECL_NEW_REG(FAMILY_DWORD, printfbptr, 1, GBE_CURBE_PRINTF_BUF_POINTER); - DECL_NEW_REG(FAMILY_DWORD, printfiptr, 1, GBE_CURBE_PRINTF_INDEX_POINTER); + DECL_NEW_REG(fn.getPointerFamily(), printfbptr, 1, GBE_CURBE_PRINTF_BUF_POINTER); + DECL_NEW_REG(fn.getPointerFamily(), printfiptr, 1, GBE_CURBE_PRINTF_INDEX_POINTER); DECL_NEW_REG(FAMILY_DWORD, dwblockip, 0, GBE_CURBE_DW_BLOCK_IP); } #undef DECL_NEW_REG diff --git a/backend/src/llvm/llvm_gen_backend.cpp b/backend/src/llvm/llvm_gen_backend.cpp index f89ae0d..9964802 100644 --- a/backend/src/llvm/llvm_gen_backend.cpp +++ b/backend/src/llvm/llvm_gen_backend.cpp @@ -1218,12 +1218,12 @@ namespace gbe } Builder.SetInsertPoint(cast<Instruction>(theUser)); - Type *int32Ty = Type::getInt32Ty(ptr->getContext()); - Value *v1 = Builder.CreatePtrToInt(pointerOp, int32Ty); + Type *ptyTy = IntegerType::get(ptr->getContext(), ptr->getType()->getIntegerBitWidth()); + Value *v1 = Builder.CreatePtrToInt(pointerOp, ptyTy); - Value *v2 = Builder.CreatePtrToInt(getSinglePointerOrigin(pointerOp), int32Ty); - Value *v3 = Builder.CreatePtrToInt(base, int32Ty); - Value *v4 = Builder.CreatePtrToInt(bti, int32Ty); + Value *v2 = Builder.CreatePtrToInt(getSinglePointerOrigin(pointerOp), ptyTy); + Value *v3 = Builder.CreatePtrToInt(base, ptyTy); + Value *v4 = Builder.CreatePtrToInt(bti, ptyTy); // newLocBase = (pointer - origin) + base_start Value *diff = Builder.CreateSub(v1, v2); Value *newLocBase = Builder.CreateAdd(v3, diff); @@ -1600,7 +1600,10 @@ namespace gbe // NULL pointers if(isa<ConstantPointerNull>(CPV)) { - return ctx.newImmediate(uint32_t(0)); + if (ctx.getPointerFamily() == ir::FAMILY_QWORD) + return ctx.newImmediate(uint64_t(0)); + else + return ctx.newImmediate(uint32_t(0)); } const Type::TypeID typeID = CPV->getType()->getTypeID(); @@ -2553,13 +2556,13 @@ namespace gbe this->newRegister(const_cast<GlobalVariable*>(&v)); ir::Register reg = regTranslator.getScalar(const_cast<GlobalVariable*>(&v), 0); - ctx.LOADI(ir::TYPE_S32, reg, ctx.newIntegerImmediate(oldSlm + padding/8, ir::TYPE_S32)); + ctx.LOADI(getType(ctx, v.getType()), reg, ctx.newIntegerImmediate(oldSlm + padding/8, getType(ctx, v.getType()))); } else if(addrSpace == ir::MEM_CONSTANT || v.isConstant()) { GBE_ASSERT(v.hasInitializer()); this->newRegister(const_cast<GlobalVariable*>(&v)); ir::Register reg = regTranslator.getScalar(const_cast<GlobalVariable*>(&v), 0); ir::Constant &con = unit.getConstantSet().getConstant(v.getName()); - ctx.LOADI(ir::TYPE_S32, reg, ctx.newIntegerImmediate(con.getOffset(), ir::TYPE_S32)); + ctx.LOADI(getType(ctx, v.getType()), reg, ctx.newIntegerImmediate(con.getOffset(), getType(ctx, v.getType()))); } else { if(v.getName().equals(StringRef("__gen_ocl_printf_buf"))) { ctx.getFunction().getPrintfSet()->setBufBTI(BtiMap.find(const_cast<GlobalVariable*>(&v))->second); @@ -4342,15 +4345,23 @@ namespace gbe uint32_t prevStackPtr = ctx.getFunction().getStackSize(); uint32_t step = ((prevStackPtr + (align - 1)) & ~(align - 1)) - prevStackPtr; if (step != 0) { - ir::ImmediateIndex stepImm = ctx.newIntegerImmediate(step, ir::TYPE_U32); + ir::ImmediateIndex stepImm; + ir::Type pointerTy = getType(pointerFamily); + if (ctx.getPointerSize() == ir::POINTER_32_BITS) + stepImm = ctx.newImmediate(uint32_t(step)); + else + stepImm = ctx.newImmediate(uint64_t(step)); ir::Register stepReg = ctx.reg(ctx.getPointerFamily()); - ctx.LOADI(ir::TYPE_U32, stepReg, stepImm); - ctx.ADD(ir::TYPE_U32, stack, stack, stepReg); + ctx.LOADI(pointerTy, stepReg, stepImm); + ctx.ADD(pointerTy, stack, stack, stepReg); ctx.getFunction().pushStackSize(step); } } // Set the destination register properly - ctx.MOV(imm.getType(), dst, stack); + if (legacyMode) + ctx.MOV(imm.getType(), dst, stack); + else + ctx.ADD(imm.getType(), dst, stack, ir::ocl::stackbuffer); ctx.LOADI(imm.getType(), reg, immIndex); ctx.ADD(imm.getType(), stack, stack, reg); @@ -4518,7 +4529,7 @@ namespace gbe // but later ArgumentLower pass need to match exact load/addImm pattern // so, I avoid subtracting zero base to satisfy ArgumentLower pass. if (!zeroBase) - ctx.SUB(ir::TYPE_U32, mPtr, pointer, baseReg); + ctx.SUB(getType(ctx, llvmPtr->getType()), mPtr, pointer, baseReg); else mPtr = pointer; } else { diff --git a/backend/src/llvm/llvm_printf_parser.cpp b/backend/src/llvm/llvm_printf_parser.cpp index d1cb1ae..edaf5cc 100644 --- a/backend/src/llvm/llvm_printf_parser.cpp +++ b/backend/src/llvm/llvm_printf_parser.cpp @@ -350,29 +350,36 @@ error: { Value* op0 = NULL; Value* val = NULL; + const DataLayout &DL = module->getDataLayout(); + Type *ptrIntTy = IntegerType::get(module->getContext(), DL.getPointerSizeInBits()); ///////////////////////////////////////////////////// /* calculate index address. index_addr = (index_offset + wg_offset )* sizeof(int) * 2 + index_buf_ptr index_offset = global_size2 * global_size1 * global_size0 * printf_num */ - Value* index_offset = builder->CreateMul(g1Xg2Xg3, ConstantInt::get(intTy, printf_num)); + Value* index_offset = builder->CreateMul(g1Xg2Xg3, ConstantInt::get(ptrIntTy, printf_num)); // index_offset + offset op0 = builder->CreateAdd(index_offset, wg_offset); // (index_offset + offset)* sizeof(int) * 2 - op0 = builder->CreateMul(op0, ConstantInt::get(intTy, sizeof(int)*2)); + op0 = builder->CreateMul(op0, ConstantInt::get(ptrIntTy, sizeof(int)*2)); // Final index address = index_buf_ptr + (index_offset + offset)* sizeof(int) op0 = builder->CreateAdd(index_buf_ptr, op0); Value* index_addr = builder->CreateIntToPtr(op0, Type::getInt32PtrTy(module->getContext(), 1)); // Load the printf num first, printf may be in loop. Value* loop_num = builder->CreateLoad(index_addr); + //if(DL.getPointerSizeInBits() == 64) + // loop_num = builder->CreateZExt(loop_num, ptrIntTy); val = builder->CreateAdd(loop_num, ConstantInt::get(intTy, 1)); builder->CreateStore(val, index_addr);// The loop number. - op0 = builder->CreateAdd(op0, ConstantInt::get(intTy, sizeof(int))); + op0 = builder->CreateAdd(op0, ConstantInt::get(ptrIntTy, sizeof(int))); index_addr = builder->CreateIntToPtr(op0, Type::getInt32PtrTy(module->getContext(), 1)); builder->CreateStore(ConstantInt::get(intTy, printf_num), index_addr);// The printf number. + if(DL.getPointerSizeInBits() == 64) + loop_num = builder->CreateZExt(loop_num, ptrIntTy); + int i = 1; Value* data_addr = NULL; for (auto &s : (*pInfo.printf_fmt).first) { @@ -406,14 +413,14 @@ error: data_offset = global_size2 * global_size1 * global_size0 * out_buf_sizeof_offset //global_size2 * global_size1 * global_size0 * out_buf_sizeof_offset */ - op0 = builder->CreateMul(g1Xg2Xg3, ConstantInt::get(intTy, out_buf_sizeof_offset)); + op0 = builder->CreateMul(g1Xg2Xg3, ConstantInt::get(ptrIntTy, out_buf_sizeof_offset)); //offset * sizeof(specify) - val = builder->CreateMul(wg_offset, ConstantInt::get(intTy, sizeof_size)); + val = builder->CreateMul(wg_offset, ConstantInt::get(ptrIntTy, sizeof_size)); //data_offset + pbuf_ptr op0 = builder->CreateAdd(pbuf_ptr, op0); op0 = builder->CreateAdd(op0, val); //totalSizeofSize * global_size2 * global_size1 * global_size0 - val = builder->CreateMul(g1Xg2Xg3, ConstantInt::get(intTy, totalSizeofSize)); + val = builder->CreateMul(g1Xg2Xg3, ConstantInt::get(ptrIntTy, totalSizeofSize)); //totalSizeofSize * global_size2 * global_size1 * global_size0 * loop_num val = builder->CreateMul(val, loop_num); //final @@ -543,6 +550,8 @@ error: totalSizeofSize = 0; module = F.getParent(); intTy = IntegerType::get(module->getContext(), 32); + const DataLayout &DL = module->getDataLayout(); + Type *ptrIntTy = IntegerType::get(module->getContext(), DL.getPointerSizeInBits()); // As we inline all function calls, so skip non-kernel functions bool bKernel = isKernelFunction(F); @@ -608,7 +617,7 @@ error: nullptr, GlobalVariable::NotThreadLocal, 1); - pbuf_ptr = builder->CreatePtrToInt(pBuf, Type::getInt32Ty(module->getContext())); + pbuf_ptr = builder->CreatePtrToInt(pBuf, ptrIntTy); } if (!index_buf_ptr) { Type *ptrTy = Type::getInt32PtrTy(module->getContext(), 1); @@ -619,7 +628,7 @@ error: nullptr, GlobalVariable::NotThreadLocal, 1); - index_buf_ptr = builder->CreatePtrToInt(pBuf, Type::getInt32Ty(module->getContext())); + index_buf_ptr = builder->CreatePtrToInt(pBuf, ptrIntTy); } if (!wg_offset || !g1Xg2Xg3) { @@ -683,6 +692,10 @@ error: op0 = builder->CreateMul(global_size2, global_size1); // global_size2 * global_size1 * global_size0 g1Xg2Xg3 = builder->CreateMul(op0, global_size0); + if(DL.getPointerSizeInBits() == 64) { + wg_offset = builder->CreateZExt(wg_offset, ptrIntTy); + g1Xg2Xg3 = builder->CreateZExt(g1Xg2Xg3, ptrIntTy); + } } -- 1.9.1 _______________________________________________ Beignet mailing list [email protected] http://lists.freedesktop.org/mailman/listinfo/beignet
