Just as we discussed offline. Please use llvm.ctlz directly in clz() builtin function, thus we don't need to implement the __gen_ocl_lzd() which is non-standard intrinsics. And we can avoid to duplicate a lot of code.
On Mon, Jan 26, 2015 at 02:57:46PM +0800, [email protected] wrote: > From: Luo Xionghu <[email protected]> > > the fbh style is inefficient. > > Signed-off-by: Luo Xionghu <[email protected]> > --- > backend/src/libocl/tmpl/ocl_integer.tmpl.cl | 86 > +++++++---------------------- > backend/src/llvm/llvm_gen_backend.cpp | 76 +++++++++++++++++++++++++ > backend/src/llvm/llvm_gen_ocl_function.hxx | 1 + > 3 files changed, 98 insertions(+), 65 deletions(-) > > diff --git a/backend/src/libocl/tmpl/ocl_integer.tmpl.cl > b/backend/src/libocl/tmpl/ocl_integer.tmpl.cl > index 6da0bab..36da959 100644 > --- a/backend/src/libocl/tmpl/ocl_integer.tmpl.cl > +++ b/backend/src/libocl/tmpl/ocl_integer.tmpl.cl > @@ -19,6 +19,16 @@ > > PURE CONST uint __gen_ocl_fbh(uint); > PURE CONST uint __gen_ocl_fbl(uint); > + > +PURE CONST OVERLOADABLE ulong __gen_ocl_lzd(ulong); > +PURE CONST OVERLOADABLE long __gen_ocl_lzd(long); > +PURE CONST OVERLOADABLE uint __gen_ocl_lzd(uint); > +PURE CONST OVERLOADABLE int __gen_ocl_lzd(int); > +PURE CONST OVERLOADABLE ushort __gen_ocl_lzd(ushort); > +PURE CONST OVERLOADABLE short __gen_ocl_lzd(short); > +PURE CONST OVERLOADABLE uchar __gen_ocl_lzd(uchar); > +PURE CONST OVERLOADABLE char __gen_ocl_lzd(char); > + > PURE CONST OVERLOADABLE uint __gen_ocl_cbit(uint); > PURE CONST OVERLOADABLE uint __gen_ocl_cbit(int); > PURE CONST OVERLOADABLE uint __gen_ocl_cbit(ushort); > @@ -26,71 +36,17 @@ PURE CONST OVERLOADABLE uint __gen_ocl_cbit(short); > PURE CONST OVERLOADABLE uint __gen_ocl_cbit(uchar); > PURE CONST OVERLOADABLE uint __gen_ocl_cbit(char); > > -OVERLOADABLE char clz(char x) { > - if (x < 0) > - return 0; > - if (x == 0) > - return 8; > - return __gen_ocl_fbh(x) - 24; > -} > - > -OVERLOADABLE uchar clz(uchar x) { > - if (x == 0) > - return 8; > - return __gen_ocl_fbh(x) - 24; > -} > - > -OVERLOADABLE short clz(short x) { > - if (x < 0) > - return 0; > - if (x == 0) > - return 16; > - return __gen_ocl_fbh(x) - 16; > -} > - > -OVERLOADABLE ushort clz(ushort x) { > - if (x == 0) > - return 16; > - return __gen_ocl_fbh(x) - 16; > -} > - > -OVERLOADABLE int clz(int x) { > - if (x < 0) > - return 0; > - if (x == 0) > - return 32; > - return __gen_ocl_fbh(x); > -} > - > -OVERLOADABLE uint clz(uint x) { > - if (x == 0) > - return 32; > - return __gen_ocl_fbh(x); > -} > - > -OVERLOADABLE long clz(long x) { > - union { int i[2]; long x; } u; > - u.x = x; > - if (u.i[1] & 0x80000000u) > - return 0; > - if (u.i[1] == 0 && u.i[0] == 0) > - return 64; > - uint v = clz(u.i[1]); > - if(v == 32) > - v += clz(u.i[0]); > - return v; > -} > - > -OVERLOADABLE ulong clz(ulong x) { > - if (x == 0) > - return 64; > - union { uint i[2]; ulong x; } u; > - u.x = x; > - uint v = clz(u.i[1]); > - if(v == 32) > - v += clz(u.i[0]); > - return v; > -} > +#define SDEF(TYPE) \ > +OVERLOADABLE TYPE clz(TYPE x){ return __gen_ocl_lzd(x);} > +SDEF(char); > +SDEF(uchar); > +SDEF(short); > +SDEF(ushort); > +SDEF(int); > +SDEF(uint); > +SDEF(long); > +SDEF(ulong); > +#undef SDEF > > #define SDEF(TYPE) \ > OVERLOADABLE TYPE popcount(TYPE x){ return __gen_ocl_cbit(x);} > diff --git a/backend/src/llvm/llvm_gen_backend.cpp > b/backend/src/llvm/llvm_gen_backend.cpp > index 7922ddb..7948c26 100644 > --- a/backend/src/llvm/llvm_gen_backend.cpp > +++ b/backend/src/llvm/llvm_gen_backend.cpp > @@ -2902,6 +2902,7 @@ error: > regTranslator.newScalarProxy(ir::ocl::workdim, dst); break; > case GEN_OCL_FBH: > case GEN_OCL_FBL: > + case GEN_OCL_LZD: > case GEN_OCL_CBIT: > case GEN_OCL_COS: > case GEN_OCL_SIN: > @@ -3463,6 +3464,81 @@ error: > } > case GEN_OCL_FBH: this->emitUnaryCallInst(I,CS,ir::OP_FBH); break; > case GEN_OCL_FBL: this->emitUnaryCallInst(I,CS,ir::OP_FBL); break; > + case GEN_OCL_LZD: > + { > + Type *llvmDstType = I.getType(); > + ir::Type dstType = getType(ctx, llvmDstType); > + Type *llvmSrcType = I.getOperand(0)->getType(); > + ir::Type srcType = getUnsignedType(ctx, llvmSrcType); > + > + const ir::Register dst = this->getRegister(&I); > + const ir::Register src = this->getRegister(I.getOperand(0)); > + int imm_value = 0; > + if(srcType == ir::TYPE_U16) { > + imm_value = 16; > + }else if(srcType == ir::TYPE_U8) { > + imm_value = 24; > + }else if(srcType == ir::TYPE_U64) { > + imm_value = 32; > + } > + > + if(srcType == ir::TYPE_U16 || srcType == ir::TYPE_U8) { > + ir::ImmediateIndex imm; > + ir::Type tmpType = ir::TYPE_S32; > + imm = ctx.newIntegerImmediate(imm_value, tmpType); > + const ir::RegisterFamily family = getFamily(tmpType); > + const ir::Register immReg = ctx.reg(family); > + ctx.LOADI(ir::TYPE_S32, immReg, imm); > + > + ir::Register tmp0 = ctx.reg(getFamily(tmpType)); > + ir::Register tmp1 = ctx.reg(getFamily(tmpType)); > + ir::Register tmp2 = ctx.reg(getFamily(tmpType)); > + ctx.CVT(tmpType, srcType, tmp0, src); > + ctx.ALU1(ir::OP_LZD, tmpType, tmp1, tmp0); > + ctx.SUB(tmpType, tmp2, tmp1, immReg); > + ctx.CVT(dstType, tmpType, dst, tmp2); > + } > + else if(srcType == ir::TYPE_U64) { > + ir::ImmediateIndex imm; > + ir::Type tmpType = ir::TYPE_U32; > + imm = ctx.newIntegerImmediate(imm_value, srcType); > + const ir::RegisterFamily family = getFamily(srcType); > + const ir::Register immReg = ctx.reg(family); > + ctx.LOADI(ir::TYPE_S64, immReg, imm); > + > + const ir::RegisterFamily tmpFamily = getFamily(tmpType); > + const ir::ImmediateIndex imm32 = ctx.newIntegerImmediate(32, > tmpType); > + const ir::Register imm32Reg = ctx.reg(tmpFamily); > + ctx.LOADI(ir::TYPE_S32, imm32Reg, imm32); > + > + ir::Register tmp0 = ctx.reg(getFamily(srcType)); > + ir::Register tmp1 = ctx.reg(getFamily(tmpType)); > + ir::Register tmp2 = ctx.reg(getFamily(tmpType)); > + ir::Register tmp3 = ctx.reg(getFamily(tmpType)); > + ir::Register tmp4 = ctx.reg(getFamily(tmpType)); > + ir::Register tmp5 = ctx.reg(getFamily(tmpType)); > + ir::Register tmp6 = ctx.reg(getFamily(tmpType)); > + ir::Register cmp = ctx.reg(ir::FAMILY_BOOL); > + > + ctx.SHR(srcType, tmp0, src, immReg); > + ctx.CVT(tmpType, srcType, tmp1, tmp0); > + > + ctx.ALU1(ir::OP_LZD, tmpType, tmp2, tmp1); > + ctx.LT(tmpType, cmp, tmp2, imm32Reg); > + > + ctx.CVT(tmpType, srcType, tmp3, src); > + ctx.ALU1(ir::OP_LZD, tmpType, tmp4, tmp3); > + ctx.ADD(tmpType, tmp5, tmp4, imm32Reg); > + > + ctx.SEL(tmpType, tmp6, cmp, tmp2, tmp5); > + ctx.CVT(dstType, tmpType, dst, tmp6); > + } > + else > + { > + ctx.ALU1(ir::OP_LZD, dstType, dst, src); > + } > + } > + break; > case GEN_OCL_CBIT: this->emitUnaryCallInst(I,CS,ir::OP_CBIT, > getUnsignedType(ctx, (*AI)->getType())); break; > case GEN_OCL_ABS: > { > diff --git a/backend/src/llvm/llvm_gen_ocl_function.hxx > b/backend/src/llvm/llvm_gen_ocl_function.hxx > index 8ec8336..5a9b377 100644 > --- a/backend/src/llvm/llvm_gen_ocl_function.hxx > +++ b/backend/src/llvm/llvm_gen_ocl_function.hxx > @@ -127,6 +127,7 @@ DECL_LLVM_GEN_FUNCTION(I64RHADD, _Z15__gen_ocl_rhaddmm) > DECL_LLVM_GEN_FUNCTION(UPSAMPLE_SHORT, _Z18__gen_ocl_upsampless) > DECL_LLVM_GEN_FUNCTION(UPSAMPLE_INT, _Z18__gen_ocl_upsampleii) > DECL_LLVM_GEN_FUNCTION(UPSAMPLE_LONG, _Z18__gen_ocl_upsamplell) > +DECL_LLVM_GEN_FUNCTION(LZD, __gen_ocl_lzd) > DECL_LLVM_GEN_FUNCTION(CBIT, __gen_ocl_cbit) > > // saturate convert > -- > 1.9.1 > > _______________________________________________ > Beignet mailing list > [email protected] > http://lists.freedesktop.org/mailman/listinfo/beignet _______________________________________________ Beignet mailing list [email protected] http://lists.freedesktop.org/mailman/listinfo/beignet
