From: Luo Xionghu <[email protected]> the fbh style is inefficient.
Signed-off-by: Luo Xionghu <[email protected]> --- backend/src/libocl/tmpl/ocl_integer.tmpl.cl | 86 +++++++---------------------- backend/src/llvm/llvm_gen_backend.cpp | 76 +++++++++++++++++++++++++ backend/src/llvm/llvm_gen_ocl_function.hxx | 1 + 3 files changed, 98 insertions(+), 65 deletions(-) diff --git a/backend/src/libocl/tmpl/ocl_integer.tmpl.cl b/backend/src/libocl/tmpl/ocl_integer.tmpl.cl index 6da0bab..36da959 100644 --- a/backend/src/libocl/tmpl/ocl_integer.tmpl.cl +++ b/backend/src/libocl/tmpl/ocl_integer.tmpl.cl @@ -19,6 +19,16 @@ PURE CONST uint __gen_ocl_fbh(uint); PURE CONST uint __gen_ocl_fbl(uint); + +PURE CONST OVERLOADABLE ulong __gen_ocl_lzd(ulong); +PURE CONST OVERLOADABLE long __gen_ocl_lzd(long); +PURE CONST OVERLOADABLE uint __gen_ocl_lzd(uint); +PURE CONST OVERLOADABLE int __gen_ocl_lzd(int); +PURE CONST OVERLOADABLE ushort __gen_ocl_lzd(ushort); +PURE CONST OVERLOADABLE short __gen_ocl_lzd(short); +PURE CONST OVERLOADABLE uchar __gen_ocl_lzd(uchar); +PURE CONST OVERLOADABLE char __gen_ocl_lzd(char); + PURE CONST OVERLOADABLE uint __gen_ocl_cbit(uint); PURE CONST OVERLOADABLE uint __gen_ocl_cbit(int); PURE CONST OVERLOADABLE uint __gen_ocl_cbit(ushort); @@ -26,71 +36,17 @@ PURE CONST OVERLOADABLE uint __gen_ocl_cbit(short); PURE CONST OVERLOADABLE uint __gen_ocl_cbit(uchar); PURE CONST OVERLOADABLE uint __gen_ocl_cbit(char); -OVERLOADABLE char clz(char x) { - if (x < 0) - return 0; - if (x == 0) - return 8; - return __gen_ocl_fbh(x) - 24; -} - -OVERLOADABLE uchar clz(uchar x) { - if (x == 0) - return 8; - return __gen_ocl_fbh(x) - 24; -} - -OVERLOADABLE short clz(short x) { - if (x < 0) - return 0; - if (x == 0) - return 16; - return __gen_ocl_fbh(x) - 16; -} - -OVERLOADABLE ushort clz(ushort x) { - if (x == 0) - return 16; - return __gen_ocl_fbh(x) - 16; -} - -OVERLOADABLE int clz(int x) { - if (x < 0) - return 0; - if (x == 0) - return 32; - return __gen_ocl_fbh(x); -} - -OVERLOADABLE uint clz(uint x) { - if (x == 0) - return 32; - return __gen_ocl_fbh(x); -} - -OVERLOADABLE long clz(long x) { - union { int i[2]; long x; } u; - u.x = x; - if (u.i[1] & 0x80000000u) - return 0; - if (u.i[1] == 0 && u.i[0] == 0) - return 64; - uint v = clz(u.i[1]); - if(v == 32) - v += clz(u.i[0]); - return v; -} - -OVERLOADABLE ulong clz(ulong x) { - if (x == 0) - return 64; - union { uint i[2]; ulong x; } u; - u.x = x; - uint v = clz(u.i[1]); - if(v == 32) - v += clz(u.i[0]); - return v; -} +#define SDEF(TYPE) \ +OVERLOADABLE TYPE clz(TYPE x){ return __gen_ocl_lzd(x);} +SDEF(char); +SDEF(uchar); +SDEF(short); +SDEF(ushort); +SDEF(int); +SDEF(uint); +SDEF(long); +SDEF(ulong); +#undef SDEF #define SDEF(TYPE) \ OVERLOADABLE TYPE popcount(TYPE x){ return __gen_ocl_cbit(x);} diff --git a/backend/src/llvm/llvm_gen_backend.cpp b/backend/src/llvm/llvm_gen_backend.cpp index 7922ddb..7948c26 100644 --- a/backend/src/llvm/llvm_gen_backend.cpp +++ b/backend/src/llvm/llvm_gen_backend.cpp @@ -2902,6 +2902,7 @@ error: regTranslator.newScalarProxy(ir::ocl::workdim, dst); break; case GEN_OCL_FBH: case GEN_OCL_FBL: + case GEN_OCL_LZD: case GEN_OCL_CBIT: case GEN_OCL_COS: case GEN_OCL_SIN: @@ -3463,6 +3464,81 @@ error: } case GEN_OCL_FBH: this->emitUnaryCallInst(I,CS,ir::OP_FBH); break; case GEN_OCL_FBL: this->emitUnaryCallInst(I,CS,ir::OP_FBL); break; + case GEN_OCL_LZD: + { + Type *llvmDstType = I.getType(); + ir::Type dstType = getType(ctx, llvmDstType); + Type *llvmSrcType = I.getOperand(0)->getType(); + ir::Type srcType = getUnsignedType(ctx, llvmSrcType); + + const ir::Register dst = this->getRegister(&I); + const ir::Register src = this->getRegister(I.getOperand(0)); + int imm_value = 0; + if(srcType == ir::TYPE_U16) { + imm_value = 16; + }else if(srcType == ir::TYPE_U8) { + imm_value = 24; + }else if(srcType == ir::TYPE_U64) { + imm_value = 32; + } + + if(srcType == ir::TYPE_U16 || srcType == ir::TYPE_U8) { + ir::ImmediateIndex imm; + ir::Type tmpType = ir::TYPE_S32; + imm = ctx.newIntegerImmediate(imm_value, tmpType); + const ir::RegisterFamily family = getFamily(tmpType); + const ir::Register immReg = ctx.reg(family); + ctx.LOADI(ir::TYPE_S32, immReg, imm); + + ir::Register tmp0 = ctx.reg(getFamily(tmpType)); + ir::Register tmp1 = ctx.reg(getFamily(tmpType)); + ir::Register tmp2 = ctx.reg(getFamily(tmpType)); + ctx.CVT(tmpType, srcType, tmp0, src); + ctx.ALU1(ir::OP_LZD, tmpType, tmp1, tmp0); + ctx.SUB(tmpType, tmp2, tmp1, immReg); + ctx.CVT(dstType, tmpType, dst, tmp2); + } + else if(srcType == ir::TYPE_U64) { + ir::ImmediateIndex imm; + ir::Type tmpType = ir::TYPE_U32; + imm = ctx.newIntegerImmediate(imm_value, srcType); + const ir::RegisterFamily family = getFamily(srcType); + const ir::Register immReg = ctx.reg(family); + ctx.LOADI(ir::TYPE_S64, immReg, imm); + + const ir::RegisterFamily tmpFamily = getFamily(tmpType); + const ir::ImmediateIndex imm32 = ctx.newIntegerImmediate(32, tmpType); + const ir::Register imm32Reg = ctx.reg(tmpFamily); + ctx.LOADI(ir::TYPE_S32, imm32Reg, imm32); + + ir::Register tmp0 = ctx.reg(getFamily(srcType)); + ir::Register tmp1 = ctx.reg(getFamily(tmpType)); + ir::Register tmp2 = ctx.reg(getFamily(tmpType)); + ir::Register tmp3 = ctx.reg(getFamily(tmpType)); + ir::Register tmp4 = ctx.reg(getFamily(tmpType)); + ir::Register tmp5 = ctx.reg(getFamily(tmpType)); + ir::Register tmp6 = ctx.reg(getFamily(tmpType)); + ir::Register cmp = ctx.reg(ir::FAMILY_BOOL); + + ctx.SHR(srcType, tmp0, src, immReg); + ctx.CVT(tmpType, srcType, tmp1, tmp0); + + ctx.ALU1(ir::OP_LZD, tmpType, tmp2, tmp1); + ctx.LT(tmpType, cmp, tmp2, imm32Reg); + + ctx.CVT(tmpType, srcType, tmp3, src); + ctx.ALU1(ir::OP_LZD, tmpType, tmp4, tmp3); + ctx.ADD(tmpType, tmp5, tmp4, imm32Reg); + + ctx.SEL(tmpType, tmp6, cmp, tmp2, tmp5); + ctx.CVT(dstType, tmpType, dst, tmp6); + } + else + { + ctx.ALU1(ir::OP_LZD, dstType, dst, src); + } + } + break; case GEN_OCL_CBIT: this->emitUnaryCallInst(I,CS,ir::OP_CBIT, getUnsignedType(ctx, (*AI)->getType())); break; case GEN_OCL_ABS: { diff --git a/backend/src/llvm/llvm_gen_ocl_function.hxx b/backend/src/llvm/llvm_gen_ocl_function.hxx index 8ec8336..5a9b377 100644 --- a/backend/src/llvm/llvm_gen_ocl_function.hxx +++ b/backend/src/llvm/llvm_gen_ocl_function.hxx @@ -127,6 +127,7 @@ DECL_LLVM_GEN_FUNCTION(I64RHADD, _Z15__gen_ocl_rhaddmm) DECL_LLVM_GEN_FUNCTION(UPSAMPLE_SHORT, _Z18__gen_ocl_upsampless) DECL_LLVM_GEN_FUNCTION(UPSAMPLE_INT, _Z18__gen_ocl_upsampleii) DECL_LLVM_GEN_FUNCTION(UPSAMPLE_LONG, _Z18__gen_ocl_upsamplell) +DECL_LLVM_GEN_FUNCTION(LZD, __gen_ocl_lzd) DECL_LLVM_GEN_FUNCTION(CBIT, __gen_ocl_cbit) // saturate convert -- 1.9.1 _______________________________________________ Beignet mailing list [email protected] http://lists.freedesktop.org/mailman/listinfo/beignet
