This is an automated email from the ASF dual-hosted git repository.
wuwei pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/incubator-tvm.git
The following commit(s) were added to refs/heads/master by this push:
new 05b0f7e [CodeGen][CUDA] Vectorization for intrinsics (#5101)
05b0f7e is described below
commit 05b0f7e00217958d4b2017802b2e4bddcc3aaa0b
Author: Wei Pan <[email protected]>
AuthorDate: Sun Mar 22 12:22:29 2020 -0700
[CodeGen][CUDA] Vectorization for intrinsics (#5101)
- This allows to emit vectorized loads/stores
for CUDA math intrinsics.
- A few intrinsics should be lowered as CUDAMath not CUDAFastMath ones.
- Fixed the code block identation.
---
src/target/source/codegen_c.h | 23 ++++
src/target/source/codegen_cuda.cc | 118 +++++++++++++-------
src/target/source/intrin_rule_cuda.cc | 26 +++--
tests/python/unittest/test_target_codegen_cuda.py | 124 +++++++++++++++++++++-
4 files changed, 238 insertions(+), 53 deletions(-)
diff --git a/src/target/source/codegen_c.h b/src/target/source/codegen_c.h
index a9da780..c1894a3 100644
--- a/src/target/source/codegen_c.h
+++ b/src/target/source/codegen_c.h
@@ -257,6 +257,29 @@ class CodeGenC :
/*! \brief the data type of allocated buffers */
std::unordered_map<const VarNode*, DataType> handle_data_type_;
+ /*!
+ * \brief A RAII utility class for emitting code in a scoped region.
+ */
+ class EnterScopeRAII {
+ // The codegen context.
+ CodeGenC* cg;
+
+ // The new scope level.
+ int scope;
+
+ public:
+ explicit EnterScopeRAII(CodeGenC* cg) : cg(cg) {
+ cg->PrintIndent();
+ cg->stream << "{\n";
+ scope = cg->BeginScope();
+ }
+ ~EnterScopeRAII() {
+ cg->EndScope(scope);
+ cg->PrintIndent();
+ cg->stream << "}\n";
+ }
+ };
+
private:
/*! \brief whether to print in SSA form */
bool print_ssa_form_{false};
diff --git a/src/target/source/codegen_cuda.cc
b/src/target/source/codegen_cuda.cc
index 2cc7b92..f8bc873 100644
--- a/src/target/source/codegen_cuda.cc
+++ b/src/target/source/codegen_cuda.cc
@@ -24,6 +24,7 @@
#include <tvm/runtime/registry.h>
#include <cmath>
+#include <utility>
#include <vector>
#include <string>
#include "literal/cuda_half_t.h"
@@ -235,25 +236,19 @@ void CodeGenCUDA::PrintType(DataType t, std::ostream& os)
{ // NOLINT(*)
void CodeGenCUDA::PrintVecBinaryOp(
const std::string& op, DataType t,
PrimExpr lhs, PrimExpr rhs, std::ostream& os) { // NOLINT(*)
- // unpacking operations.
- int lanes = t.lanes();
-
+ // Delcare the result.
+ std::string sret = GetUniqueName("_");
+ this->PrintIndent();
+ this->PrintType(t, stream);
+ stream << ' ' << sret << ";\n";
{
- // The assignment below introduces side-effect, and the resulting value
cannot
- // be reused across multiple expression, thus a new scope is needed
- int vec_scope = BeginScope();
+ EnterScopeRAII scope(this);
- // default: unpack into individual ops.
+ // Unpack into individual ops.
std::string vlhs = SSAGetID(PrintExpr(lhs), lhs.dtype());
std::string vrhs = SSAGetID(PrintExpr(rhs), rhs.dtype());
- std::string sret = GetUniqueName("_");
- {
- // delcare type.
- this->PrintIndent();
- this->PrintType(t, stream);
- stream << ' ' << sret << ";\n";
- }
- for (int i = 0; i < lanes; ++i) {
+
+ for (int i = 0, lanes = t.lanes(); i < lanes; ++i) {
std::ostringstream value_temp;
if (isalpha(op[0])) {
value_temp << op << "(";
@@ -270,9 +265,8 @@ void CodeGenCUDA::PrintVecBinaryOp(
}
PrintVecElemStore(sret, t, i, value_temp.str());
}
- os << sret;
- EndScope(vec_scope);
}
+ os << sret;
}
void CodeGenCUDA::PrintVecElemLoad(
@@ -418,6 +412,54 @@ void CodeGenCUDA::VisitExpr_(const CallNode *op,
std::ostream& os) {
this->PrintExpr(op->args[i * 2 + 1], os);
os << "]" << ((i < 3) ? ", ": ")");
}
+ } else if (op->call_type == CallNode::PureExtern && op->dtype.is_vector()) {
+ //
+ // Emit an unsupported vector call
+ //
+ // v = intrin_f((float4*)A[0], (float4*)B[0])
+ //
+ // as
+ //
+ // float4 __ret;
+ // {
+ // float4 __arg0 = ((float4*)A)[0];
+ // float4 __arg1 = ((float4*)B)[0];
+ // __ret.x = intrin_f(__arg0.x, __arg1.x);
+ // __ret.y = intrin_f(__arg0.y, __arg1.y);
+ // __ret.z = intrin_f(__arg0.z, __arg1.z);
+ // __ret.w = intrin_f(__arg0.w, __arg1.w);
+ // }
+ // v = __ret;
+ //
+ // Declare the result vector.
+ std::string sret = GetUniqueName("_");
+ this->PrintIndent();
+ this->PrintType(op->dtype, stream);
+ stream << ' ' << sret << ";\n";
+ {
+ EnterScopeRAII scope(this);
+
+ // Load arguments.
+ std::vector<std::string> sargs;
+ for (size_t i = 0; i < op->args.size(); ++i) {
+ std::string val = SSAGetID(PrintExpr(op->args[i]),
op->args[i].dtype());
+ sargs.push_back(std::move(val));
+ }
+
+ // Emit a scalar call for each lane.
+ for (int i = 0; i < op->dtype.lanes(); ++i) {
+ std::ostringstream scall;
+ scall << op->name << "(";
+ for (size_t j = 0; j < op->args.size(); ++j) {
+ if (j > 0)
+ scall << ", ";
+ PrintVecElemLoad(sargs[j], op->args[j].dtype(), i, scall);
+ }
+ scall << ")";
+ PrintVecElemStore(sret, op->dtype, i, scall.str());
+ }
+ }
+ os << sret;
} else {
CodeGenC::VisitExpr_(op, os);
}
@@ -580,34 +622,34 @@ void CodeGenCUDA::VisitExpr_(const SelectNode* op,
std::ostream &os) {
op->true_value->dtype == op->dtype &&
op->dtype.lanes() == op->condition.dtype().lanes());
- int lanes = op->dtype.lanes();
- int scope = BeginScope();
-
- std::string c_var = SSAGetID(PrintExpr(op->condition), op->dtype);
- std::string t_var = SSAGetID(PrintExpr(op->true_value), op->dtype);
- std::string f_var = SSAGetID(PrintExpr(op->false_value), op->dtype);
std::string r_var = GetUniqueName("_");
-
this->PrintIndent();
this->PrintType(op->dtype, stream);
stream << ' ' << r_var << ";\n";
+ {
+ EnterScopeRAII scope(this);
+
+ std::string c_var = SSAGetID(PrintExpr(op->condition), op->dtype);
+ std::string t_var = SSAGetID(PrintExpr(op->true_value), op->dtype);
+ std::string f_var = SSAGetID(PrintExpr(op->false_value), op->dtype);
- // The condition is stored as an ushort vector.
- DataType memory_ty(DataType::TypeCode::kUInt, 16, lanes);
-
- for (int i = 0; i < lanes; ++i) {
- std::ostringstream item;
- item << "(bool(";
- PrintVecElemLoad(c_var, memory_ty, i, item);
- item << ")?";
- PrintVecElemLoad(t_var, op->dtype, i, item);
- item << ':';
- PrintVecElemLoad(f_var, op->dtype, i, item);
- item << ')';
- PrintVecElemStore(r_var, op->dtype, i, item.str());
+ // The condition is stored as an ushort vector.
+ int lanes = op->dtype.lanes();
+ DataType memory_ty(DataType::TypeCode::kUInt, 16, lanes);
+
+ for (int i = 0; i < lanes; ++i) {
+ std::ostringstream item;
+ item << "(bool(";
+ PrintVecElemLoad(c_var, memory_ty, i, item);
+ item << ")?";
+ PrintVecElemLoad(t_var, op->dtype, i, item);
+ item << ':';
+ PrintVecElemLoad(f_var, op->dtype, i, item);
+ item << ')';
+ PrintVecElemStore(r_var, op->dtype, i, item.str());
+ }
}
os << r_var;
- EndScope(scope);
}
inline void PrintConst(const FloatImmNode* op, std::ostream& os, CodeGenCUDA*
p) { // NOLINT(*)
diff --git a/src/target/source/intrin_rule_cuda.cc
b/src/target/source/intrin_rule_cuda.cc
index d009110..d944120 100644
--- a/src/target/source/intrin_rule_cuda.cc
+++ b/src/target/source/intrin_rule_cuda.cc
@@ -29,14 +29,12 @@ namespace intrin {
// Add float suffix to the intrinsics, CUDA fast math.
struct CUDAMath {
std::string operator()(DataType t, std::string name) const {
- if (t.lanes() == 1) {
- if (t.is_float()) {
- switch (t.bits()) {
- case 64: return name;
- case 32: return name + 'f';
- case 16: return 'h' + name;
- default: return "";
- }
+ if (t.is_float()) {
+ switch (t.bits()) {
+ case 64: return name;
+ case 32: return name + 'f';
+ case 16: return 'h' + name;
+ default: return "";
}
}
return "";
@@ -45,7 +43,7 @@ struct CUDAMath {
struct CUDAFastMath : public CUDAMath {
std::string operator()(DataType t, std::string name) const {
- if (t.lanes() == 1 && t.is_float() && t.bits() == 32) {
+ if (t.is_float() && t.bits() == 32) {
return "__" + name + 'f';
} else {
return CUDAMath::operator()(t, name);
@@ -56,7 +54,7 @@ struct CUDAFastMath : public CUDAMath {
struct CUDAFastMathTan : public CUDAMath {
std::string operator()(DataType t, std::string name) const {
- if (t.lanes() == 1 && t.is_float()) {
+ if (t.is_float()) {
switch (t.bits()) {
case 64: return name;
// `__tanf` seems to produce some values too deviant from numpy tan
version.
@@ -72,7 +70,7 @@ struct CUDAFastMathTan : public CUDAMath {
struct CUDAPopcount {
std::string operator()(DataType t, std::string name) const {
- if (t.lanes() == 1 && t.is_uint()) {
+ if (t.is_uint()) {
switch (t.bits()) {
case 32: return "__popc";
case 64: return "__popcll";
@@ -108,7 +106,7 @@ TVM_REGISTER_GLOBAL("tvm.intrin.rule.cuda.exp")
.set_body(DispatchExtern<CUDAFastMath>);
TVM_REGISTER_GLOBAL("tvm.intrin.rule.cuda.exp2")
-.set_body(DispatchExtern<CUDAFastMath>);
+.set_body(DispatchExtern<CUDAMath>);
TVM_REGISTER_GLOBAL("tvm.intrin.rule.cuda.exp10")
.set_body(DispatchExtern<CUDAFastMath>);
@@ -132,13 +130,13 @@ TVM_REGISTER_GLOBAL("tvm.intrin.rule.cuda.cos")
.set_body(DispatchExtern<CUDAFastMath>);
TVM_REGISTER_GLOBAL("tvm.intrin.rule.cuda.cosh")
-.set_body(DispatchExtern<CUDAFastMath>);
+.set_body(DispatchExtern<CUDAMath>);
TVM_REGISTER_GLOBAL("tvm.intrin.rule.cuda.sin")
.set_body(DispatchExtern<CUDAFastMath>);
TVM_REGISTER_GLOBAL("tvm.intrin.rule.cuda.sinh")
-.set_body(DispatchExtern<CUDAFastMath>);
+.set_body(DispatchExtern<CUDAMath>);
TVM_REGISTER_GLOBAL("tvm.intrin.rule.cuda.atan")
.set_body(DispatchExtern<CUDAMath>);
diff --git a/tests/python/unittest/test_target_codegen_cuda.py
b/tests/python/unittest/test_target_codegen_cuda.py
index 083cede..e8c6cd1 100644
--- a/tests/python/unittest/test_target_codegen_cuda.py
+++ b/tests/python/unittest/test_target_codegen_cuda.py
@@ -348,6 +348,125 @@ def test_cuda_floordiv_with_vectorization():
func(a_nd, b_nd)
tvm.testing.assert_allclose(b_nd.asnumpy(), b_np, rtol=1e-3)
+def sched(B):
+ s = te.create_schedule(B.op)
+ io, ii = s[B].split(s[B].op.axis[0], nparts=1)
+ iio, iii = s[B].split(ii, nparts=32)
+ _, iiii = s[B].split(iii, factor=4)
+ s[B].vectorize(iiii)
+ s[B].bind(io, bx)
+ s[B].bind(iio, tx)
+ return s
+
+def test_vectorized_intrin1():
+ test_funcs = [
+ (tvm.tir.floor, lambda x : np.floor(x)),
+ (tvm.tir.ceil, lambda x : np.ceil(x)),
+ (tvm.tir.trunc, lambda x : np.trunc(x)),
+ (tvm.tir.abs, lambda x : np.fabs(x)),
+ (tvm.tir.round, lambda x : np.round(x)),
+ (tvm.tir.exp, lambda x : np.exp(x)),
+ (tvm.tir.exp2, lambda x : np.exp2(x)),
+ (tvm.tir.exp10, lambda x : np.power(10,x)),
+ (tvm.tir.log, lambda x : np.log(x)),
+ (tvm.tir.log2, lambda x : np.log2(x)),
+ (tvm.tir.log10, lambda x : np.log10(x)),
+ (tvm.tir.tan, lambda x : np.tan(x)),
+ (tvm.tir.cos, lambda x : np.cos(x)),
+ (tvm.tir.cosh, lambda x : np.cosh(x)),
+ (tvm.tir.sin, lambda x : np.sin(x)),
+ (tvm.tir.sinh, lambda x : np.sinh(x)),
+ (tvm.tir.atan, lambda x : np.arctan(x)),
+ (tvm.tir.tanh, lambda x : np.tanh(x)),
+ (tvm.tir.sqrt, lambda x : np.sqrt(x)),
+ ]
+ def run_test(tvm_intrin, np_func, dtype):
+ if not tvm.gpu(0).exist or not tvm.runtime.enabled("cuda"):
+ print("skip because cuda is not enabled..")
+ return
+ if dtype == "float16" and not have_fp16(tvm.gpu(0).compute_version):
+ print("Skip because gpu does not have fp16 support")
+ return
+ # set of intrinsics does not support fp16 yet.
+ skip_set = {tvm.tir.abs,
+ tvm.tir.round,
+ tvm.tir.tan,
+ tvm.tir.atan,
+ tvm.tir.tanh,
+ tvm.tir.cosh,
+ tvm.tir.sinh}
+ if dtype == "float16" and tvm_intrin in skip_set:
+ print("Skip because '{0}' does not support fp16
yet".format(tvm_intrin.__name__))
+ return
+
+ n = 128
+ A = te.placeholder((n,), dtype=dtype, name='A')
+ B = te.compute((n,), lambda *i: tvm_intrin(A(*i)), name='B')
+ s = sched(B)
+ f = tvm.build(s, [A, B], "cuda")
+ ctx = tvm.gpu(0)
+ a = tvm.nd.array(np.random.uniform(0, 1, size=n).astype(A.dtype), ctx)
+ b = tvm.nd.array(np.zeros(shape=(n,)).astype(A.dtype), ctx)
+ f(a, b)
+ tvm.testing.assert_allclose(b.asnumpy(), np_func(a.asnumpy()),
atol=1e-3, rtol=1e-3)
+
+ for func in test_funcs:
+ run_test(*func, "float32")
+ run_test(*func, "float16")
+
+def test_vectorized_intrin2(dtype="float32"):
+ c2 = tvm.tir.const(2, dtype=dtype)
+ test_funcs = [
+ (tvm.tir.power, lambda x : np.power(x, 2.0)),
+ (tvm.tir.fmod, lambda x : np.fmod(x, 2.0))
+ ]
+ def run_test(tvm_intrin, np_func):
+ if not tvm.gpu(0).exist or not tvm.runtime.enabled("cuda"):
+ print("skip because cuda is not enabled..")
+ return
+
+ n = 128
+ A = te.placeholder((n,), dtype=dtype, name='A')
+ B = te.compute((n,), lambda i: tvm_intrin(A[i], c2), name='B')
+ s = sched(B)
+ f = tvm.build(s, [A, B], "cuda")
+ ctx = tvm.gpu(0)
+ a = tvm.nd.array(np.random.uniform(0, 1, size=n).astype(A.dtype), ctx)
+ b = tvm.nd.array(np.zeros(shape=(n,)).astype(A.dtype), ctx)
+ f(a, b)
+ tvm.testing.assert_allclose(b.asnumpy(), np_func(a.asnumpy()),
atol=1e-3, rtol=1e-3)
+
+ for func in test_funcs:
+ run_test(*func)
+
+def test_vectorized_popcount():
+ def ref_popcount(x):
+ cnt = 0
+ while x:
+ x -= x & -x
+ cnt += 1
+ return cnt
+
+ def run_test(dtype):
+ if not tvm.gpu(0).exist or not tvm.runtime.enabled("cuda"):
+ print("skip because cuda is not enabled..")
+ return
+
+ n = 128
+ A = te.placeholder((n,), dtype=dtype, name='A')
+ B = te.compute((n,), lambda i: tvm.tir.popcount(A[i]), name='B')
+ s = sched(B)
+ f = tvm.build(s, [A, B], "cuda")
+ ctx = tvm.gpu(0)
+ a = tvm.nd.array(np.random.randint(0, 100000, size=n).astype(A.dtype),
ctx)
+ b = tvm.nd.array(np.zeros(shape=(n,)).astype(B.dtype), ctx)
+ f(a, b)
+ ref = np.vectorize(ref_popcount)(a.asnumpy())
+ tvm.testing.assert_allclose(b.asnumpy(), ref)
+
+ run_test("uint32")
+ run_test("uint64")
+
if __name__ == "__main__":
test_cuda_vectorize_add()
test_cuda_multiply_add()
@@ -359,4 +478,7 @@ if __name__ == "__main__":
test_rfactor_predicates()
test_cuda_const_float_to_half()
test_cuda_reduction()
- test_cuda_floordiv_with_vectorization()
\ No newline at end of file
+ test_cuda_floordiv_with_vectorization()
+ test_vectorized_intrin1()
+ test_vectorized_intrin2()
+ test_vectorized_popcount()
\ No newline at end of file