This is an automated email from the ASF dual-hosted git repository.
jcf94 pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tvm.git
The following commit(s) were added to refs/heads/main by this push:
new 3306857 [Runtime] add set_output_zero_copy (#8497)
3306857 is described below
commit 3306857d80bfc76cdd10d7a40028f52b7ca696aa
Author: Swift.Sun <[email protected]>
AuthorDate: Fri Aug 27 17:28:50 2021 +0800
[Runtime] add set_output_zero_copy (#8497)
* Update graph_executor.h
* Update graph_executor.cc
* modify zero copy UT add set input zero copy
* modify C style
* add runtime test
* realy build generatr the json
Co-authored-by: hwstaff <[email protected]>
---
src/runtime/graph_executor/graph_executor.cc | 106 +++++++++++++++---
src/runtime/graph_executor/graph_executor.h | 28 +++++
tests/cpp/runtime_test.cc | 154 +++++++++++++++++++++++++++
3 files changed, 274 insertions(+), 14 deletions(-)
diff --git a/src/runtime/graph_executor/graph_executor.cc
b/src/runtime/graph_executor/graph_executor.cc
index bc73a59..dbd072a 100644
--- a/src/runtime/graph_executor/graph_executor.cc
+++ b/src/runtime/graph_executor/graph_executor.cc
@@ -91,6 +91,11 @@ void GraphExecutor::Init(const std::string& graph_json,
tvm::runtime::Module mod
std::string& name = nodes_[nid].name;
input_map_[name] = i;
}
+ for (size_t i = 0; i < outputs_.size(); i++) {
+ const uint32_t nid = outputs_[i].node_id;
+ std::string& name = nodes_[nid].name;
+ output_map_[name] = i;
+ }
}
/*!
* \brief Get the input index given the name of input.
@@ -105,6 +110,18 @@ int GraphExecutor::GetInputIndex(const std::string& name) {
return -1;
}
/*!
+ * \brief Get the output index given the name of output.
+ * \param name The name of the output.
+ * \return The index of output.
+ */
+int GraphExecutor::GetOutputIndex(const std::string& name) {
+ auto it = output_map_.find(name);
+ if (it != output_map_.end()) {
+ return it->second;
+ }
+ return -1;
+}
+/*!
* \brief set index-th input to the graph.
* \param index The input index.
* \param data_in The input data.
@@ -115,6 +132,23 @@ void GraphExecutor::SetInput(int index, DLTensor* data_in)
{
data_entry_[eid].CopyFrom(data_in);
}
/*!
+ * \brief Check the legality of external DLTensor*.
+ * \param external The external DLTensor*.
+ * \param eid The data_enrty_ index.
+ */
+void GraphExecutor::CheckExternalDLTensor(const DLTensor* external, uint32_t
eid) const {
+ const DLTensor* internal = data_entry_[eid].operator->();
+
+ ICHECK_EQ(data_alignment_[eid], details::GetDataAlignment(*external));
+ ICHECK_EQ(reinterpret_cast<size_t>(external->data) % kAllocAlignment, 0);
+ ICHECK_EQ(internal->ndim, static_cast<size_t>(external->ndim));
+ ICHECK_EQ(internal->device.device_type, external->device.device_type);
+ ICHECK_EQ(internal->device.device_id, external->device.device_id);
+ for (auto i = 0; i < external->ndim; ++i) {
+ ICHECK_EQ(internal->shape[i], external->shape[i]);
+ }
+}
+/*!
* \brief set index-th input to the graph without copying the data.
* \param index The input index.
* \param data_ref The input data that is referred.
@@ -122,24 +156,38 @@ void GraphExecutor::SetInput(int index, DLTensor*
data_in) {
void GraphExecutor::SetInputZeroCopy(int index, DLTensor* data_ref) {
ICHECK_LT(static_cast<size_t>(index), input_nodes_.size());
uint32_t eid = this->entry_id(input_nodes_[index], 0);
- const DLTensor* old_t = data_entry_[eid].operator->();
-
// check the consistency of input
- ICHECK_EQ(data_alignment_[eid], details::GetDataAlignment(*data_ref));
- ICHECK_EQ(reinterpret_cast<size_t>(data_ref->data) % kAllocAlignment, 0);
- ICHECK_EQ(old_t->ndim, static_cast<size_t>(data_ref->ndim));
- ICHECK_EQ(old_t->device.device_type, data_ref->device.device_type);
- ICHECK_EQ(old_t->device.device_id, data_ref->device.device_id);
- for (auto i = 0; i < data_ref->ndim; ++i) {
- ICHECK_EQ(old_t->shape[i], data_ref->shape[i]);
- }
-
+ CheckExternalDLTensor(data_ref, eid);
// Update the data pointer for each argument of each op
for (DLTensor* t : input_dltensors_[eid]) {
t->data = data_ref->data;
}
}
/*!
+ * \brief set index-th output to the graph without copying the data.
+ * \param index The output index.
+ * \param data_ref The output data that is referred.
+ */
+void GraphExecutor::SetOutputZeroCopy(int index, DLTensor* data_ref) {
+ ICHECK_LT(static_cast<size_t>(index), outputs_.size());
+ ICHECK_LT(static_cast<size_t>(index), output_dltensors_.size());
+ const NodeEntry& output_node = outputs_[index];
+ uint32_t output_node_eid = this->entry_id(output_node);
+
+ // check the consistency of output
+ CheckExternalDLTensor(data_ref, output_node_eid);
+
+ // Update the data pointer for output op
+ for (DLTensor* t : output_dltensors_[output_node_eid]) {
+ t->data = data_ref->data;
+ }
+
+ // Update the input of the op connected to the output
+ for (DLTensor* t : both_output_opinput_dltensors_[output_node_eid]) {
+ t->data = data_ref->data;
+ }
+}
+/*!
* \brief Get the number of outputs
*
* \return The number of outputs from graph.
@@ -358,11 +406,17 @@ void GraphExecutor::SetupStorage() {
void GraphExecutor::SetupOpExecs() {
op_execs_.resize(this->GetNumOfNodes());
input_dltensors_.resize(num_node_entries());
+ output_dltensors_.resize(num_node_entries());
+ both_output_opinput_dltensors_.resize(num_node_entries());
std::unordered_set<uint32_t> input_node_eids;
for (size_t i = 0; i < input_nodes_.size(); i++) {
uint32_t nid = input_nodes_[i];
input_node_eids.insert(entry_id(nid, 0));
}
+ std::unordered_set<uint32_t> output_node_eids;
+ for (size_t i = 0; i < outputs_.size(); i++) {
+ output_node_eids.insert(entry_id(outputs_[i]));
+ }
// setup the array and requirements.
for (uint32_t nid = 0; nid < this->GetNumOfNodes(); ++nid) {
@@ -383,10 +437,25 @@ void GraphExecutor::SetupOpExecs() {
std::tie(op_execs_[nid], op_args) = CreateTVMOp(inode.param, args);
for (size_t i = 0; i < inode.inputs.size(); i++) {
- uint32_t eid = this->entry_id(inode.inputs[i]);
+ uint32_t input_eid = this->entry_id(inode.inputs[i]);
// check if op input is model input
- if (input_node_eids.count(eid) > 0) {
-
input_dltensors_[eid].push_back(static_cast<DLTensor*>(op_args->arg_values[i].v_handle));
+ if (input_node_eids.count(input_eid) > 0) {
+ input_dltensors_[input_eid].push_back(
+ static_cast<DLTensor*>(op_args->arg_values[i].v_handle));
+ }
+ // check if any model output is the input of the op
+ if (output_node_eids.count(input_eid) > 0) {
+ both_output_opinput_dltensors_[input_eid].push_back(
+ static_cast<DLTensor*>(op_args->arg_values[i].v_handle));
+ }
+ }
+
+ for (uint32_t i = inode.inputs.size(); i < inode.inputs.size() +
inode.param.num_outputs; ++i) {
+ uint32_t output_eid = this->entry_id(nid, i - inode.inputs.size());
+ // check if op output is model output
+ if (output_node_eids.count(output_eid) > 0) {
+ output_dltensors_[output_eid].push_back(
+ static_cast<DLTensor*>(op_args->arg_values[i].v_handle));
}
}
}
@@ -462,6 +531,15 @@ PackedFunc GraphExecutor::GetFunction(const std::string&
name,
this->SetInputZeroCopy(args[0], args[1]);
}
});
+ } else if (name == "set_output_zero_copy") {
+ return PackedFunc([sptr_to_self, this](TVMArgs args, TVMRetValue* rv) {
+ if (String::CanConvertFrom(args[0])) {
+ int out_idx = this->GetOutputIndex(args[0].operator String());
+ if (out_idx >= 0) this->SetOutputZeroCopy(out_idx, args[1]);
+ } else {
+ this->SetOutputZeroCopy(args[0], args[1]);
+ }
+ });
} else if (name == "get_output") {
return PackedFunc([sptr_to_self, this](TVMArgs args, TVMRetValue* rv) {
if (args.num_args == 2) {
diff --git a/src/runtime/graph_executor/graph_executor.h
b/src/runtime/graph_executor/graph_executor.h
index 42b5c40..87e8aa3 100644
--- a/src/runtime/graph_executor/graph_executor.h
+++ b/src/runtime/graph_executor/graph_executor.h
@@ -108,6 +108,13 @@ class TVM_DLL GraphExecutor : public ModuleNode {
int GetInputIndex(const std::string& name);
/*!
+ * \brief Get the output index given the name of output.
+ * \param name The name of the output.
+ * \return The index of output.
+ */
+ int GetOutputIndex(const std::string& name);
+
+ /*!
* \brief set index-th input to the graph.
* \param index The input index.
* \param data_in The input data.
@@ -120,6 +127,12 @@ class TVM_DLL GraphExecutor : public ModuleNode {
*/
void SetInputZeroCopy(int index, DLTensor* data_ref);
/*!
+ * \brief set index-th output to the graph without copying the data.
+ * \param index The output index.
+ * \param data_ref The output data that is referred.
+ */
+ void SetOutputZeroCopy(int index, DLTensor* data_ref);
+ /*!
* \brief Get the number of outputs
*
* \return The number of outputs from graph.
@@ -193,6 +206,9 @@ class TVM_DLL GraphExecutor : public ModuleNode {
uint32_t node_id;
uint32_t index;
uint32_t version;
+ inline bool operator==(const NodeEntry& other) const {
+ return node_id == other.node_id && index == other.index && version ==
other.version;
+ }
// JSON Loader
void Load(dmlc::JSONReader* reader) {
reader->BeginArray();
@@ -378,6 +394,12 @@ class TVM_DLL GraphExecutor : public ModuleNode {
/*! \brief Setup the executors. */
void SetupOpExecs();
/*!
+ * \brief Check the legality of external DLTensor*.
+ * \param external The external DLTensor*.
+ * \param eid The data_enrty_ index.
+ */
+ void CheckExternalDLTensor(const DLTensor* external, uint32_t eid) const;
+ /*!
* \brief Create an execution function given input.
* \param attrs The node attributes.
* \param args The arguments to the functor, including inputs and outputs.
@@ -397,8 +419,14 @@ class TVM_DLL GraphExecutor : public ModuleNode {
std::vector<uint32_t> input_nodes_;
/*! \brief Map of input names to input indices. */
std::unordered_map<std::string, uint32_t> input_map_;
+ /*! \brief Map of output names to output indices. */
+ std::unordered_map<std::string, uint32_t> output_map_;
/*! \brief Used for quick node input DLTensor* lookup given an input eid. */
std::vector<std::vector<DLTensor*>> input_dltensors_;
+ /*! \brief Used for quick node output DLTensor* lookup given an output eid.
*/
+ std::vector<std::vector<DLTensor*>> output_dltensors_;
+ /*! \brief Used for quick node(both model output and op input) DLTensor*
lookup given an eid. */
+ std::vector<std::vector<DLTensor*>> both_output_opinput_dltensors_;
/*! \brief Used for quick entry indexing. */
std::vector<uint32_t> node_row_ptr_;
/*! \brief Output entries. */
diff --git a/tests/cpp/runtime_test.cc b/tests/cpp/runtime_test.cc
new file mode 100644
index 0000000..6dbcd61
--- /dev/null
+++ b/tests/cpp/runtime_test.cc
@@ -0,0 +1,154 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#include <gtest/gtest.h>
+#include <tvm/driver/driver_api.h>
+#include <tvm/ir/module.h>
+#include <tvm/relay/analysis.h>
+#include <tvm/relay/expr.h>
+#include <tvm/relay/op_attr_types.h>
+#include <tvm/relay/op_strategy.h>
+#include <tvm/relay/transform.h>
+#include <tvm/relay/type.h>
+#include <tvm/runtime/executor_info.h>
+#include <tvm/runtime/module.h>
+#include <tvm/runtime/packed_func.h>
+#include <tvm/runtime/registry.h>
+#include <tvm/te/operation.h>
+#include <tvm/topi/broadcast.h>
+#include <tvm/topi/generic/injective.h>
+
+using namespace tvm;
+using namespace tvm::relay;
+
+TVM_REGISTER_GLOBAL("runtime_test.strategy")
+ .set_body_typed([](const Attrs& attrs, const Array<te::Tensor>& inputs,
const Type& out_type,
+ const Target& target) {
+ FTVMCompute fcompute = [](const Attrs& attrs, const Array<te::Tensor>&
inputs,
+ const Type& out_type) -> Array<te::Tensor> {
+ ICHECK_EQ(inputs.size(), 2U);
+ return {topi::add(inputs[0], inputs[1])};
+ };
+ FTVMSchedule fschedule = [](const Attrs& attrs, const Array<te::Tensor>&
outs,
+ const Target& target) {
+ With<Target> target_scope(target);
+ return topi::generic::schedule_injective(target, outs);
+ };
+
+ auto n = make_object<OpStrategyNode>();
+ auto strategy = tvm::relay::OpStrategy(std::move(n));
+ strategy.AddImplementation(fcompute, fschedule, "runtime_test.strategy",
10);
+ return strategy;
+ });
+
+TEST(Runtime, ZeroCopy) {
+ auto tensor_type = relay::TensorType({2, 3}, DataType::Float(32));
+ auto a = relay::Var("a", tensor_type);
+ auto b = relay::Var("b", tensor_type);
+ auto add_op = relay::Op::Get("add");
+ auto x = relay::Call(add_op, {a, b}, tvm::Attrs(), {});
+ auto c = relay::Var("c", tensor_type);
+ auto y = relay::Call(add_op, {x, c}, tvm::Attrs(), {});
+ auto func = relay::Function(relay::FreeVars(y), y, relay::Type(), {});
+ auto A = tvm::runtime::NDArray::Empty({2, 3}, {kDLFloat, 32, 1}, {kDLCPU,
0});
+ auto B = tvm::runtime::NDArray::Empty({2, 3}, {kDLFloat, 32, 1}, {kDLCPU,
0});
+ auto C = tvm::runtime::NDArray::Empty({2, 3}, {kDLFloat, 32, 1}, {kDLCPU,
0});
+ auto Y = tvm::runtime::NDArray::Empty({2, 3}, {kDLFloat, 32, 1}, {kDLCPU,
0});
+
+ auto pA = static_cast<float*>(A->data);
+ auto pB = static_cast<float*>(B->data);
+ auto pC = static_cast<float*>(C->data);
+ auto pY = static_cast<float*>(Y->data);
+
+ for (int i = 0; i < 6; ++i) {
+ pA[i] = i;
+ pB[i] = i + 1;
+ pC[i] = i + 2;
+ }
+ // get schedule
+ auto reg = tvm::runtime::Registry::Get("ir.RegisterOpAttr");
+ if (!reg) {
+ LOG(FATAL) << "no _Register";
+ }
+ auto fs = tvm::runtime::Registry::Get("runtime_test.strategy");
+ if (!fs) {
+ LOG(FATAL) << "No test_strategy registered.";
+ }
+ auto fgeneric =
GenericFunc::Get("runtime_test.strategy_generic").set_default(*fs);
+ (*reg)("add", "FTVMStrategy", fgeneric, 10);
+ Array<Integer> dep;
+ dep.push_back(0);
+ (*reg)("add", "TShapeDataDependent", dep, 10);
+ // build
+ auto pfb = tvm::runtime::Registry::Get("relay.build_module._BuildModule");
+ tvm::runtime::Module build_mod = (*pfb)();
+ auto build_f = build_mod.GetFunction("build", false);
+ auto json_f = build_mod.GetFunction("get_graph_json", false);
+ auto mod_f = build_mod.GetFunction("get_module", false);
+ Map<tvm::Integer, tvm::Target> targets;
+ Target llvm_tgt = Target("llvm");
+ targets.Set(0, llvm_tgt);
+ auto relay_mod = tvm::IRModule::FromExpr(func);
+ ICHECK(relay_mod.defined()) << "Module must be defined";
+ build_f(relay_mod, targets, llvm_tgt, runtime::kTvmExecutorGraph, "");
+ // create graph executor
+ std::string json = json_f();
+ tvm::runtime::Module mod = mod_f();
+ auto dev = A->device;
+ auto pfr = tvm::runtime::Registry::Get("tvm.graph_executor.create");
+ ICHECK(mod.defined()) << "Module must be defined";
+ tvm::runtime::Module run_mod =
+ (*pfr)(json, mod, static_cast<int>(dev.device_type), dev.device_id);
+ // get function
+ auto set_input_f = run_mod.GetFunction("set_input_zero_copy", false);
+ auto set_output_f = run_mod.GetFunction("set_output_zero_copy", false);
+ auto run_f = run_mod.GetFunction("run", false);
+ // set input zero copy
+ set_input_f("a", const_cast<DLTensor*>(A.operator->()));
+ set_input_f("b", const_cast<DLTensor*>(B.operator->()));
+ set_input_f("c", const_cast<DLTensor*>(C.operator->()));
+ // set output zero copy
+ set_output_f(0, const_cast<DLTensor*>(Y.operator->()));
+ run_f();
+ // check correctness
+ for (int i = 0; i < 6; ++i) {
+ ICHECK_LT(fabs(pY[i] - (i + (i + 1) + (i + 2))), 1e-4);
+ }
+ // mutate the input a bit and run it again
+ for (int i = 0; i < 6; ++i) {
+ pB[i] = i + 3;
+ }
+ run_f();
+ // check correctness
+ for (int i = 0; i < 6; ++i) {
+ ICHECK_LT(fabs(pY[i] - (i + (i + 3) + (i + 2))), 1e-4);
+ }
+ // attach a different input and run it again
+ auto C2 = tvm::runtime::NDArray::Empty({2, 3}, {kDLFloat, 32, 1}, {kDLCPU,
0});
+ auto pC2 = static_cast<float*>(C2->data);
+ for (int i = 0; i < 6; ++i) {
+ pC2[i] = i + 4;
+ }
+ set_input_f("c", const_cast<DLTensor*>(C2.operator->()));
+ run_f();
+ // check correctness
+ for (int i = 0; i < 6; ++i) {
+ ICHECK_LT(fabs(pY[i] - (i + (i + 3) + (i + 4))), 1e-4);
+ }
+}