larroy commented on a change in pull request #14836: Refactor AGInfo and
Imperative
URL: https://github.com/apache/incubator-mxnet/pull/14836#discussion_r318802544
##########
File path: src/imperative/imperative.cc
##########
@@ -316,181 +308,220 @@ std::vector<NDArray*> Imperative::Backward(
info.outputs.back() = static_cast<real_t>(1.0);
}
}
+ return ograd_entries;
+}
- // Get gradient graph
- Symbol sym;
- sym.outputs = graph.outputs;
- std::vector<NodeEntry> xs;
- std::vector<NDArray*> x_grads;
- std::vector<OpReqType> x_reqs;
- if (variables.size()) {
- xs.reserve(variables.size());
- x_grads.reserve(variables.size());
- x_reqs.reserve(variables.size());
+struct Imperative::GradientVariableNodes {
+ std::vector<nnvm::NodeEntry> variable_nodes;
+ std::vector<NDArray*> gradients;
+ std::vector<OpReqType> op_req_types;
+};
+
+Imperative::GradientVariableNodes Imperative::CreateGradientVariableNodes(
+ const std::vector<NDArray *> &variables,
+ const std::vector<nnvm::NodeEntry> &outputs) {
+ GradientVariableNodes var_nodes;
+ if (!variables.empty()) {
+ var_nodes.variable_nodes.reserve(variables.size());
+ var_nodes.gradients.reserve(variables.size());
+ var_nodes.op_req_types.reserve(variables.size());
for (size_t i = 0; i < variables.size(); ++i) {
CHECK(!AGInfo::IsNone(*variables[i]) &&
- AGInfo::IsVariable(variables[i]->entry_.node))
+ AGInfo::IsVariable(variables[i]->autograd_.node))
<< "Cannot differentiate with respect to the " << i+1 << "-th
variable"
- << " because it does not require gradient.";
- xs.emplace_back(variables[i]->entry_);
- x_grads.push_back(new NDArray());
- x_reqs.push_back(kWriteTo);
+ << " because it does not require gradient. Did you forget
attach_grad()?";
+ var_nodes.variable_nodes.emplace_back(variables[i]->autograd_);
+ var_nodes.gradients.push_back(new NDArray());
+ var_nodes.op_req_types.push_back(kWriteTo);
}
} else {
- std::vector<NodePtr> args = sym.ListInputs(Symbol::kReadOnlyArgs);
- xs.reserve(args.size());
- x_grads.reserve(args.size());
- x_reqs.reserve(args.size());
- for (const auto& i : args) {
- AGInfo& info = AGInfo::Get(i);
- if (info.grad_req == kNullOp) continue;
- xs.emplace_back(NodeEntry{i, 0, 0});
- x_grads.push_back(&info.out_grads[0]);
- x_reqs.push_back(info.grad_req);
- info.fresh_out_grad = true;
+ nnvm::Symbol s;
+ s.outputs = outputs;
+ std::vector<nnvm::NodePtr> input_ro_nodes =
s.ListInputs(Symbol::kReadOnlyArgs);
+ var_nodes.variable_nodes.reserve(input_ro_nodes.size());
+ var_nodes.gradients.reserve(input_ro_nodes.size());
+ var_nodes.op_req_types.reserve(input_ro_nodes.size());
+ for (const auto& node : input_ro_nodes) {
+ AGInfo& info = AGInfo::Get(node);
+ if (info.grad_req != kNullOp) {
+ var_nodes.variable_nodes.emplace_back(node);
+ var_nodes.gradients.push_back(&info.out_grads[0]);
+ var_nodes.op_req_types.push_back(info.grad_req);
+ info.fresh_out_grad = true;
+ }
}
- CHECK_GT(xs.size(), 0)
+ CHECK_GT(var_nodes.variable_nodes.size(), 0)
<< "There are no inputs in computation graph that require gradients.";
}
+ return var_nodes;
+}
- Graph g_graph = pass::MXGradient(
- graph, graph.outputs, xs, ograd_entries,
+std::vector<NDArray*> Imperative::Backward(
+ const std::vector<NDArray*>& outputs,
+ const std::vector<NDArray*>& ograds,
+ const std::vector<NDArray*>& variables,
+ bool is_train, bool retain_graph,
+ bool create_graph) {
+ using namespace nnvm;
+ using namespace imperative;
+ static const std::vector<const Op*> zero_ops{Op::Get("zeros_like"),
Op::Get("_zeros")};
+ static const Op* copy_op = Op::Get("_copy");
+
+ Graph graph = CreateGraph(outputs);
+
+ // Prepare head gradient nodes
+ std::vector<NodeEntry> ograd_entries = CreateHeadGradientNodes(outputs,
ograds);
+
+ // Get variable nodes
+ GradientVariableNodes gvars = CreateGradientVariableNodes(variables,
graph.outputs);
+
+ // Run backward on the graph
+ Graph gradient_graph = pass::MXGradient(
+ graph, graph.outputs, gvars.variable_nodes, ograd_entries,
exec::AggregateGradient, nullptr, nullptr,
zero_ops, "_copy");
- CHECK_EQ(g_graph.outputs.size(), xs.size());
- for (const auto& e : g_graph.outputs) {
- if (e.node->op() == nullptr) {
+
+ CHECK_EQ(gradient_graph.outputs.size(), gvars.variable_nodes.size());
+ std::vector<nnvm::NodeEntry> forward_outputs = graph.outputs;
+ const size_t num_forward_outputs = graph.outputs.size();
+
+ // TODO(larroy): move inside pass::MXGradient
+ for (const auto& backward_node : gradient_graph.outputs) {
+ if (backward_node.node->is_variable()) {
auto node = Node::Create();
node->attrs.op = copy_op;
- node->inputs.push_back(e);
+ node->inputs.push_back(backward_node);
graph.outputs.emplace_back(std::move(node));
} else {
- graph.outputs.push_back(e);
+ graph.outputs.push_back(backward_node);
}
}
- const auto& idx = graph.indexed_graph();
+
+ auto& indexed_graph = graph.indexed_graph();
// get number of nodes used in forward pass
size_t num_forward_nodes = 0;
size_t num_forward_entries = 0;
for (size_t i = 0; i < num_forward_outputs; ++i) {
num_forward_nodes = std::max(
- num_forward_nodes, static_cast<size_t>(idx.outputs()[i].node_id + 1));
+ num_forward_nodes,
static_cast<size_t>(indexed_graph.outputs()[i].node_id + 1));
num_forward_entries = std::max(
- num_forward_entries,
static_cast<size_t>(idx.entry_id(idx.outputs()[i])) + 1);
+ num_forward_entries, static_cast<size_t>(indexed_graph.entry_id(
+ indexed_graph.outputs()[i])) + 1);
}
// Allocate buffer
- std::vector<NDArray> buff(idx.num_node_entries());
+ std::vector<NDArray> buff(indexed_graph.num_node_entries());
std::vector<uint32_t> ref_count(buff.size(), 0);
std::vector<OpStatePtr> states;
std::vector<NDArray*> arrays;
arrays.reserve(buff.size());
- for (auto& buffered_array : buff) {
+ for (auto& buffered_array : buff)
arrays.push_back(&buffered_array);
- }
+
if (create_graph) {
states.resize(num_forward_nodes);
- nnvm::DFSVisit(sym.outputs, [&](const nnvm::NodePtr& n) {
- AGInfo& info = AGInfo::Get(n);
- states[idx.node_id(n.get())] = info.state;
- for (uint32_t i = 0; i < info.outputs.size(); ++i) {
- CHECK(idx.exist(n.get()));
- size_t nid = idx.node_id(n.get());
- size_t eid = idx.entry_id(nid, i);
+ nnvm::DFSVisit(forward_outputs, [&](const nnvm::NodePtr& n) {
+ const AGInfo& info = AGInfo::Get(n);
+ states.at(indexed_graph.node_id(n.get())) = info.state;
+ for (size_t i = 0; i < info.outputs.size(); ++i) {
+ CHECK(indexed_graph.exist(n.get()));
+ const size_t nid = indexed_graph.node_id(n.get());
+ const size_t eid = indexed_graph.entry_id(nid, i);
buff[eid] = info.outputs[i];
- buff[eid].entry_ = NodeEntry{n, i, 0};
+ buff[eid].autograd_ = NodeEntry{n, static_cast<uint32_t>(i), 0};
ref_count[eid] = 1;
}
});
for (auto& ograd_entry : ograd_entries) {
- AGInfo& info = AGInfo::Get(ograd_entry.node);
- if (!idx.exist(ograd_entry.node.get())) continue;
- size_t eid = idx.entry_id(ograd_entry);
+ const AGInfo& info = AGInfo::Get(ograd_entry.node);
+ if (!indexed_graph.exist(ograd_entry.node.get())) continue;
+ size_t eid = indexed_graph.entry_id(ograd_entry);
buff[eid] = info.outputs[0];
- buff[eid].entry_ = ograd_entry;
+ buff[eid].autograd_ = ograd_entry;
}
} else {
states.reserve(num_forward_nodes);
for (size_t i = 0; i < num_forward_nodes; ++i) {
- const AGInfo& info = dmlc::get<AGInfo>(idx[i].source->info);
+ // TODO(larroy): This is a code smell 💩
+ AGInfo& info =
const_cast<AGInfo&>(dmlc::get<AGInfo>(indexed_graph[i].source->info));
states.emplace_back(info.state);
for (size_t j = 0; j < info.outputs.size(); ++j) {
- size_t eid = idx.entry_id(i, j);
- arrays[eid] = const_cast<NDArray*>(&(info.outputs[j]));
-
- if (retain_graph || info.grad_req != kNullOp) ref_count[eid] = 1;
+ const size_t eid = indexed_graph.entry_id(i, j);
+ arrays[eid] = &(info.outputs[j]);
+ if (retain_graph || info.grad_req != kNullOp)
+ ref_count[eid] = 1;
}
}
for (auto& ograd_entry : ograd_entries) {
- if (!idx.exist(ograd_entry.node.get())) continue;
+ if (!indexed_graph.exist(ograd_entry.node.get())) continue;
AGInfo& info = AGInfo::Get(ograd_entry.node);
- arrays[idx.entry_id(ograd_entry)] = &info.outputs[0];
+ arrays[indexed_graph.entry_id(ograd_entry)] = &info.outputs[0];
}
}
for (size_t i = num_forward_outputs; i < graph.outputs.size(); ++i) {
- size_t eid = idx.entry_id(graph.outputs[i]);
- arrays[eid] = x_grads[i - num_forward_outputs];
+ size_t eid = indexed_graph.entry_id(graph.outputs[i]);
+ arrays[eid] = gvars.gradients[i - num_forward_outputs];
ref_count[eid] = 1;
}
// Assign context
- auto vctx = PlaceDevice(idx);
+ auto vctx = PlaceDevice(indexed_graph);
// Infer shape type
{
Review comment:
We could. Do you think we could leave that to an additional refactor? It's
an incremental process as this code might have to be addressed for thread
safety soon.
----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
For queries about this service, please contact Infrastructure at:
[email protected]
With regards,
Apache Git Services