AndrewZhaoLuo commented on code in PR #11000:
URL: https://github.com/apache/tvm/pull/11000#discussion_r852460978
##########
python/tvm/contrib/debugger/debug_executor.py:
##########
@@ -281,6 +282,42 @@ def run_individual(self, number, repeat=1,
min_repeat_ms=0):
ret = self._run_individual(number, repeat, min_repeat_ms)
return ret.strip(",").split(",") if ret else []
+ def run_individual_node(self, index, number, repeat=1, min_repeat_ms=0):
+ """Benchmark a single node in the serialized graph.
Review Comment:
Done
##########
python/tvm/contrib/debugger/debug_executor.py:
##########
@@ -281,6 +282,42 @@ def run_individual(self, number, repeat=1,
min_repeat_ms=0):
ret = self._run_individual(number, repeat, min_repeat_ms)
return ret.strip(",").split(",") if ret else []
+ def run_individual_node(self, index, number, repeat=1, min_repeat_ms=0):
+ """Benchmark a single node in the serialized graph.
+
+ Parameters
+ ----------
+ index : int
+ The index of the node, see `self.debug_datum.get_graph_nodes`
+
+ number: int
+ The number of times to run the node to get a benchmark result.
+
+ repeat: int
+ The number of times to benchmark the nodes.
+
+ min_repeat_ms: int
+ The minimum consecutive runtime of the node for a benchmark result.
+
+ Returns
+ -------
+ A list of dimensions `number` x `repeat` each one the runtime of the
node
Review Comment:
Basically if have like 3 repeats of 3 numbers it would return a 3x3
array/list.
arr[0][1] would be the first repeat in the second number, arr[1][2] would be
the repeat 2, number 3, etc.
I think BenchmarkResult is better though since it seems to store the
sequence of all float results anyway?
##########
python/tvm/contrib/debugger/debug_executor.py:
##########
@@ -281,6 +282,42 @@ def run_individual(self, number, repeat=1,
min_repeat_ms=0):
ret = self._run_individual(number, repeat, min_repeat_ms)
return ret.strip(",").split(",") if ret else []
+ def run_individual_node(self, index, number, repeat=1, min_repeat_ms=0):
Review Comment:
done
##########
python/tvm/contrib/debugger/debug_executor.py:
##########
@@ -281,6 +282,42 @@ def run_individual(self, number, repeat=1,
min_repeat_ms=0):
ret = self._run_individual(number, repeat, min_repeat_ms)
return ret.strip(",").split(",") if ret else []
+ def run_individual_node(self, index, number, repeat=1, min_repeat_ms=0):
+ """Benchmark a single node in the serialized graph.
+
+ Parameters
+ ----------
+ index : int
+ The index of the node, see `self.debug_datum.get_graph_nodes`
+
+ number: int
+ The number of times to run the node to get a benchmark result.
+
+ repeat: int
+ The number of times to benchmark the nodes.
Review Comment:
I'll probably just use time_evaluator, so will change later.
##########
src/runtime/graph_executor/debug/graph_executor_debug.cc:
##########
@@ -362,6 +396,33 @@ PackedFunc GraphExecutorDebug::GetFunction(const
std::string& name,
ICHECK_GE(min_repeat_ms, 0);
*rv = this->RunIndividual(number, repeat, min_repeat_ms);
});
+ } else if (name == "run_individual_node") {
+ return PackedFunc([sptr_to_self, this](TVMArgs args, TVMRetValue* rv) {
+ int node_index = args[0];
+ int number = args[1];
+ int repeat = args[2];
+ int min_repeat_ms = args[3];
+ ICHECK_GE(node_index, 0);
+ ICHECK_LT(node_index, nodes_.size());
+ ICHECK_GT(number, 0);
+ ICHECK_GT(repeat, 0);
+ ICHECK_GE(min_repeat_ms, 0);
+ std::vector<std::vector<double>> results =
+ this->RunIndividualNode(node_index, number, repeat, min_repeat_ms);
+
+ std::stringstream s;
+ s.precision(6); // down to microseconds
Review Comment:
Done
##########
src/runtime/graph_executor/debug/graph_executor_debug.cc:
##########
@@ -114,15 +94,69 @@ class GraphExecutorDebug : public GraphExecutor {
std::ostringstream os;
for (size_t index = 0; index < time_sec_per_op.size(); index++) {
- os << time_sec_per_op[index] << ",";
+ double time = time_sec_per_op[index];
+ // To have good behavior when calculating total time, etc.
+ if (isnan(time)) {
+ time = 0;
+ }
Review Comment:
0 / 0 is possible from the above for nodes which do not have any associated
execution function
##########
src/runtime/graph_executor/debug/graph_executor_debug.cc:
##########
@@ -114,15 +94,69 @@ class GraphExecutorDebug : public GraphExecutor {
std::ostringstream os;
for (size_t index = 0; index < time_sec_per_op.size(); index++) {
- os << time_sec_per_op[index] << ",";
+ double time = time_sec_per_op[index];
+ // To have good behavior when calculating total time, etc.
+ if (isnan(time)) {
+ time = 0;
+ }
+ os << time << ",";
}
return os.str();
}
+ std::vector<std::vector<double>> RunIndividualNode(int node_index, int
number, int repeat,
+ int min_repeat_ms) {
+ // warmup run
+ // GraphExecutor::Run();
+ std::string tkey = module_->type_key();
+
+ // results_in_seconds[a][b] is the bth index run of the ath index repeat
+ std::vector<std::vector<double>> results_in_seconds;
+
+ if (tkey == "rpc") {
+ LOG(FATAL) << "RPC measurements should not use RunIndividualNode!";
+ }
+
+ for (int i = 0; i < repeat; ++i) {
+ std::vector<Timer> op_timers;
+ double duration_ms = 0.0;
+
+ // Keep timing operations, upping number of repeats until we reach
min_repeat_ms
+ do {
+ op_timers.clear();
+ if (duration_ms > 0.0) {
+ number = static_cast<int>(std::max((min_repeat_ms / (duration_ms /
number) + 1),
+ number * 1.618)); // 1.618 is
chosen by random
+ }
+
+ std::chrono::time_point<std::chrono::high_resolution_clock,
std::chrono::nanoseconds>
+ tbegin, tend;
+ tbegin = std::chrono::high_resolution_clock::now();
Review Comment:
Oh you mean get rid of all of this timing code here and instead use the
python time_evaluator interface?
##########
src/runtime/graph_executor/debug/graph_executor_debug.cc:
##########
@@ -114,15 +94,69 @@ class GraphExecutorDebug : public GraphExecutor {
std::ostringstream os;
for (size_t index = 0; index < time_sec_per_op.size(); index++) {
- os << time_sec_per_op[index] << ",";
+ double time = time_sec_per_op[index];
+ // To have good behavior when calculating total time, etc.
+ if (isnan(time)) {
+ time = 0;
+ }
+ os << time << ",";
}
return os.str();
}
+ std::vector<std::vector<double>> RunIndividualNode(int node_index, int
number, int repeat,
+ int min_repeat_ms) {
+ // warmup run
+ // GraphExecutor::Run();
+ std::string tkey = module_->type_key();
+
+ // results_in_seconds[a][b] is the bth index run of the ath index repeat
+ std::vector<std::vector<double>> results_in_seconds;
+
+ if (tkey == "rpc") {
+ LOG(FATAL) << "RPC measurements should not use RunIndividualNode!";
+ }
+
+ for (int i = 0; i < repeat; ++i) {
+ std::vector<Timer> op_timers;
+ double duration_ms = 0.0;
+
+ // Keep timing operations, upping number of repeats until we reach
min_repeat_ms
+ do {
+ op_timers.clear();
+ if (duration_ms > 0.0) {
+ number = static_cast<int>(std::max((min_repeat_ms / (duration_ms /
number) + 1),
+ number * 1.618)); // 1.618 is
chosen by random
+ }
+
+ std::chrono::time_point<std::chrono::high_resolution_clock,
std::chrono::nanoseconds>
+ tbegin, tend;
+ tbegin = std::chrono::high_resolution_clock::now();
Review Comment:
Example?
##########
src/runtime/graph_executor/debug/graph_executor_debug.cc:
##########
@@ -362,6 +396,33 @@ PackedFunc GraphExecutorDebug::GetFunction(const
std::string& name,
ICHECK_GE(min_repeat_ms, 0);
*rv = this->RunIndividual(number, repeat, min_repeat_ms);
});
+ } else if (name == "run_individual_node") {
+ return PackedFunc([sptr_to_self, this](TVMArgs args, TVMRetValue* rv) {
+ int node_index = args[0];
+ int number = args[1];
+ int repeat = args[2];
+ int min_repeat_ms = args[3];
+ ICHECK_GE(node_index, 0);
+ ICHECK_LT(node_index, nodes_.size());
+ ICHECK_GT(number, 0);
+ ICHECK_GT(repeat, 0);
+ ICHECK_GE(min_repeat_ms, 0);
+ std::vector<std::vector<double>> results =
+ this->RunIndividualNode(node_index, number, repeat, min_repeat_ms);
+
+ std::stringstream s;
+ s.precision(6); // down to microseconds
+
+ for (std::vector<double>& row : results) {
+ for (double cur : row) {
+ s << cur << ", ";
+ }
+ s << "\n";
+ }
+
+ // Have problems returning Integers and FloatImm so this is hack
+ *rv = s.str();
Review Comment:
Haha well double -> char* have problems if the endianness of the two systems
is different B). Yeah IDK what the right way is. I was mostly following the
string serialization approach that `run_individual` does, though I understand
that this is slow over RPC
##########
src/runtime/graph_executor/debug/graph_executor_debug.cc:
##########
@@ -362,6 +396,33 @@ PackedFunc GraphExecutorDebug::GetFunction(const
std::string& name,
ICHECK_GE(min_repeat_ms, 0);
*rv = this->RunIndividual(number, repeat, min_repeat_ms);
});
+ } else if (name == "run_individual_node") {
+ return PackedFunc([sptr_to_self, this](TVMArgs args, TVMRetValue* rv) {
Review Comment:
Thanks, done
##########
src/runtime/graph_executor/debug/graph_executor_debug.cc:
##########
@@ -114,15 +94,69 @@ class GraphExecutorDebug : public GraphExecutor {
std::ostringstream os;
for (size_t index = 0; index < time_sec_per_op.size(); index++) {
- os << time_sec_per_op[index] << ",";
+ double time = time_sec_per_op[index];
+ // To have good behavior when calculating total time, etc.
+ if (isnan(time)) {
+ time = 0;
+ }
+ os << time << ",";
}
return os.str();
}
+ std::vector<std::vector<double>> RunIndividualNode(int node_index, int
number, int repeat,
+ int min_repeat_ms) {
+ // warmup run
+ // GraphExecutor::Run();
+ std::string tkey = module_->type_key();
+
+ // results_in_seconds[a][b] is the bth index run of the ath index repeat
+ std::vector<std::vector<double>> results_in_seconds;
+
+ if (tkey == "rpc") {
+ LOG(FATAL) << "RPC measurements should not use RunIndividualNode!";
+ }
+
+ for (int i = 0; i < repeat; ++i) {
+ std::vector<Timer> op_timers;
+ double duration_ms = 0.0;
+
+ // Keep timing operations, upping number of repeats until we reach
min_repeat_ms
+ do {
+ op_timers.clear();
+ if (duration_ms > 0.0) {
+ number = static_cast<int>(std::max((min_repeat_ms / (duration_ms /
number) + 1),
+ number * 1.618)); // 1.618 is
chosen by random
+ }
+
+ std::chrono::time_point<std::chrono::high_resolution_clock,
std::chrono::nanoseconds>
+ tbegin, tend;
+ tbegin = std::chrono::high_resolution_clock::now();
Review Comment:
Hmm, in the refactor this needs to return some timing info (it can't be
outside measuring in). Do you have example of the timers interface (or just the
file to look, `timers` is kind of hard to grep for).
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]