AndrewZhaoLuo commented on code in PR #11066:
URL: https://github.com/apache/tvm/pull/11066#discussion_r858087970
##########
tests/python/unittest/test_runtime_profiling.py:
##########
@@ -257,6 +259,50 @@ def test_profile_function(target, dev):
assert report[metric].value > 0
[email protected]_targets("llvm")
+def test_estimate_peak_fma_flops(target, dev):
+ # This test uses vectorized instructions so we need a target that supports
them
+ if target == "llvm":
+ target = "llvm -mattr=+fma,+avx2"
+ flops = tvm.utils.estimate_peak_fma_flops(tvm.target.Target(target), dev)
+ # assume we can achieve 1 GFLOP/s per thread
Review Comment:
How is this achieved, is it 1 flop per cycle in a 1 GHZ processor?
Might be nice to clarify here
##########
src/runtime/profiling.cc:
##########
@@ -288,6 +290,8 @@ void print_metric(std::ostream& os, ObjectRef o) {
os << "{\"microseconds\":" << std::setprecision(17) << std::fixed <<
n->microseconds << "}";
} else if (const PercentNode* n = o.as<PercentNode>()) {
os << "{\"percent\":" << std::setprecision(17) << std::fixed << n->percent
<< "}";
+ } else if (const RatioNode* n = o.as<RatioNode>()) {
+ os << "{\"ratio\":" << std::setprecision(17) << std::fixed << n->ratio <<
"}";
Review Comment:
nit: suggest using `std::numeric_limits<double>::max_digits10` instead of
`17` for this and the above usages too
##########
src/runtime/profiling.cc:
##########
@@ -343,6 +347,46 @@ String ReportNode::AsJSON() const {
return s.str();
}
+// Aggregate a set of values for a metric. Computes sum for Duration, Count,
+// and Percent; average for Ratio; and assumes all Strings are the same. All
+// ObjectRefs in metrics must have the same type.
+ObjectRef AggregateMetric(const std::vector<ObjectRef>& metrics) {
+ ICHECK_GT(metrics.size(), 0) << "Must pass a non-zero number of metrics";
+ if (metrics[0].as<DurationNode>()) {
+ double sum = 0;
+ for (auto& metric : metrics) {
+ sum += metric.as<DurationNode>()->microseconds;
+ }
+ return ObjectRef(make_object<DurationNode>(sum));
+ } else if (metrics[0].as<CountNode>()) {
+ int64_t sum = 0;
+ for (auto& metric : metrics) {
+ sum += metric.as<CountNode>()->value;
+ }
+ return ObjectRef(make_object<CountNode>(sum));
+ } else if (metrics[0].as<PercentNode>()) {
+ double sum = 0;
+ for (auto& metric : metrics) {
+ sum += metric.as<PercentNode>()->percent;
+ }
+ return ObjectRef(make_object<PercentNode>(sum));
+ } else if (metrics[0].as<RatioNode>()) {
+ double sum = 0;
+ for (auto& metric : metrics) {
+ sum += metric.as<RatioNode>()->ratio;
+ }
+ return ObjectRef(make_object<RatioNode>(sum / metrics.size()));
+ } else if (metrics[0].as<StringObj>()) {
+ // Assume all strings in metrics are the same.
Review Comment:
Know the old code didn't check but might be a good idea to check here
assumption is correct
##########
tests/python/unittest/test_runtime_profiling.py:
##########
@@ -257,6 +259,50 @@ def test_profile_function(target, dev):
assert report[metric].value > 0
[email protected]_targets("llvm")
+def test_estimate_peak_fma_flops(target, dev):
+ # This test uses vectorized instructions so we need a target that supports
them
+ if target == "llvm":
+ target = "llvm -mattr=+fma,+avx2"
+ flops = tvm.utils.estimate_peak_fma_flops(tvm.target.Target(target), dev)
+ # assume we can achieve 1 GFLOP/s per thread
+ assert (
+ flops > 10**9 * tvm.runtime.num_threads() and flops < 10**14
+ ), f"FLOP/s should be between 10^9 * num_threads and 10^14, but it is
{flops}"
+
+
[email protected]_targets("llvm")
+def test_estimate_peak_bandwidth(target, dev):
+ # This test uses vectorized instructions so we need a target that supports
them
+ if target == "llvm":
+ target = "llvm -mattr=+fma,+avx2"
+ bandwidth = tvm.utils.estimate_peak_bandwidth(tvm.target.Target(target),
dev)
+ # assume we can achieve 1 GB/s
Review Comment:
also be nice to see where bandwidth assumption comes from
##########
tests/python/unittest/test_runtime_profiling.py:
##########
@@ -257,6 +259,50 @@ def test_profile_function(target, dev):
assert report[metric].value > 0
[email protected]_targets("llvm")
+def test_estimate_peak_fma_flops(target, dev):
+ # This test uses vectorized instructions so we need a target that supports
them
+ if target == "llvm":
+ target = "llvm -mattr=+fma,+avx2"
Review Comment:
Is it possible to turn off vectorization and see if that affects flops as a
test?
##########
tests/python/unittest/test_runtime_profiling.py:
##########
@@ -257,6 +259,50 @@ def test_profile_function(target, dev):
assert report[metric].value > 0
[email protected]_targets("llvm")
+def test_estimate_peak_fma_flops(target, dev):
+ # This test uses vectorized instructions so we need a target that supports
them
+ if target == "llvm":
+ target = "llvm -mattr=+fma,+avx2"
Review Comment:
This test also will require processors which support these extensions right?
So I can't run this on my m1 mac. can we skip this if i do not support avx2?
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]