[tvm] branch main updated: [PROFILER] Theoretical roofline models (#11066)

andrewzhaoluo Tue, 03 May 2022 14:54:51 -0700

This is an automated email from the ASF dual-hosted git repository.

andrewzhaoluo pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tvm.git



The following commit(s) were added to refs/heads/main by this push:
     new 90084ab8db [PROFILER] Theoretical roofline models (#11066)
90084ab8db is described below

commit 90084ab8db4a4afdb305a7cfbbc1aca17a2df811
Author: Tristan Konolige <[email protected]>
AuthorDate: Tue May 3 14:54:38 2022 -0700

    [PROFILER] Theoretical roofline models (#11066)
    
    `tvm.analysis.roofline_analysis` adds estimated roofline performance to a
    profiling report. The roofline model measures how close an operator gets
    to best possible memory bandwidth or FLOP/s depending on whether it is
    memory or compute bound. This computation uses the runtime of the
    operator along with two numbers extracted from the TIR code: bytes of
    memory touched and number of floating point operations. Because these
    numbers are extracted from TIR, they may not be 100% accurate. The best
    possible memory bandwidth and FLOP/s are measured by running small
    programs that are memory and compute bound respectively.
    
    For now, this function only works with llvm cpu targets, but it should
    be possible to extend to GPU targets.
---
 include/tvm/runtime/profiling.h                 |  15 ++
 python/tvm/runtime/profiling/__init__.py        |  47 ++++
 python/tvm/utils/__init__.py                    |  19 ++
 python/tvm/utils/roofline.py                    | 315 ++++++++++++++++++++++++
 src/node/structural_hash.cc                     |  13 +
 src/runtime/profiling.cc                        | 133 ++++++++--
 tests/python/unittest/test_runtime_profiling.py |  47 ++++
 7 files changed, 563 insertions(+), 26 deletions(-)

diff --git a/include/tvm/runtime/profiling.h b/include/tvm/runtime/profiling.h
index 3cfb73f58e..0163f0c2e4 100644
--- a/include/tvm/runtime/profiling.h
+++ b/include/tvm/runtime/profiling.h
@@ -459,6 +459,21 @@ class CountNode : public Object {
   TVM_DECLARE_FINAL_OBJECT_INFO(CountNode, Object);
 };
 
+/* \brief A ratio of two things. */
+class RatioNode : public Object {
+ public:
+  /* The ratio as a double precision floating point number. */
+  double ratio;
+
+  /* \brief Construct a new ratio.
+   * \param a The ratio.
+   */
+  explicit RatioNode(double a) : ratio(a) {}
+
+  static constexpr const char* _type_key = "runtime.profiling.Ratio";
+  TVM_DECLARE_FINAL_OBJECT_INFO(RatioNode, Object);
+};
+
 /*! \brief String representation of an array of NDArray shapes
  *  \param shapes Array of NDArrays to get the shapes of.
  *  \return A textual representation of the shapes. For example: `float32[2], 
int64[1, 2]`.
diff --git a/python/tvm/runtime/profiling/__init__.py 
b/python/tvm/runtime/profiling/__init__.py
index a79c46f4a8..5737790378 100644
--- a/python/tvm/runtime/profiling/__init__.py
+++ b/python/tvm/runtime/profiling/__init__.py
@@ -35,6 +35,21 @@ class Report(Object):
         Per-device metrics collected over the entire run.
     """
 
+    def __init__(
+        self, calls: Sequence[Dict[str, Object]], device_metrics: Dict[str, 
Dict[str, Object]]
+    ):
+        """Construct a profiling report from a list of metrics and per-device 
metrics.
+
+        Parameters
+        ----------
+        calls : Sequence[Dict[str, Object]]
+            Per function call metrics.
+
+        device_metrics : Dict[str, Dict[str, Object]]
+            Per device metrics.
+        """
+        self.__init_handle_by_constructor__(_ffi_api.Report, calls, 
device_metrics)
+
     def csv(self):
         """Convert this profiling report into CSV format.
 
@@ -150,6 +165,38 @@ class Report(Object):
         return _ffi_api.FromJSON(s)
 
 
+@_ffi.register_object("runtime.profiling.Count")
+class Count(Object):
+    """A integer count of something"""
+
+    def __init__(self, count: int):
+        self.__init_handle_by_constructor__(_ffi_api.Count, count)
+
+
+@_ffi.register_object("runtime.profiling.Duration")
+class Duration(Object):
+    """A duration of something"""
+
+    def __init__(self, duration: float):
+        self.__init_handle_by_constructor__(_ffi_api.Duration, duration)
+
+
+@_ffi.register_object("runtime.profiling.Percent")
+class Percent(Object):
+    """A Percent of something"""
+
+    def __init__(self, percent: float):
+        self.__init_handle_by_constructor__(_ffi_api.Percent, percent)
+
+
+@_ffi.register_object("runtime.profiling.Ratio")
+class Ratio(Object):
+    """A Ratio of two things"""
+
+    def __init__(self, ratio: float):
+        self.__init_handle_by_constructor__(_ffi_api.Ratio, ratio)
+
+
 @_ffi.register_object("runtime.profiling.MetricCollector")
 class MetricCollector(Object):
     """Interface for user defined profiling metric collection."""
diff --git a/python/tvm/utils/__init__.py b/python/tvm/utils/__init__.py
new file mode 100644
index 0000000000..3c1703c244
--- /dev/null
+++ b/python/tvm/utils/__init__.py
@@ -0,0 +1,19 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""Utilities operating at a graph/model or other "high" level"""
+
+from .roofline import estimate_peak_bandwidth, estimate_peak_fma_flops, 
roofline_analysis
diff --git a/python/tvm/utils/roofline.py b/python/tvm/utils/roofline.py
new file mode 100644
index 0000000000..2d05503da7
--- /dev/null
+++ b/python/tvm/utils/roofline.py
@@ -0,0 +1,315 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""Utilities for computing an approximate roofline model"""
+from typing import Dict, Union, Optional
+import numpy as np
+
+from .. import auto_scheduler, relay, tir, nd, IRModule, build, topi, transform
+from ..target import Target
+from ..runtime import profiler_vm, profiling, Device, num_threads
+from ..script import tir as T
+
+
+def _create_args(mod: IRModule, dev: Device, func_name: str = "main"):
+    args = []
+    for arg in mod[func_name].params:
+        args.append(
+            nd.array(
+                np.zeros([x.value for x in arg.type_annotation.shape], 
arg.type_annotation.dtype),
+                device=dev,
+            )
+        )
+    return args
+
+
+def _estimated_features(mod: IRModule, params: Dict[str, nd.NDArray], target: 
Target):
+    comp = relay.vm.VMCompiler()
+    mod, params = comp.optimize(mod, params=params, target=target)
+    return {
+        prim.attrs["hash"]: (name, 
auto_scheduler.feature.named_features_from_primfunc(prim))
+        for name, prim in mod.functions.items()
+        if isinstance(prim, tir.PrimFunc)
+    }
+
+
+def _detect_vec_width_registers(
+    target: Target, vec_width: Optional[int], num_vector_registers: 
Optional[int]
+):
+    """Get the vector width and number of vector registers for a target.
+
+    Parameters
+    ----------
+    target : Target
+        Target to detect vector width and registers for.
+    vec_width : Optional[int]
+        If None, try and detect vector width from target. Otherwise provided 
input is used.
+    num_vector_registers : Optional[int]
+        If None, try and number of vector registers from target. Otherwise 
provided input is used.
+
+    Returns
+    -------
+    vec_width: int
+        Width of a vector register on `target`.
+    num_vector_registers: int
+        Number of vector registers on `target`.
+    """
+    if vec_width is None:
+        # Only implemented for x86 so far...
+        if (
+            str(target.kind) == "llvm"
+            and target.device_name == ""
+            and len(target.keys) == 1
+            and target.keys[0] == "cpu"
+        ):
+            with target:
+                vec_width = topi.x86.utils.get_simd_32bit_lanes()  # in number 
of float32s
+        else:
+            raise RuntimeError(f"Cannot determine vector width for target 
{target}")
+    if num_vector_registers is None:
+        if target.device_name == "":  # indicates x86
+            num_vector_registers = 16  # Assuming for all platforms, probably 
wrong on older ones
+        else:
+            raise RuntimeError(f"Cannot determine number of vector registers 
for target {target}")
+    return vec_width, num_vector_registers
+
+
[email protected]_func
+def peakflops_fma_tir(
+    a: T.handle,
+    vec_width: T.int32,
+    iters: T.int32,
+    num_vector_registers: T.int32,
+    threads: T.int32,
+) -> None:
+    # pylint: disable=invalid-name, missing-function-docstring
+    A = T.match_buffer(a, [threads, num_vector_registers, vec_width], 
"float32")
+    for t in T.parallel(threads):
+        for _j in range(iters):
+            for l in T.unroll(num_vector_registers):
+                # We want to use as few registers as possible, so we perform
+                # all operations on the same element
+                for k in T.vectorized(vec_width):
+                    A[t, l, k] = A[t, l, k] * A[t, l, k] + A[t, l, k]
+
+
+def estimate_peak_fma_flops(
+    target: Target,
+    dev: Device,
+    vec_width: Optional[int] = None,
+    num_vector_registers: Optional[int] = None,
+) -> float:
+    """
+    Estimate the maximum number of FLOP/s this target/device combo is capable
+    of reaching by running a test program. This assumes vectorized f32 FMA
+    (fused-multiply-add) instructions.
+
+
+    Parameters
+    ----------
+    target : Target
+        Target to run on. This should be as specific to the actual hardware as
+        possible to make sure that LLVM generates the best vector code.
+    dev : Device
+        Device to run on.
+    vec_width : Optional[int]
+        Vector width of SIMD units on the underlying hardware. Will try to
+        infer if no value is provided.
+    num_vector_registers : Optional[int]
+        Number of vector registers on the underlying hardware. Will try to
+        infer if no value is provided.
+
+    Returns
+    -------
+    float
+        Approximate sustained FLOP/s of this target/device combo assuming
+        vectorized f32 FMA instructions.
+    """
+    assert str(target.kind) == "llvm", "Only llvm targets are supported"
+    vec_width, num_vector_registers = _detect_vec_width_registers(
+        target, vec_width, num_vector_registers
+    )
+    iters = 1000000
+    nthreads = num_threads()
+    specialized = peakflops_fma_tir.specialize(
+        {
+            peakflops_fma_tir.params[1]: vec_width,
+            peakflops_fma_tir.params[2]: iters,
+            peakflops_fma_tir.params[3]: num_vector_registers,
+            peakflops_fma_tir.params[4]: nthreads,
+        }
+    )
+    with transform.PassContext(opt_level=3):
+        f = build(specialized, target=target)
+    a = nd.array(np.ones((nthreads, num_vector_registers, vec_width), 
dtype="float32"), device=dev)
+    times = f.time_evaluator(f.entry_name, dev, repeat=100, number=1)(a)
+    flops = 2 * vec_width * num_vector_registers * nthreads * iters  # fma is 
two flops
+    flop_s = flops / times.min
+    return flop_s
+
+
[email protected]_func
+def peak_bandwidth_tir(a: T.handle, b: T.handle, threads: T.int32, vec_width: 
T.int32) -> None:
+    # pylint: disable=invalid-name, missing-function-docstring
+    N = T.var("int32")
+    A = T.match_buffer(a, [threads, N, 4, vec_width], "float32")
+    B = T.match_buffer(b, [threads, vec_width, 4], "float32")
+    # Parallelism is necessary to hit all cores/nodes
+    for i in T.parallel(threads):
+        for k in T.serial(N):
+            for l in T.unroll(4):
+                # vectorized load is necessary to hit peak bandwidth
+                for j in T.vectorized(vec_width):
+                    # += is necessary to introduce a data dependency for all
+                    # elements of A, preventing the backend from removing the
+                    # `k` loop and setting `k` to the loop extent.
+                    B[i, l, j] += A[i, k, l, j]
+
+
+def estimate_peak_bandwidth(target: Target, dev: Device, vec_width: 
Optional[int] = None) -> float:
+    """Estimate peak memory bandwidth of a target/device combo.
+
+    Peak bandwidth is estimated by running a small experiment on the underlying
+    hardware. The peak bandwidth measurement assumes that vector instructions
+    are being used to load the data.
+
+    Parameters
+    ----------
+    target : Target
+        Target to use for measurement. This target should be as specific to the
+        underlying hardware as possible.
+    dev : Device
+        Device to measure peak bandwidth on.
+    vec_width : Optional[int]
+        Vector unit width, determined from target if not supplied.
+
+    Returns
+    -------
+    float
+        Peak memory bandwidth in bytes/seconds.
+    """
+    # Ideally we'd be able to use this code to measure peak bandwidth of the
+    # different cache levels. If we could just generate load commands, then we
+    # could use those in a tight loop. Instead we need some code that is
+    # limited on the cache bandwidth. With the L1 cache we need an operation
+    # that has a very low arithmetic intensity and we haven't come up with one
+    # yet.
+    vec_width, _ = _detect_vec_width_registers(target, vec_width, 1)
+    specialized = peak_bandwidth_tir.specialize(
+        {
+            peak_bandwidth_tir.params[3]: vec_width,
+        }
+    )
+    with transform.PassContext(opt_level=3):
+        f = build(specialized, target=target)
+    threads = num_threads()
+    # Data size needs to be larger than last level of cache. We don't have a
+    # way of getting cache sizes, so this number should give us a large enough
+    # size.
+    size = 10**8 // (4 * threads * vec_width)
+    a = nd.array(np.ones((threads, size, 4, vec_width), dtype="float32"), 
device=dev)
+    b = nd.array(np.ones((threads, vec_width, 4), dtype="float32"), device=dev)
+    times = f.time_evaluator(f.entry_name, dev, repeat=10, number=1)(a, b, 
threads)
+    return a.numpy().size * 4 / times.min  # 4 bytes per float32
+
+
+def roofline_analysis(
+    mod: IRModule, params: Dict[str, nd.NDArray], target: Union[str, Target], 
dev: Device
+) -> profiling.Report:
+    """
+    Create a profiling report that contains roofline and other estimated
+    statistics from running a module on the VM.
+
+    These statistics are calculated by analyzing the lowered TIR of each
+    operator, so they are estimates of the true values. The statistics are:
+      - Bound: Is the operator memory or compute bound. This is computed by
+        assuming that the operator could perfectly cache all loads -- each byte
+        of memory is only loaded once.
+      - Percent of Theoretical Optimal: What percent of theoretical optimal for
+        the bound. i.e. percent of peak memory bandwidth if memory bound,
+        percent of peak FLOP/s if compute bound.
+      - Loaded Bytes: estimation of the number of bytes loaded from main 
memory.
+      - Estimated Flops: estimated number of floating point operations.
+      - Arithmetic Intensity: ratio of FLOPs per byte of data.
+      - FLOP/s: floating point operations per second.
+      - Bandwidth: Number of bytes loaded per second.
+
+    Parameters
+    ----------
+    mod : IRModule
+      Uncompiled input module>
+
+    params : Dict[str, nd.NDArray]
+
+    target : Union[str, Target]
+      Target to run on.
+
+    dev : Device
+      Device to run on.
+
+    Returns
+    -------
+
+    report : profiling.Report
+      Profiling report which includes the estimated statistics.
+    """
+    if isinstance(target, str):
+        target = Target(target)
+    peak_bandwidth = estimate_peak_bandwidth(target, dev)
+    peak_flops = estimate_peak_fma_flops(target, dev)
+
+    ridge_point = peak_flops / peak_bandwidth
+
+    all_features = _estimated_features(mod, params, target)
+
+    lib = relay.vm.compile(mod, params=params, target=target)
+    vmexec = profiler_vm.VirtualMachineProfiler(lib, dev)
+
+    args = _create_args(mod, dev)
+    report = vmexec.profile(*args)
+    new_calls = []
+    for call in report.calls:
+        if "Hash" in call.keys():
+            _, features = all_features[call["Hash"]]
+
+            flops = np.sum(features["float_addsub"] + features["float_mul"] + 
features["float_mad"])
+            loaded_bytes = 0.0
+            # assume no more than 100 buffers
+            for i in range(100):
+                key = f"B{i}.bytes"
+                if not key in features.keys():
+                    break
+                loaded_bytes += np.sum(features[key])
+            runtime = call["Duration (us)"].microseconds * 1e-6
+            arith_inten = flops / loaded_bytes
+            call = dict(call)
+            call["Loaded Bytes"] = profiling.Count(int(loaded_bytes))
+            call["Estimated FLOPs"] = profiling.Count(int(flops))
+            call["Arithmetic Intensity"] = profiling.Ratio(arith_inten)
+            call["FLOP/s"] = profiling.Ratio(flops / runtime)
+            call["Bandwidth"] = profiling.Ratio(loaded_bytes / runtime)
+            compute_bound = arith_inten > ridge_point
+            call["Bound"] = "compute" if compute_bound else "memory"
+            per_mem_bound = (loaded_bytes / runtime) / peak_bandwidth * 100
+            per_compute_bound = flops / peak_flops * 100.0
+            # We use ratio here because the percentages should be averaged 
instead of summed.
+            call["Percent of Theoretical Optimal"] = profiling.Ratio(
+                per_compute_bound if compute_bound else per_mem_bound
+            )
+            new_calls.append(call)
+        else:
+            new_calls.append(call)
+    return profiling.Report(new_calls, report.device_metrics)
diff --git a/src/node/structural_hash.cc b/src/node/structural_hash.cc
index 05899e4465..4d82f1e38b 100644
--- a/src/node/structural_hash.cc
+++ b/src/node/structural_hash.cc
@@ -553,5 +553,18 @@ TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable)
       auto* op = static_cast<const 
runtime::profiling::PercentNode*>(node.get());
       p->stream << op->GetTypeKey() << "(" << op->percent << ")";
     });
+struct RatioNodeTrait {
+  static void VisitAttrs(runtime::profiling::RatioNode* n, AttrVisitor* attrs) 
{
+    attrs->Visit("ratio", &n->ratio);
+  }
+  static constexpr std::nullptr_t SEqualReduce = nullptr;
+  static constexpr std::nullptr_t SHashReduce = nullptr;
+};
+TVM_REGISTER_REFLECTION_VTABLE(runtime::profiling::RatioNode, RatioNodeTrait);
+TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable)
+    .set_dispatch<runtime::profiling::RatioNode>([](const ObjectRef& node, 
ReprPrinter* p) {
+      auto* op = static_cast<const runtime::profiling::RatioNode*>(node.get());
+      p->stream << op->GetTypeKey() << "(" << op->ratio << ")";
+    });
 
 }  // namespace tvm
diff --git a/src/runtime/profiling.cc b/src/runtime/profiling.cc
index 6d95a0fbd2..9499a6e7a5 100644
--- a/src/runtime/profiling.cc
+++ b/src/runtime/profiling.cc
@@ -263,6 +263,8 @@ String ReportNode::AsCSV() const {
           s << (*it).second.as<DurationNode>()->microseconds;
         } else if ((*it).second.as<PercentNode>()) {
           s << (*it).second.as<PercentNode>()->percent;
+        } else if ((*it).second.as<RatioNode>()) {
+          s << (*it).second.as<RatioNode>()->ratio;
         } else if ((*it).second.as<StringObj>()) {
           s << "\"" << Downcast<String>((*it).second) << "\"";
         }
@@ -285,9 +287,14 @@ void print_metric(std::ostream& os, ObjectRef o) {
   } else if (const CountNode* n = o.as<CountNode>()) {
     os << "{\"count\":" << n->value << "}";
   } else if (const DurationNode* n = o.as<DurationNode>()) {
-    os << "{\"microseconds\":" << std::setprecision(17) << std::fixed << 
n->microseconds << "}";
+    os << "{\"microseconds\":" << 
std::setprecision(std::numeric_limits<double>::max_digits10)
+       << std::fixed << n->microseconds << "}";
   } else if (const PercentNode* n = o.as<PercentNode>()) {
-    os << "{\"percent\":" << std::setprecision(17) << std::fixed << n->percent 
<< "}";
+    os << "{\"percent\":" << 
std::setprecision(std::numeric_limits<double>::max_digits10)
+       << std::fixed << n->percent << "}";
+  } else if (const RatioNode* n = o.as<RatioNode>()) {
+    os << "{\"ratio\":" << 
std::setprecision(std::numeric_limits<double>::max_digits10)
+       << std::fixed << n->ratio << "}";
   } else {
     LOG(FATAL) << "Unprintable type " << o->GetTypeKey();
   }
@@ -343,6 +350,51 @@ String ReportNode::AsJSON() const {
   return s.str();
 }
 
+// Aggregate a set of values for a metric. Computes sum for Duration, Count,
+// and Percent; average for Ratio; and assumes all Strings are the same. All
+// ObjectRefs in metrics must have the same type.
+ObjectRef AggregateMetric(const std::vector<ObjectRef>& metrics) {
+  ICHECK_GT(metrics.size(), 0) << "Must pass a non-zero number of metrics";
+  if (metrics[0].as<DurationNode>()) {
+    double sum = 0;
+    for (auto& metric : metrics) {
+      sum += metric.as<DurationNode>()->microseconds;
+    }
+    return ObjectRef(make_object<DurationNode>(sum));
+  } else if (metrics[0].as<CountNode>()) {
+    int64_t sum = 0;
+    for (auto& metric : metrics) {
+      sum += metric.as<CountNode>()->value;
+    }
+    return ObjectRef(make_object<CountNode>(sum));
+  } else if (metrics[0].as<PercentNode>()) {
+    double sum = 0;
+    for (auto& metric : metrics) {
+      sum += metric.as<PercentNode>()->percent;
+    }
+    return ObjectRef(make_object<PercentNode>(sum));
+  } else if (metrics[0].as<RatioNode>()) {
+    double sum = 0;
+    for (auto& metric : metrics) {
+      sum += metric.as<RatioNode>()->ratio;
+    }
+    return ObjectRef(make_object<RatioNode>(sum / metrics.size()));
+  } else if (metrics[0].as<StringObj>()) {
+    for (auto& m : metrics) {
+      if (Downcast<String>(metrics[0]) != Downcast<String>(m)) {
+        return ObjectRef(String(""));
+      }
+    }
+    // Assume all strings in metrics are the same.
+    return metrics[0];
+  } else {
+    LOG(FATAL) << "Can only aggregate metrics with types DurationNode, 
CountNode, "
+                  "PercentNode, RatioNode, and StringObj, but got "
+               << metrics[0]->GetTypeKey();
+    return ObjectRef();  // To silence warnings
+  }
+}
+
 String ReportNode::AsTable(bool sort, bool aggregate, bool compute_col_sums) 
const {
   // aggregate calls by op hash (or op name if hash is not set) + argument 
shapes
   std::vector<Map<String, ObjectRef>> aggregated_calls;
@@ -370,32 +422,27 @@ String ReportNode::AsTable(bool sort, bool aggregate, 
bool compute_col_sums) con
     }
     for (const auto& p : aggregates) {
       std::unordered_map<String, ObjectRef> aggregated;
-      for (auto i : p.second) {
-        for (auto& metric : calls[i]) {
-          auto it = aggregated.find(metric.first);
-          if (it == aggregated.end()) {
-            aggregated[metric.first] = metric.second;
-          } else {
-            if (metric.second.as<DurationNode>()) {
-              aggregated[metric.first] = ObjectRef(
-                  
make_object<DurationNode>(it->second.as<DurationNode>()->microseconds +
-                                            
metric.second.as<DurationNode>()->microseconds));
-            } else if (metric.second.as<CountNode>()) {
-              aggregated[metric.first] = ObjectRef(make_object<CountNode>(
-                  it->second.as<CountNode>()->value + 
metric.second.as<CountNode>()->value));
-            } else if (metric.second.as<PercentNode>()) {
-              aggregated[metric.first] =
-                  
ObjectRef(make_object<PercentNode>(it->second.as<PercentNode>()->percent +
-                                                     
metric.second.as<PercentNode>()->percent));
-            } else if (metric.second.as<StringObj>()) {
-              // Don't do anything. Assume the two strings are the same.
-            } else {
-              LOG(FATAL) << "Can only aggregate metrics with types 
DurationNode, CountNode, "
-                            "PercentNode, and StringObj, but got "
-                         << metric.second->GetTypeKey();
-            }
+      std::unordered_set<std::string> metrics;
+      for (auto& call : calls) {
+        for (auto& metric : call) {
+          metrics.insert(metric.first);
+        }
+      }
+      for (const std::string& metric : metrics) {
+        std::vector<ObjectRef> per_call;
+        for (auto i : p.second) {
+          auto& call = calls[i];
+          auto it = std::find_if(call.begin(), call.end(),
+                                 [&metric](const std::pair<String, ObjectRef>& 
call_metric) {
+                                   return std::string(call_metric.first) == 
metric;
+                                 });
+          if (it != call.end()) {
+            per_call.push_back((*it).second);
           }
         }
+        if (per_call.size() > 0) {
+          aggregated[metric] = AggregateMetric(per_call);
+        }
       }
       aggregated_calls.push_back(aggregated);
     }
@@ -440,6 +487,8 @@ String ReportNode::AsTable(bool sort, bool aggregate, bool 
compute_col_sums) con
             val += it->second.as<PercentNode>()->percent;
           }
           col_sums[p.first] = ObjectRef(make_object<PercentNode>(val));
+        } else if (p.second.as<RatioNode>()) {
+          // It does not make sense to sum ratios
         }
       }
     }
@@ -499,6 +548,11 @@ String ReportNode::AsTable(bool sort, bool aggregate, bool 
compute_col_sums) con
           std::stringstream s;
           s << std::fixed << std::setprecision(2) << 
(*it).second.as<PercentNode>()->percent;
           val = s.str();
+        } else if ((*it).second.as<RatioNode>()) {
+          std::stringstream s;
+          s.imbue(std::locale(""));  // for 1000s seperators
+          s << std::setprecision(2) << (*it).second.as<RatioNode>()->ratio;
+          val = s.str();
         } else if ((*it).second.as<StringObj>()) {
           val = Downcast<String>((*it).second);
         }
@@ -615,6 +669,10 @@ Map<String, ObjectRef> parse_metrics(dmlc::JSONReader* 
reader) {
       int64_t count;
       reader->Read(&count);
       o = ObjectRef(make_object<CountNode>(count));
+    } else if (metric_value_name == "ratio") {
+      double ratio;
+      reader->Read(&ratio);
+      o = ObjectRef(make_object<RatioNode>(ratio));
     } else if (metric_value_name == "string") {
       std::string s;
       reader->Read(&s);
@@ -664,6 +722,7 @@ Report Report::FromJSON(String json) {
 TVM_REGISTER_OBJECT_TYPE(DurationNode);
 TVM_REGISTER_OBJECT_TYPE(PercentNode);
 TVM_REGISTER_OBJECT_TYPE(CountNode);
+TVM_REGISTER_OBJECT_TYPE(RatioNode);
 TVM_REGISTER_OBJECT_TYPE(ReportNode);
 TVM_REGISTER_OBJECT_TYPE(DeviceWrapperNode);
 TVM_REGISTER_OBJECT_TYPE(MetricCollectorNode);
@@ -794,6 +853,28 @@ PackedFunc WrapTimeEvaluator(PackedFunc pf, Device dev, 
int number, int repeat,
   return PackedFunc(ftimer);
 }
 
+TVM_REGISTER_GLOBAL("runtime.profiling.Report")
+    .set_body_typed([](Array<Map<String, ObjectRef>> calls,
+                       Map<String, Map<String, ObjectRef>> device_metrics) {
+      return Report(calls, device_metrics);
+    });
+
+TVM_REGISTER_GLOBAL("runtime.profiling.Count").set_body_typed([](int64_t 
count) {
+  return ObjectRef(make_object<CountNode>(count));
+});
+
+TVM_REGISTER_GLOBAL("runtime.profiling.Percent").set_body_typed([](double 
percent) {
+  return ObjectRef(make_object<PercentNode>(percent));
+});
+
+TVM_REGISTER_GLOBAL("runtime.profiling.Duration").set_body_typed([](double 
duration) {
+  return ObjectRef(make_object<DurationNode>(duration));
+});
+
+TVM_REGISTER_GLOBAL("runtime.profiling.Ratio").set_body_typed([](double ratio) 
{
+  return ObjectRef(make_object<RatioNode>(ratio));
+});
+
 }  // namespace profiling
 }  // namespace runtime
 }  // namespace tvm
diff --git a/tests/python/unittest/test_runtime_profiling.py 
b/tests/python/unittest/test_runtime_profiling.py
index 2f8366f470..b2928cfe1d 100644
--- a/tests/python/unittest/test_runtime_profiling.py
+++ b/tests/python/unittest/test_runtime_profiling.py
@@ -20,8 +20,10 @@ from io import StringIO
 import csv
 import os
 import json
+import platform
 
 import tvm.testing
+import tvm.utils
 from tvm.runtime import profiler_vm
 from tvm import relay
 from tvm.relay.testing import mlp
@@ -257,6 +259,51 @@ def test_profile_function(target, dev):
     assert report[metric].value > 0
 
 
[email protected]_targets("llvm")
+def test_estimate_peak_fma_flops(target, dev):
+    # This test uses vectorized instructions so we need a target that supports 
them
+    if target == "llvm":
+        target = "llvm -mattr=+fma,+avx2"
+    flops = tvm.utils.estimate_peak_fma_flops(tvm.target.Target(target), dev)
+    # Assume we can achieve 1 GFLOP/s per thread, which is 1 FLOP per cycle on 
a 1GHz cpu.
+    assert (
+        flops > 10**9 * tvm.runtime.num_threads() and flops < 10**14
+    ), f"FLOP/s should be between 10^9 * num_threads and 10^14, but it is 
{flops}"
+
+
[email protected]_targets("llvm")
+def test_estimate_peak_bandwidth(target, dev):
+    # This test uses vectorized instructions so we need a target that supports 
them
+    if target == "llvm":
+        target = "llvm -mattr=+fma,+avx2"
+    bandwidth = tvm.utils.estimate_peak_bandwidth(tvm.target.Target(target), 
dev)
+    # Assume we can achieve 1 GB/s. DDR2 should transfer somewhere around 6
+    # GB/s, so this should leave enough wiggle room.
+    assert (
+        bandwidth > 10**9 and bandwidth < 10**12
+    ), f"Bandwidth should be between 10^9 and 10^12, but it is {bandwidth}"
+
+
[email protected](platform.machine() == "i386", reason="Cannot allocate 
enough memory on i386")
[email protected]_targets("llvm")
+def test_roofline_analysis(target, dev):
+    a = relay.var("a", relay.TensorType((512, 512), "float32"))
+    b = relay.var("b", relay.TensorType((512, 512), "float32"))
+    c = relay.nn.dense(a, b)
+    mod = tvm.IRModule.from_expr(relay.Function([a, b], c))
+    params = {}
+    report = tvm.utils.roofline_analysis(mod, params, target, dev)
+
+    assert "Bound" in report.table()
+    assert "Percent of Theoretical Optimal" in report.table()
+    for call in report.calls:
+        if "Percent of Theoretical Optimal" in call:
+            # Ideally we'd like a little tighter bound here, but it is hard to
+            # know how well this dense will perform without tuning. And we
+            # don't have an operator that uses a specific number of flops.
+            assert call["Percent of Theoretical Optimal"].ratio >= 0
+
+
 if __name__ == "__main__":
     import sys
     import pytest

[tvm] branch main updated: [PROFILER] Theoretical roofline models (#11066)

Reply via email to