[tvm] branch main updated: [CLI TOOLS][RTVM] Improve rtvm tool with new options to measure native performance (#15818)

srk Thu, 28 Sep 2023 22:00:33 -0700

This is an automated email from the ASF dual-hosted git repository.

srk pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tvm.git



The following commit(s) were added to refs/heads/main by this push:
     new def551dfd5 [CLI TOOLS][RTVM] Improve rtvm tool with new options to 
measure native performance (#15818)
def551dfd5 is described below

commit def551dfd50bfff4e9d50108dc4e8027b553b8ec
Author: Siva <[email protected]>
AuthorDate: Fri Sep 29 10:30:20 2023 +0530

    [CLI TOOLS][RTVM] Improve rtvm tool with new options to measure native 
performance (#15818)
    
    * [RTVM] Improve rtvm tool with new options to measure native performance
    
    Few fixes and enhancements that affects model loading times
    New options to measure performance.
    
    * * review comments
    
    * * review comments
---
 apps/cpp_rtvm/README.md     |  22 +++++
 apps/cpp_rtvm/main.cc       | 199 ++++++++++++++++++++++++++++++++++++++------
 apps/cpp_rtvm/tvm_runner.cc | 129 +++++++++++++++++++++-------
 apps/cpp_rtvm/tvm_runner.h  |  24 +++++-
 4 files changed, 316 insertions(+), 58 deletions(-)

diff --git a/apps/cpp_rtvm/README.md b/apps/cpp_rtvm/README.md
index c60a7b0e12..652d46eb58 100644
--- a/apps/cpp_rtvm/README.md
+++ b/apps/cpp_rtvm/README.md
@@ -122,6 +122,11 @@ Command line usage
 --input        - Numpy file for the model input (optional and we use random of 
not given)
 --output       - Numpy file name to dump the model output as numpy
 --dump-meta    - Dump model meta information
+--pre-compiled - The file name of a file where pre-compiled programs should be 
stored
+--profile      - Profile over all execution
+--dry-run      - Profile after given dry runs, default 10
+--run-count    - Profile for given runs, default 50
+--zero-copy    - Profile with zero copy api
 
   Example
   ./rtvm --model=keras-resnet50 --device="opencl" --dump-meta
@@ -366,3 +371,20 @@ stored. If the pre-compiled file name was passed to the 
`rtvm` then After method
 `Load`, method `UsePreCompiledProgram` is called. This method loads 
pre-compiled
 programs if the file exists. In opposite case the file will be created and
 pre-compiled programs will be saved to this file.
+
+# Performnace Profiling Options
+The tool has added few options to measure wall clock performance of the given 
model on Target natively.
+--profile : Can turn on the profiling
+--dry-run : The number of times dry run the model before mearuring the 
performance. Default value os 10
+--run-count : The number times to run the model and take an average. Default 
value is 50.
+--zero-copy: This option enables graph runtime zero copy to be used for input 
and output than byte copy to DLTensor.
+
+Performance profile options dumps information summary as given below.
+     Module Load              :27 ms
+     Graph Runtime Create     :11 ms
+     Params Read              :15 ms
+     Params Set               :41 ms
+     Pre Compiled Progs Load  :24 ms
+Total Load Time     :118 ms
+Average ExecTime    :27 ms
+Unload Time         :35.9236 ms
diff --git a/apps/cpp_rtvm/main.cc b/apps/cpp_rtvm/main.cc
index c38a5f62bd..dc3cf1c414 100644
--- a/apps/cpp_rtvm/main.cc
+++ b/apps/cpp_rtvm/main.cc
@@ -29,6 +29,7 @@
 #endif
 #include <dmlc/logging.h>
 
+#include <chrono>
 #include <cstring>
 #include <iostream>
 #include <sstream>
@@ -54,7 +55,11 @@ static const string kUsage =
     "--input        - Numpy file for the model input (optional and we use 
random of not given)\n"
     "--output       - Numpy file name to dump the model output as numpy\n"
     "--dump-meta    - Dump model meta information\n"
-    "--pre-compiled - The file name of a file where pre-compiled programs 
should be stored"
+    "--pre-compiled - The file name of a file where pre-compiled programs 
should be stored\n"
+    "--profile      - Profile over all execution\n"
+    "--dry-run      - Profile after given dry runs, default 10\n"
+    "--run-count    - Profile for given runs, default 50\n"
+    "--zero-copy    - Profile with zero copy api\n"
     "\n"
     "  Example\n"
     "  ./rtvm --model=keras-resnet50 --device=\"opencl\" --dump-meta\n"
@@ -68,6 +73,7 @@ static const string kUsage =
  * \arg input Numpy file for the model input
  * \arg output Numpy file name to dump the model output as numpy
  * \arg pre_compiled File name where pre-compiled programs should be stored
+ * \arg profile Do we profile overall execution
  */
 struct ToolArgs {
   string model;
@@ -75,7 +81,11 @@ struct ToolArgs {
   string input;
   string output;
   string pre_compiled;
-  bool dump_meta = false;
+  bool dump_meta{false};
+  bool profile{false};
+  int dry_run{10};
+  int run_count{50};
+  bool zero_copy{false};
 };
 
 /*!
@@ -89,6 +99,10 @@ void PrintArgs(const ToolArgs& args) {
   LOG(INFO) << "Output        = " << args.output;
   LOG(INFO) << "Pre-compiled  = " << args.pre_compiled;
   LOG(INFO) << "Dump Metadata = " << ((args.dump_meta) ? ("True") : ("False"));
+  LOG(INFO) << "Profile       = " << ((args.profile) ? ("True") : ("False"));
+  LOG(INFO) << "Dry Run       = " << args.dry_run;
+  LOG(INFO) << "Run Count     = " << args.run_count;
+  LOG(INFO) << "Zero Copy     = " << ((args.zero_copy) ? ("True") : ("False"));
 }
 
 #if defined(__linux__) || defined(__ANDROID__)
@@ -178,6 +192,26 @@ void ParseCmdArgs(int argc, char* argv[], struct ToolArgs& 
args) {
   }
 
   args.pre_compiled = GetCmdOption(argc, argv, "--pre-compiled=");
+
+  const string pprofile = GetCmdOption(argc, argv, "--profile", true);
+  if (!pprofile.empty()) {
+    args.profile = true;
+  }
+
+  const string pdry_run = GetCmdOption(argc, argv, "--dry-run=");
+  if (!pdry_run.empty()) {
+    args.dry_run = stoi(pdry_run);
+  }
+
+  const string prun = GetCmdOption(argc, argv, "--run-count=");
+  if (!prun.empty()) {
+    args.run_count = stoi(prun);
+  }
+
+  const string pzcopy = GetCmdOption(argc, argv, "--zero-copy", true);
+  if (!pzcopy.empty()) {
+    args.zero_copy = true;
+  }
 }
 
 /*!
@@ -192,59 +226,174 @@ int ExecuteModel(ToolArgs& args) {
 #endif
 
   // Initialize TVM Runner
-  TVMRunner runner = TVMRunner(args.model, args.device);
+  auto runner = new TVMRunner(args.model, args.device);
 
   // Load the model
-  runner.Load();
+  runner->Load();
   if (!args.pre_compiled.empty()) {
-    runner.UsePreCompiledPrograms(args.pre_compiled);
+    runner->UsePreCompiledPrograms(args.pre_compiled);
   }
 
   // Query Model meta Information
-  TVMMetaInfo mInfo = runner.GetMetaInfo();
+  TVMMetaInfo mInfo = runner->GetMetaInfo();
 
   // Print Meta Information
-  if (args.dump_meta) runner.PrintMetaInfo();
+  if (args.dump_meta) runner->PrintMetaInfo();
+
+  int total_exec_time = 0;
+
+  if (args.profile) {
+    if (args.dry_run) {
+      for (int ii = 0; ii < args.dry_run; ++ii) {
+        runner->Run();
+      }
+      TVMSynchronize(GetTVMDevice(args.device), 0, nullptr);
+    }
+    int total_time = 0;
+    std::map<std::string, NDArray> input_data_even, input_data_odd;
+    std::map<std::string, NDArray> output_data_even, output_data_odd;
+
+    std::map<std::string, char*> input_data;
+    std::map<std::string, char*> output_data;
+
+    // Alloc / populate and keep input data ready
+    for (auto& elem : mInfo.input_info) {
+      if (args.zero_copy) {
+        auto ndarr =
+            NDArray::Empty(elem.second.first, 
tvm::runtime::String2DLDataType(elem.second.second),
+                           DLDevice{GetTVMDevice(args.device), 0});
+        input_data_even.insert({elem.first, ndarr});
+
+        ndarr =
+            NDArray::Empty(elem.second.first, 
tvm::runtime::String2DLDataType(elem.second.second),
+                           DLDevice{GetTVMDevice(args.device), 0});
+        input_data_odd.insert({elem.first, ndarr});
+      } else {
+        char* data = (char*)malloc(runner->GetInputMemSize(elem.first));
+        input_data.insert({elem.first, data});
+      }
+    }
+
+    // Alloc and keep output bufers ready
+    for (auto& elem : mInfo.output_info) {
+      if (args.zero_copy) {
+        auto ndarr =
+            NDArray::Empty(elem.second.first, 
tvm::runtime::String2DLDataType(elem.second.second),
+                           DLDevice{GetTVMDevice(args.device), 0});
+        output_data_even.insert({elem.first, ndarr});
+
+        ndarr =
+            NDArray::Empty(elem.second.first, 
tvm::runtime::String2DLDataType(elem.second.second),
+                           DLDevice{GetTVMDevice(args.device), 0});
+        output_data_odd.insert({elem.first, ndarr});
+      } else {
+        char* data = (char*)malloc(runner->GetOutputMemSize(elem.first));
+        output_data.insert({elem.first, data});
+      }
+    }
+
+    for (int ii = 0; ii < args.run_count; ++ii) {
+      // Timer start
+      auto tstart = std::chrono::high_resolution_clock::now();
+      // Set random input for all input
+      for (auto& elem : mInfo.input_info) {
+        if (args.zero_copy) {
+          if (ii % 2) {
+            runner->SetInput(elem.first, input_data_even[elem.first]);
+          } else {
+            runner->SetInput(elem.first, input_data_odd[elem.first]);
+          }
+        } else {
+          runner->SetInput(elem.first, input_data[elem.first]);
+        }
+      }
+
+      if (args.zero_copy) {
+        // With zero copy set the result NDArray up front
+        for (auto& elem : mInfo.output_info) {
+          if (ii % 2) {
+            runner->SetOutput(elem.first, output_data_even[elem.first]);
+          } else {
+            runner->SetOutput(elem.first, output_data_odd[elem.first]);
+          }
+        }
+      }
 
-  if (args.input.empty() || args.output.empty()) {
+      // Run the model
+      runner->Run();
+
+      if (!args.zero_copy) {
+        // W/o zero copy we need to invoke explicite data copy
+        for (auto& elem : mInfo.output_info) {
+          runner->GetOutput(elem.first, output_data[elem.first]);
+        }
+      } else {
+        // Just wait for the run to complete.
+        TVMSynchronize(GetTVMDevice(args.device), 0, nullptr);
+      }
+
+      // Timer end
+      auto tend = std::chrono::high_resolution_clock::now();
+      LOG(INFO) << "Exec Time:" << static_cast<double>((tend - 
tstart).count()) / 1e6;
+      total_exec_time += static_cast<double>((tend - tstart).count()) / 1e6;
+    }
+
+    // Free input bufers
+    for (auto& elem : mInfo.input_info) {
+      free(input_data[elem.first]);
+    }
+
+    // Free output bufers
+    for (auto& elem : mInfo.output_info) {
+      free(output_data[elem.first]);
+    }
+  } else if (!args.input.empty() && !args.output.empty()) {
+    LOG(INFO) << "Executing with Input:" << args.input << " Output:" << 
args.output;
+    // Set Input from Numpy Input
+    runner->SetInput(args.input);
+    // Run the model
+    runner->Run();
+    // Get Output as Numpy dump
+    runner->GetOutput(args.output);
+  } else {
     LOG(INFO) << "Executing dry run ... ";
     // Set random input for all inputs
     for (auto& elem : mInfo.input_info) {
       LOG(INFO) << "Set Random Input for :" << elem.first;
       auto shape = elem.second.first;
-      size_t ssize = runner.GetInputMemSize(elem.first);
+      size_t ssize = runner->GetInputMemSize(elem.first);
       char* data = (char*)malloc(ssize);
       LOG(INFO) << "Random Input Size:" << ssize << "  bytes";
-      runner.SetInput(elem.first, data);
+      runner->SetInput(elem.first, data);
       free(data);
     }
-
     // Run the model
-    runner.Run();
-
+    runner->Run();
     // Get Output and dump few values
     for (auto& elem : mInfo.output_info) {
       LOG(INFO) << "Get Output for :" << elem.first;
       auto shape = elem.second.first;
-      size_t ssize = runner.GetOutputMemSize(elem.first);
+      size_t ssize = runner->GetOutputMemSize(elem.first);
       char* data = (char*)malloc(ssize);
-      runner.GetOutput(elem.first, data);
+      runner->GetOutput(elem.first, data);
       LOG(INFO) << "Output Size:" << ssize << "  bytes";
       free(data);
     }
-  } else {
-    LOG(INFO) << "Executing with Input:" << args.input << " Output:" << 
args.output;
-
-    // Set Input from Numpy Input
-    runner.SetInput(args.input);
-
-    // Run the model
-    runner.Run();
+  }
 
-    // Get Output as Numpy dump
-    runner.GetOutput(args.output);
+  if (args.profile) {
+    // Print Stats
+    runner->PrintStats();
   }
+  auto tstart = std::chrono::high_resolution_clock::now();
+  delete runner;
+  auto tend = std::chrono::high_resolution_clock::now();
 
+  if (args.profile) {
+    LOG(INFO) << "Average ExecTime :" << total_exec_time / args.run_count << " 
ms";
+    LOG(INFO) << "Unload Time      :" << static_cast<double>((tend - 
tstart).count()) / 1e6
+              << " ms";
+  }
   return 0;
 }
 
diff --git a/apps/cpp_rtvm/tvm_runner.cc b/apps/cpp_rtvm/tvm_runner.cc
index 2fd4f2281e..7d6dbc23ee 100644
--- a/apps/cpp_rtvm/tvm_runner.cc
+++ b/apps/cpp_rtvm/tvm_runner.cc
@@ -26,10 +26,12 @@
 
 #include <cnpy.h>
 
+#include <chrono>
 #include <fstream>
 #include <iterator>
 #include <streambuf>
 #include <string>
+#include <vector>
 
 namespace tvm {
 namespace runtime {
@@ -39,25 +41,25 @@ namespace runtime {
  * \param device the target device in string format.
  * \return dl_device corresponding to the device string.
  */
-int GetTVMDevice(std::string device) {
+DLDeviceType GetTVMDevice(std::string device) {
   if (!device.compare("cpu")) {
-    return static_cast<int>(kDLCPU);
+    return kDLCPU;
   } else if (!device.compare("llvm")) {
-    return static_cast<int>(kDLCPU);
+    return kDLCPU;
   } else if (!device.compare("cuda")) {
-    return static_cast<int>(kDLCUDA);
+    return kDLCUDA;
   } else if (!device.compare("opencl")) {
-    return static_cast<int>(kDLOpenCL);
+    return kDLOpenCL;
   } else if (!device.compare("vulkan")) {
-    return static_cast<int>(kDLVulkan);
+    return kDLVulkan;
   } else if (!device.compare("metal")) {
-    return static_cast<int>(kDLMetal);
+    return kDLMetal;
   } else if (!device.compare("vpi")) {
-    return static_cast<int>(kDLVPI);
+    return kDLVPI;
   } else if (!device.compare("rocm")) {
-    return static_cast<int>(kDLROCM);
+    return kDLROCM;
   } else if (!device.compare("oneapi")) {
-    return static_cast<int>(kDLOneAPI);
+    return kDLOneAPI;
   } else {
     LOG(FATAL) << "TVMRunner : Unsupported device :" << device;
   }
@@ -80,34 +82,59 @@ TVMRunner::TVMRunner(std::string path, std::string device)
 int TVMRunner::Load(void) {
   LOG(INFO) << "TVMRunner Load:" << r_model_path;
   // Load the lib file
+  auto tstart = std::chrono::high_resolution_clock::now();
+
   r_mod_handle = Module::LoadFromFile((r_model_path + "/mod.so").c_str(), 
"so");
+  auto tend = std::chrono::high_resolution_clock::now();
+  r_module_load_ms = static_cast<double>((tend - tstart).count()) / 1e6;
 
+  tstart = std::chrono::high_resolution_clock::now();
   // Read model json file
   std::ifstream json_reader((r_model_path + "/mod.json").c_str());
   CHECK(!json_reader.fail()) << "Failed to open json file:" << (r_model_path + 
"/mod.json").c_str();
-  std::string json_str((std::istreambuf_iterator<char>(json_reader)),
-                       std::istreambuf_iterator<char>());
+  json_reader.seekg(0, std::ios_base::end);
+  std::size_t json_size = json_reader.tellg();
+  json_reader.seekg(0, std::ios_base::beg);
+  std::string json_data;
+  json_data.reserve(json_size);
+  json_reader.read((char*)json_data.c_str(), json_size);
   json_reader.close();
 
   // Get ref to graph exeutor
   auto f_handle = tvm::runtime::Registry::Get("tvm.graph_executor.create");
 
   // Greate graph runtime
-  r_graph_handle = (*f_handle)(json_str, r_mod_handle, GetTVMDevice(r_device), 
0);
+  r_graph_handle =
+      (*f_handle)(json_data, r_mod_handle, 
static_cast<int>(GetTVMDevice(r_device)), 0);
+
+  tend = std::chrono::high_resolution_clock::now();
+  r_graph_load_ms = static_cast<double>((tend - tstart).count()) / 1e6;
 
   // Read params binary file
+  tstart = std::chrono::high_resolution_clock::now();
   std::ifstream params_reader((r_model_path + "/mod.params").c_str(), 
std::ios::binary);
   CHECK(!params_reader.fail()) << "Failed to open json file:"
                                << (r_model_path + "/mod.params").c_str();
-  const std::string params_str((std::istreambuf_iterator<char>(params_reader)),
-                               std::istreambuf_iterator<char>());
+
+  params_reader.seekg(0, std::ios_base::end);
+  std::size_t param_size = params_reader.tellg();
+  params_reader.seekg(0, std::ios_base::beg);
+  std::vector<char> param_data(param_size / sizeof(char));
+  params_reader.read((char*)&param_data[0], param_size);
   params_reader.close();
+
   TVMByteArray params_arr;
-  params_arr.data = params_str.c_str();
-  params_arr.size = params_str.length();
+  params_arr.data = (char*)&param_data[0];
+  params_arr.size = param_size;
+
+  tend = std::chrono::high_resolution_clock::now();
+  r_param_read_ms = static_cast<double>((tend - tstart).count()) / 1e6;
 
   // Load parameters
+  tstart = std::chrono::high_resolution_clock::now();
   r_graph_handle.GetFunction("load_params")(params_arr);
+  tend = std::chrono::high_resolution_clock::now();
+  r_param_load_ms = static_cast<double>((tend - tstart).count()) / 1e6;
 
   return 0;
 }
@@ -117,6 +144,7 @@ int TVMRunner::Load(void) {
  * \param file_name File name where pre-compiled programs should be stored.
  */
 void TVMRunner::UsePreCompiledPrograms(std::string file_name) {
+  auto tstart = std::chrono::high_resolution_clock::now();
   if (r_run_was_called) {
     LOG(INFO) << "TVMRunner UsePreCompiledPrograms: should be called before 
first run";
     return;
@@ -130,10 +158,19 @@ void TVMRunner::UsePreCompiledPrograms(std::string 
file_name) {
       std::ofstream fs(file_name, std::ofstream::binary);
       fs.write(bytes.c_str(), bytes.size());
     } else {
-      std::string bytes((std::istreambuf_iterator<char>(ifs)), 
std::istreambuf_iterator<char>());
-      f_set(String(bytes));
+      ifs.seekg(0, std::ios_base::end);
+      std::size_t blob_size = ifs.tellg();
+      ifs.seekg(0, std::ios_base::beg);
+      std::string blob_data;
+      blob_data.reserve(blob_size);
+      blob_data.resize(blob_size);
+      ifs.read((char*)blob_data.c_str(), blob_size);
+      ifs.close();
+      f_set(String(blob_data));
     }
   }
+  auto tend = std::chrono::high_resolution_clock::now();
+  r_pre_compiled_load_ms = static_cast<double>((tend - tstart).count()) / 1e6;
 }
 
 /*!
@@ -156,8 +193,6 @@ inline size_t GetMemSize(NDArray& narr) {
  * \return The memory size.
  */
 size_t TVMRunner::GetInputMemSize(std::string input_id) {
-  LOG(INFO) << "TVMRunner::GetInputMemSize:" << input_id;
-
   NDArray in_arr = r_graph_handle.GetFunction("get_input")(input_id);
   auto ssize = GetMemSize(in_arr);
 
@@ -170,8 +205,6 @@ size_t TVMRunner::GetInputMemSize(std::string input_id) {
  * \return The memory size.
  */
 size_t TVMRunner::GetOutputMemSize(std::string output_id) {
-  LOG(INFO) << "TVMRunner::GetOutputMemSize:" << output_id;
-
   NDArray out_arr = r_graph_handle.GetFunction("get_output")(output_id);
   auto ssize = GetMemSize(out_arr);
 
@@ -209,13 +242,23 @@ int TVMRunner::SetInput(std::string inputfile) {
  * \param 0 on success else error code.
  */
 int TVMRunner::SetInput(std::string input_id, char* raw_input) {
-  LOG(INFO) << "TVMRunner::SetInput (Raw)";
   NDArray in_arr = r_graph_handle.GetFunction("get_input")(input_id);
   auto ssize = GetMemSize(in_arr);
   in_arr.CopyFromBytes(raw_input, ssize);
   return 0;
 }
 
+/*!
+ * \brief Set the model input from given NDArray with zero copy.
+ * \param input_id input node name.
+ * \param ndarr NDArray.
+ * \param 0 on success else error code.
+ */
+int TVMRunner::SetInput(std::string input_id, NDArray& ndarr) {
+  r_graph_handle.GetFunction("set_input_zero_copy")(input_id, ndarr);
+  return 0;
+}
+
 /*!
  * \brief Get the model outputs and dump them to npz file.
  * \param outputfile the npz file to where we dump the output data.
@@ -255,21 +298,29 @@ int TVMRunner::GetOutput(std::string outputfile) {
  * \param 0 on success else error code.
  */
 int TVMRunner::GetOutput(std::string output_id, char* raw_output) {
-  LOG(INFO) << "TVMRunner::GetOutput (Raw)";
   NDArray out_arr = r_graph_handle.GetFunction("get_output")(output_id);
   auto ssize = GetMemSize(out_arr);
   out_arr.CopyToBytes(raw_output, ssize);
   return 0;
 }
 
+/*!
+ * \brief Set the model output from given NDArray with zero copy.
+ * \param output_id output node name.
+ * \param ndarr NDArray.
+ * \param 0 on success else error code.
+ */
+int TVMRunner::SetOutput(std::string output_id, NDArray& ndarr) {
+  r_graph_handle.GetFunction("set_output_zero_copy")(output_id, ndarr);
+  return 0;
+}
+
 /*!
  * \brief Call one cycle of execution for the model.
  * \param 0 on success else error code.
  */
 int TVMRunner::Run(void) {
-  LOG(INFO) << "TVMRunner::Run";
   r_run_was_called = true;
-
   r_graph_handle.GetFunction("run")();
   return 0;
 }
@@ -289,10 +340,10 @@ TVMMetaInfo TVMRunner::GetMetaInfo(void) {
   auto dtype_info = GetRef<Map<String, 
ObjectRef>>(tvm_input_info["dtype"].as<MapNode>());
   for (const auto& kv : shape_info) {
     auto stuple = GetRef<ShapeTuple>(kv.second.as<ShapeTupleObj>());
-    std::vector<int> vshape;
+    std::vector<int64_t> vshape;
     vshape.assign(stuple.begin(), stuple.end());
     auto dtype = GetRef<String>(dtype_info[kv.first].as<StringObj>());
-    std::pair<std::vector<int>, std::string> value = std::make_pair(vshape, 
dtype);
+    std::pair<std::vector<int64_t>, std::string> value = 
std::make_pair(vshape, dtype);
     mInfo.input_info.insert({kv.first, value});
   }
 
@@ -301,10 +352,10 @@ TVMMetaInfo TVMRunner::GetMetaInfo(void) {
   dtype_info = GetRef<Map<String, 
ObjectRef>>(tvm_input_info["dtype"].as<MapNode>());
   for (const auto& kv : shape_info) {
     auto stuple = GetRef<ShapeTuple>(kv.second.as<ShapeTupleObj>());
-    std::vector<int> vshape;
+    std::vector<int64_t> vshape;
     vshape.assign(stuple.begin(), stuple.end());
     auto dtype = GetRef<String>(dtype_info[kv.first].as<StringObj>());
-    std::pair<std::vector<int>, std::string> value = std::make_pair(vshape, 
dtype);
+    std::pair<std::vector<int64_t>, std::string> value = 
std::make_pair(vshape, dtype);
     mInfo.output_info.insert({kv.first, value});
   }
 
@@ -343,5 +394,21 @@ void TVMRunner::PrintMetaInfo(void) {
   }
 }
 
+/*!
+ * \brief Print stats information.
+ */
+void TVMRunner::PrintStats(void) {
+  LOG(INFO) << "Performance Stats:" << r_model_path;
+  LOG(INFO) << "    Module Load              :" << r_module_load_ms << " ms";
+  LOG(INFO) << "    Graph Runtime Create     :" << r_graph_load_ms << " ms";
+  LOG(INFO) << "    Params Read              :" << r_param_read_ms << " ms";
+  LOG(INFO) << "    Params Set               :" << r_param_load_ms << " ms";
+  LOG(INFO) << "    Pre Compiled Progs Load  :" << r_pre_compiled_load_ms << " 
ms";
+  LOG(INFO) << "Total Load Time     :"
+            << r_module_load_ms + r_graph_load_ms + r_param_read_ms + 
r_param_load_ms +
+                   r_pre_compiled_load_ms
+            << " ms";
+}
+
 }  // namespace runtime
 }  // namespace tvm
diff --git a/apps/cpp_rtvm/tvm_runner.h b/apps/cpp_rtvm/tvm_runner.h
index 47717c3ecf..e93b63ae85 100644
--- a/apps/cpp_rtvm/tvm_runner.h
+++ b/apps/cpp_rtvm/tvm_runner.h
@@ -41,8 +41,8 @@ namespace runtime {
 typedef struct _TVMMetaInfo {
   int n_inputs;
   int n_outputs;
-  std::map<std::string, std::pair<std::vector<int>, std::string>> input_info;
-  std::map<std::string, std::pair<std::vector<int>, std::string>> output_info;
+  std::map<std::string, std::pair<std::vector<int64_t>, std::string>> 
input_info;
+  std::map<std::string, std::pair<std::vector<int64_t>, std::string>> 
output_info;
 } TVMMetaInfo;
 
 /*!
@@ -63,10 +63,14 @@ class TVMRunner {
   int SetInput(std::string);
   /*! \brief To set the input from binary data */
   int SetInput(std::string, char*);
+  /*! \brief To set the input from NDArray */
+  int SetInput(std::string, NDArray& ndarr);
   /*! \brief Save the model output into given npz file */
   int GetOutput(std::string);
   /*! \brief Get the model output in binary format */
   int GetOutput(std::string, char*);
+  /*! \brief Swap output NDArray with given one */
+  int SetOutput(std::string, NDArray& ndarr);
   /*! \brief To get the input mem size */
   size_t GetInputMemSize(std::string);
   /*! \brief To get the output mem size */
@@ -76,6 +80,21 @@ class TVMRunner {
   /*! \brief Print function to show all meta information */
   void PrintMetaInfo(void);
 
+  /*! \brief Print function to show all stats information */
+  void PrintStats(void);
+
+  // Public profiling information
+  /*! Module load time */
+  int r_module_load_ms{0};
+  /*! Graph runtime creatint time */
+  int r_graph_load_ms{0};
+  /*! Params read time */
+  int r_param_read_ms{0};
+  /*! Params load time */
+  int r_param_load_ms{0};
+  /*! Pre compiled programs load time */
+  int r_pre_compiled_load_ms{0};
+
  private:
   /*! \brief Module handle for the shared object */
   Module r_mod_handle;
@@ -91,6 +110,7 @@ class TVMRunner {
   bool r_run_was_called;
 };
 
+DLDeviceType GetTVMDevice(std::string device);
 }  // namespace runtime
 }  // namespace tvm
 #endif  // TVM_APPS_CPP_RTVM_RUNNER_H_

[tvm] branch main updated: [CLI TOOLS][RTVM] Improve rtvm tool with new options to measure native performance (#15818)

Reply via email to