[GitHub] [incubator-tvm] trevor-m commented on a change in pull request #6395: [BYOC][TensorRT] TensorRT BYOC integration

GitBox Fri, 18 Sep 2020 09:11:50 -0700


trevor-m commented on a change in pull request #6395:
URL: https://github.com/apache/incubator-tvm/pull/6395#discussion_r491051930




##########
File path: src/runtime/contrib/tensorrt/tensorrt_runtime.cc
##########
@@ -0,0 +1,311 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file src/runtime/contrib/tensorrt/tensorrt_runtime.cc
+ * \brief JSON runtime implementation for TensorRT.
+ */
+
+#include <dmlc/parameter.h>
+#include <tvm/runtime/ndarray.h>
+#include <tvm/runtime/registry.h>
+
+#include <fstream>
+
+#include "../../file_util.h"
+#include "../json/json_node.h"
+#include "../json/json_runtime.h"
+
+#ifdef TVM_GRAPH_RUNTIME_TENSORRT
+#include "NvInfer.h"
+#include "tensorrt_builder.h"
+#endif
+
+namespace tvm {
+namespace runtime {
+namespace contrib {
+
+using namespace tvm::runtime::json;
+
+class TensorRTRuntime : public JSONRuntimeBase {
+ public:
+  /*!
+   * \brief The TensorRT runtime module. Deserialize the provided functions
+   * on creation and store in the layer cache.
+   *
+   * \param symbol_name The name of the function.
+   * \param graph_json serialized JSON representation of a sub-graph.
+   * \param const_names The names of each constant in the sub-graph.
+   */
+  explicit TensorRTRuntime(const std::string& symbol_name, const std::string& 
graph_json,
+                           const Array<String>& const_names)
+      : JSONRuntimeBase(symbol_name, graph_json, const_names), 
use_implicit_batch_(true),
+        max_workspace_size_(size_t(1) << 30) {}
+
+  /*!
+   * \brief The type key of the module.
+   *
+   * \return module type key.
+   */
+  const char* type_key() const override { return "tensorrt"; }
+
+  /*!
+   * \brief Initialize runtime. Create TensorRT layer from JSON
+   * representation.
+   *
+   * \param consts The constant params from compiled model.
+   */
+  void Init(const Array<NDArray>& consts) override {
+    CHECK_EQ(consts.size(), const_idx_.size())
+        << "The number of input constants must match the number of required.";
+    LoadGlobalAttributes();
+    if (GetCachedEnginesFromDisk()) return;
+    SetupConstants(consts);
+    BuildEngine();
+    CacheEngineToDisk();
+  }
+
+  void LoadGlobalAttributes() {
+    // These settings are global to the entire subgraph. Codegen will add them 
as attributes to all
+    // op nodes. Read from first one.
+    for (size_t i = 0; i < nodes_.size(); ++i) {
+      if (nodes_[i].HasAttr("use_implicit_batch") && 
nodes_[i].HasAttr("max_workspace_size")) {
+        use_implicit_batch_ =
+            
std::stoi(nodes_[i].GetAttr<std::vector<std::string>>("use_implicit_batch")[0]);
+        // Allow max_workspace_size to be overridden at runtime.
+        size_t runtime_max_workspace_size =
+            dmlc::GetEnv("TVM_TENSORRT_MAX_WORKSPACE_SIZE", size_t(0));
+        if (runtime_max_workspace_size != 0) {
+          max_workspace_size_ = runtime_max_workspace_size;
+        } else {
+          max_workspace_size_ =
+              
std::stoul(nodes_[i].GetAttr<std::vector<std::string>>("max_workspace_size")[0]);
+        }
+        return;
+      }
+    }
+  }
+
+#ifdef TVM_GRAPH_RUNTIME_TENSORRT
+  /*! \brief Run inference using built engine. */
+  void Run() override {
+    auto& engine_and_context = trt_engine_cache_.at(symbol_name_);
+    auto engine = engine_and_context.engine;
+    auto context = engine_and_context.context;
+    std::vector<void*> bindings(engine->getNbBindings(), nullptr);
+
+    for (size_t i = 0; i < input_nodes_.size(); ++i) {
+      auto nid = input_nodes_[i];
+      if (nodes_[nid].GetOpType() == "input") {
+        for (size_t j = 0; j < nodes_[nid].GetOpShape().size(); ++j) {
+          uint32_t eid = EntryID(nid, j);
+          const std::string name = nodes_[nid].GetOpName() + "_" + 
std::to_string(j);
+          int binding_index = engine->getBindingIndex(name.c_str());
+          CHECK_NE(binding_index, -1);
+          bindings[binding_index] = data_entry_[eid]->data;
+        }
+      }
+    }
+
+    for (size_t i = 0; i < outputs_.size(); ++i) {
+      uint32_t eid = EntryID(outputs_[i]);
+      const std::string& name = engine_and_context.outputs[i];
+      int binding_index = engine->getBindingIndex(name.c_str());
+      CHECK_NE(binding_index, -1);
+      bindings[binding_index] = data_entry_[eid]->data;
+    }
+
+#if TRT_VERSION_GE(6, 0, 1)
+    if (use_implicit_batch_) {
+      CHECK(context->execute(batch_size_, bindings.data())) << "Running 
TensorRT failed.";
+    } else {
+      CHECK(context->executeV2(bindings.data())) << "Running TensorRT failed.";
+    }
+#else
+    CHECK(context->execute(batch_size_, bindings.data())) << "Running TensorRT 
failed.";
+#endif
+  }
+
+ private:
+  /*!
+   * \brief Build TensorRT engine from JSON representation.
+   */
+  void BuildEngine() {
+    LOG(INFO) << "Building new TensorRT engine for subgraph " << symbol_name_;
+    const bool use_fp16 = dmlc::GetEnv("TVM_TENSORRT_USE_FP16", false);
+    batch_size_ = GetBatchSize();
+    TensorRTBuilder builder(&logger_, max_workspace_size_, 
use_implicit_batch_, use_fp16,
+                            batch_size_);
+
+    // Add inputs and constants.
+    for (size_t i = 0; i < input_nodes_.size(); ++i) {
+      auto nid = input_nodes_[i];
+      const auto& node = nodes_[nid];
+      std::string name = node.GetOpName();
+      if (node.GetOpType() == "input") {
+        builder.AddInput(nid, node);
+      } else {
+        CHECK_EQ(node.GetOpType(), "const");
+        uint32_t eid = EntryID(nid, 0);
+        builder.AddConstant(nid, data_entry_[eid]);
+      }
+    }
+
+    // Add layers.
+    for (size_t nid = 0; nid < nodes_.size(); ++nid) {
+      const auto& node = nodes_[nid];
+      if (node.GetOpType() != "kernel") continue;
+      builder.AddLayer(nid, node);
+    }
+
+    // Add outputs.
+    for (size_t i = 0; i < outputs_.size(); ++i) {
+      builder.AddOutput(outputs_[i]);
+    }
+
+    // Build engine.
+    trt_engine_cache_[symbol_name_] = builder.BuildEngine();
+    LOG(INFO) << "Finished building TensorRT engine for subgraph " << 
symbol_name_;
+  }
+
+  /*! \brief If TVM_TENSORRT_CACHE_DIR is set, will check that directory for
+   * already built TRT engines and load into trt_engine_cache_ so they don't
+   * have to be built at first inference.
+   */
+  bool GetCachedEnginesFromDisk() {

Review comment:
       AFAIK `SaveToBinary` is only ever invoked during compilation.
   
   The engine is only built during runtime because it is specific to the target 
GPU and platform, so `CacheEnginesToDisk` needs to be performed by the runtime 
also.
   
   See 
https://docs.nvidia.com/deeplearning/tensorrt/developer-guide/index.html#work
   
   To op
   
   > timize your model for inference, TensorRT takes your network definition, 
performs optimizations including platform-specific optimizations, and generates 
the inference engine. This process is referred to as the build phase. The build 
phase can take considerable time, especially when running on embedded 
platforms. Therefore, a typical application will build an engine once, and then 
serialize it as a plan file for later use.
   > 
   > Note: The generated plan files are not portable across platforms or 
TensorRT versions. Plans are specific to the exact GPU model they were built on 
(in addition to the platforms and the TensorRT version) and must be re-targeted 
to the specific GPU in case you want to run them on a different GPU.
   

##########
File path: src/runtime/contrib/tensorrt/tensorrt_runtime.cc
##########
@@ -0,0 +1,311 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file src/runtime/contrib/tensorrt/tensorrt_runtime.cc
+ * \brief JSON runtime implementation for TensorRT.
+ */
+
+#include <dmlc/parameter.h>
+#include <tvm/runtime/ndarray.h>
+#include <tvm/runtime/registry.h>
+
+#include <fstream>
+
+#include "../../file_util.h"
+#include "../json/json_node.h"
+#include "../json/json_runtime.h"
+
+#ifdef TVM_GRAPH_RUNTIME_TENSORRT
+#include "NvInfer.h"
+#include "tensorrt_builder.h"
+#endif
+
+namespace tvm {
+namespace runtime {
+namespace contrib {
+
+using namespace tvm::runtime::json;
+
+class TensorRTRuntime : public JSONRuntimeBase {
+ public:
+  /*!
+   * \brief The TensorRT runtime module. Deserialize the provided functions
+   * on creation and store in the layer cache.
+   *
+   * \param symbol_name The name of the function.
+   * \param graph_json serialized JSON representation of a sub-graph.
+   * \param const_names The names of each constant in the sub-graph.
+   */
+  explicit TensorRTRuntime(const std::string& symbol_name, const std::string& 
graph_json,
+                           const Array<String>& const_names)
+      : JSONRuntimeBase(symbol_name, graph_json, const_names), 
use_implicit_batch_(true),
+        max_workspace_size_(size_t(1) << 30) {}
+
+  /*!
+   * \brief The type key of the module.
+   *
+   * \return module type key.
+   */
+  const char* type_key() const override { return "tensorrt"; }
+
+  /*!
+   * \brief Initialize runtime. Create TensorRT layer from JSON
+   * representation.
+   *
+   * \param consts The constant params from compiled model.
+   */
+  void Init(const Array<NDArray>& consts) override {
+    CHECK_EQ(consts.size(), const_idx_.size())
+        << "The number of input constants must match the number of required.";
+    LoadGlobalAttributes();
+    if (GetCachedEnginesFromDisk()) return;
+    SetupConstants(consts);
+    BuildEngine();
+    CacheEngineToDisk();
+  }
+
+  void LoadGlobalAttributes() {
+    // These settings are global to the entire subgraph. Codegen will add them 
as attributes to all
+    // op nodes. Read from first one.
+    for (size_t i = 0; i < nodes_.size(); ++i) {
+      if (nodes_[i].HasAttr("use_implicit_batch") && 
nodes_[i].HasAttr("max_workspace_size")) {
+        use_implicit_batch_ =
+            
std::stoi(nodes_[i].GetAttr<std::vector<std::string>>("use_implicit_batch")[0]);
+        // Allow max_workspace_size to be overridden at runtime.
+        size_t runtime_max_workspace_size =
+            dmlc::GetEnv("TVM_TENSORRT_MAX_WORKSPACE_SIZE", size_t(0));
+        if (runtime_max_workspace_size != 0) {
+          max_workspace_size_ = runtime_max_workspace_size;
+        } else {
+          max_workspace_size_ =
+              
std::stoul(nodes_[i].GetAttr<std::vector<std::string>>("max_workspace_size")[0]);
+        }
+        return;
+      }
+    }
+  }
+
+#ifdef TVM_GRAPH_RUNTIME_TENSORRT
+  /*! \brief Run inference using built engine. */
+  void Run() override {
+    auto& engine_and_context = trt_engine_cache_.at(symbol_name_);
+    auto engine = engine_and_context.engine;
+    auto context = engine_and_context.context;
+    std::vector<void*> bindings(engine->getNbBindings(), nullptr);
+
+    for (size_t i = 0; i < input_nodes_.size(); ++i) {
+      auto nid = input_nodes_[i];
+      if (nodes_[nid].GetOpType() == "input") {
+        for (size_t j = 0; j < nodes_[nid].GetOpShape().size(); ++j) {
+          uint32_t eid = EntryID(nid, j);
+          const std::string name = nodes_[nid].GetOpName() + "_" + 
std::to_string(j);
+          int binding_index = engine->getBindingIndex(name.c_str());
+          CHECK_NE(binding_index, -1);
+          bindings[binding_index] = data_entry_[eid]->data;
+        }
+      }
+    }
+
+    for (size_t i = 0; i < outputs_.size(); ++i) {
+      uint32_t eid = EntryID(outputs_[i]);
+      const std::string& name = engine_and_context.outputs[i];
+      int binding_index = engine->getBindingIndex(name.c_str());
+      CHECK_NE(binding_index, -1);
+      bindings[binding_index] = data_entry_[eid]->data;
+    }
+
+#if TRT_VERSION_GE(6, 0, 1)
+    if (use_implicit_batch_) {
+      CHECK(context->execute(batch_size_, bindings.data())) << "Running 
TensorRT failed.";
+    } else {
+      CHECK(context->executeV2(bindings.data())) << "Running TensorRT failed.";
+    }
+#else
+    CHECK(context->execute(batch_size_, bindings.data())) << "Running TensorRT 
failed.";
+#endif
+  }
+
+ private:
+  /*!
+   * \brief Build TensorRT engine from JSON representation.
+   */
+  void BuildEngine() {
+    LOG(INFO) << "Building new TensorRT engine for subgraph " << symbol_name_;
+    const bool use_fp16 = dmlc::GetEnv("TVM_TENSORRT_USE_FP16", false);
+    batch_size_ = GetBatchSize();
+    TensorRTBuilder builder(&logger_, max_workspace_size_, 
use_implicit_batch_, use_fp16,
+                            batch_size_);
+
+    // Add inputs and constants.
+    for (size_t i = 0; i < input_nodes_.size(); ++i) {
+      auto nid = input_nodes_[i];
+      const auto& node = nodes_[nid];
+      std::string name = node.GetOpName();
+      if (node.GetOpType() == "input") {
+        builder.AddInput(nid, node);
+      } else {
+        CHECK_EQ(node.GetOpType(), "const");
+        uint32_t eid = EntryID(nid, 0);
+        builder.AddConstant(nid, data_entry_[eid]);
+      }
+    }
+
+    // Add layers.
+    for (size_t nid = 0; nid < nodes_.size(); ++nid) {
+      const auto& node = nodes_[nid];
+      if (node.GetOpType() != "kernel") continue;
+      builder.AddLayer(nid, node);
+    }
+
+    // Add outputs.
+    for (size_t i = 0; i < outputs_.size(); ++i) {
+      builder.AddOutput(outputs_[i]);
+    }
+
+    // Build engine.
+    trt_engine_cache_[symbol_name_] = builder.BuildEngine();
+    LOG(INFO) << "Finished building TensorRT engine for subgraph " << 
symbol_name_;
+  }
+
+  /*! \brief If TVM_TENSORRT_CACHE_DIR is set, will check that directory for
+   * already built TRT engines and load into trt_engine_cache_ so they don't
+   * have to be built at first inference.
+   */
+  bool GetCachedEnginesFromDisk() {

Review comment:
       AFAIK `SaveToBinary` is only ever invoked during compilation.
   
   The engine is only built during runtime because it is specific to the target 
GPU and platform, so `CacheEnginesToDisk` needs to be performed by the runtime 
also.
   
   See 
https://docs.nvidia.com/deeplearning/tensorrt/developer-guide/index.html#work
   
   
   
   > To optimize your model for inference, TensorRT takes your network 
definition, performs optimizations including platform-specific optimizations, 
and generates the inference engine. This process is referred to as the build 
phase. The build phase can take considerable time, especially when running on 
embedded platforms. Therefore, a typical application will build an engine once, 
and then serialize it as a plan file for later use.
   > 
   > Note: The generated plan files are not portable across platforms or 
TensorRT versions. Plans are specific to the exact GPU model they were built on 
(in addition to the platforms and the TensorRT version) and must be re-targeted 
to the specific GPU in case you want to run them on a different GPU.
   




----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
[email protected]

[GitHub] [incubator-tvm] trevor-m commented on a change in pull request #6395: [BYOC][TensorRT] TensorRT BYOC integration

Reply via email to