[GitHub] [incubator-tvm] comaniac commented on a change in pull request #6395: [BYOC][TensorRT] TensorRT BYOC integration

GitBox Mon, 21 Sep 2020 18:17:47 -0700


comaniac commented on a change in pull request #6395:
URL: https://github.com/apache/incubator-tvm/pull/6395#discussion_r492417754




##########
File path: src/relay/backend/contrib/tensorrt/codegen.cc
##########
@@ -0,0 +1,240 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file src/relay/backend/contrib/tensorrt/codegen.cc
+ * \brief Implementation of the TensorRT JSON serializer.
+ */
+#include <tvm/ir/module.h>
+#include <tvm/relay/attrs/nn.h>
+#include <tvm/relay/type.h>
+
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "../../utils.h"
+#include "../codegen_json/codegen_json.h"
+
+#if TVM_GRAPH_RUNTIME_TENSORRT
+#include "NvInfer.h"
+#endif
+
+namespace tvm {
+namespace relay {
+namespace contrib {
+
+/*! \brief Attributes to store the compiler options for TensorRT. */
+struct TensorRTCompilerConfigNode : public 
tvm::AttrsNode<TensorRTCompilerConfigNode> {
+  Array<Integer> tensorrt_version;
+  bool use_implicit_batch;
+  size_t max_workspace_size;
+  bool remove_no_mac_subgraphs;
+
+  TVM_DECLARE_ATTRS(TensorRTCompilerConfigNode, 
"ext.attrs.TensorRTCompilerConfigNode") {
+    TVM_ATTR_FIELD(tensorrt_version)
+        .describe("TensorRT version as (major, minor, patch).")
+        .set_default(Array<Integer>({6, 0, 1}));
+    TVM_ATTR_FIELD(use_implicit_batch).set_default(true);
+    TVM_ATTR_FIELD(max_workspace_size).set_default(size_t(1) << 30);
+    TVM_ATTR_FIELD(remove_no_mac_subgraphs).set_default(false);
+  }
+};
+
+class TensorRTCompilerConfig : public Attrs {
+ public:
+  TVM_DEFINE_NOTNULLABLE_OBJECT_REF_METHODS(TensorRTCompilerConfig, Attrs,
+                                            TensorRTCompilerConfigNode);
+};
+
+TVM_REGISTER_NODE_TYPE(TensorRTCompilerConfigNode);
+TVM_REGISTER_PASS_CONFIG_OPTION("relay.ext.tensorrt.options", 
TensorRTCompilerConfig);
+
+/*!
+ * \brief Generates an TensorRTModule from a relay expression by serializing 
the expression to a
+ * json representation. TensorRT is not required here because use of TensorRT 
APIs is deferred until
+ * runtime.
+ */
+class TensorRTJSONSerializer : public backend::contrib::JSONSerializer {
+  using JSONGraphNode = tvm::runtime::json::JSONGraphNode;
+  using JSONGraphNodeEntry = tvm::runtime::json::JSONGraphNodeEntry;
+
+ public:
+  TensorRTJSONSerializer(const std::string& symbol, const Expr& expr)
+      : JSONSerializer(symbol, expr) {}
+
+  std::vector<JSONGraphNodeEntry> VisitExpr_(const CallNode* cn) {
+    std::string name;
+    if (const auto* op_node = cn->op.as<OpNode>()) {
+      name = op_node->name;
+    } else {
+      return JSONSerializer::VisitExpr_(cn);
+    }
+
+    std::vector<JSONGraphNodeEntry> inputs;
+    for (const auto& arg : cn->args) {
+      auto res = VisitExpr(arg);
+      inputs.insert(inputs.end(), res.begin(), res.end());
+    }
+    auto node = std::make_shared<JSONGraphNode>(name,     /* name_ */
+                                                "kernel", /* op_type_ */
+                                                inputs, 1 /* num_outputs_ */);
+    if (name == "nn.pad") {
+      SetPadNodeAttribute(node, cn);
+    } else if (name == "strided_slice") {
+      SetStridedSliceNodeAttribute(node, cn);
+    } else {
+      SetCallNodeAttribute(node, cn);
+    }
+    // These attributes are global to the whole module.
+    SaveGlobalAttributes(node);
+    return AddNode(node, GetRef<Expr>(cn));
+  }
+
+  void SetPadNodeAttribute(std::shared_ptr<JSONGraphNode> node, const 
CallNode* cn) {
+    const auto* pad_attr = cn->attrs.as<PadAttrs>();
+    CHECK(pad_attr);
+    auto p = pad_attr->pad_width;
+    const int dim_h = (p.size() == 5) ? 3 : 2;
+    const int dim_w = (p.size() == 5) ? 4 : 3;
+    std::vector<std::string> padding = 
{std::to_string(p[dim_h][0].as<IntImmNode>()->value),
+                                        
std::to_string(p[dim_w][0].as<IntImmNode>()->value),
+                                        
std::to_string(p[dim_h][1].as<IntImmNode>()->value),
+                                        
std::to_string(p[dim_w][1].as<IntImmNode>()->value)};
+    std::vector<dmlc::any> padding_attr;
+    padding_attr.emplace_back(padding);
+    node->SetAttr("padding", padding_attr);
+  }
+
+  void SetStridedSliceNodeAttribute(std::shared_ptr<JSONGraphNode> node, const 
CallNode* cn) {
+    const auto* attrs = cn->attrs.as<StridedSliceAttrs>();
+    CHECK(attrs);
+    CHECK(attrs->begin && attrs->end && attrs->strides);
+    const bool default_strides =
+        !attrs->strides.value().defined() || attrs->strides.value().size() == 
0;
+    auto ishape = backend::GetShape(cn->args[0]->checked_type());
+
+    auto process_slice_index = [](Integer x, int default_value, int dim_value) 
{
+      if (!x.defined()) return default_value;
+      int value = x.as<IntImmNode>()->value;
+      if (value < 0) value += dim_value;
+      return value;
+    };
+
+    std::vector<std::string> start, size, strides;
+    for (size_t i = 0; i < attrs->begin.value().size(); ++i) {
+      const int begin_value = process_slice_index(attrs->begin.value()[i], 0, 
ishape[i]);
+      const int end_value = process_slice_index(attrs->end.value()[i], 
ishape[i], ishape[i]);
+      const int stride_value = (default_strides || i >= 
attrs->strides.value().size() ||
+                                !attrs->strides.value()[i].defined())
+                                   ? 1
+                                   : 
attrs->strides.value()[i].as<IntImmNode>()->value;
+      CHECK_GT(stride_value, 0);
+      const int size_value = (end_value - begin_value + stride_value - 1) / 
stride_value;
+      CHECK_GE(begin_value, 0);
+      CHECK_GT(size_value, 0);
+      start.push_back(std::to_string(begin_value));
+      size.push_back(std::to_string(size_value));
+      strides.push_back(std::to_string(stride_value));
+    }
+    std::vector<dmlc::any> start_attr, size_attr, strides_attr;
+    start_attr.emplace_back(start);
+    size_attr.emplace_back(size);
+    strides_attr.emplace_back(strides);
+    node->SetAttr("start", start_attr);
+    node->SetAttr("size", size_attr);
+    node->SetAttr("strides", strides_attr);
+  }
+
+  void SaveGlobalAttributes(std::shared_ptr<JSONGraphNode> node) {
+    auto ctx = transform::PassContext::Current();
+    auto cfg = 
ctx->GetConfig<TensorRTCompilerConfig>("relay.ext.tensorrt.options");
+    if (!cfg.defined()) {
+      cfg = AttrsWithDefaultValues<TensorRTCompilerConfig>();
+    }
+    CHECK_EQ(cfg.value()->tensorrt_version.size(), 3);
+    std::vector<std::string> tensorrt_version = 
{std::to_string(cfg.value()->tensorrt_version[0]),
+                                                 
std::to_string(cfg.value()->tensorrt_version[1]),
+                                                 
std::to_string(cfg.value()->tensorrt_version[2])};
+    std::vector<std::string> use_implicit_batch = 
{std::to_string(cfg.value()->use_implicit_batch)};
+    std::vector<std::string> max_workspace_size = 
{std::to_string(cfg.value()->max_workspace_size)};
+    std::vector<dmlc::any> tensorrt_version_attr, use_implicit_batch_attr, 
max_workspace_size_attr;
+    tensorrt_version_attr.emplace_back(tensorrt_version);
+    use_implicit_batch_attr.emplace_back(use_implicit_batch);
+    max_workspace_size_attr.emplace_back(max_workspace_size);
+    node->SetAttr("tensorrt_version", tensorrt_version_attr);
+    node->SetAttr("use_implicit_batch", use_implicit_batch_attr);
+    node->SetAttr("max_workspace_size", max_workspace_size_attr);
+  }
+};
+
+/*!
+ * \brief Create a runtime module for TensorRT.
+ * \param ref The ext_func Relay expression/module to be executed using extern 
ops.
+ * \return A runtime module.
+ */
+runtime::Module TensorRTCompiler(const ObjectRef& ref) {
+  CHECK(ref->IsInstance<FunctionNode>()) << "The input ref is expected to be a 
Relay function.";
+  Function func = Downcast<Function>(ref);
+  std::string func_name = backend::GetExtSymbol(func);
+
+  TensorRTJSONSerializer serializer(func_name, func);
+  serializer.serialize();
+  std::string graph_json = serializer.GetJSON();
+  auto param_names = serializer.GetParams();
+  const auto* pf = runtime::Registry::Get("runtime.tensorrt_runtime_create");
+  CHECK(pf != nullptr) << "Cannot find JSON runtime module to create";

Review comment:
       Improve the error message to be more TensorRT specific.

##########
File path: src/runtime/contrib/tensorrt/tensorrt_builder.h
##########
@@ -0,0 +1,159 @@
+/* * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file runtime/contrib/tensorrt/tensorrt_builder.h
+ * \brief The TensorRTBuilder class can be used to convert a JSONRuntime graph 
into a TRT engine
+ * which can be used for inference.
+ */
+
+#ifndef TVM_RUNTIME_CONTRIB_TENSORRT_TENSORRT_BUILDER_H_
+#define TVM_RUNTIME_CONTRIB_TENSORRT_TENSORRT_BUILDER_H_
+
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+#include "../json/json_node.h"
+#include "NvInfer.h"
+#include "tensorrt_logger.h"
+#include "tensorrt_ops.h"
+
+namespace tvm {
+namespace runtime {
+namespace contrib {
+
+using JSONGraphNode = tvm::runtime::json::JSONGraphNode;
+using JSONGraphNodeEntry = tvm::runtime::json::JSONGraphNodeEntry;
+
+/*!
+ * \brief The product of TensorRTBuilder which provides everything needed to
+ * perform inference.
+ */
+struct TrtEngineAndContext {

Review comment:
       Better to be consistent with `TensorRTBuilder`: 
`TensorRTEngineAndContext`.

##########
File path: src/runtime/contrib/tensorrt/tensorrt_ops.h
##########
@@ -0,0 +1,208 @@
+/* * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file runtime/contrib/tensorrt/tensorrt_ops.h
+ * \brief Converters from Relay ops into TensorRT layers. Converters should
+ * inherit from TrtOpConverter and implement the Convert() method.
+ */
+
+#ifndef TVM_RUNTIME_CONTRIB_TENSORRT_TENSORRT_OPS_H_
+#define TVM_RUNTIME_CONTRIB_TENSORRT_TENSORRT_OPS_H_
+
+#include <algorithm>
+#include <cmath>
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+#include "../json/json_node.h"
+#include "NvInfer.h"
+#include "tensorrt_utils.h"
+
+#if TRT_VERSION_GE(6, 0, 1)
+#define TRT_HAS_IMPLICIT_BATCH(params) 
(params->network->hasImplicitBatchDimension())
+#else
+#define TRT_HAS_IMPLICIT_BATCH(params) (true)
+#endif
+
+namespace tvm {
+namespace runtime {
+namespace contrib {
+
+using JSONGraphNode = tvm::runtime::json::JSONGraphNode;
+
+/*!
+ * \brief An input to a op may be either kTensor in the case of 
nvinfer::ITensor*
+ * or kWeight for nvinfer1::Weights.
+ */
+enum TrtInputType {
+  kTensor,
+  kWeight,
+};
+
+/*!
+ * \brief An input to a TrtOpConverter. The type of the input is either kTensor
+ * or kWeight. For kTensor, "tensor" contains the input tensor. For kWeight,
+ * "weight" contains the input weight and "weight_shape" contains the shape.
+ */
+struct TrtOpInput {
+  /*! \brief If type is kTensor, will store input tensor. */
+  nvinfer1::ITensor* tensor;
+
+  /*! \brief If type is kWeight, will store input weight. */
+  nvinfer1::Weights weight;
+
+  /*! \brief Whether the input is in tensor or weight. */
+  TrtInputType type;
+
+  /*! \brief If type is kWeight, will store weight shape. */
+  std::vector<int> weight_shape;
+
+  explicit TrtOpInput(nvinfer1::ITensor* tensor)
+      : tensor(tensor), weight({nvinfer1::DataType::kFLOAT, nullptr, 0}), 
type(kTensor) {}
+  TrtOpInput(nvinfer1::Weights weight, const std::vector<int>& shape)
+      : tensor(nullptr), weight(weight), type(kWeight), weight_shape(shape) {}
+};
+
+/*! \brief Parameters to convert an Op from relay to TensorRT. */
+struct AddTrtLayerParams {

Review comment:
       What does "Add" means here?

##########
File path: tests/python/contrib/test_tensorrt.py
##########
@@ -0,0 +1,896 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+import numpy as np
+import time
+import pytest
+
+import tvm
+import tvm.relay.testing
+from tvm import relay
+from tvm.relay.op.contrib import tensorrt
+from tvm.contrib import graph_runtime
+
+
+def skip_codegen_test():
+    """Skip test if TensorRT and CUDA codegen are not present"""
+    if not tvm.runtime.enabled("cuda") or not tvm.gpu(0).exist:
+        print("Skip because CUDA is not enabled.")
+        return True
+    if not tvm.get_global_func("relay.ext.tensorrt", True):
+        print("Skip because TensorRT codegen is not available.")
+        return True
+    return False
+
+
+def skip_runtime_test():
+    if not tvm.runtime.enabled("cuda") or not tvm.gpu(0).exist:
+        print("Skip because CUDA is not enabled.")
+        return True
+    if not tensorrt.is_tensorrt_runtime_enabled():
+        print("Skip because TensorRT runtime is not available.")
+        return True
+    return False
+
+
+def run_and_verify(config):
+    if skip_codegen_test():
+        return
+    f, input_shapes, is_param = config
+    params = {x: np.random.uniform(-1, 1, input_shapes[x]).astype(np.float32) 
for x in is_param}
+    input_dict = {
+        k: np.random.uniform(-1, 1, v).astype(np.float32)
+        for k, v in input_shapes.items()
+        if k not in is_param
+    }
+
+    # Run TRT
+    mod = tvm.IRModule()
+    mod["main"] = f
+    mod, config = tensorrt.partition_for_tensorrt(mod, params)
+    with tvm.transform.PassContext(opt_level=3, 
config={"relay.ext.tensorrt.options": config}):
+        graph, lib, graph_params = relay.build(mod, "cuda", params=params)
+    if skip_runtime_test():
+        return
+    mod = graph_runtime.create(graph, lib, ctx=tvm.gpu(0))
+    mod.set_input(**graph_params)
+    mod.run(**input_dict)
+    results = [mod.get_output(i) for i in range(mod.get_num_outputs())]
+
+    # Run reference
+    mod = tvm.IRModule()
+    mod["main"] = f
+    with tvm.transform.PassContext(opt_level=3):
+        graph, lib, graph_params = relay.build(mod, "cuda", params=params)
+    mod = graph_runtime.create(graph, lib, ctx=tvm.gpu(0))
+    mod.set_input(**graph_params)
+    mod.run(**input_dict)
+    ref_results = [mod.get_output(i) for i in range(mod.get_num_outputs())]
+
+    assert len(results) == len(ref_results)
+    for i in range(len(results)):
+        res = results[i].asnumpy()
+        ref_res = ref_results[i].asnumpy()
+        assert res.shape == ref_res.shape
+        tvm.testing.assert_allclose(res, ref_res, rtol=1e-3, atol=1e-3)
+
+
+def run_and_verify_model(model):
+    if skip_codegen_test():
+        return
+
+    def compile_and_run(i_data, input_shape, dtype, use_trt=True, 
num_iteration=1):
+        import mxnet as mx
+        from mxnet.gluon.model_zoo.vision import get_model
+
+        def check_trt_used(graph):
+            import json
+
+            graph = json.loads(graph)
+            num_trt_subgraphs = sum(
+                [
+                    1
+                    for n in graph["nodes"]
+                    if n.get("attrs", {}).get("func_name", 
"").startswith("tensorrt_")
+                ]
+            )
+            assert num_trt_subgraphs >= 1
+
+        block = get_model(model, pretrained=True)
+        mod, params = relay.frontend.from_mxnet(block, shape={"data": 
input_shape}, dtype=dtype)
+
+        if use_trt:
+            mod, config = tensorrt.partition_for_tensorrt(mod, params)
+            with tvm.transform.PassContext(
+                opt_level=3, config={"relay.ext.tensorrt.options": config}
+            ):
+                graph, lib, params = relay.build(mod, "cuda", params=params)
+            check_trt_used(graph)
+        else:
+            with tvm.transform.PassContext(opt_level=3):
+                graph, lib, params = relay.build(mod, "cuda", params=params)
+
+        if skip_runtime_test():
+            return
+        mod = graph_runtime.create(graph, lib, ctx=tvm.gpu(0))
+        mod.set_input(**params)
+        # Warmup
+        for i in range(10):
+            mod.run(data=i_data)
+        # Time
+        times = []
+        for i in range(num_iteration):
+            start_time = time.time()
+            mod.run(data=i_data)
+            res = mod.get_output(0)
+            times.append(time.time() - start_time)
+        latency = 1000.0 * np.mean(times)
+        print(model, latency)
+        return res
+
+    dtype = "float32"
+    input_shape = (1, 3, 224, 224)
+    i_data = np.random.uniform(-1, 1, input_shape).astype(dtype)
+    res = compile_and_run(i_data, input_shape, dtype, use_trt=True)
+    ref_res = compile_and_run(i_data, input_shape, dtype, use_trt=False, 
num_iteration=1)
+    tvm.testing.assert_allclose(res.asnumpy(), ref_res.asnumpy(), rtol=1e-3, 
atol=1e-3)
+
+
+def test_tensorrt_simple():
+    if skip_codegen_test():
+        return
+    dtype = "float32"
+    xshape = (1, 3, 2, 2)
+    yshape = (1, 3, 1, 1)
+    zshape = (1, 1, 1, 1)
+    x = relay.var("x", shape=(xshape), dtype=dtype)
+    y = relay.var("y", shape=(yshape), dtype=dtype)
+    z = relay.var("z", shape=(zshape), dtype=dtype)
+    w = z * (x + y)
+    out = relay.nn.relu(w)
+    f = relay.Function([x, y, z], out)
+
+    mod = tvm.IRModule()
+    mod["main"] = f
+    mod, config = tensorrt.partition_for_tensorrt(mod)
+    with tvm.transform.PassContext(opt_level=3, 
config={"relay.ext.tensorrt.options": config}):
+        graph, lib, params = relay.build(mod, "cuda")
+    if skip_runtime_test():
+        return
+    mod = graph_runtime.create(graph, lib, ctx=tvm.gpu(0))
+    x_data = np.random.uniform(-1, 1, xshape).astype(dtype)
+    y_data = np.random.uniform(-1, 1, yshape).astype(dtype)
+    z_data = np.random.uniform(-1, 1, zshape).astype(dtype)
+    mod.run(x=x_data, y=y_data, z=z_data)
+    results = [mod.get_output(i).asnumpy() for i in 
range(mod.get_num_outputs())]
+
+
+def test_tensorrt_not_compatible():
+    if skip_codegen_test():
+        return
+    dtype = "float32"
+    xshape = (1, 32, 14, 14)
+    x = relay.var("x", shape=(xshape), dtype=dtype)
+    y = relay.add(x, x)
+    z = relay.erf(y)
+    out = relay.nn.relu(z)
+    f = relay.Function([x], out)
+    mod = tvm.IRModule()
+    mod["main"] = f
+    mod, config = tensorrt.partition_for_tensorrt(mod)
+    with tvm.transform.PassContext(opt_level=3, 
config={"relay.ext.tensorrt.options": config}):
+        graph, lib, params = relay.build(mod, "cuda")
+    if skip_runtime_test():
+        return
+    mod = graph_runtime.create(graph, lib, ctx=tvm.gpu(0))
+    x_data = np.random.uniform(-1, 1, xshape).astype(dtype)
+    mod.run(x=x_data)
+    results = [mod.get_output(i).asnumpy() for i in 
range(mod.get_num_outputs())]
+
+
+def test_tensorrt_serialize():
+    if skip_codegen_test():
+        return
+    import mxnet
+    from mxnet.gluon.model_zoo.vision import get_model
+
+    block = get_model("resnet18_v1", pretrained=True)
+    mod, params = relay.frontend.from_mxnet(
+        block, shape={"data": (1, 3, 224, 224)}, dtype="float32"
+    )
+    # Compile
+    mod, config = tensorrt.partition_for_tensorrt(mod, params)
+    with tvm.transform.PassContext(opt_level=3, 
config={"relay.ext.tensorrt.options": config}):
+        lib = relay.build(mod, "cuda", params=params)
+    # Serialize
+    lib.export_library("compiled.so")
+    # Deserialize
+    loaded_lib = tvm.runtime.load_module("compiled.so")
+    # Run
+    if skip_runtime_test():
+        return
+    gen_module = 
tvm.contrib.graph_runtime.GraphModule(loaded_lib["default"](tvm.gpu(0)))
+    i_data = np.random.uniform(0, 1, (1, 3, 224, 224)).astype("float32")
+    for i in range(10):
+        gen_module.run(data=i_data)
+
+
+def test_conv2d():
+    def get_graph(
+        x_shape=(1, 32, 8, 8),
+        k_shape=(16, 32, 3, 3),
+        groups=1,
+        padding=(0, 0),
+        strides=(1, 1),
+        dilation=(1, 1),
+    ):
+        x = relay.var("x", shape=(x_shape), dtype="float32")
+        kernel = relay.var("kernel", shape=(k_shape), dtype="float32")
+        out = relay.nn.conv2d(
+            x,
+            kernel,
+            channels=k_shape[0],
+            kernel_size=k_shape[2:4],
+            groups=groups,
+            padding=padding,
+            strides=strides,
+            dilation=dilation,
+        )
+        f = relay.Function([x, kernel], out)
+        return f, {"x": x_shape, "kernel": k_shape}, ["kernel"]
+
+    for k_shape, groups in [((16, 32, 3, 3), 1), ((32, 1, 3, 3), 32)]:
+        for padding in [(0, 0), (1, 1)]:
+            for strides in [(1, 1), (2, 2)]:
+                for dilation in [(1, 1), (2, 2)]:
+                    run_and_verify(
+                        get_graph(
+                            k_shape=k_shape,
+                            groups=groups,
+                            padding=padding,
+                            strides=strides,
+                            dilation=dilation,
+                        )
+                    )
+
+
+def test_conv2d_nhwc():
+    def get_graph(x_shape=(1, 8, 8, 32), k_shape=(3, 3, 32, 16)):
+        x = relay.var("x", shape=(x_shape), dtype="float32")
+        kernel = relay.var("kernel", shape=(k_shape), dtype="float32")
+        out = relay.nn.conv2d(
+            x,
+            kernel,
+            channels=16,
+            kernel_size=(3, 3),
+            data_layout="NHWC",
+            kernel_layout="HWIO",
+        )
+        f = relay.Function([x, kernel], out)
+        return f, {"x": x_shape, "kernel": k_shape}, ["kernel"]
+
+    run_and_verify(get_graph())
+
+
+def test_conv2d_weights_const():
+    def get_graph(
+        x_shape=(1, 32, 8, 8),
+        k_shape=(16, 32, 3, 3),
+        groups=1,
+        padding=(0, 0),
+        strides=(1, 1),
+        dilation=(1, 1),
+    ):
+        x = relay.var("x", shape=(x_shape), dtype="float32")
+        kernel = relay.const(np.ones(k_shape).astype("float32"))
+        out = relay.nn.conv2d(
+            x,
+            kernel,
+            channels=k_shape[0],
+            kernel_size=k_shape[2:4],
+            groups=groups,
+            padding=padding,
+            strides=strides,
+            dilation=dilation,
+        )
+        f = relay.Function([x], out)
+        return f, {"x": x_shape}, []
+
+    run_and_verify(get_graph())
+
+
+def test_conv2d_weights_transposed():
+    def get_graph(x_shape=(1, 32, 9, 9), k_shape=(3, 3, 32, 16), order=(3, 2, 
0, 1)):
+        x = relay.var("x", shape=(x_shape), dtype="float32")
+        kernel = relay.var("kernel", shape=(k_shape), dtype="float32")
+        kernel_t = relay.transpose(kernel, order)
+        # Conv2d requires constant weights in TensorRT, so the weights should 
be transposed by
+        # FoldConstant.
+        out = relay.nn.conv2d(x, kernel_t, channels=k_shape[order[0]], 
kernel_size=(3, 3))
+        f = relay.Function([x, kernel], out)
+        return f, {"x": x_shape, "kernel": k_shape}, ["kernel"]
+
+    run_and_verify(get_graph())
+
+
+def test_dense():
+    def get_graph(x_shape=(1, 16), k_shape=(32, 16)):
+        x = relay.var("x", shape=(x_shape), dtype="float32")
+        kernel = relay.var("kernel", shape=(k_shape), dtype="float32")
+        # Dense requires constant weights in TensorRT, so the weights are 
transposed by us.
+        out = relay.nn.dense(x, kernel, units=k_shape[0])
+        f = relay.Function([x, kernel], out)
+        return f, {"x": x_shape, "kernel": k_shape}, ["kernel"]
+
+    run_and_verify(get_graph())
+
+
+def test_bias_add():
+    def get_graph(x_shape=(1, 16), channels=16):
+        x = relay.var("x", shape=(x_shape), dtype="float32")
+        bias = relay.var("bias", shape=(channels,), dtype="float32")
+        out = relay.nn.bias_add(x, bias)
+        f = relay.Function([x, bias], out)
+        return f, {"x": x_shape, "bias": (channels,)}, ["bias"]
+
+    run_and_verify(get_graph())
+    run_and_verify(get_graph((1, 6, 3, 4), 6))
+
+
+def test_pool2d():
+    def get_graph(
+        op,
+        x_shape=(1, 3, 32, 32),
+        pool_size=(2, 2),
+        strides=(2, 2),
+        padding=(0, 0),
+        ceil_mode=False,
+        count_include_pad=None,
+    ):
+        x = relay.var("x", shape=(x_shape), dtype="float32")
+        if count_include_pad is not None:
+            out = op(
+                x,
+                pool_size=pool_size,
+                strides=strides,
+                padding=padding,
+                ceil_mode=ceil_mode,
+                count_include_pad=count_include_pad,
+            )
+        else:
+            out = op(
+                x,
+                pool_size=pool_size,
+                strides=strides,
+                padding=padding,
+                ceil_mode=ceil_mode,
+            )
+        f = relay.Function([x], out)
+        return f, {"x": x_shape}, []
+
+    for pool_size in [(2, 2), (3, 3)]:
+        for strides in [(1, 1), (2, 2)]:
+            for padding in [(0, 0), (1, 1), (0, 0, 1, 1)]:
+                for ceil_mode in [False, True]:
+                    # Skip "the padding size is larger than or equal to the 
filter size for exclusive-counting pooling"
+                    if pool_size == (2, 2) and padding == (0, 0, 1, 1):
+                        continue
+                    for count_include_pad in [False, True]:
+                        # Skip "inclusive-counted blended or average pooling 
is not supported in combination with asymmetric padding"
+                        if count_include_pad and (padding == (0, 0, 1, 1) or 
strides == (2, 2)):
+                            continue
+                        run_and_verify(
+                            get_graph(
+                                relay.nn.avg_pool2d,
+                                pool_size=pool_size,
+                                strides=strides,
+                                padding=padding,
+                                ceil_mode=ceil_mode,
+                                count_include_pad=count_include_pad,
+                            )
+                        )
+                    run_and_verify(
+                        get_graph(
+                            relay.nn.max_pool2d,
+                            pool_size=pool_size,
+                            strides=strides,
+                            padding=padding,
+                            ceil_mode=ceil_mode,
+                        )
+                    )
+
+
+def test_global_pool2d():
+    def get_graph(op, x_shape=(1, 3, 32, 32)):
+        x = relay.var("x", shape=(x_shape), dtype="float32")
+        out = op(x)
+        f = relay.Function([x], out)
+        return f, {"x": x_shape}, []
+
+    run_and_verify(get_graph(relay.nn.global_max_pool2d))
+    run_and_verify(get_graph(relay.nn.global_avg_pool2d))
+
+
+def test_batch_flatten():
+    def get_graph(x_shape=(1, 3, 4, 6)):
+        x = relay.var("x", shape=(x_shape), dtype="float32")
+        out = relay.nn.batch_flatten(x)
+        f = relay.Function([x], out)
+        return f, {"x": x_shape}, []
+
+    run_and_verify(get_graph())
+
+
+def test_expand_dims():
+    def get_graph(x_shape=(1, 3), axis=1, num_newaxis=1):
+        x = relay.var("x", shape=(x_shape), dtype="float32")
+        out = relay.expand_dims(x, axis, num_newaxis)
+        f = relay.Function([x], out)
+        return f, {"x": x_shape}, []
+
+    run_and_verify(get_graph())
+
+
+def test_squeeze():
+    def get_graph(x_shape, axis):
+        x = relay.var("x", shape=(x_shape), dtype="float32")
+        out = relay.squeeze(x, axis=axis)
+        f = relay.Function([x], out)
+        return f, {"x": x_shape}, []
+
+    run_and_verify(get_graph((1, 5, 1, 1), (2, 3)))
+    run_and_verify(get_graph((1, 3, 1), (-1,)))
+
+
+def test_concatenate():
+    def get_graph(input_shapes, axis):
+        concat_inputs = []
+        shapes_dict = {}
+        for i in range(len(input_shapes)):
+            name = "input_{}".format(i)
+            concat_inputs.append(relay.var(name, shape=(input_shapes[i]), 
dtype="float32"))
+            shapes_dict[name] = input_shapes[i]
+        out = relay.concatenate(concat_inputs, axis)
+        f = relay.Function(concat_inputs, out)
+        return f, shapes_dict, []
+
+    run_and_verify(get_graph([(1, 2, 6, 6), (1, 3, 6, 6)], axis=1))
+
+
+def test_conv2d_transpose():
+    def get_graph(
+        x_shape=(1, 32, 8, 8),
+        k_shape=(32, 16, 3, 3),
+        groups=1,
+        padding=(0, 0),
+        strides=(1, 1),
+    ):
+        x = relay.var("x", shape=(x_shape), dtype="float32")
+        kernel = relay.var("kernel", shape=(k_shape), dtype="float32")
+        out = relay.nn.conv2d_transpose(
+            x,
+            kernel,
+            channels=k_shape[1],
+            kernel_size=k_shape[2:4],
+            groups=groups,
+            padding=padding,
+            strides=strides,
+        )
+        f = relay.Function([x, kernel], out)
+        return f, {"x": x_shape, "kernel": k_shape}, ["kernel"]
+
+    for padding in [(0, 0), (1, 1)]:
+        for strides in [(1, 1), (2, 2)]:
+            run_and_verify(get_graph(padding=padding, strides=strides))
+
+
+def test_reshape():
+    def get_graph(x_shape, new_shape):
+        x = relay.var("x", shape=(x_shape), dtype="float32")
+        out = relay.reshape(x, new_shape)
+        f = relay.Function([x], out)
+        return f, {"x": x_shape}, []
+
+    run_and_verify(get_graph((1, 1, 1, 10), (-1, 10)))
+    run_and_verify(get_graph((1, 10, 2, 3), (1, -1)))
+    run_and_verify(get_graph((1, 1, 2, 3), (1, 6)))
+
+
+def test_transpose():
+    def get_graph(x_shape, order):
+        x = relay.var("x", shape=(x_shape), dtype="float32")
+        out = relay.transpose(x, order)
+        f = relay.Function([x], out)
+        return f, {"x": x_shape}, []
+
+    run_and_verify(get_graph((1, 16, 7, 7), [0, 2, 3, 1]))
+    run_and_verify(get_graph((1, 7, 7, 16), [0, 3, 1, 2]))
+
+
+def test_float_const():
+    def get_graph(x_shape=(1, 16)):
+        x = relay.var("x", shape=(x_shape), dtype="float32")
+        beta = relay.const(1, dtype="float32")
+        out = relay.multiply(x, beta)
+        f = relay.Function([x], out)
+        return f, {"x": x_shape}, []
+
+    run_and_verify(get_graph())
+
+
+def test_pad():
+    def get_graph(x_shape, pad_width):
+        x = relay.var("x", shape=(x_shape), dtype="float32")
+        out = relay.nn.pad(x, pad_width=pad_width)
+        f = relay.Function([x], out)
+        return f, {"x": x_shape}, []
+
+    run_and_verify(get_graph((1, 8, 16, 16), [[0, 0], [0, 0], [0, 0], [0, 0]]))
+    run_and_verify(get_graph((1, 8, 16, 16), [[0, 0], [0, 0], [1, 1], [1, 1]]))
+    run_and_verify(get_graph((1, 8, 16, 16), [[0, 0], [0, 0], [0, 1], [2, 0]]))
+    run_and_verify(get_graph((1, 8, 3, 16, 16), [[0, 0], [0, 0], [0, 0], [0, 
0], [0, 0]]))
+
+
+def test_softmax():
+    def get_graph(x_shape, axis):
+        x = relay.var("x", shape=(x_shape), dtype="float32")
+        out = relay.nn.softmax(x, axis=axis)
+        f = relay.Function([x], out)
+        return f, {"x": x_shape}, []
+
+    run_and_verify(get_graph((1, 1000), axis=1))
+    run_and_verify(get_graph((1, 1000), axis=-1))
+    run_and_verify(get_graph((1, 3, 4), axis=-2))
+    run_and_verify(get_graph((1, 3, 4), axis=1))
+
+
+def test_batch_norm():
+    def get_graph(x_shape, param_shape, axis=1, epsilon=1e-5):
+        x = relay.var("x", shape=(x_shape), dtype="float32")
+        beta = relay.var("beta", shape=(param_shape), dtype="float32")
+        gamma = relay.var("gamma", shape=(param_shape), dtype="float32")
+        moving_mean = relay.var("moving_mean", shape=(param_shape), 
dtype="float32")
+        moving_var = relay.var("moving_var", shape=(param_shape), 
dtype="float32")
+        out, _, _ = relay.nn.batch_norm(
+            x,
+            gamma=gamma,
+            beta=beta,
+            moving_mean=moving_mean,
+            moving_var=moving_var,
+            axis=axis,
+            center=True,
+            scale=True,
+            epsilon=epsilon,
+        )
+        f = relay.Function([x, gamma, beta, moving_mean, moving_var], out)
+        return (
+            f,
+            {
+                "x": x_shape,
+                "beta": param_shape,
+                "gamma": param_shape,
+                "moving_mean": param_shape,
+                "moving_var": param_shape,
+            },
+            ["beta", "gamma", "moving_mean", "moving_var"],
+        )
+
+    run_and_verify(get_graph((1, 64, 56, 56), (64,)))
+    run_and_verify(get_graph((1, 56, 56, 64), (64,), axis=3, 
epsilon=1.001e-05))
+
+
+def test_unary():
+    def get_graph(op, x_shape=(1, 8, 3, 3)):
+        x = relay.var("x", shape=(x_shape), dtype="float32")
+        out = op(x)
+        f = relay.Function([x], out)
+        return f, {"x": x_shape}, []
+
+    for op in [
+        relay.nn.relu,
+        relay.sigmoid,
+        relay.tanh,
+        relay.exp,
+        relay.log,
+        relay.sqrt,
+        relay.abs,
+        relay.negative,
+        relay.sin,
+        relay.cos,
+        relay.atan,
+        relay.ceil,
+        relay.floor,
+    ]:
+        run_and_verify(get_graph(op))
+
+
+def test_clip():
+    def get_graph(x_shape=(1, 8, 3, 3)):
+        x = relay.var("x", shape=(x_shape), dtype="float32")
+        out = relay.clip(x, a_min=-0.2, a_max=0.4)
+        f = relay.Function([x], out)
+        return f, {"x": x_shape}, []
+
+    run_and_verify(get_graph())
+
+
+def test_leaky_relu():
+    def get_graph(x_shape=(1, 8, 3, 3)):
+        x = relay.var("x", shape=(x_shape), dtype="float32")
+        out = relay.nn.leaky_relu(x, alpha=0.1)
+        f = relay.Function([x], out)
+        return f, {"x": x_shape}, []
+
+    run_and_verify(get_graph())
+
+
+def test_binary():
+    def get_graph(op, x_shape, y_shape, y_is_const=False):
+        x = relay.var("x", shape=(x_shape), dtype="float32")
+        if y_is_const:
+            y = relay.const(np.ones(y_shape).astype("float32"))
+            out = op(x, y)
+            f = relay.Function([x], out)
+            return f, {"x": x_shape}, []
+        y = relay.var("y", shape=(y_shape), dtype="float32")
+        out = op(x, y)
+        f = relay.Function([x, y], out)
+        return f, {"x": x_shape, "y": y_shape}, []
+
+    for op in [relay.add, relay.subtract, relay.multiply, relay.divide, 
relay.power]:
+        for y_is_const in [True, False]:
+            run_and_verify(get_graph(op, (1, 8, 3, 3), (1, 8, 3, 3), 
y_is_const))
+            run_and_verify(get_graph(op, (1, 8, 1, 3), (1, 8, 3, 1), 
y_is_const))
+            run_and_verify(get_graph(op, (1, 10), (10,), y_is_const))
+            run_and_verify(get_graph(op, (1, 1, 1, 10), (10,), y_is_const))
+            run_and_verify(get_graph(op, (1, 1, 1), (3,), y_is_const))
+
+
+def test_reduce():
+    def get_graph(op, x_shape=(1, 2, 3, 4), axis=(2, 3), keepdims=False):
+        x = relay.var("x", shape=(x_shape), dtype="float32")
+        out = op(x, axis=axis, keepdims=keepdims)
+        f = relay.Function([x], out)
+        return f, {"x": x_shape}, []
+
+    for op in [relay.sum, relay.prod, relay.max, relay.min, relay.mean]:
+        for keepdims in [True, False]:
+            run_and_verify(get_graph(op, axis=(1), keepdims=keepdims))
+            run_and_verify(get_graph(op, axis=(2, 3), keepdims=keepdims))
+            run_and_verify(get_graph(op, axis=(1, 2), keepdims=keepdims))
+            run_and_verify(get_graph(op, axis=(1, 2, 3), keepdims=keepdims))
+
+
+def test_strided_slice():
+    def get_graph(x_shape, begin, end, strides=None):
+        x = relay.var("x", shape=(x_shape), dtype="float32")
+        if strides:
+            out = relay.strided_slice(
+                x,
+                relay.expr.const(begin, dtype="int32"),
+                relay.expr.const(end, dtype="int32"),
+                relay.expr.const(strides, dtype="int32"),
+            )
+        else:
+            out = relay.strided_slice(
+                x,
+                relay.expr.const(begin, dtype="int32"),
+                relay.expr.const(end, dtype="int32"),
+            )
+        f = relay.Function([x], out)
+        return f, {"x": x_shape}, []
+
+    run_and_verify(get_graph((1, 3, 6, 7), [0, 0, 0, 0], [1, 1, 6, 7]))
+    run_and_verify(get_graph((1, 3, 6, 7), [0, 1, 0, 0], [1, 2, 6, 6]))
+    run_and_verify(get_graph((1, 10), [0, 0], [1, 10], [1, 2]))
+
+
+def test_adaptive_pool2d():
+    def get_graph(op, x_shape=(1, 3, 32, 32), out_size=(1, 1)):
+        x = relay.var("x", shape=(x_shape), dtype="float32")
+        out = op(x, out_size)
+        f = relay.Function([x], out)
+        return f, {"x": x_shape}, []
+
+    run_and_verify(get_graph(relay.nn.adaptive_max_pool2d))
+    run_and_verify(get_graph(relay.nn.adaptive_avg_pool2d))
+
+
+def test_multiple_outputs():
+    def get_graph():
+        x = relay.var("x", shape=(1, 3), dtype="float32")
+        y = relay.var("y", shape=(1, 3), dtype="float32")
+        z = relay.add(x, y)
+        w = relay.add(z, y)
+        out = relay.Tuple((z, w))
+        f = relay.Function([x, y], out)
+        return f, {"x": (1, 3), "y": (1, 3)}, []
+
+    run_and_verify(get_graph())
+
+
+def test_conv3d():
+    def get_graph(
+        x_shape=(1, 32, 8, 8, 8),
+        k_shape=(16, 32, 3, 3, 3),
+        groups=1,
+        padding=(0, 0, 0),
+        strides=(1, 1, 1),
+        dilation=(1, 1, 1),
+    ):
+        x = relay.var("x", shape=(x_shape), dtype="float32")
+        kernel = relay.var("kernel", shape=(k_shape), dtype="float32")
+        out = relay.nn.conv3d(
+            x,
+            kernel,
+            channels=k_shape[0],
+            kernel_size=k_shape[2:],
+            groups=groups,
+            padding=padding,
+            strides=strides,
+            dilation=dilation,
+        )
+        f = relay.Function([x, kernel], out)
+        return f, {"x": x_shape, "kernel": k_shape}, ["kernel"]
+
+    run_and_verify(get_graph())
+    run_and_verify(get_graph(padding=(0, 0, 0, 1, 1, 1)))
+
+
+def test_pool3d():
+    def get_graph(
+        op,
+        x_shape=(1, 3, 8, 32, 32),
+        pool_size=(2, 2, 2),
+        strides=(2, 2, 2),
+        padding=(0, 0, 0),
+        ceil_mode=False,
+        count_include_pad=None,
+    ):
+        x = relay.var("x", shape=(x_shape), dtype="float32")
+        if count_include_pad is not None:
+            out = op(
+                x,
+                pool_size=pool_size,
+                strides=strides,
+                padding=padding,
+                ceil_mode=ceil_mode,
+                count_include_pad=count_include_pad,
+            )
+        else:
+            out = op(
+                x,
+                pool_size=pool_size,
+                strides=strides,
+                padding=padding,
+                ceil_mode=ceil_mode,
+            )
+        f = relay.Function([x], out)
+        return f, {"x": x_shape}, []
+
+    run_and_verify(get_graph(relay.nn.avg_pool3d))
+    run_and_verify(get_graph(relay.nn.max_pool3d))
+    run_and_verify(get_graph(relay.nn.max_pool3d, padding=(0, 0, 0, 1, 1, 1)))
+    run_and_verify(get_graph(relay.nn.max_pool3d, strides=(1, 1, 1)))
+
+
+def test_conv3d_transpose():
+    def get_graph(
+        x_shape=(1, 32, 8, 8, 8),
+        k_shape=(32, 16, 3, 3, 3),
+        groups=1,
+        padding=(0, 0, 0),
+        strides=(1, 1, 1),
+        output_padding=(0, 0, 0),
+    ):
+        x = relay.var("x", shape=(x_shape), dtype="float32")
+        kernel = relay.var("kernel", shape=(k_shape), dtype="float32")
+        out = relay.nn.conv3d_transpose(
+            x,
+            kernel,
+            channels=k_shape[1],
+            kernel_size=k_shape[2:5],
+            groups=groups,
+            padding=padding,
+            strides=strides,
+            output_padding=output_padding,
+        )
+        f = relay.Function([x, kernel], out)
+        return f, {"x": x_shape, "kernel": k_shape}, ["kernel"]
+
+    run_and_verify(get_graph())
+    run_and_verify(get_graph(strides=(2, 2, 2)))
+    run_and_verify(get_graph(strides=(2, 2, 2), output_padding=(1, 1, 1)))
+
+
+def test_alexnet():
+    run_and_verify_model("alexnet")
+
+
+def test_resnet18_v1():
+    run_and_verify_model("resnet18_v1")
+
+
+def test_resnet18_v2():
+    run_and_verify_model("resnet18_v2")
+
+
+def test_squeezenet():
+    run_and_verify_model("squeezenet1.0")
+
+
+def test_mobilenet():
+    run_and_verify_model("mobilenet0.25")
+
+
+def test_mobilenet_v2():
+    run_and_verify_model("mobilenetv2_0.25")
+
+
+def test_vgg11():
+    run_and_verify_model("vgg11")
+
+
+def test_densenet121():
+    run_and_verify_model("densenet121")

Review comment:
       I don't think we need to test that many models in the CI. Maybe one 
ResNet and one MobileNet would be sufficient.

##########
File path: tests/python/contrib/test_tensorrt.py
##########
@@ -0,0 +1,896 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+import numpy as np
+import time
+import pytest
+
+import tvm
+import tvm.relay.testing
+from tvm import relay
+from tvm.relay.op.contrib import tensorrt
+from tvm.contrib import graph_runtime
+
+
+def skip_codegen_test():
+    """Skip test if TensorRT and CUDA codegen are not present"""
+    if not tvm.runtime.enabled("cuda") or not tvm.gpu(0).exist:
+        print("Skip because CUDA is not enabled.")
+        return True
+    if not tvm.get_global_func("relay.ext.tensorrt", True):
+        print("Skip because TensorRT codegen is not available.")
+        return True
+    return False
+
+
+def skip_runtime_test():
+    if not tvm.runtime.enabled("cuda") or not tvm.gpu(0).exist:
+        print("Skip because CUDA is not enabled.")
+        return True
+    if not tensorrt.is_tensorrt_runtime_enabled():
+        print("Skip because TensorRT runtime is not available.")
+        return True
+    return False
+
+
+def run_and_verify(config):

Review comment:
       It's unclear what this function does especially you have 
`run_and_verify_model`. Maybe `run_and_verify_func`?

##########
File path: src/relay/backend/contrib/tensorrt/codegen.cc
##########
@@ -0,0 +1,240 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file src/relay/backend/contrib/tensorrt/codegen.cc
+ * \brief Implementation of the TensorRT JSON serializer.
+ */
+#include <tvm/ir/module.h>
+#include <tvm/relay/attrs/nn.h>
+#include <tvm/relay/type.h>
+
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "../../utils.h"
+#include "../codegen_json/codegen_json.h"
+
+#if TVM_GRAPH_RUNTIME_TENSORRT
+#include "NvInfer.h"
+#endif
+
+namespace tvm {
+namespace relay {
+namespace contrib {
+
+/*! \brief Attributes to store the compiler options for TensorRT. */
+struct TensorRTCompilerConfigNode : public 
tvm::AttrsNode<TensorRTCompilerConfigNode> {
+  Array<Integer> tensorrt_version;
+  bool use_implicit_batch;
+  size_t max_workspace_size;
+  bool remove_no_mac_subgraphs;
+
+  TVM_DECLARE_ATTRS(TensorRTCompilerConfigNode, 
"ext.attrs.TensorRTCompilerConfigNode") {
+    TVM_ATTR_FIELD(tensorrt_version)
+        .describe("TensorRT version as (major, minor, patch).")
+        .set_default(Array<Integer>({6, 0, 1}));
+    TVM_ATTR_FIELD(use_implicit_batch).set_default(true);
+    TVM_ATTR_FIELD(max_workspace_size).set_default(size_t(1) << 30);
+    TVM_ATTR_FIELD(remove_no_mac_subgraphs).set_default(false);
+  }
+};
+
+class TensorRTCompilerConfig : public Attrs {
+ public:
+  TVM_DEFINE_NOTNULLABLE_OBJECT_REF_METHODS(TensorRTCompilerConfig, Attrs,
+                                            TensorRTCompilerConfigNode);
+};
+
+TVM_REGISTER_NODE_TYPE(TensorRTCompilerConfigNode);
+TVM_REGISTER_PASS_CONFIG_OPTION("relay.ext.tensorrt.options", 
TensorRTCompilerConfig);
+
+/*!
+ * \brief Generates an TensorRTModule from a relay expression by serializing 
the expression to a
+ * json representation. TensorRT is not required here because use of TensorRT 
APIs is deferred until
+ * runtime.
+ */
+class TensorRTJSONSerializer : public backend::contrib::JSONSerializer {
+  using JSONGraphNode = tvm::runtime::json::JSONGraphNode;
+  using JSONGraphNodeEntry = tvm::runtime::json::JSONGraphNodeEntry;
+
+ public:
+  TensorRTJSONSerializer(const std::string& symbol, const Expr& expr)
+      : JSONSerializer(symbol, expr) {}
+
+  std::vector<JSONGraphNodeEntry> VisitExpr_(const CallNode* cn) {
+    std::string name;
+    if (const auto* op_node = cn->op.as<OpNode>()) {
+      name = op_node->name;
+    } else {
+      return JSONSerializer::VisitExpr_(cn);
+    }
+
+    std::vector<JSONGraphNodeEntry> inputs;
+    for (const auto& arg : cn->args) {
+      auto res = VisitExpr(arg);
+      inputs.insert(inputs.end(), res.begin(), res.end());
+    }
+    auto node = std::make_shared<JSONGraphNode>(name,     /* name_ */
+                                                "kernel", /* op_type_ */
+                                                inputs, 1 /* num_outputs_ */);
+    if (name == "nn.pad") {
+      SetPadNodeAttribute(node, cn);
+    } else if (name == "strided_slice") {
+      SetStridedSliceNodeAttribute(node, cn);
+    } else {
+      SetCallNodeAttribute(node, cn);
+    }
+    // These attributes are global to the whole module.
+    SaveGlobalAttributes(node);
+    return AddNode(node, GetRef<Expr>(cn));
+  }
+
+  void SetPadNodeAttribute(std::shared_ptr<JSONGraphNode> node, const 
CallNode* cn) {
+    const auto* pad_attr = cn->attrs.as<PadAttrs>();
+    CHECK(pad_attr);
+    auto p = pad_attr->pad_width;
+    const int dim_h = (p.size() == 5) ? 3 : 2;
+    const int dim_w = (p.size() == 5) ? 4 : 3;
+    std::vector<std::string> padding = 
{std::to_string(p[dim_h][0].as<IntImmNode>()->value),
+                                        
std::to_string(p[dim_w][0].as<IntImmNode>()->value),
+                                        
std::to_string(p[dim_h][1].as<IntImmNode>()->value),
+                                        
std::to_string(p[dim_w][1].as<IntImmNode>()->value)};
+    std::vector<dmlc::any> padding_attr;
+    padding_attr.emplace_back(padding);
+    node->SetAttr("padding", padding_attr);
+  }
+
+  void SetStridedSliceNodeAttribute(std::shared_ptr<JSONGraphNode> node, const 
CallNode* cn) {
+    const auto* attrs = cn->attrs.as<StridedSliceAttrs>();
+    CHECK(attrs);
+    CHECK(attrs->begin && attrs->end && attrs->strides);

Review comment:
       We can merge these two checks and provide a proper error message.

##########
File path: src/runtime/contrib/tensorrt/tensorrt_utils.h
##########
@@ -0,0 +1,91 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file runtime/contrib/tensorrt/utils.h
+ * \brief Helper functions used by TensorRTBuilder or TrtOpConverters.
+ */
+
+#ifndef TVM_RUNTIME_CONTRIB_TENSORRT_TENSORRT_UTILS_H_
+#define TVM_RUNTIME_CONTRIB_TENSORRT_TENSORRT_UTILS_H_
+
+#include <string>
+#include <vector>
+
+#include "NvInfer.h"
+
+// There is a conflict between cpplint and clang-format-10.
+// clang-format off
+#define TRT_VERSION_GE(major, minor, patch)                                    
                \
+  ((NV_TENSORRT_MAJOR > major) || (NV_TENSORRT_MAJOR == major && 
NV_TENSORRT_MINOR > minor) || \
+  (NV_TENSORRT_MAJOR == major && NV_TENSORRT_MINOR == minor && 
NV_TENSORRT_PATCH >= patch))
+// clang-format on
+
+namespace tvm {
+namespace runtime {
+namespace contrib {
+
+/*!
+ * \brief Helper function to convert an vector to TRT Dims.
+ * \param vec Vector.
+ * \return TRT Dims.
+ */
+template <typename T>
+inline nvinfer1::Dims VectorToTrtDims(const std::vector<T>& vec) {
+  nvinfer1::Dims dims;
+  // Dims(nbDims=0, d[0]=1) is used to represent a scalar in TRT.
+  dims.d[0] = 1;
+  dims.nbDims = vec.size();
+  for (size_t i = 0; i < vec.size(); ++i) {
+    dims.d[i] = vec[i];
+  }
+  return dims;
+}
+
+/*!
+ * \brief Helper function to convert TRT Dims to vector.
+ * \param vec TRT Dims.
+ * \return Vector.
+ */
+inline std::vector<int> TrtDimsToVector(const nvinfer1::Dims& dims) {
+  return std::vector<int>(dims.d, dims.d + dims.nbDims);
+}
+
+/*!
+ * \brief Helper function to convert vector to string.
+ * \param vec Vector.
+ * \return Vector as a string.
+ */
+template <typename T>
+inline std::string DebugString(const std::vector<T>& vec) {

Review comment:
       @zhiics do we have existing helper functions to achieve the same goal?

##########
File path: tests/python/contrib/test_tensorrt.py
##########
@@ -0,0 +1,896 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+import numpy as np
+import time
+import pytest
+
+import tvm
+import tvm.relay.testing
+from tvm import relay
+from tvm.relay.op.contrib import tensorrt
+from tvm.contrib import graph_runtime
+
+
+def skip_codegen_test():
+    """Skip test if TensorRT and CUDA codegen are not present"""
+    if not tvm.runtime.enabled("cuda") or not tvm.gpu(0).exist:
+        print("Skip because CUDA is not enabled.")
+        return True
+    if not tvm.get_global_func("relay.ext.tensorrt", True):
+        print("Skip because TensorRT codegen is not available.")
+        return True
+    return False
+
+
+def skip_runtime_test():
+    if not tvm.runtime.enabled("cuda") or not tvm.gpu(0).exist:
+        print("Skip because CUDA is not enabled.")
+        return True
+    if not tensorrt.is_tensorrt_runtime_enabled():
+        print("Skip because TensorRT runtime is not available.")
+        return True
+    return False
+
+
+def run_and_verify(config):
+    if skip_codegen_test():
+        return
+    f, input_shapes, is_param = config
+    params = {x: np.random.uniform(-1, 1, input_shapes[x]).astype(np.float32) 
for x in is_param}
+    input_dict = {
+        k: np.random.uniform(-1, 1, v).astype(np.float32)
+        for k, v in input_shapes.items()
+        if k not in is_param
+    }
+
+    # Run TRT
+    mod = tvm.IRModule()
+    mod["main"] = f
+    mod, config = tensorrt.partition_for_tensorrt(mod, params)
+    with tvm.transform.PassContext(opt_level=3, 
config={"relay.ext.tensorrt.options": config}):
+        graph, lib, graph_params = relay.build(mod, "cuda", params=params)
+    if skip_runtime_test():
+        return
+    mod = graph_runtime.create(graph, lib, ctx=tvm.gpu(0))
+    mod.set_input(**graph_params)
+    mod.run(**input_dict)
+    results = [mod.get_output(i) for i in range(mod.get_num_outputs())]
+
+    # Run reference
+    mod = tvm.IRModule()
+    mod["main"] = f
+    with tvm.transform.PassContext(opt_level=3):
+        graph, lib, graph_params = relay.build(mod, "cuda", params=params)
+    mod = graph_runtime.create(graph, lib, ctx=tvm.gpu(0))
+    mod.set_input(**graph_params)
+    mod.run(**input_dict)
+    ref_results = [mod.get_output(i) for i in range(mod.get_num_outputs())]
+
+    assert len(results) == len(ref_results)
+    for i in range(len(results)):
+        res = results[i].asnumpy()
+        ref_res = ref_results[i].asnumpy()
+        assert res.shape == ref_res.shape
+        tvm.testing.assert_allclose(res, ref_res, rtol=1e-3, atol=1e-3)
+
+
+def run_and_verify_model(model):
+    if skip_codegen_test():
+        return
+
+    def compile_and_run(i_data, input_shape, dtype, use_trt=True, 
num_iteration=1):
+        import mxnet as mx
+        from mxnet.gluon.model_zoo.vision import get_model
+
+        def check_trt_used(graph):
+            import json
+
+            graph = json.loads(graph)
+            num_trt_subgraphs = sum(
+                [
+                    1
+                    for n in graph["nodes"]
+                    if n.get("attrs", {}).get("func_name", 
"").startswith("tensorrt_")
+                ]
+            )
+            assert num_trt_subgraphs >= 1
+
+        block = get_model(model, pretrained=True)
+        mod, params = relay.frontend.from_mxnet(block, shape={"data": 
input_shape}, dtype=dtype)
+
+        if use_trt:
+            mod, config = tensorrt.partition_for_tensorrt(mod, params)
+            with tvm.transform.PassContext(
+                opt_level=3, config={"relay.ext.tensorrt.options": config}
+            ):
+                graph, lib, params = relay.build(mod, "cuda", params=params)
+            check_trt_used(graph)
+        else:
+            with tvm.transform.PassContext(opt_level=3):
+                graph, lib, params = relay.build(mod, "cuda", params=params)
+
+        if skip_runtime_test():
+            return
+        mod = graph_runtime.create(graph, lib, ctx=tvm.gpu(0))
+        mod.set_input(**params)
+        # Warmup
+        for i in range(10):
+            mod.run(data=i_data)
+        # Time
+        times = []
+        for i in range(num_iteration):
+            start_time = time.time()
+            mod.run(data=i_data)
+            res = mod.get_output(0)
+            times.append(time.time() - start_time)
+        latency = 1000.0 * np.mean(times)
+        print(model, latency)
+        return res
+
+    dtype = "float32"
+    input_shape = (1, 3, 224, 224)
+    i_data = np.random.uniform(-1, 1, input_shape).astype(dtype)
+    res = compile_and_run(i_data, input_shape, dtype, use_trt=True)
+    ref_res = compile_and_run(i_data, input_shape, dtype, use_trt=False, 
num_iteration=1)
+    tvm.testing.assert_allclose(res.asnumpy(), ref_res.asnumpy(), rtol=1e-3, 
atol=1e-3)
+
+
+def test_tensorrt_simple():
+    if skip_codegen_test():
+        return
+    dtype = "float32"
+    xshape = (1, 3, 2, 2)
+    yshape = (1, 3, 1, 1)
+    zshape = (1, 1, 1, 1)
+    x = relay.var("x", shape=(xshape), dtype=dtype)
+    y = relay.var("y", shape=(yshape), dtype=dtype)
+    z = relay.var("z", shape=(zshape), dtype=dtype)
+    w = z * (x + y)
+    out = relay.nn.relu(w)
+    f = relay.Function([x, y, z], out)
+
+    mod = tvm.IRModule()
+    mod["main"] = f
+    mod, config = tensorrt.partition_for_tensorrt(mod)
+    with tvm.transform.PassContext(opt_level=3, 
config={"relay.ext.tensorrt.options": config}):
+        graph, lib, params = relay.build(mod, "cuda")
+    if skip_runtime_test():
+        return
+    mod = graph_runtime.create(graph, lib, ctx=tvm.gpu(0))
+    x_data = np.random.uniform(-1, 1, xshape).astype(dtype)
+    y_data = np.random.uniform(-1, 1, yshape).astype(dtype)
+    z_data = np.random.uniform(-1, 1, zshape).astype(dtype)
+    mod.run(x=x_data, y=y_data, z=z_data)
+    results = [mod.get_output(i).asnumpy() for i in 
range(mod.get_num_outputs())]
+
+
+def test_tensorrt_not_compatible():
+    if skip_codegen_test():
+        return
+    dtype = "float32"
+    xshape = (1, 32, 14, 14)
+    x = relay.var("x", shape=(xshape), dtype=dtype)
+    y = relay.add(x, x)
+    z = relay.erf(y)
+    out = relay.nn.relu(z)
+    f = relay.Function([x], out)
+    mod = tvm.IRModule()
+    mod["main"] = f
+    mod, config = tensorrt.partition_for_tensorrt(mod)
+    with tvm.transform.PassContext(opt_level=3, 
config={"relay.ext.tensorrt.options": config}):
+        graph, lib, params = relay.build(mod, "cuda")
+    if skip_runtime_test():
+        return
+    mod = graph_runtime.create(graph, lib, ctx=tvm.gpu(0))
+    x_data = np.random.uniform(-1, 1, xshape).astype(dtype)
+    mod.run(x=x_data)
+    results = [mod.get_output(i).asnumpy() for i in 
range(mod.get_num_outputs())]
+
+
+def test_tensorrt_serialize():
+    if skip_codegen_test():
+        return
+    import mxnet
+    from mxnet.gluon.model_zoo.vision import get_model
+
+    block = get_model("resnet18_v1", pretrained=True)
+    mod, params = relay.frontend.from_mxnet(
+        block, shape={"data": (1, 3, 224, 224)}, dtype="float32"
+    )
+    # Compile
+    mod, config = tensorrt.partition_for_tensorrt(mod, params)
+    with tvm.transform.PassContext(opt_level=3, 
config={"relay.ext.tensorrt.options": config}):
+        lib = relay.build(mod, "cuda", params=params)
+    # Serialize
+    lib.export_library("compiled.so")
+    # Deserialize
+    loaded_lib = tvm.runtime.load_module("compiled.so")
+    # Run
+    if skip_runtime_test():
+        return
+    gen_module = 
tvm.contrib.graph_runtime.GraphModule(loaded_lib["default"](tvm.gpu(0)))
+    i_data = np.random.uniform(0, 1, (1, 3, 224, 224)).astype("float32")
+    for i in range(10):
+        gen_module.run(data=i_data)

Review comment:
       Why you need to run 10 times?

##########
File path: src/runtime/contrib/tensorrt/tensorrt_runtime.cc
##########
@@ -0,0 +1,312 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file src/runtime/contrib/tensorrt/tensorrt_runtime.cc
+ * \brief JSON runtime implementation for TensorRT.
+ */
+
+#include <dmlc/parameter.h>
+#include <tvm/runtime/ndarray.h>
+#include <tvm/runtime/registry.h>
+
+#include <fstream>
+
+#include "../../file_util.h"
+#include "../json/json_node.h"
+#include "../json/json_runtime.h"
+
+#ifdef TVM_GRAPH_RUNTIME_TENSORRT
+#include "NvInfer.h"
+#include "tensorrt_builder.h"
+#endif
+
+namespace tvm {
+namespace runtime {
+namespace contrib {
+
+using namespace tvm::runtime::json;
+
+class TensorRTRuntime : public JSONRuntimeBase {
+ public:
+  /*!
+   * \brief The TensorRT runtime module. Deserialize the provided functions
+   * on creation and store in the layer cache.
+   *
+   * \param symbol_name The name of the function.
+   * \param graph_json serialized JSON representation of a sub-graph.
+   * \param const_names The names of each constant in the sub-graph.
+   */
+  explicit TensorRTRuntime(const std::string& symbol_name, const std::string& 
graph_json,
+                           const Array<String>& const_names)
+      : JSONRuntimeBase(symbol_name, graph_json, const_names),
+        use_implicit_batch_(true),
+        max_workspace_size_(size_t(1) << 30) {}
+
+  /*!
+   * \brief The type key of the module.
+   *
+   * \return module type key.
+   */
+  const char* type_key() const override { return "tensorrt"; }
+
+  /*!
+   * \brief Initialize runtime. Create TensorRT layer from JSON
+   * representation.
+   *
+   * \param consts The constant params from compiled model.
+   */
+  void Init(const Array<NDArray>& consts) override {
+    CHECK_EQ(consts.size(), const_idx_.size())
+        << "The number of input constants must match the number of required.";
+    LoadGlobalAttributes();
+    if (GetCachedEnginesFromDisk()) return;
+    SetupConstants(consts);
+    BuildEngine();
+    CacheEngineToDisk();
+  }
+
+  void LoadGlobalAttributes() {
+    // These settings are global to the entire subgraph. Codegen will add them 
as attributes to all
+    // op nodes. Read from first one.
+    for (size_t i = 0; i < nodes_.size(); ++i) {
+      if (nodes_[i].HasAttr("use_implicit_batch") && 
nodes_[i].HasAttr("max_workspace_size")) {
+        use_implicit_batch_ =
+            
std::stoi(nodes_[i].GetAttr<std::vector<std::string>>("use_implicit_batch")[0]);
+        // Allow max_workspace_size to be overridden at runtime.
+        size_t runtime_max_workspace_size =
+            dmlc::GetEnv("TVM_TENSORRT_MAX_WORKSPACE_SIZE", size_t(0));
+        if (runtime_max_workspace_size != 0) {
+          max_workspace_size_ = runtime_max_workspace_size;
+        } else {
+          max_workspace_size_ =
+              
std::stoul(nodes_[i].GetAttr<std::vector<std::string>>("max_workspace_size")[0]);
+        }
+        return;
+      }
+    }
+  }
+
+#ifdef TVM_GRAPH_RUNTIME_TENSORRT
+  /*! \brief Run inference using built engine. */
+  void Run() override {
+    auto& engine_and_context = trt_engine_cache_.at(symbol_name_);
+    auto engine = engine_and_context.engine;
+    auto context = engine_and_context.context;
+    std::vector<void*> bindings(engine->getNbBindings(), nullptr);
+
+    for (size_t i = 0; i < input_nodes_.size(); ++i) {
+      auto nid = input_nodes_[i];
+      if (nodes_[nid].GetOpType() == "input") {

Review comment:
       Could you remind me what other possible types here?

##########
File path: src/runtime/contrib/tensorrt/tensorrt_runtime.cc
##########
@@ -0,0 +1,311 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file src/runtime/contrib/tensorrt/tensorrt_runtime.cc
+ * \brief JSON runtime implementation for TensorRT.
+ */
+
+#include <dmlc/parameter.h>
+#include <tvm/runtime/ndarray.h>
+#include <tvm/runtime/registry.h>
+
+#include <fstream>
+
+#include "../../file_util.h"
+#include "../json/json_node.h"
+#include "../json/json_runtime.h"
+
+#ifdef TVM_GRAPH_RUNTIME_TENSORRT
+#include "NvInfer.h"
+#include "tensorrt_builder.h"
+#endif
+
+namespace tvm {
+namespace runtime {
+namespace contrib {
+
+using namespace tvm::runtime::json;
+
+class TensorRTRuntime : public JSONRuntimeBase {
+ public:
+  /*!
+   * \brief The TensorRT runtime module. Deserialize the provided functions
+   * on creation and store in the layer cache.
+   *
+   * \param symbol_name The name of the function.
+   * \param graph_json serialized JSON representation of a sub-graph.
+   * \param const_names The names of each constant in the sub-graph.
+   */
+  explicit TensorRTRuntime(const std::string& symbol_name, const std::string& 
graph_json,
+                           const Array<String>& const_names)
+      : JSONRuntimeBase(symbol_name, graph_json, const_names), 
use_implicit_batch_(true),
+        max_workspace_size_(size_t(1) << 30) {}
+
+  /*!
+   * \brief The type key of the module.
+   *
+   * \return module type key.
+   */
+  const char* type_key() const override { return "tensorrt"; }
+
+  /*!
+   * \brief Initialize runtime. Create TensorRT layer from JSON
+   * representation.
+   *
+   * \param consts The constant params from compiled model.
+   */
+  void Init(const Array<NDArray>& consts) override {
+    CHECK_EQ(consts.size(), const_idx_.size())
+        << "The number of input constants must match the number of required.";
+    LoadGlobalAttributes();
+    if (GetCachedEnginesFromDisk()) return;
+    SetupConstants(consts);
+    BuildEngine();
+    CacheEngineToDisk();
+  }
+
+  void LoadGlobalAttributes() {
+    // These settings are global to the entire subgraph. Codegen will add them 
as attributes to all
+    // op nodes. Read from first one.
+    for (size_t i = 0; i < nodes_.size(); ++i) {
+      if (nodes_[i].HasAttr("use_implicit_batch") && 
nodes_[i].HasAttr("max_workspace_size")) {
+        use_implicit_batch_ =
+            
std::stoi(nodes_[i].GetAttr<std::vector<std::string>>("use_implicit_batch")[0]);
+        // Allow max_workspace_size to be overridden at runtime.
+        size_t runtime_max_workspace_size =
+            dmlc::GetEnv("TVM_TENSORRT_MAX_WORKSPACE_SIZE", size_t(0));
+        if (runtime_max_workspace_size != 0) {
+          max_workspace_size_ = runtime_max_workspace_size;
+        } else {
+          max_workspace_size_ =
+              
std::stoul(nodes_[i].GetAttr<std::vector<std::string>>("max_workspace_size")[0]);
+        }
+        return;
+      }
+    }
+  }
+
+#ifdef TVM_GRAPH_RUNTIME_TENSORRT
+  /*! \brief Run inference using built engine. */
+  void Run() override {
+    auto& engine_and_context = trt_engine_cache_.at(symbol_name_);
+    auto engine = engine_and_context.engine;
+    auto context = engine_and_context.context;
+    std::vector<void*> bindings(engine->getNbBindings(), nullptr);
+
+    for (size_t i = 0; i < input_nodes_.size(); ++i) {
+      auto nid = input_nodes_[i];
+      if (nodes_[nid].GetOpType() == "input") {
+        for (size_t j = 0; j < nodes_[nid].GetOpShape().size(); ++j) {
+          uint32_t eid = EntryID(nid, j);
+          const std::string name = nodes_[nid].GetOpName() + "_" + 
std::to_string(j);
+          int binding_index = engine->getBindingIndex(name.c_str());
+          CHECK_NE(binding_index, -1);
+          bindings[binding_index] = data_entry_[eid]->data;
+        }
+      }
+    }
+
+    for (size_t i = 0; i < outputs_.size(); ++i) {
+      uint32_t eid = EntryID(outputs_[i]);
+      const std::string& name = engine_and_context.outputs[i];
+      int binding_index = engine->getBindingIndex(name.c_str());
+      CHECK_NE(binding_index, -1);
+      bindings[binding_index] = data_entry_[eid]->data;
+    }
+
+#if TRT_VERSION_GE(6, 0, 1)
+    if (use_implicit_batch_) {
+      CHECK(context->execute(batch_size_, bindings.data())) << "Running 
TensorRT failed.";
+    } else {
+      CHECK(context->executeV2(bindings.data())) << "Running TensorRT failed.";
+    }
+#else
+    CHECK(context->execute(batch_size_, bindings.data())) << "Running TensorRT 
failed.";
+#endif
+  }
+
+ private:
+  /*!
+   * \brief Build TensorRT engine from JSON representation.
+   */
+  void BuildEngine() {
+    LOG(INFO) << "Building new TensorRT engine for subgraph " << symbol_name_;
+    const bool use_fp16 = dmlc::GetEnv("TVM_TENSORRT_USE_FP16", false);
+    batch_size_ = GetBatchSize();
+    TensorRTBuilder builder(&logger_, max_workspace_size_, 
use_implicit_batch_, use_fp16,
+                            batch_size_);
+
+    // Add inputs and constants.
+    for (size_t i = 0; i < input_nodes_.size(); ++i) {
+      auto nid = input_nodes_[i];
+      const auto& node = nodes_[nid];
+      std::string name = node.GetOpName();
+      if (node.GetOpType() == "input") {
+        builder.AddInput(nid, node);
+      } else {
+        CHECK_EQ(node.GetOpType(), "const");
+        uint32_t eid = EntryID(nid, 0);
+        builder.AddConstant(nid, data_entry_[eid]);
+      }
+    }
+
+    // Add layers.
+    for (size_t nid = 0; nid < nodes_.size(); ++nid) {
+      const auto& node = nodes_[nid];
+      if (node.GetOpType() != "kernel") continue;
+      builder.AddLayer(nid, node);
+    }
+
+    // Add outputs.
+    for (size_t i = 0; i < outputs_.size(); ++i) {
+      builder.AddOutput(outputs_[i]);
+    }
+
+    // Build engine.
+    trt_engine_cache_[symbol_name_] = builder.BuildEngine();
+    LOG(INFO) << "Finished building TensorRT engine for subgraph " << 
symbol_name_;
+  }
+
+  /*! \brief If TVM_TENSORRT_CACHE_DIR is set, will check that directory for
+   * already built TRT engines and load into trt_engine_cache_ so they don't
+   * have to be built at first inference.
+   */
+  bool GetCachedEnginesFromDisk() {

Review comment:
       This is an interesting discussion. I realized that this is more like a 
serialization for platform-dependent TensorRT engines. If it's not possible to 
build and serialize the engine during the compilation (or cross-compilation) 
even we have built the TVM with TensorRT runtime, then this is probably 
inevitable; otherwise we may build the engine and serialize the bit-stream 
along with other artifacts in `SaveToBinary`.
   
   If the serialization here is inevitable, which I believe in it because users 
may not have TensorRT during compilation, then the next question is whether we 
can update the ".so" file with the serialized engine here instead of creating a 
separate file. In other words, the .so file may or may not contain a serialized 
engine, but if it has, we don't need to build it again.
   
   
   
   




----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
[email protected]

[GitHub] [incubator-tvm] comaniac commented on a change in pull request #6395: [BYOC][TensorRT] TensorRT BYOC integration

Reply via email to