areusch commented on a change in pull request #7785:
URL: https://github.com/apache/tvm/pull/7785#discussion_r614972206



##########
File path: include/tvm/runtime/crt/page_allocator.h
##########
@@ -18,12 +18,12 @@
  */
 
 /*!
- * \file tvm/runtime/crt/memory.h
+ * \file tvm/runtime/crt/page_allocator.h

Review comment:
       want to also rename MemoryManagerCreate to e.g. PageMemoryManagerCreate, 
since we now have StackMemoryManager?

##########
File path: python/tvm/relay/backend/executor_factory.py
##########
@@ -14,21 +14,125 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
-"""Graph executor factory."""
+"""Executor factory modules."""
+from abc import abstractmethod
 import warnings
+
+from tvm import tir
+
 from ..._ffi.base import string_types
 from ..._ffi.registry import get_global_func
 from ...runtime import ndarray
 
 
-class GraphExecutorFactoryModule:
+class ExecutorFactoryModule:
+    """Common interface for executor factory modules
+    This class describes the common API of different
+    factory modules
+    """
+
+    @abstractmethod
+    def get_internal_repr(self):
+        """Common function to return the internal representation
+        the executor relies upon to execute the network
+        """
+        raise NotImplementedError
+
+    @abstractmethod
+    def get_params(self):
+        """
+        Sometimes we want to get params explicitly.
+        For example, we want to save its params value to
+        an independent file.
+        """
+        raise NotImplementedError
+
+    @abstractmethod
+    def get_lib(self):
+        """ Return the generated library"""
+        raise NotImplementedError
+
+    @abstractmethod
+    def get_internal_repr(self):
+        """ Return the internal representation used to execute the network"""
+        raise NotImplementedError
+
+    def __getitem__(self, item):
+        print(item)
+        return self.module.__getitem__(item)
+
+    def __iter__(self):
+        warnings.warn(
+            "legacy graph executor behavior of producing json / lib / params 
will be "
+            "removed in the next release."
+            " Please see documents of tvm.contrib.graph_executor.GraphModule 
for the "
+            " new recommended usage.",
+            DeprecationWarning,
+            2,
+        )
+        return self
+
+    def __next__(self):
+        if self.iter_cnt > 2:
+            raise StopIteration
+
+        objs = [self.get_internal_repr(), self.lib, self.params]
+        obj = objs[self.iter_cnt]
+        self.iter_cnt += 1
+        return obj
+
+
+class AOTExecutorFactoryModule(ExecutorFactoryModule):
+    """AOT executor factory module.

Review comment:
       I think this belongs on `__init__`, or change Parameters to Attributes. 
https://numpydoc.readthedocs.io/en/latest/format.html#class-docstring

##########
File path: python/tvm/relay/backend/executor_factory.py
##########
@@ -14,21 +14,125 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
-"""Graph executor factory."""
+"""Executor factory modules."""
+from abc import abstractmethod
 import warnings
+
+from tvm import tir
+
 from ..._ffi.base import string_types
 from ..._ffi.registry import get_global_func
 from ...runtime import ndarray
 
 
-class GraphExecutorFactoryModule:
+class ExecutorFactoryModule:
+    """Common interface for executor factory modules
+    This class describes the common API of different
+    factory modules
+    """
+
+    @abstractmethod
+    def get_internal_repr(self):
+        """Common function to return the internal representation

Review comment:
       @giuseros can you follow the numpydoc style: 
https://numpydoc.readthedocs.io/en/latest/format.html#sections

##########
File path: src/target/source/codegen_c_host.cc
##########
@@ -211,21 +214,34 @@ void CodeGenCHost::PrintGetFuncFromBackend(const 
std::string& func_name,
   this->stream << "}\n";
 }
 
-void CodeGenCHost::PrintFuncCall(const std::string& packed_func_name, int 
num_args) {
+void CodeGenCHost::PrintFuncCall(const std::string& packed_func_name, PrimExpr 
values,
+                                 int num_args) {
   this->PrintIndent();
+  std::string stack_value = "stack_value";

Review comment:
       does it make sense to add something explicitly indicating this is a 
default e.g. unnamed_stack_value

##########
File path: src/relay/backend/aot_codegen.cc
##########
@@ -0,0 +1,704 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file relay/backend/graph_codegen.cc
+ * \brief Graph runtime codegen
+ */
+
+#include <dmlc/any.h>
+#include <tvm/ir/module.h>
+#include <tvm/relay/expr_functor.h>
+#include <tvm/runtime/device_api.h>
+#include <tvm/tir/builtin.h>
+#include <tvm/tir/expr.h>
+#include <tvm/tir/stmt.h>
+
+#include <algorithm>
+#include <list>
+#include <string>
+#include <vector>
+
+#include "../../runtime/meta_data.h"
+#include "compile_engine.h"
+#include "utils.h"
+
+namespace tvm {
+namespace relay {
+namespace backend {
+
+using IntegerArray = Array<Integer>;
+using ShapeVector = std::vector<std::vector<int64_t>>;
+using GraphAttrs = std::unordered_map<std::string, dmlc::any>;
+using TargetsMap = std::unordered_map<int, Target>;
+
+/*! \brief Lowered outputs */
+struct AOTLoweredOutput {
+  tir::PrimFunc runner_func;
+  Map<String, IRModule> lowered_funcs;
+  Array<tvm::runtime::Module> external_mods;
+  std::unordered_map<std::string, std::pair<int, const tvm::runtime::NDArray>> 
params;
+  runtime::AOTMetadata aot_metadata;
+};
+
+class AotReturnSidVisitor : public ExprVisitor {
+ public:
+  explicit AotReturnSidVisitor(Map<Expr, Array<IntegerArray>> 
storage_device_map)
+      : storage_device_map_{storage_device_map}, return_sid_{-1} {}
+
+  IntegerArray FindReturnSid(Function func) {
+    VisitExpr(func->body);
+    return return_sid_;
+  }
+
+ protected:
+  void AssignReturnSid(Expr e) {
+    auto iter = storage_device_map_.find(e);
+    if (iter != storage_device_map_.end()) {
+      return_sid_ = (*iter).second[0];
+    }
+  }
+
+  void VisitExpr_(const ConstantNode* cn) override {
+    ExprVisitor::VisitExpr_(cn);
+    AssignReturnSid(GetRef<Expr>(cn));
+  }
+
+  void VisitExpr_(const VarNode* vn) override {
+    ExprVisitor::VisitExpr_(vn);
+    AssignReturnSid(GetRef<Expr>(vn));
+  }
+
+  void VisitExpr_(const CallNode* cn) override {
+    ExprVisitor::VisitExpr_(cn);
+    AssignReturnSid(GetRef<Expr>(cn));
+  }
+
+  void VisitExpr_(const LetNode* op) override { VisitExpr(op->body); }
+
+  void VisitExpr_(const TupleNode* tn) override {
+    ExprVisitor::VisitExpr_(tn);
+    AssignReturnSid(GetRef<Expr>(tn));
+  }
+
+ private:
+  Map<Expr, Array<IntegerArray>> storage_device_map_;
+  IntegerArray return_sid_;
+};
+
+/*! \brief Code generator for AOT executor */
+class AOTCodegen : public ExprVisitor {
+ protected:
+  /*!
+   * \brief Utility function to allocate a DLTensor or TVMValue
+   * \param  type the type of allocation
+   * \param num the number of variable to allocate on the stack
+   * \return PrimExpr representing the allocated object
+   */
+  PrimExpr StackAlloca(std::string type, size_t num) {
+    Array<PrimExpr> args = {tir::StringImm(type), ConstInt32(num)};
+    return tir::Call(DataType::Handle(), tir::builtin::tvm_stack_alloca(), 
args);
+  }
+
+  /*!
+   * \brief Utility function to allocate memory for storage identifiers
+   * \param  memory_size_byte size in bytes of the allocation
+   * \return PrimExpr representing the allocated memory
+   */
+  PrimExpr AllocateBackendMemory(int memory_size_byte) {
+    // TODO(giuseros): use tir::Allocate instead of TVMBackendAllocWorkspace
+    // to enable unified memory planning
+    static const Op& op = Op::Get("tir.TVMBackendAllocWorkspace");
+    return tvm::tir::Call(DataType::Handle(), op, {1, 0, memory_size_byte, 2, 
8});
+  }
+
+  /*!
+   * \brief Utility function to convert a concrete integer to a PrimExpr.
+   * \param num the number to convert
+   * \return PrimExpr representing num
+   */
+  inline PrimExpr ConstInt32(size_t num) {
+    ICHECK_LE(num, std::numeric_limits<int>::max());
+    return tir::make_const(DataType::Int(32), static_cast<int>(num));
+  }
+
+  /*!
+   * \brief Return a vector of variables that represents the sids for the 
given Relay Expr
+   */
+  std::vector<tir::Var> pack_sid(Expr expr) {
+    Array<IntegerArray> sids = storage_device_map_[expr];
+    std::vector<tir::Var> sid_vars;
+
+    // Note that an expression can have multiple sids associated with it
+    // e.g., returning multiple values from a function
+    for (const auto& sid : sids[0]) {
+      // Determine if an sid is an output buffer
+      int sid_int = static_cast<int>((sid.as<IntImmNode>())->value);
+      auto output_iter = std::find(return_sid_.begin(), return_sid_.end(), 
sid_int);
+      if (output_iter != return_sid_.end()) {
+        int output_index = std::distance(return_sid_.begin(), output_iter);
+        sid_vars.push_back(main_signature_[input_vars_.size() + output_index]);
+        continue;
+      }
+      // Pack the sid inside the TVMValue
+      auto sid_array = te::Var(make_string("sid_", sid, "_value"), 
DataType::Handle());
+      auto sid_value = sids_table_[sid];
+      tvm::PrimExpr set_tensor =
+          tvm::tir::Call(DataType::Handle(), 
tvm::tir::builtin::tvm_struct_set(),
+                         {sid_array, 0, tir::builtin::kArrData, sid_value});
+      stmts_.push_back(tir::LetStmt(sid_array, StackAlloca("array", 1), 
tir::Evaluate(set_tensor)));
+      sid_vars.push_back(sid_array);
+    }
+    return sid_vars;
+  }
+
+  /*!
+   * \brief Utility function to return a parameter associated with an 
expression
+   * \param expr Relay Expression assicated with the parameter
+   * \return Variable that represents the DLTensor associated with the 
parameters
+   */
+  tir::Var pack_param(Expr expr) {
+    // TODO(giuseros): Using call_extern to call into lookup_linked_param. 
This is because the
+    // builtin::ret is not supported yet in the c target. Once return is 
supported we can use
+    // tvm_call_packed_lowered().
+    int param_sid = param_storage_ids_[params_by_expr_[expr]];
+    auto lookup_linked_param_fn = 
tir::StringImm(::tvm::runtime::symbol::tvm_lookup_linked_param);
+    auto param_array = te::Var(make_string("param_", param_sid, "_array"), 
DataType::Handle());
+
+    // Compose the lookup_call using a local stack
+    Array<tir::Stmt> lookup_call;
+    auto param_var = te::Var(make_string("param_", param_sid, "_value"), 
DataType::Handle());
+    auto ret_var = te::Var("ret_value", DataType::Handle());
+    auto ret_code = te::Var("ret_value", DataType::Handle());
+
+    lookup_call.push_back(tir::Evaluate(
+        tvm::tir::Call(DataType::Handle(), tvm::tir::builtin::tvm_struct_set(),
+                       {param_var, 0, tir::builtin::kTVMValueContent, 
ConstInt32(param_sid)})));
+    lookup_call.push_back(tir::Evaluate(
+        tvm::tir::Call(DataType::Handle(), tir::builtin::call_extern(),
+                       {lookup_linked_param_fn, param_var, 0, 0, ret_var, 
ret_code, 0})));
+    auto ret_var_handle = tvm::tir::Call(DataType::Handle(), 
tvm::tir::builtin::tvm_struct_get(),
+                                         {ret_var, 0, 
tir::builtin::kTVMValueContent});
+
+    // Set the param to the value returned by lookup_call
+    tvm::PrimExpr set_param_array =
+        tvm::tir::Call(DataType::Handle(), tvm::tir::builtin::tvm_struct_set(),
+                       {param_array, 0, tir::builtin::kArrData, 
ret_var_handle});
+    lookup_call.push_back(tir::Evaluate(set_param_array));
+
+    tir::Stmt lookup_body = tir::SeqStmt(lookup_call);
+
+    // Allocate the DLTensors on the stack
+    lookup_body = tir::LetStmt(param_var, StackAlloca("arg_value", 1), 
lookup_body);
+    lookup_body = tir::LetStmt(ret_var, StackAlloca("arg_value", 1), 
lookup_body);
+    lookup_body = tir::LetStmt(ret_code, StackAlloca("arg_value", 1), 
lookup_body);
+    lookup_body = tir::LetStmt(param_array, StackAlloca("arg_value", 1), 
lookup_body);
+    stmts_.push_back(lookup_body);
+    return param_array;
+  }
+
+  /*!
+   * brief Given an expression return the variable(s) associated with that 
expression
+   */
+  std::vector<te::Var> find_expr(Expr arg) {
+    auto input_iter = std::find(input_vars_.begin(), input_vars_.end(), arg);
+    if (input_iter != input_vars_.end()) {
+      // Input variable
+      int main_index = std::distance(input_vars_.begin(), input_iter);
+      return {main_signature_[main_index]};
+    } else if (params_by_expr_.find(arg) != params_by_expr_.end()) {
+      // Parameter of the network
+      return {pack_param(arg)};
+    } else {
+      // Storage identifier (i.e., intermediate memory)
+      return pack_sid(arg);
+    }
+  }
+
+  /*!
+   * brief Call a function with a given name
+   */
+  void func_call(Call call, std::string func_name) {
+    tvm::Array<PrimExpr> args{tvm::tir::StringImm(func_name)};
+    std::vector<tir::Stmt> func_call_stmts;
+
+    // Pack the inputs
+    for (Expr arg : call->args) {
+      auto var_arg = find_expr(arg);
+      args.push_back(var_arg[0]);
+    }
+
+    auto ret_expr = Downcast<Expr>(call);
+
+    // Pack the return(s) value. A call node can produce multiple outputs
+    for (const auto& var : pack_sid(ret_expr)) {
+      args.push_back(var);
+    }
+
+    // Use tvm_call_packed to execute the function
+    func_call_stmts.push_back(tir::Evaluate(
+        tvm::tir::Call(DataType::Int(32), 
tvm::tir::builtin::tvm_call_packed(), args)));
+    tir::Stmt body = tir::SeqStmt(func_call_stmts);
+    stmts_.push_back(body);
+  }
+
+  /*!
+   * brief Copy a variable to the output. This function is mainly used in edge 
cases
+   * when we want to return an input or a parameter.
+   */
+  void copy_to_output(te::Var out, te::Var in, size_t size) {
+    auto retval_get = tvm::tir::Call(DataType::Handle(), 
tvm::tir::builtin::tvm_struct_get(),
+                                     {in, 0, tir::builtin::kArrData});
+
+    // Define intermediate DLTensor to load/store the data
+    auto tmp0 = te::Var("tmp0", DataType::Handle());
+    auto tmp1 = te::Var("tmp1", DataType::Handle());
+    te::Var loop_idx("i", DataType::Int(32));
+    auto retval_i = tir::Load(DataType::UInt(8), tmp0, loop_idx, 
tir::const_true());
+    auto tostore = tvm::tir::Call(DataType::Handle(), 
tvm::tir::builtin::tvm_struct_get(),
+                                  {out, 0, tir::builtin::kArrData});
+
+    // Copy the variable from the input to the output
+    tir::Stmt copy = tir::For(
+        loop_idx, 0, ConstInt32(size), tir::ForKind::kSerial,
+        tir::Store(tmp1, tir::Let(tmp0, retval_get, retval_i), loop_idx, 
tir::const_true()));
+    stmts_.push_back(tir::LetStmt(tmp1, tostore, copy));
+  }
+
+  /*!
+   * Utility function to string together different arguments
+   */
+  template <typename... Args>
+  std::string make_string(Args const&... args) {
+    std::ostringstream ss;
+    using List = int[];
+    (void)List{0, ((void)(ss << args), 0)...};
+
+    return ss.str();
+  }
+
+  void VisitExpr_(const CallNode* op) override {
+    // Descend the call tree
+    for (auto arg : op->args) {
+      VisitExpr(arg);
+    }
+
+    Expr expr = GetRef<Expr>(op);
+    Function func;
+    if (op->op.as<OpNode>()) {
+      LOG(FATAL) << "Operators should be transformed away; try applying"
+                 << "the fuse_ops transformation to the expression.";
+    } else if (op->op.as<GlobalVarNode>()) {
+      LOG(FATAL) << "Not implemented";
+    } else if (op->op.as<FunctionNode>()) {
+      func = GetRef<Function>(op->op.as<FunctionNode>());
+    } else {
+      LOG(FATAL) << "TVM runtime does not support calls to " << 
op->op->GetTypeKey();
+    }
+    if (!func->HasNonzeroAttr(attr::kPrimitive)) {
+      LOG(FATAL) << "TVM only support calls to primitive functions "
+                 << "(i.e functions composed of fusable operator invocations)";
+    }
+
+    auto pf0 = GetPackedFunc("relay.backend._make_CCacheKey");
+    auto pf1 = GetPackedFunc("relay.backend._CompileEngineLower");
+    Target target;
+    // Handle external function
+    if (func->GetAttr<String>(attr::kCompiler).defined()) {
+      target = Target("ext_dev");
+      CCacheKey key = (*pf0)(func, target);
+      CachedFunc ext_func = (*pf1)(compile_engine_, key);
+      ICHECK(ext_func.defined()) << "External function is not defined.";
+      UpdateConstants(func, &params_);
+
+      // Generate the TIR function call
+      func_call(GetRef<Call>(op), ext_func->func_name);
+    }
+
+    ICHECK_GE(storage_device_map_.count(expr), 0);
+    auto& device_type = storage_device_map_[expr][1];
+    auto call_dev_type = device_type[0]->value;
+    // Normal Relay Function
+    if (targets_.size() == 1) {
+      // homogeneous execution.
+      const auto& it = targets_.begin();
+      target = (*it).second;
+    } else {
+      // heterogeneous execution.
+      std::string call_dev_name;
+      if (call_dev_type == 0) {
+        call_dev_name = "llvm";
+      } else {
+        call_dev_name = runtime::DeviceName(call_dev_type);
+      }
+      if (targets_.count(call_dev_type) == 0) {
+        LOG(FATAL) << "No target is provided for device " << call_dev_name;
+      }
+      target = targets_[call_dev_type];
+    }
+    CCacheKey key = (*pf0)(func, target);
+    CachedFunc lowered_func = (*pf1)(compile_engine_, key);
+    if (!lowered_funcs_.count(target->str())) {
+      lowered_funcs_[target->str()] = IRModule(Map<GlobalVar, BaseFunc>({}));
+    }
+    lowered_funcs_[target->str()]->Update(lowered_func->funcs);
+
+    // Generate the TIR function call
+    func_call(GetRef<Call>(op), lowered_func->func_name);
+  }
+
+  void VisitExpr_(const VarNode* op) override {
+    Expr expr = GetRef<Expr>(op);
+
+    // If the Var node is an output node we need to copy the content of the 
variable to the output
+    // It's safe to check the SID here because Var StorageToken are never 
reallocated
+    Array<IntegerArray> sids = storage_device_map_[expr];
+
+    auto output_iter = std::find(return_sid_.begin(), return_sid_.end(),
+                                 
static_cast<int>((sids[0][0].as<IntImmNode>())->value));
+    if (output_iter != return_sid_.end()) {
+      int output_index = std::distance(return_sid_.begin(), output_iter);
+      auto var_expr = find_expr(expr);
+      copy_to_output(main_signature_[input_vars_.size() + output_index], 
var_expr[0], sids[2][0]);
+    }
+  }
+
+  void VisitExpr_(const ConstantNode* op) override {
+    Expr expr = GetRef<Expr>(op);
+    size_t index = params_.size();
+    std::string name = "p" + std::to_string(index);
+
+    param_storage_ids_[name] = storage_device_map_[expr][0][0]->value;
+    params_[name] = op->data;
+    params_by_expr_.Set(expr, name);
+
+    // If the Constant node is an output node we need to copy the content of 
the parameter to the
+    // output A Var node can only produce a single output
+    Array<IntegerArray> sids = storage_device_map_[expr];
+    auto output_iter = std::find(return_sid_.begin(), return_sid_.end(),
+                                 
static_cast<int>((sids[0][0].as<IntImmNode>())->value));
+    if (output_iter != return_sid_.end()) {
+      int output_index = std::distance(return_sid_.begin(), output_iter);
+      copy_to_output(main_signature_[input_vars_.size() + output_index], 
pack_param(expr),
+                     sids[2][0]);
+    }
+  }
+
+  void VisitExpr_(const TupleNode* op) override {
+    for (auto field : op->fields) {
+      VisitExpr(field);
+    }
+  }
+
+  void VisitExpr_(const LetNode* op) override {
+    // TODO(giuseros): support Let nodes in AOT
+    CHECK(false) << "Let not yet implemented in AOT";
+  }
+  void VisitExpr_(const TupleGetItemNode* op) override { VisitExpr(op->tuple); 
}
+  void VisitExpr_(const OpNode* op) override {
+    throw std::runtime_error("can not compile op in non-eta expanded form");
+  }
+  void VisitExpr_(const GlobalVarNode* op) override { throw 
std::runtime_error(""); }
+  void VisitExpr_(const IfNode* op) override { throw std::invalid_argument("if 
not supported"); }
+  void VisitExpr_(const FunctionNode* op) override {
+    ICHECK(op->GetAttr<String>(attr::kCompiler).defined())
+        << "Only functions supported by custom codegen";
+  }
+  void VisitExpr_(const RefCreateNode* op) override {
+    throw std::invalid_argument("reference not supported");
+  }
+  void VisitExpr_(const RefReadNode* op) override {
+    throw std::invalid_argument("reference not supported");
+  }
+  void VisitExpr_(const RefWriteNode* op) override {
+    throw std::invalid_argument("reference not supported");
+  }
+  void VisitExpr_(const ConstructorNode* op) override {
+    throw std::invalid_argument("ADT constructor case not yet implemented");
+  }
+  void VisitExpr_(const MatchNode* op) override {
+    throw std::invalid_argument("match case not yet implemented");
+  }
+
+  // Create the main PrimFunc to execute the graph
+  tir::PrimFunc CreateMainFunc(unsigned int relay_params) {
+    tir::Stmt body = tir::SeqStmt(stmts_);
+
+    // Allocate the sids
+    std::unordered_map<int, bool> allocated;
+
+    for (auto kv : storage_device_map_) {
+      // Only allocate sids that are needed
+      const bool is_input =
+          (std::find(input_vars_.begin(), input_vars_.end(), kv.first) != 
input_vars_.end());
+      const bool is_param = (params_by_expr_.find(kv.first) != 
params_by_expr_.end());
+      if (is_input || is_param) {
+        continue;
+      }
+
+      for (unsigned int i = 0; i < kv.second[0].size(); i++) {
+        int size = kv.second[2][i];
+        int sid = static_cast<int>((kv.second[0][i].as<IntImmNode>())->value);
+
+        if (std::find(return_sid_.begin(), return_sid_.end(), sid) != 
return_sid_.end()) {
+          continue;
+        }
+
+        // TODO(giuseros): we should allocate this one time outside the 
PrimFunc
+        // so we dont' pay the price of allocation for every inference
+        if (!allocated[sid]) {
+          body = tir::LetStmt(sids_table_[sid], AllocateBackendMemory(size), 
body);
+        }
+        allocated[sid] = true;
+      }
+    }
+
+    // Define the attributes
+    body = tir::AttrStmt(PrimExpr(), tvm::tir::attr::device_type, 1, body);
+    body = tir::AttrStmt(PrimExpr(), tvm::tir::attr::device_id, 0, body);
+
+    // Make the PrimFunc
+    return tir::PrimFunc(main_signature_, body, VoidType(), Map<tir::Var, 
tir::Buffer>(),
+                         DictAttrs(dict_attrs_));
+  }
+
+ protected:
+  /*! \brief mod */
+  runtime::Module* mod_;
+  /*! \brief list of input expressions (i.e., variable passed by the user) */
+  std::vector<Expr> input_vars_;
+  /*! \brief input and output variables belonging to the main function 
signature */
+  Array<tir::Var> main_signature_;
+  /*! \brief target device */
+  TargetsMap targets_;
+  /*! \brief target host */
+  Target target_host_;
+  /*! PrimFunc attributes */
+  Map<String, ObjectRef> dict_attrs_;
+
+  /*!
+   * \brief parameters (i.e. ConstantNodes found in the graph).
+   * These are take as inputs to the GraphRuntime.
+   * Maps param name to a pair of storage_id and NDArray. At runtime, the 
storage_id can be
+   * used to lookup the parameter.
+   */
+  std::unordered_map<std::string, runtime::NDArray> params_;
+  /*! \brief mapping between expression and parameters */
+  Map<Expr, String> params_by_expr_;
+  /*! \brief mapping between parameter names ("p0", "p1", etc..) and storage 
identifiers*/
+  std::unordered_map<std::string, int64_t> param_storage_ids_;
+
+  /*! \brief plan memory of device result */
+  Map<Expr, Array<IntegerArray>> storage_device_map_;
+  std::unordered_map<int, te::Var> sids_table_;
+  /*! \brief lowered funcs */
+  std::unordered_map<std::string, IRModule> lowered_funcs_;
+  /*! \brief name map */
+  std::unordered_map<std::string, size_t> name_map_;
+  /*! \brief compile engine */
+  CompileEngine compile_engine_;
+  /*! \brief GraphPlanMemory module */
+  runtime::Module graph_plan_memory_module_;
+  /*! \brief the IR module stored which represents the executor program */
+  Map<String, IRModule> tir_module_;
+  /*! \brief the set of statements that make the program */
+  std::vector<tir::Stmt> stmts_;
+  /*! \brief the list of return sids (note that the function might return more 
then one output */
+  IntegerArray return_sid_;
+
+ public:
+  AOTCodegen(runtime::Module* mod, const TargetsMap& targets, Target 
target_host)
+      : mod_(mod), return_sid_() {
+    compile_engine_ = CompileEngine::Global();
+    targets_ = targets;
+    target_host_ = target_host;
+    dict_attrs_.Set("global_symbol", runtime::String("tvm__run_func"));

Review comment:
       does this need to be class-level?

##########
File path: src/relay/backend/aot_codegen.cc
##########
@@ -0,0 +1,704 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file relay/backend/graph_codegen.cc
+ * \brief Graph runtime codegen
+ */
+
+#include <dmlc/any.h>
+#include <tvm/ir/module.h>
+#include <tvm/relay/expr_functor.h>
+#include <tvm/runtime/device_api.h>
+#include <tvm/tir/builtin.h>
+#include <tvm/tir/expr.h>
+#include <tvm/tir/stmt.h>
+
+#include <algorithm>
+#include <list>
+#include <string>
+#include <vector>
+
+#include "../../runtime/meta_data.h"
+#include "compile_engine.h"
+#include "utils.h"
+
+namespace tvm {
+namespace relay {
+namespace backend {
+
+using IntegerArray = Array<Integer>;
+using ShapeVector = std::vector<std::vector<int64_t>>;
+using GraphAttrs = std::unordered_map<std::string, dmlc::any>;
+using TargetsMap = std::unordered_map<int, Target>;
+
+/*! \brief Lowered outputs */
+struct AOTLoweredOutput {
+  tir::PrimFunc runner_func;
+  Map<String, IRModule> lowered_funcs;
+  Array<tvm::runtime::Module> external_mods;
+  std::unordered_map<std::string, std::pair<int, const tvm::runtime::NDArray>> 
params;
+  runtime::AOTMetadata aot_metadata;
+};
+
+class AotReturnSidVisitor : public ExprVisitor {
+ public:
+  explicit AotReturnSidVisitor(Map<Expr, Array<IntegerArray>> 
storage_device_map)
+      : storage_device_map_{storage_device_map}, return_sid_{-1} {}
+
+  IntegerArray FindReturnSid(Function func) {
+    VisitExpr(func->body);
+    return return_sid_;
+  }
+
+ protected:
+  void AssignReturnSid(Expr e) {
+    auto iter = storage_device_map_.find(e);
+    if (iter != storage_device_map_.end()) {
+      return_sid_ = (*iter).second[0];
+    }
+  }
+
+  void VisitExpr_(const ConstantNode* cn) override {
+    ExprVisitor::VisitExpr_(cn);
+    AssignReturnSid(GetRef<Expr>(cn));
+  }
+
+  void VisitExpr_(const VarNode* vn) override {
+    ExprVisitor::VisitExpr_(vn);
+    AssignReturnSid(GetRef<Expr>(vn));
+  }
+
+  void VisitExpr_(const CallNode* cn) override {
+    ExprVisitor::VisitExpr_(cn);
+    AssignReturnSid(GetRef<Expr>(cn));
+  }
+
+  void VisitExpr_(const LetNode* op) override { VisitExpr(op->body); }
+
+  void VisitExpr_(const TupleNode* tn) override {
+    ExprVisitor::VisitExpr_(tn);
+    AssignReturnSid(GetRef<Expr>(tn));
+  }
+
+ private:
+  Map<Expr, Array<IntegerArray>> storage_device_map_;
+  IntegerArray return_sid_;
+};
+
+/*! \brief Code generator for AOT executor */
+class AOTCodegen : public ExprVisitor {
+ protected:
+  /*!
+   * \brief Utility function to allocate a DLTensor or TVMValue
+   * \param  type the type of allocation
+   * \param num the number of variable to allocate on the stack
+   * \return PrimExpr representing the allocated object
+   */
+  PrimExpr StackAlloca(std::string type, size_t num) {
+    Array<PrimExpr> args = {tir::StringImm(type), ConstInt32(num)};
+    return tir::Call(DataType::Handle(), tir::builtin::tvm_stack_alloca(), 
args);
+  }
+
+  /*!
+   * \brief Utility function to allocate memory for storage identifiers
+   * \param  memory_size_byte size in bytes of the allocation
+   * \return PrimExpr representing the allocated memory
+   */
+  PrimExpr AllocateBackendMemory(int memory_size_byte) {
+    // TODO(giuseros): use tir::Allocate instead of TVMBackendAllocWorkspace
+    // to enable unified memory planning
+    static const Op& op = Op::Get("tir.TVMBackendAllocWorkspace");
+    return tvm::tir::Call(DataType::Handle(), op, {1, 0, memory_size_byte, 2, 
8});
+  }
+
+  /*!
+   * \brief Utility function to convert a concrete integer to a PrimExpr.
+   * \param num the number to convert
+   * \return PrimExpr representing num
+   */
+  inline PrimExpr ConstInt32(size_t num) {
+    ICHECK_LE(num, std::numeric_limits<int>::max());
+    return tir::make_const(DataType::Int(32), static_cast<int>(num));
+  }
+
+  /*!
+   * \brief Return a vector of variables that represents the sids for the 
given Relay Expr
+   */
+  std::vector<tir::Var> pack_sid(Expr expr) {
+    Array<IntegerArray> sids = storage_device_map_[expr];
+    std::vector<tir::Var> sid_vars;
+
+    // Note that an expression can have multiple sids associated with it
+    // e.g., returning multiple values from a function
+    for (const auto& sid : sids[0]) {
+      // Determine if an sid is an output buffer
+      int sid_int = static_cast<int>((sid.as<IntImmNode>())->value);
+      auto output_iter = std::find(return_sid_.begin(), return_sid_.end(), 
sid_int);
+      if (output_iter != return_sid_.end()) {
+        int output_index = std::distance(return_sid_.begin(), output_iter);
+        sid_vars.push_back(main_signature_[input_vars_.size() + output_index]);
+        continue;
+      }
+      // Pack the sid inside the TVMValue
+      auto sid_array = te::Var(make_string("sid_", sid, "_value"), 
DataType::Handle());
+      auto sid_value = sids_table_[sid];
+      tvm::PrimExpr set_tensor =
+          tvm::tir::Call(DataType::Handle(), 
tvm::tir::builtin::tvm_struct_set(),
+                         {sid_array, 0, tir::builtin::kArrData, sid_value});
+      stmts_.push_back(tir::LetStmt(sid_array, StackAlloca("array", 1), 
tir::Evaluate(set_tensor)));
+      sid_vars.push_back(sid_array);
+    }
+    return sid_vars;
+  }
+
+  /*!
+   * \brief Utility function to return a parameter associated with an 
expression
+   * \param expr Relay Expression assicated with the parameter
+   * \return Variable that represents the DLTensor associated with the 
parameters
+   */
+  tir::Var pack_param(Expr expr) {
+    // TODO(giuseros): Using call_extern to call into lookup_linked_param. 
This is because the
+    // builtin::ret is not supported yet in the c target. Once return is 
supported we can use
+    // tvm_call_packed_lowered().
+    int param_sid = param_storage_ids_[params_by_expr_[expr]];
+    auto lookup_linked_param_fn = 
tir::StringImm(::tvm::runtime::symbol::tvm_lookup_linked_param);
+    auto param_array = te::Var(make_string("param_", param_sid, "_array"), 
DataType::Handle());
+
+    // Compose the lookup_call using a local stack
+    Array<tir::Stmt> lookup_call;
+    auto param_var = te::Var(make_string("param_", param_sid, "_value"), 
DataType::Handle());
+    auto ret_var = te::Var("ret_value", DataType::Handle());
+    auto ret_code = te::Var("ret_value", DataType::Handle());
+
+    lookup_call.push_back(tir::Evaluate(
+        tvm::tir::Call(DataType::Handle(), tvm::tir::builtin::tvm_struct_set(),
+                       {param_var, 0, tir::builtin::kTVMValueContent, 
ConstInt32(param_sid)})));
+    lookup_call.push_back(tir::Evaluate(
+        tvm::tir::Call(DataType::Handle(), tir::builtin::call_extern(),
+                       {lookup_linked_param_fn, param_var, 0, 0, ret_var, 
ret_code, 0})));
+    auto ret_var_handle = tvm::tir::Call(DataType::Handle(), 
tvm::tir::builtin::tvm_struct_get(),
+                                         {ret_var, 0, 
tir::builtin::kTVMValueContent});
+
+    // Set the param to the value returned by lookup_call
+    tvm::PrimExpr set_param_array =
+        tvm::tir::Call(DataType::Handle(), tvm::tir::builtin::tvm_struct_set(),
+                       {param_array, 0, tir::builtin::kArrData, 
ret_var_handle});
+    lookup_call.push_back(tir::Evaluate(set_param_array));
+
+    tir::Stmt lookup_body = tir::SeqStmt(lookup_call);
+
+    // Allocate the DLTensors on the stack
+    lookup_body = tir::LetStmt(param_var, StackAlloca("arg_value", 1), 
lookup_body);
+    lookup_body = tir::LetStmt(ret_var, StackAlloca("arg_value", 1), 
lookup_body);
+    lookup_body = tir::LetStmt(ret_code, StackAlloca("arg_value", 1), 
lookup_body);
+    lookup_body = tir::LetStmt(param_array, StackAlloca("arg_value", 1), 
lookup_body);
+    stmts_.push_back(lookup_body);
+    return param_array;
+  }
+
+  /*!
+   * brief Given an expression return the variable(s) associated with that 
expression
+   */
+  std::vector<te::Var> find_expr(Expr arg) {
+    auto input_iter = std::find(input_vars_.begin(), input_vars_.end(), arg);
+    if (input_iter != input_vars_.end()) {
+      // Input variable
+      int main_index = std::distance(input_vars_.begin(), input_iter);
+      return {main_signature_[main_index]};
+    } else if (params_by_expr_.find(arg) != params_by_expr_.end()) {
+      // Parameter of the network
+      return {pack_param(arg)};
+    } else {
+      // Storage identifier (i.e., intermediate memory)
+      return pack_sid(arg);
+    }
+  }
+
+  /*!
+   * brief Call a function with a given name
+   */
+  void func_call(Call call, std::string func_name) {
+    tvm::Array<PrimExpr> args{tvm::tir::StringImm(func_name)};
+    std::vector<tir::Stmt> func_call_stmts;
+
+    // Pack the inputs
+    for (Expr arg : call->args) {
+      auto var_arg = find_expr(arg);
+      args.push_back(var_arg[0]);
+    }
+
+    auto ret_expr = Downcast<Expr>(call);
+
+    // Pack the return(s) value. A call node can produce multiple outputs
+    for (const auto& var : pack_sid(ret_expr)) {
+      args.push_back(var);
+    }
+
+    // Use tvm_call_packed to execute the function
+    func_call_stmts.push_back(tir::Evaluate(
+        tvm::tir::Call(DataType::Int(32), 
tvm::tir::builtin::tvm_call_packed(), args)));
+    tir::Stmt body = tir::SeqStmt(func_call_stmts);
+    stmts_.push_back(body);
+  }
+
+  /*!
+   * brief Copy a variable to the output. This function is mainly used in edge 
cases
+   * when we want to return an input or a parameter.
+   */
+  void copy_to_output(te::Var out, te::Var in, size_t size) {
+    auto retval_get = tvm::tir::Call(DataType::Handle(), 
tvm::tir::builtin::tvm_struct_get(),
+                                     {in, 0, tir::builtin::kArrData});
+
+    // Define intermediate DLTensor to load/store the data
+    auto tmp0 = te::Var("tmp0", DataType::Handle());
+    auto tmp1 = te::Var("tmp1", DataType::Handle());
+    te::Var loop_idx("i", DataType::Int(32));
+    auto retval_i = tir::Load(DataType::UInt(8), tmp0, loop_idx, 
tir::const_true());
+    auto tostore = tvm::tir::Call(DataType::Handle(), 
tvm::tir::builtin::tvm_struct_get(),
+                                  {out, 0, tir::builtin::kArrData});
+
+    // Copy the variable from the input to the output
+    tir::Stmt copy = tir::For(
+        loop_idx, 0, ConstInt32(size), tir::ForKind::kSerial,
+        tir::Store(tmp1, tir::Let(tmp0, retval_get, retval_i), loop_idx, 
tir::const_true()));
+    stmts_.push_back(tir::LetStmt(tmp1, tostore, copy));
+  }
+
+  /*!
+   * Utility function to string together different arguments
+   */
+  template <typename... Args>
+  std::string make_string(Args const&... args) {
+    std::ostringstream ss;
+    using List = int[];
+    (void)List{0, ((void)(ss << args), 0)...};
+
+    return ss.str();
+  }
+
+  void VisitExpr_(const CallNode* op) override {
+    // Descend the call tree
+    for (auto arg : op->args) {
+      VisitExpr(arg);
+    }
+
+    Expr expr = GetRef<Expr>(op);
+    Function func;
+    if (op->op.as<OpNode>()) {
+      LOG(FATAL) << "Operators should be transformed away; try applying"
+                 << "the fuse_ops transformation to the expression.";
+    } else if (op->op.as<GlobalVarNode>()) {
+      LOG(FATAL) << "Not implemented";
+    } else if (op->op.as<FunctionNode>()) {
+      func = GetRef<Function>(op->op.as<FunctionNode>());
+    } else {
+      LOG(FATAL) << "TVM runtime does not support calls to " << 
op->op->GetTypeKey();
+    }
+    if (!func->HasNonzeroAttr(attr::kPrimitive)) {
+      LOG(FATAL) << "TVM only support calls to primitive functions "
+                 << "(i.e functions composed of fusable operator invocations)";
+    }
+
+    auto pf0 = GetPackedFunc("relay.backend._make_CCacheKey");
+    auto pf1 = GetPackedFunc("relay.backend._CompileEngineLower");
+    Target target;
+    // Handle external function
+    if (func->GetAttr<String>(attr::kCompiler).defined()) {
+      target = Target("ext_dev");
+      CCacheKey key = (*pf0)(func, target);
+      CachedFunc ext_func = (*pf1)(compile_engine_, key);
+      ICHECK(ext_func.defined()) << "External function is not defined.";
+      UpdateConstants(func, &params_);
+
+      // Generate the TIR function call
+      func_call(GetRef<Call>(op), ext_func->func_name);
+    }
+
+    ICHECK_GE(storage_device_map_.count(expr), 0);
+    auto& device_type = storage_device_map_[expr][1];
+    auto call_dev_type = device_type[0]->value;
+    // Normal Relay Function
+    if (targets_.size() == 1) {
+      // homogeneous execution.
+      const auto& it = targets_.begin();
+      target = (*it).second;
+    } else {
+      // heterogeneous execution.
+      std::string call_dev_name;
+      if (call_dev_type == 0) {
+        call_dev_name = "llvm";
+      } else {
+        call_dev_name = runtime::DeviceName(call_dev_type);
+      }
+      if (targets_.count(call_dev_type) == 0) {
+        LOG(FATAL) << "No target is provided for device " << call_dev_name;
+      }
+      target = targets_[call_dev_type];
+    }
+    CCacheKey key = (*pf0)(func, target);
+    CachedFunc lowered_func = (*pf1)(compile_engine_, key);
+    if (!lowered_funcs_.count(target->str())) {
+      lowered_funcs_[target->str()] = IRModule(Map<GlobalVar, BaseFunc>({}));
+    }
+    lowered_funcs_[target->str()]->Update(lowered_func->funcs);
+
+    // Generate the TIR function call
+    func_call(GetRef<Call>(op), lowered_func->func_name);
+  }
+
+  void VisitExpr_(const VarNode* op) override {
+    Expr expr = GetRef<Expr>(op);
+
+    // If the Var node is an output node we need to copy the content of the 
variable to the output
+    // It's safe to check the SID here because Var StorageToken are never 
reallocated
+    Array<IntegerArray> sids = storage_device_map_[expr];
+
+    auto output_iter = std::find(return_sid_.begin(), return_sid_.end(),
+                                 
static_cast<int>((sids[0][0].as<IntImmNode>())->value));
+    if (output_iter != return_sid_.end()) {
+      int output_index = std::distance(return_sid_.begin(), output_iter);
+      auto var_expr = find_expr(expr);
+      copy_to_output(main_signature_[input_vars_.size() + output_index], 
var_expr[0], sids[2][0]);
+    }
+  }
+
+  void VisitExpr_(const ConstantNode* op) override {
+    Expr expr = GetRef<Expr>(op);
+    size_t index = params_.size();
+    std::string name = "p" + std::to_string(index);
+
+    param_storage_ids_[name] = storage_device_map_[expr][0][0]->value;
+    params_[name] = op->data;
+    params_by_expr_.Set(expr, name);
+
+    // If the Constant node is an output node we need to copy the content of 
the parameter to the
+    // output A Var node can only produce a single output
+    Array<IntegerArray> sids = storage_device_map_[expr];
+    auto output_iter = std::find(return_sid_.begin(), return_sid_.end(),
+                                 
static_cast<int>((sids[0][0].as<IntImmNode>())->value));
+    if (output_iter != return_sid_.end()) {
+      int output_index = std::distance(return_sid_.begin(), output_iter);
+      copy_to_output(main_signature_[input_vars_.size() + output_index], 
pack_param(expr),
+                     sids[2][0]);
+    }
+  }
+
+  void VisitExpr_(const TupleNode* op) override {
+    for (auto field : op->fields) {
+      VisitExpr(field);
+    }
+  }
+
+  void VisitExpr_(const LetNode* op) override {
+    // TODO(giuseros): support Let nodes in AOT
+    CHECK(false) << "Let not yet implemented in AOT";
+  }
+  void VisitExpr_(const TupleGetItemNode* op) override { VisitExpr(op->tuple); 
}
+  void VisitExpr_(const OpNode* op) override {
+    throw std::runtime_error("can not compile op in non-eta expanded form");
+  }
+  void VisitExpr_(const GlobalVarNode* op) override { throw 
std::runtime_error(""); }
+  void VisitExpr_(const IfNode* op) override { throw std::invalid_argument("if 
not supported"); }
+  void VisitExpr_(const FunctionNode* op) override {
+    ICHECK(op->GetAttr<String>(attr::kCompiler).defined())
+        << "Only functions supported by custom codegen";

Review comment:
       nit: "FunctionNode only" or just state not supported by AOT

##########
File path: src/target/source/source_module.cc
##########
@@ -191,17 +192,36 @@ class CSourceCrtMetadataModuleNode : public 
runtime::ModuleNode {
           << "}\n";
   }
 
+  void GenerateAOTDescriptor() {
+    code_ << "#include \"aot_executor.h\"\n";
+    code_ << "#include \"tvm/runtime/c_runtime_api.h\"\n";
+    code_ << "#ifdef __cplusplus\n";
+    code_ << "extern \"C\"\n";
+    code_ << "#endif\n";
+    code_ << "TVM_DLL int32_t " << ::tvm::runtime::symbol::tvm_run_func_prefix;
+    code_ << "(void* args, void* type_code, int num_args, void* out_value, 
void* "
+             "out_type_code, void* resource_handle);\n";
+    code_ << "const tvm_model_t network = {\n"
+          << "    .run_func = &" << 
::tvm::runtime::symbol::tvm_run_func_prefix << ",\n"
+          << "    .num_input_tensors = " << aot_metadata_->num_inputs << ",\n"
+          << "    .num_output_tensors = " << aot_metadata_->num_outputs << ", 
\n"
+          << "};\n";
+  }
+
   void CreateSource() {
     if (target_->GetAttr<Bool>("system-lib").value_or(Bool(false)) && 
!func_names_.empty()) {
       CreateFuncRegistry();
       GenerateCrtSystemLib();
     }
+    if (target_->GetAttr<String>("executor").value_or("graph_runtime") == 
"aot") {

Review comment:
       should just be "graph" or "aot". could you use constants here?

##########
File path: tests/python/relay/aot/infra.py
##########
@@ -0,0 +1,226 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""
+This module provides infrastructure to verify the correctness of
+the command stream produced.
+Currently it will invoke vela to generate a vela-optimized tflite
+in which the command stream is contained as a custom operator.
+This class include methods to parse the custom operator to extract
+the command stream and perform an equivalency check for single operator
+test cases.
+"""
+import tflite
+import os
+import io
+import struct
+import numpy as np
+import pathlib
+import shutil
+import subprocess
+import tempfile
+import tarfile
+
+
+import tvm
+from tvm import relay
+from tvm.relay import transform
+from tvm.relay.op.contrib import get_pattern_table
+from tvm.contrib import utils, graph_executor
+from tvm.relay.backend import compile_engine
+from tvm.contrib import utils
+from tvm.contrib import graph_runtime
+from tvm.micro import export_model_library_format
+
+
+def subprocess_with_stdout_and_log(cmd, cwd, logfile, stdout):
+    """
+    This method runs a process and logs the output to both a log file and 
stdout
+    """
+    with subprocess.Popen(
+        cmd, cwd=cwd, shell=True, bufsize=0, stdout=subprocess.PIPE, 
stderr=subprocess.STDOUT
+    ) as proc, open(logfile, "a") as f:
+        while True:
+            data = proc.stdout.readline()
+            result = proc.poll()
+            # process is done if there is no data and the result is valid
+            if data == b"" and result is not None:
+                return int(result)
+            if data:
+                text = data.decode("ascii", errors="backslashreplace")
+                f.write(text)
+                if stdout:
+                    print(text, end="")
+
+
+def create_main(test_name, input_list, output_list, output_path):
+    file_path = pathlib.Path(f"{output_path}/" + test_name).resolve()
+    # create header file
+    raw_path = file_path.with_suffix(".c").resolve()
+    with open(raw_path, "w") as main_file:
+        main_file.write("#include <stdio.h>\n")
+        main_file.write('#include "aot_executor.h"\n')
+        main_file.write('#include "stack_allocator.h"\n')
+        main_file.write("#define WORKSPACE_SIZE (16384*1024)\n")
+        main_file.write("static uint8_t g_aot_memory[WORKSPACE_SIZE];\n")
+
+        for i in range(0, len(input_list)):
+            main_file.write('#include "input_data%i.h"\n' % i)
+        for i in range(0, len(output_list)):
+            main_file.write('#include "expected_output_data%i.h"\n' % i)
+            main_file.write('#include "output_data%i.h"\n' % i)
+
+        main_file.write("extern tvm_model_t network;\n")
+        main_file.write("tvm_workspace_t app_workspace;\n")
+        main_file.write(
+            """
+tvm_crt_error_t TVMPlatformMemoryAllocate(size_t num_bytes, DLDevice dev, 
void** out_ptr) {
+    (*out_ptr) = StackMemoryManager_Allocate(&app_workspace, num_bytes);
+}
+
+tvm_crt_error_t TVMPlatformMemoryFree(void* ptr, DLDevice dev) {
+    StackMemoryManager_Free(&app_workspace,ptr);
+}
+
+void  TVMPlatformAbort(tvm_crt_error_t code) { }
+
+void TVMLogf(const char* msg, ...) { }
+      
+        """
+        )
+        main_file.write("int main(){\n")
+        main_file.write("void* inputs[%i] = { " % (len(input_list)))
+
+        for i in range(0, len(input_list)):
+            main_file.write("input_data%i, " % i)
+        main_file.write("};\n")
+
+        main_file.write("void* outputs[%i]  = { " % (len(output_list)))
+        for i in range(0, len(output_list)):
+            main_file.write("output_data%i, " % i)
+        main_file.write("};\n")
+
+        main_file.write("StackMemoryManager_Init(&app_workspace, g_aot_memory, 
WORKSPACE_SIZE);")
+        main_file.write("tvm_runtime_run(&network, inputs, outputs);")
+
+        for i in range(0, len(output_list)):
+            main_file.write("for (int i = 0; i<output_data%i_len; i++){\n" % i)
+            main_file.write(
+                'if 
(output_data%s[i]!=expected_output_data%s[i]){printf("ko\\n");return -1;}\n'
+                % (i, i)
+            )
+            main_file.write("}\n")
+
+        main_file.write('printf("ok\\n");')
+        main_file.write("return 0;")
+        main_file.write("}\n")
+
+
+def create_header_file(tensor_name, npy_data, output_path):
+    """
+    This method generates a header file containing the data contained in the 
numpy array provided.
+    It is used to capture the tensor data (for both inputs and expected 
outputs) to be bundled into the standalone ethosu_test_runner.
+    """
+    file_path = pathlib.Path(f"{output_path}/" + tensor_name).resolve()
+    # create header file
+    raw_path = file_path.with_suffix(".h").resolve()
+    with open(raw_path, "w") as header_file:
+        header_file.write("#include <stddef.h>\n")
+        header_file.write("#include <stdint.h>\n")
+        header_file.write("#include <dlpack/dlpack.h>\n")
+        header_file.write(f"const size_t {tensor_name}_len = 
{npy_data.size};\n")
+
+        if npy_data.dtype == "int8":
+            header_file.write(f"int8_t {tensor_name}[] =")
+        elif npy_data.dtype == "int32":
+            header_file.write(f"int32_t {tensor_name}[] = ")
+        elif npy_data.dtype == "uint8":
+            header_file.write(f"uint8_t {tensor_name}[] = ")
+        elif npy_data.dtype == "float32":
+            header_file.write(f"float {tensor_name}[] = ")
+
+        header_file.write("{")
+        for i in np.ndindex(npy_data.shape):
+            header_file.write(f"{npy_data[i]}, ")
+        header_file.write("};\n\n")
+
+
+def verify_source(mod, input_list, output_list, params=None):

Review comment:
       I think you could call this something like compile_and_run, 
verify_source sounds like it's just going to assert on the source code content

##########
File path: python/tvm/relay/backend/executor_factory.py
##########
@@ -14,21 +14,125 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
-"""Graph executor factory."""
+"""Executor factory modules."""
+from abc import abstractmethod
 import warnings
+
+from tvm import tir
+
 from ..._ffi.base import string_types
 from ..._ffi.registry import get_global_func
 from ...runtime import ndarray
 
 
-class GraphExecutorFactoryModule:
+class ExecutorFactoryModule:
+    """Common interface for executor factory modules
+    This class describes the common API of different
+    factory modules
+    """
+
+    @abstractmethod
+    def get_internal_repr(self):
+        """Common function to return the internal representation
+        the executor relies upon to execute the network
+        """
+        raise NotImplementedError
+
+    @abstractmethod
+    def get_params(self):
+        """
+        Sometimes we want to get params explicitly.
+        For example, we want to save its params value to
+        an independent file.
+        """
+        raise NotImplementedError
+
+    @abstractmethod
+    def get_lib(self):
+        """ Return the generated library"""
+        raise NotImplementedError
+
+    @abstractmethod
+    def get_internal_repr(self):
+        """ Return the internal representation used to execute the network"""
+        raise NotImplementedError
+
+    def __getitem__(self, item):
+        print(item)
+        return self.module.__getitem__(item)
+
+    def __iter__(self):
+        warnings.warn(
+            "legacy graph executor behavior of producing json / lib / params 
will be "
+            "removed in the next release."
+            " Please see documents of tvm.contrib.graph_executor.GraphModule 
for the "
+            " new recommended usage.",
+            DeprecationWarning,
+            2,
+        )
+        return self
+
+    def __next__(self):
+        if self.iter_cnt > 2:
+            raise StopIteration
+
+        objs = [self.get_internal_repr(), self.lib, self.params]
+        obj = objs[self.iter_cnt]
+        self.iter_cnt += 1
+        return obj
+
+
+class AOTExecutorFactoryModule(ExecutorFactoryModule):
+    """AOT executor factory module.
+
+    Parameters
+    ----------
+    runner_function : the PrimFunc containing of the TIR main executor 
function.
+    target : tvm.Target
+        The Target used to build this module.
+    libmod : tvm.Module
+        The module of the corresponding function
+    libmod_name: str
+        The name of module
+    params : dict of str to NDArray
+        The parameters of module
+    """
+
+    def __init__(self, ir_mod, target, runner_function, libmod, libmod_name, 
params):
+        assert isinstance(runner_function, tir.PrimFunc)
+        args = []

Review comment:
       this seems unused here

##########
File path: src/target/source/codegen_c_host.cc
##########
@@ -40,13 +40,16 @@ namespace codegen {
 
 CodeGenCHost::CodeGenCHost() { module_name_ = 
GetUniqueName("__tvm_module_ctx"); }
 
-void CodeGenCHost::Init(bool output_ssa, bool emit_asserts, std::string 
target_str) {
+void CodeGenCHost::Init(bool output_ssa, bool emit_asserts, bool 
is_aot_executor,
+                        std::string target_str) {

Review comment:
       ok @giuseros, I agree this PR is pretty close to what I was proposing as 
a first cut. There are two areas I'd disagree:
   1. the `tir.call_cpacked` intrinsic--I think we are in agreement here, so 
let's see if this works out. if not, perhaps we can rely on the FuncRegistry 
and introduce a workaround in a follow-on PR.
   2. I think given we are deferring discussing the firmware-facing API, we 
should not checkin `include/tvm/runtime/crt/aot_executor.h` and instead place 
this in `src/runtime/crt/include/tvm/runtime/crt/common/`. This way, you can 
use it from tests without us declaring it as a public API just yet. There might 
be one tricky thing here--you might need to tweak your aot_test.mk to place 
those files in the public include path.
   
   how does this sound as far as a scope for this PR?

##########
File path: tests/python/relay/aot/aot_test.mk
##########
@@ -0,0 +1,71 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+# Makefile to build ethosu_test_runner
+# Setup build environment
+#
+AOT_ROOT ?= $(TVM_ROOT)/src/runtime/crt/aot

Review comment:
       @giuseros just curious if you have thoughts on this?

##########
File path: python/tvm/relay/build_module.py
##########
@@ -111,7 +113,7 @@ def build(self, mod, target=None, target_host=None, 
params=None):
 
         Returns
         -------
-        factory_module : 
tvm.relay.backend.graph_executor_factory.GraphExecutorFactoryModule
+        factory_module : 
tvm.relay.backend.executor_factory.ExecutorFactoryModule

Review comment:
       could you update the docstring to reflect the tuple return value?

##########
File path: python/tvm/relay/backend/executor_factory.py
##########
@@ -14,21 +14,125 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
-"""Graph executor factory."""
+"""Executor factory modules."""
+from abc import abstractmethod
 import warnings
+
+from tvm import tir
+
 from ..._ffi.base import string_types
 from ..._ffi.registry import get_global_func
 from ...runtime import ndarray
 
 
-class GraphExecutorFactoryModule:
+class ExecutorFactoryModule:
+    """Common interface for executor factory modules
+    This class describes the common API of different
+    factory modules
+    """
+
+    @abstractmethod
+    def get_internal_repr(self):
+        """Common function to return the internal representation
+        the executor relies upon to execute the network
+        """
+        raise NotImplementedError
+
+    @abstractmethod
+    def get_params(self):
+        """
+        Sometimes we want to get params explicitly.
+        For example, we want to save its params value to
+        an independent file.
+        """
+        raise NotImplementedError
+
+    @abstractmethod
+    def get_lib(self):
+        """ Return the generated library"""
+        raise NotImplementedError
+
+    @abstractmethod
+    def get_internal_repr(self):
+        """ Return the internal representation used to execute the network"""
+        raise NotImplementedError
+
+    def __getitem__(self, item):
+        print(item)
+        return self.module.__getitem__(item)
+
+    def __iter__(self):
+        warnings.warn(
+            "legacy graph executor behavior of producing json / lib / params 
will be "
+            "removed in the next release."
+            " Please see documents of tvm.contrib.graph_executor.GraphModule 
for the "
+            " new recommended usage.",
+            DeprecationWarning,
+            2,
+        )
+        return self
+
+    def __next__(self):
+        if self.iter_cnt > 2:
+            raise StopIteration
+
+        objs = [self.get_internal_repr(), self.lib, self.params]
+        obj = objs[self.iter_cnt]
+        self.iter_cnt += 1
+        return obj
+
+
+class AOTExecutorFactoryModule(ExecutorFactoryModule):
+    """AOT executor factory module.
+
+    Parameters
+    ----------
+    runner_function : the PrimFunc containing of the TIR main executor 
function.
+    target : tvm.Target
+        The Target used to build this module.
+    libmod : tvm.Module
+        The module of the corresponding function
+    libmod_name: str
+        The name of module
+    params : dict of str to NDArray
+        The parameters of module
+    """
+
+    def __init__(self, ir_mod, target, runner_function, libmod, libmod_name, 
params):
+        assert isinstance(runner_function, tir.PrimFunc)
+        args = []
+        for k, v in params.items():
+            args.append(k)
+            args.append(ndarray.array(v))
+
+        self.ir_mod = ir_mod
+        self.target = target
+        self.runner_func = runner_function
+        self.lib = libmod
+        self.libmod_name = libmod_name
+        self.params = params
+        self.iter_cnt = 0
+
+    # Sometimes we want to get params explicitly.

Review comment:
       make this a docstring or rm

##########
File path: python/tvm/micro/model_library_format.py
##########
@@ -126,20 +125,25 @@ def export_model_library_format(mod: 
graph_executor_factory.GraphExecutorFactory
 
     Parameters
     ----------
-    mod : tvm.relay.backend.graph_executor_factory.GraphExecutorFactoryModule
+    mod : tvm.relay.backend.executor_factory.ExecutorFactoryModule
         The return value of tvm.relay.build, which will be exported into Model 
Library Format.
     file_name : str
         Path to the .tar archive to generate.
     """
     tempdir = utils.tempdir()
+    is_aot = isinstance(mod, executor_factory.AOTExecutorFactoryModule)

Review comment:
       I think we should either update the docstring/type annotation or remove 
this logic.

##########
File path: src/relay/backend/aot_codegen.cc
##########
@@ -0,0 +1,704 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file relay/backend/graph_codegen.cc
+ * \brief Graph runtime codegen
+ */
+
+#include <dmlc/any.h>
+#include <tvm/ir/module.h>
+#include <tvm/relay/expr_functor.h>
+#include <tvm/runtime/device_api.h>
+#include <tvm/tir/builtin.h>
+#include <tvm/tir/expr.h>
+#include <tvm/tir/stmt.h>
+
+#include <algorithm>
+#include <list>
+#include <string>
+#include <vector>
+
+#include "../../runtime/meta_data.h"
+#include "compile_engine.h"
+#include "utils.h"
+
+namespace tvm {
+namespace relay {
+namespace backend {
+
+using IntegerArray = Array<Integer>;
+using ShapeVector = std::vector<std::vector<int64_t>>;
+using GraphAttrs = std::unordered_map<std::string, dmlc::any>;
+using TargetsMap = std::unordered_map<int, Target>;
+
+/*! \brief Lowered outputs */
+struct AOTLoweredOutput {
+  tir::PrimFunc runner_func;
+  Map<String, IRModule> lowered_funcs;
+  Array<tvm::runtime::Module> external_mods;
+  std::unordered_map<std::string, std::pair<int, const tvm::runtime::NDArray>> 
params;
+  runtime::AOTMetadata aot_metadata;
+};
+
+class AotReturnSidVisitor : public ExprVisitor {
+ public:
+  explicit AotReturnSidVisitor(Map<Expr, Array<IntegerArray>> 
storage_device_map)
+      : storage_device_map_{storage_device_map}, return_sid_{-1} {}
+
+  IntegerArray FindReturnSid(Function func) {
+    VisitExpr(func->body);
+    return return_sid_;
+  }
+
+ protected:
+  void AssignReturnSid(Expr e) {
+    auto iter = storage_device_map_.find(e);
+    if (iter != storage_device_map_.end()) {
+      return_sid_ = (*iter).second[0];
+    }
+  }
+
+  void VisitExpr_(const ConstantNode* cn) override {
+    ExprVisitor::VisitExpr_(cn);
+    AssignReturnSid(GetRef<Expr>(cn));
+  }
+
+  void VisitExpr_(const VarNode* vn) override {
+    ExprVisitor::VisitExpr_(vn);
+    AssignReturnSid(GetRef<Expr>(vn));
+  }
+
+  void VisitExpr_(const CallNode* cn) override {
+    ExprVisitor::VisitExpr_(cn);
+    AssignReturnSid(GetRef<Expr>(cn));
+  }
+
+  void VisitExpr_(const LetNode* op) override { VisitExpr(op->body); }
+
+  void VisitExpr_(const TupleNode* tn) override {
+    ExprVisitor::VisitExpr_(tn);
+    AssignReturnSid(GetRef<Expr>(tn));
+  }
+
+ private:
+  Map<Expr, Array<IntegerArray>> storage_device_map_;
+  IntegerArray return_sid_;
+};
+
+/*! \brief Code generator for AOT executor */
+class AOTCodegen : public ExprVisitor {
+ protected:
+  /*!
+   * \brief Utility function to allocate a DLTensor or TVMValue
+   * \param  type the type of allocation
+   * \param num the number of variable to allocate on the stack
+   * \return PrimExpr representing the allocated object
+   */
+  PrimExpr StackAlloca(std::string type, size_t num) {
+    Array<PrimExpr> args = {tir::StringImm(type), ConstInt32(num)};
+    return tir::Call(DataType::Handle(), tir::builtin::tvm_stack_alloca(), 
args);
+  }
+
+  /*!
+   * \brief Utility function to allocate memory for storage identifiers
+   * \param  memory_size_byte size in bytes of the allocation
+   * \return PrimExpr representing the allocated memory
+   */
+  PrimExpr AllocateBackendMemory(int memory_size_byte) {
+    // TODO(giuseros): use tir::Allocate instead of TVMBackendAllocWorkspace
+    // to enable unified memory planning
+    static const Op& op = Op::Get("tir.TVMBackendAllocWorkspace");
+    return tvm::tir::Call(DataType::Handle(), op, {1, 0, memory_size_byte, 2, 
8});
+  }
+
+  /*!
+   * \brief Utility function to convert a concrete integer to a PrimExpr.
+   * \param num the number to convert
+   * \return PrimExpr representing num
+   */
+  inline PrimExpr ConstInt32(size_t num) {
+    ICHECK_LE(num, std::numeric_limits<int>::max());
+    return tir::make_const(DataType::Int(32), static_cast<int>(num));
+  }
+
+  /*!
+   * \brief Return a vector of variables that represents the sids for the 
given Relay Expr
+   */
+  std::vector<tir::Var> pack_sid(Expr expr) {

Review comment:
       could you rename these to follow style guide, since they're not 
accessors/mutators? 
https://google.github.io/styleguide/cppguide.html#Function_Names
   
   e.g. PackSid or PackSid_

##########
File path: python/tvm/relay/build_module.py
##########
@@ -287,10 +299,19 @@ def build(ir_mod, target=None, target_host=None, 
params=None, mod_name="default"
 
     with tophub_context:
         bld_mod = BuildModule()
-        graph_json, runtime_mod, params = bld_mod.build(mod=ir_mod, 
target=target, params=params)
-        executor_factory = _graph_executor_factory.GraphExecutorFactoryModule(
-            ir_mod, target, graph_json, runtime_mod, mod_name, params
-        )
+        internal_repr, runtime_mod, params = bld_mod.build(mod=ir_mod, 
target=target, params=params)
+
+        if bld_mod.get_executor_type() == "aot":
+            executor_factory = _executor_factory.AOTExecutorFactoryModule(
+                ir_mod, target, internal_repr, runtime_mod, mod_name, params
+            )
+        elif bld_mod.get_executor_type() == "graph":
+            executor_factory = _executor_factory.GraphExecutorFactoryModule(
+                ir_mod, target, internal_repr, runtime_mod, mod_name, params
+            )
+        else:
+            assert False, "Executor not supported"

Review comment:
       could you include `build_mod.get_executor_type()` in the message?

##########
File path: src/relay/backend/aot_codegen.cc
##########
@@ -0,0 +1,704 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file relay/backend/graph_codegen.cc
+ * \brief Graph runtime codegen
+ */
+
+#include <dmlc/any.h>
+#include <tvm/ir/module.h>
+#include <tvm/relay/expr_functor.h>
+#include <tvm/runtime/device_api.h>
+#include <tvm/tir/builtin.h>
+#include <tvm/tir/expr.h>
+#include <tvm/tir/stmt.h>
+
+#include <algorithm>
+#include <list>
+#include <string>
+#include <vector>
+
+#include "../../runtime/meta_data.h"
+#include "compile_engine.h"
+#include "utils.h"
+
+namespace tvm {
+namespace relay {
+namespace backend {
+
+using IntegerArray = Array<Integer>;
+using ShapeVector = std::vector<std::vector<int64_t>>;
+using GraphAttrs = std::unordered_map<std::string, dmlc::any>;
+using TargetsMap = std::unordered_map<int, Target>;
+
+/*! \brief Lowered outputs */
+struct AOTLoweredOutput {
+  tir::PrimFunc runner_func;
+  Map<String, IRModule> lowered_funcs;
+  Array<tvm::runtime::Module> external_mods;
+  std::unordered_map<std::string, std::pair<int, const tvm::runtime::NDArray>> 
params;
+  runtime::AOTMetadata aot_metadata;
+};
+
+class AotReturnSidVisitor : public ExprVisitor {
+ public:
+  explicit AotReturnSidVisitor(Map<Expr, Array<IntegerArray>> 
storage_device_map)
+      : storage_device_map_{storage_device_map}, return_sid_{-1} {}
+
+  IntegerArray FindReturnSid(Function func) {
+    VisitExpr(func->body);
+    return return_sid_;
+  }
+
+ protected:
+  void AssignReturnSid(Expr e) {
+    auto iter = storage_device_map_.find(e);
+    if (iter != storage_device_map_.end()) {
+      return_sid_ = (*iter).second[0];
+    }
+  }
+
+  void VisitExpr_(const ConstantNode* cn) override {
+    ExprVisitor::VisitExpr_(cn);
+    AssignReturnSid(GetRef<Expr>(cn));
+  }
+
+  void VisitExpr_(const VarNode* vn) override {
+    ExprVisitor::VisitExpr_(vn);
+    AssignReturnSid(GetRef<Expr>(vn));
+  }
+
+  void VisitExpr_(const CallNode* cn) override {
+    ExprVisitor::VisitExpr_(cn);
+    AssignReturnSid(GetRef<Expr>(cn));
+  }
+
+  void VisitExpr_(const LetNode* op) override { VisitExpr(op->body); }
+
+  void VisitExpr_(const TupleNode* tn) override {
+    ExprVisitor::VisitExpr_(tn);
+    AssignReturnSid(GetRef<Expr>(tn));
+  }
+
+ private:
+  Map<Expr, Array<IntegerArray>> storage_device_map_;
+  IntegerArray return_sid_;
+};
+
+/*! \brief Code generator for AOT executor */
+class AOTCodegen : public ExprVisitor {
+ protected:
+  /*!
+   * \brief Utility function to allocate a DLTensor or TVMValue
+   * \param  type the type of allocation
+   * \param num the number of variable to allocate on the stack
+   * \return PrimExpr representing the allocated object
+   */
+  PrimExpr StackAlloca(std::string type, size_t num) {
+    Array<PrimExpr> args = {tir::StringImm(type), ConstInt32(num)};
+    return tir::Call(DataType::Handle(), tir::builtin::tvm_stack_alloca(), 
args);
+  }
+
+  /*!
+   * \brief Utility function to allocate memory for storage identifiers
+   * \param  memory_size_byte size in bytes of the allocation
+   * \return PrimExpr representing the allocated memory
+   */
+  PrimExpr AllocateBackendMemory(int memory_size_byte) {
+    // TODO(giuseros): use tir::Allocate instead of TVMBackendAllocWorkspace
+    // to enable unified memory planning
+    static const Op& op = Op::Get("tir.TVMBackendAllocWorkspace");
+    return tvm::tir::Call(DataType::Handle(), op, {1, 0, memory_size_byte, 2, 
8});
+  }
+
+  /*!
+   * \brief Utility function to convert a concrete integer to a PrimExpr.
+   * \param num the number to convert
+   * \return PrimExpr representing num
+   */
+  inline PrimExpr ConstInt32(size_t num) {
+    ICHECK_LE(num, std::numeric_limits<int>::max());
+    return tir::make_const(DataType::Int(32), static_cast<int>(num));
+  }
+
+  /*!
+   * \brief Return a vector of variables that represents the sids for the 
given Relay Expr
+   */
+  std::vector<tir::Var> pack_sid(Expr expr) {
+    Array<IntegerArray> sids = storage_device_map_[expr];
+    std::vector<tir::Var> sid_vars;
+
+    // Note that an expression can have multiple sids associated with it
+    // e.g., returning multiple values from a function
+    for (const auto& sid : sids[0]) {
+      // Determine if an sid is an output buffer
+      int sid_int = static_cast<int>((sid.as<IntImmNode>())->value);
+      auto output_iter = std::find(return_sid_.begin(), return_sid_.end(), 
sid_int);
+      if (output_iter != return_sid_.end()) {
+        int output_index = std::distance(return_sid_.begin(), output_iter);
+        sid_vars.push_back(main_signature_[input_vars_.size() + output_index]);
+        continue;
+      }
+      // Pack the sid inside the TVMValue
+      auto sid_array = te::Var(make_string("sid_", sid, "_value"), 
DataType::Handle());
+      auto sid_value = sids_table_[sid];
+      tvm::PrimExpr set_tensor =
+          tvm::tir::Call(DataType::Handle(), 
tvm::tir::builtin::tvm_struct_set(),
+                         {sid_array, 0, tir::builtin::kArrData, sid_value});
+      stmts_.push_back(tir::LetStmt(sid_array, StackAlloca("array", 1), 
tir::Evaluate(set_tensor)));
+      sid_vars.push_back(sid_array);
+    }
+    return sid_vars;
+  }
+
+  /*!
+   * \brief Utility function to return a parameter associated with an 
expression
+   * \param expr Relay Expression assicated with the parameter
+   * \return Variable that represents the DLTensor associated with the 
parameters
+   */
+  tir::Var pack_param(Expr expr) {
+    // TODO(giuseros): Using call_extern to call into lookup_linked_param. 
This is because the
+    // builtin::ret is not supported yet in the c target. Once return is 
supported we can use
+    // tvm_call_packed_lowered().
+    int param_sid = param_storage_ids_[params_by_expr_[expr]];
+    auto lookup_linked_param_fn = 
tir::StringImm(::tvm::runtime::symbol::tvm_lookup_linked_param);
+    auto param_array = te::Var(make_string("param_", param_sid, "_array"), 
DataType::Handle());
+
+    // Compose the lookup_call using a local stack
+    Array<tir::Stmt> lookup_call;
+    auto param_var = te::Var(make_string("param_", param_sid, "_value"), 
DataType::Handle());
+    auto ret_var = te::Var("ret_value", DataType::Handle());
+    auto ret_code = te::Var("ret_value", DataType::Handle());
+
+    lookup_call.push_back(tir::Evaluate(
+        tvm::tir::Call(DataType::Handle(), tvm::tir::builtin::tvm_struct_set(),
+                       {param_var, 0, tir::builtin::kTVMValueContent, 
ConstInt32(param_sid)})));
+    lookup_call.push_back(tir::Evaluate(
+        tvm::tir::Call(DataType::Handle(), tir::builtin::call_extern(),
+                       {lookup_linked_param_fn, param_var, 0, 0, ret_var, 
ret_code, 0})));
+    auto ret_var_handle = tvm::tir::Call(DataType::Handle(), 
tvm::tir::builtin::tvm_struct_get(),
+                                         {ret_var, 0, 
tir::builtin::kTVMValueContent});
+
+    // Set the param to the value returned by lookup_call
+    tvm::PrimExpr set_param_array =
+        tvm::tir::Call(DataType::Handle(), tvm::tir::builtin::tvm_struct_set(),
+                       {param_array, 0, tir::builtin::kArrData, 
ret_var_handle});
+    lookup_call.push_back(tir::Evaluate(set_param_array));
+
+    tir::Stmt lookup_body = tir::SeqStmt(lookup_call);
+
+    // Allocate the DLTensors on the stack
+    lookup_body = tir::LetStmt(param_var, StackAlloca("arg_value", 1), 
lookup_body);
+    lookup_body = tir::LetStmt(ret_var, StackAlloca("arg_value", 1), 
lookup_body);
+    lookup_body = tir::LetStmt(ret_code, StackAlloca("arg_value", 1), 
lookup_body);
+    lookup_body = tir::LetStmt(param_array, StackAlloca("arg_value", 1), 
lookup_body);
+    stmts_.push_back(lookup_body);
+    return param_array;
+  }
+
+  /*!
+   * brief Given an expression return the variable(s) associated with that 
expression
+   */
+  std::vector<te::Var> find_expr(Expr arg) {
+    auto input_iter = std::find(input_vars_.begin(), input_vars_.end(), arg);
+    if (input_iter != input_vars_.end()) {
+      // Input variable
+      int main_index = std::distance(input_vars_.begin(), input_iter);
+      return {main_signature_[main_index]};
+    } else if (params_by_expr_.find(arg) != params_by_expr_.end()) {
+      // Parameter of the network
+      return {pack_param(arg)};
+    } else {
+      // Storage identifier (i.e., intermediate memory)
+      return pack_sid(arg);
+    }
+  }
+
+  /*!
+   * brief Call a function with a given name
+   */
+  void func_call(Call call, std::string func_name) {
+    tvm::Array<PrimExpr> args{tvm::tir::StringImm(func_name)};
+    std::vector<tir::Stmt> func_call_stmts;
+
+    // Pack the inputs
+    for (Expr arg : call->args) {
+      auto var_arg = find_expr(arg);
+      args.push_back(var_arg[0]);
+    }
+
+    auto ret_expr = Downcast<Expr>(call);
+
+    // Pack the return(s) value. A call node can produce multiple outputs
+    for (const auto& var : pack_sid(ret_expr)) {
+      args.push_back(var);
+    }
+
+    // Use tvm_call_packed to execute the function
+    func_call_stmts.push_back(tir::Evaluate(
+        tvm::tir::Call(DataType::Int(32), 
tvm::tir::builtin::tvm_call_packed(), args)));
+    tir::Stmt body = tir::SeqStmt(func_call_stmts);
+    stmts_.push_back(body);
+  }
+
+  /*!
+   * brief Copy a variable to the output. This function is mainly used in edge 
cases
+   * when we want to return an input or a parameter.
+   */
+  void copy_to_output(te::Var out, te::Var in, size_t size) {
+    auto retval_get = tvm::tir::Call(DataType::Handle(), 
tvm::tir::builtin::tvm_struct_get(),
+                                     {in, 0, tir::builtin::kArrData});
+
+    // Define intermediate DLTensor to load/store the data
+    auto tmp0 = te::Var("tmp0", DataType::Handle());
+    auto tmp1 = te::Var("tmp1", DataType::Handle());
+    te::Var loop_idx("i", DataType::Int(32));
+    auto retval_i = tir::Load(DataType::UInt(8), tmp0, loop_idx, 
tir::const_true());
+    auto tostore = tvm::tir::Call(DataType::Handle(), 
tvm::tir::builtin::tvm_struct_get(),
+                                  {out, 0, tir::builtin::kArrData});
+
+    // Copy the variable from the input to the output
+    tir::Stmt copy = tir::For(
+        loop_idx, 0, ConstInt32(size), tir::ForKind::kSerial,
+        tir::Store(tmp1, tir::Let(tmp0, retval_get, retval_i), loop_idx, 
tir::const_true()));
+    stmts_.push_back(tir::LetStmt(tmp1, tostore, copy));
+  }
+
+  /*!
+   * Utility function to string together different arguments
+   */
+  template <typename... Args>
+  std::string make_string(Args const&... args) {
+    std::ostringstream ss;
+    using List = int[];
+    (void)List{0, ((void)(ss << args), 0)...};
+
+    return ss.str();
+  }
+
+  void VisitExpr_(const CallNode* op) override {
+    // Descend the call tree
+    for (auto arg : op->args) {
+      VisitExpr(arg);
+    }
+
+    Expr expr = GetRef<Expr>(op);
+    Function func;
+    if (op->op.as<OpNode>()) {
+      LOG(FATAL) << "Operators should be transformed away; try applying"
+                 << "the fuse_ops transformation to the expression.";
+    } else if (op->op.as<GlobalVarNode>()) {
+      LOG(FATAL) << "Not implemented";
+    } else if (op->op.as<FunctionNode>()) {
+      func = GetRef<Function>(op->op.as<FunctionNode>());
+    } else {
+      LOG(FATAL) << "TVM runtime does not support calls to " << 
op->op->GetTypeKey();
+    }
+    if (!func->HasNonzeroAttr(attr::kPrimitive)) {
+      LOG(FATAL) << "TVM only support calls to primitive functions "
+                 << "(i.e functions composed of fusable operator invocations)";
+    }
+
+    auto pf0 = GetPackedFunc("relay.backend._make_CCacheKey");
+    auto pf1 = GetPackedFunc("relay.backend._CompileEngineLower");
+    Target target;
+    // Handle external function
+    if (func->GetAttr<String>(attr::kCompiler).defined()) {
+      target = Target("ext_dev");
+      CCacheKey key = (*pf0)(func, target);
+      CachedFunc ext_func = (*pf1)(compile_engine_, key);
+      ICHECK(ext_func.defined()) << "External function is not defined.";
+      UpdateConstants(func, &params_);
+
+      // Generate the TIR function call
+      func_call(GetRef<Call>(op), ext_func->func_name);
+    }
+
+    ICHECK_GE(storage_device_map_.count(expr), 0);
+    auto& device_type = storage_device_map_[expr][1];
+    auto call_dev_type = device_type[0]->value;
+    // Normal Relay Function
+    if (targets_.size() == 1) {
+      // homogeneous execution.
+      const auto& it = targets_.begin();
+      target = (*it).second;
+    } else {
+      // heterogeneous execution.
+      std::string call_dev_name;
+      if (call_dev_type == 0) {
+        call_dev_name = "llvm";
+      } else {
+        call_dev_name = runtime::DeviceName(call_dev_type);
+      }
+      if (targets_.count(call_dev_type) == 0) {
+        LOG(FATAL) << "No target is provided for device " << call_dev_name;
+      }
+      target = targets_[call_dev_type];
+    }
+    CCacheKey key = (*pf0)(func, target);
+    CachedFunc lowered_func = (*pf1)(compile_engine_, key);
+    if (!lowered_funcs_.count(target->str())) {
+      lowered_funcs_[target->str()] = IRModule(Map<GlobalVar, BaseFunc>({}));
+    }
+    lowered_funcs_[target->str()]->Update(lowered_func->funcs);
+
+    // Generate the TIR function call
+    func_call(GetRef<Call>(op), lowered_func->func_name);
+  }
+
+  void VisitExpr_(const VarNode* op) override {
+    Expr expr = GetRef<Expr>(op);
+
+    // If the Var node is an output node we need to copy the content of the 
variable to the output
+    // It's safe to check the SID here because Var StorageToken are never 
reallocated
+    Array<IntegerArray> sids = storage_device_map_[expr];
+
+    auto output_iter = std::find(return_sid_.begin(), return_sid_.end(),
+                                 
static_cast<int>((sids[0][0].as<IntImmNode>())->value));
+    if (output_iter != return_sid_.end()) {
+      int output_index = std::distance(return_sid_.begin(), output_iter);
+      auto var_expr = find_expr(expr);
+      copy_to_output(main_signature_[input_vars_.size() + output_index], 
var_expr[0], sids[2][0]);
+    }
+  }
+
+  void VisitExpr_(const ConstantNode* op) override {
+    Expr expr = GetRef<Expr>(op);
+    size_t index = params_.size();
+    std::string name = "p" + std::to_string(index);
+
+    param_storage_ids_[name] = storage_device_map_[expr][0][0]->value;
+    params_[name] = op->data;
+    params_by_expr_.Set(expr, name);
+
+    // If the Constant node is an output node we need to copy the content of 
the parameter to the
+    // output A Var node can only produce a single output
+    Array<IntegerArray> sids = storage_device_map_[expr];
+    auto output_iter = std::find(return_sid_.begin(), return_sid_.end(),
+                                 
static_cast<int>((sids[0][0].as<IntImmNode>())->value));
+    if (output_iter != return_sid_.end()) {
+      int output_index = std::distance(return_sid_.begin(), output_iter);
+      copy_to_output(main_signature_[input_vars_.size() + output_index], 
pack_param(expr),
+                     sids[2][0]);
+    }
+  }
+
+  void VisitExpr_(const TupleNode* op) override {
+    for (auto field : op->fields) {
+      VisitExpr(field);
+    }
+  }
+
+  void VisitExpr_(const LetNode* op) override {
+    // TODO(giuseros): support Let nodes in AOT
+    CHECK(false) << "Let not yet implemented in AOT";
+  }
+  void VisitExpr_(const TupleGetItemNode* op) override { VisitExpr(op->tuple); 
}
+  void VisitExpr_(const OpNode* op) override {
+    throw std::runtime_error("can not compile op in non-eta expanded form");
+  }
+  void VisitExpr_(const GlobalVarNode* op) override { throw 
std::runtime_error(""); }
+  void VisitExpr_(const IfNode* op) override { throw std::invalid_argument("if 
not supported"); }
+  void VisitExpr_(const FunctionNode* op) override {
+    ICHECK(op->GetAttr<String>(attr::kCompiler).defined())
+        << "Only functions supported by custom codegen";
+  }
+  void VisitExpr_(const RefCreateNode* op) override {
+    throw std::invalid_argument("reference not supported");
+  }
+  void VisitExpr_(const RefReadNode* op) override {
+    throw std::invalid_argument("reference not supported");
+  }
+  void VisitExpr_(const RefWriteNode* op) override {
+    throw std::invalid_argument("reference not supported");
+  }
+  void VisitExpr_(const ConstructorNode* op) override {
+    throw std::invalid_argument("ADT constructor case not yet implemented");
+  }
+  void VisitExpr_(const MatchNode* op) override {
+    throw std::invalid_argument("match case not yet implemented");
+  }
+
+  // Create the main PrimFunc to execute the graph
+  tir::PrimFunc CreateMainFunc(unsigned int relay_params) {
+    tir::Stmt body = tir::SeqStmt(stmts_);
+
+    // Allocate the sids
+    std::unordered_map<int, bool> allocated;
+
+    for (auto kv : storage_device_map_) {
+      // Only allocate sids that are needed
+      const bool is_input =
+          (std::find(input_vars_.begin(), input_vars_.end(), kv.first) != 
input_vars_.end());
+      const bool is_param = (params_by_expr_.find(kv.first) != 
params_by_expr_.end());
+      if (is_input || is_param) {
+        continue;
+      }
+
+      for (unsigned int i = 0; i < kv.second[0].size(); i++) {
+        int size = kv.second[2][i];
+        int sid = static_cast<int>((kv.second[0][i].as<IntImmNode>())->value);
+
+        if (std::find(return_sid_.begin(), return_sid_.end(), sid) != 
return_sid_.end()) {
+          continue;
+        }
+
+        // TODO(giuseros): we should allocate this one time outside the 
PrimFunc
+        // so we dont' pay the price of allocation for every inference
+        if (!allocated[sid]) {
+          body = tir::LetStmt(sids_table_[sid], AllocateBackendMemory(size), 
body);
+        }
+        allocated[sid] = true;
+      }
+    }
+
+    // Define the attributes
+    body = tir::AttrStmt(PrimExpr(), tvm::tir::attr::device_type, 1, body);
+    body = tir::AttrStmt(PrimExpr(), tvm::tir::attr::device_id, 0, body);
+
+    // Make the PrimFunc
+    return tir::PrimFunc(main_signature_, body, VoidType(), Map<tir::Var, 
tir::Buffer>(),
+                         DictAttrs(dict_attrs_));
+  }
+
+ protected:
+  /*! \brief mod */
+  runtime::Module* mod_;
+  /*! \brief list of input expressions (i.e., variable passed by the user) */
+  std::vector<Expr> input_vars_;
+  /*! \brief input and output variables belonging to the main function 
signature */
+  Array<tir::Var> main_signature_;
+  /*! \brief target device */
+  TargetsMap targets_;
+  /*! \brief target host */
+  Target target_host_;
+  /*! PrimFunc attributes */
+  Map<String, ObjectRef> dict_attrs_;
+
+  /*!
+   * \brief parameters (i.e. ConstantNodes found in the graph).
+   * These are take as inputs to the GraphRuntime.
+   * Maps param name to a pair of storage_id and NDArray. At runtime, the 
storage_id can be
+   * used to lookup the parameter.
+   */
+  std::unordered_map<std::string, runtime::NDArray> params_;
+  /*! \brief mapping between expression and parameters */
+  Map<Expr, String> params_by_expr_;
+  /*! \brief mapping between parameter names ("p0", "p1", etc..) and storage 
identifiers*/
+  std::unordered_map<std::string, int64_t> param_storage_ids_;
+
+  /*! \brief plan memory of device result */
+  Map<Expr, Array<IntegerArray>> storage_device_map_;
+  std::unordered_map<int, te::Var> sids_table_;
+  /*! \brief lowered funcs */
+  std::unordered_map<std::string, IRModule> lowered_funcs_;
+  /*! \brief name map */
+  std::unordered_map<std::string, size_t> name_map_;
+  /*! \brief compile engine */
+  CompileEngine compile_engine_;
+  /*! \brief GraphPlanMemory module */
+  runtime::Module graph_plan_memory_module_;
+  /*! \brief the IR module stored which represents the executor program */
+  Map<String, IRModule> tir_module_;

Review comment:
       is this used?

##########
File path: src/relay/backend/aot_codegen.cc
##########
@@ -0,0 +1,704 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file relay/backend/graph_codegen.cc
+ * \brief Graph runtime codegen
+ */
+
+#include <dmlc/any.h>
+#include <tvm/ir/module.h>
+#include <tvm/relay/expr_functor.h>
+#include <tvm/runtime/device_api.h>
+#include <tvm/tir/builtin.h>
+#include <tvm/tir/expr.h>
+#include <tvm/tir/stmt.h>
+
+#include <algorithm>
+#include <list>
+#include <string>
+#include <vector>
+
+#include "../../runtime/meta_data.h"
+#include "compile_engine.h"
+#include "utils.h"
+
+namespace tvm {
+namespace relay {
+namespace backend {
+
+using IntegerArray = Array<Integer>;
+using ShapeVector = std::vector<std::vector<int64_t>>;
+using GraphAttrs = std::unordered_map<std::string, dmlc::any>;
+using TargetsMap = std::unordered_map<int, Target>;
+
+/*! \brief Lowered outputs */
+struct AOTLoweredOutput {
+  tir::PrimFunc runner_func;
+  Map<String, IRModule> lowered_funcs;
+  Array<tvm::runtime::Module> external_mods;
+  std::unordered_map<std::string, std::pair<int, const tvm::runtime::NDArray>> 
params;
+  runtime::AOTMetadata aot_metadata;
+};
+
+class AotReturnSidVisitor : public ExprVisitor {
+ public:
+  explicit AotReturnSidVisitor(Map<Expr, Array<IntegerArray>> 
storage_device_map)
+      : storage_device_map_{storage_device_map}, return_sid_{-1} {}
+
+  IntegerArray FindReturnSid(Function func) {
+    VisitExpr(func->body);
+    return return_sid_;
+  }
+
+ protected:
+  void AssignReturnSid(Expr e) {
+    auto iter = storage_device_map_.find(e);
+    if (iter != storage_device_map_.end()) {
+      return_sid_ = (*iter).second[0];
+    }
+  }
+
+  void VisitExpr_(const ConstantNode* cn) override {
+    ExprVisitor::VisitExpr_(cn);
+    AssignReturnSid(GetRef<Expr>(cn));
+  }
+
+  void VisitExpr_(const VarNode* vn) override {
+    ExprVisitor::VisitExpr_(vn);
+    AssignReturnSid(GetRef<Expr>(vn));
+  }
+
+  void VisitExpr_(const CallNode* cn) override {
+    ExprVisitor::VisitExpr_(cn);
+    AssignReturnSid(GetRef<Expr>(cn));
+  }
+
+  void VisitExpr_(const LetNode* op) override { VisitExpr(op->body); }
+
+  void VisitExpr_(const TupleNode* tn) override {
+    ExprVisitor::VisitExpr_(tn);
+    AssignReturnSid(GetRef<Expr>(tn));
+  }
+
+ private:
+  Map<Expr, Array<IntegerArray>> storage_device_map_;
+  IntegerArray return_sid_;
+};
+
+/*! \brief Code generator for AOT executor */
+class AOTCodegen : public ExprVisitor {
+ protected:
+  /*!
+   * \brief Utility function to allocate a DLTensor or TVMValue
+   * \param  type the type of allocation
+   * \param num the number of variable to allocate on the stack
+   * \return PrimExpr representing the allocated object
+   */
+  PrimExpr StackAlloca(std::string type, size_t num) {
+    Array<PrimExpr> args = {tir::StringImm(type), ConstInt32(num)};
+    return tir::Call(DataType::Handle(), tir::builtin::tvm_stack_alloca(), 
args);
+  }
+
+  /*!
+   * \brief Utility function to allocate memory for storage identifiers
+   * \param  memory_size_byte size in bytes of the allocation
+   * \return PrimExpr representing the allocated memory
+   */
+  PrimExpr AllocateBackendMemory(int memory_size_byte) {
+    // TODO(giuseros): use tir::Allocate instead of TVMBackendAllocWorkspace
+    // to enable unified memory planning
+    static const Op& op = Op::Get("tir.TVMBackendAllocWorkspace");
+    return tvm::tir::Call(DataType::Handle(), op, {1, 0, memory_size_byte, 2, 
8});
+  }
+
+  /*!
+   * \brief Utility function to convert a concrete integer to a PrimExpr.
+   * \param num the number to convert
+   * \return PrimExpr representing num
+   */
+  inline PrimExpr ConstInt32(size_t num) {
+    ICHECK_LE(num, std::numeric_limits<int>::max());
+    return tir::make_const(DataType::Int(32), static_cast<int>(num));
+  }
+
+  /*!
+   * \brief Return a vector of variables that represents the sids for the 
given Relay Expr
+   */
+  std::vector<tir::Var> pack_sid(Expr expr) {
+    Array<IntegerArray> sids = storage_device_map_[expr];
+    std::vector<tir::Var> sid_vars;
+
+    // Note that an expression can have multiple sids associated with it
+    // e.g., returning multiple values from a function
+    for (const auto& sid : sids[0]) {
+      // Determine if an sid is an output buffer
+      int sid_int = static_cast<int>((sid.as<IntImmNode>())->value);
+      auto output_iter = std::find(return_sid_.begin(), return_sid_.end(), 
sid_int);
+      if (output_iter != return_sid_.end()) {
+        int output_index = std::distance(return_sid_.begin(), output_iter);
+        sid_vars.push_back(main_signature_[input_vars_.size() + output_index]);
+        continue;
+      }
+      // Pack the sid inside the TVMValue
+      auto sid_array = te::Var(make_string("sid_", sid, "_value"), 
DataType::Handle());
+      auto sid_value = sids_table_[sid];
+      tvm::PrimExpr set_tensor =
+          tvm::tir::Call(DataType::Handle(), 
tvm::tir::builtin::tvm_struct_set(),
+                         {sid_array, 0, tir::builtin::kArrData, sid_value});
+      stmts_.push_back(tir::LetStmt(sid_array, StackAlloca("array", 1), 
tir::Evaluate(set_tensor)));
+      sid_vars.push_back(sid_array);
+    }
+    return sid_vars;
+  }
+
+  /*!
+   * \brief Utility function to return a parameter associated with an 
expression
+   * \param expr Relay Expression assicated with the parameter
+   * \return Variable that represents the DLTensor associated with the 
parameters
+   */
+  tir::Var pack_param(Expr expr) {
+    // TODO(giuseros): Using call_extern to call into lookup_linked_param. 
This is because the
+    // builtin::ret is not supported yet in the c target. Once return is 
supported we can use
+    // tvm_call_packed_lowered().
+    int param_sid = param_storage_ids_[params_by_expr_[expr]];
+    auto lookup_linked_param_fn = 
tir::StringImm(::tvm::runtime::symbol::tvm_lookup_linked_param);
+    auto param_array = te::Var(make_string("param_", param_sid, "_array"), 
DataType::Handle());
+
+    // Compose the lookup_call using a local stack
+    Array<tir::Stmt> lookup_call;
+    auto param_var = te::Var(make_string("param_", param_sid, "_value"), 
DataType::Handle());
+    auto ret_var = te::Var("ret_value", DataType::Handle());
+    auto ret_code = te::Var("ret_value", DataType::Handle());
+
+    lookup_call.push_back(tir::Evaluate(
+        tvm::tir::Call(DataType::Handle(), tvm::tir::builtin::tvm_struct_set(),
+                       {param_var, 0, tir::builtin::kTVMValueContent, 
ConstInt32(param_sid)})));
+    lookup_call.push_back(tir::Evaluate(
+        tvm::tir::Call(DataType::Handle(), tir::builtin::call_extern(),
+                       {lookup_linked_param_fn, param_var, 0, 0, ret_var, 
ret_code, 0})));
+    auto ret_var_handle = tvm::tir::Call(DataType::Handle(), 
tvm::tir::builtin::tvm_struct_get(),
+                                         {ret_var, 0, 
tir::builtin::kTVMValueContent});
+
+    // Set the param to the value returned by lookup_call
+    tvm::PrimExpr set_param_array =
+        tvm::tir::Call(DataType::Handle(), tvm::tir::builtin::tvm_struct_set(),
+                       {param_array, 0, tir::builtin::kArrData, 
ret_var_handle});
+    lookup_call.push_back(tir::Evaluate(set_param_array));
+
+    tir::Stmt lookup_body = tir::SeqStmt(lookup_call);
+
+    // Allocate the DLTensors on the stack
+    lookup_body = tir::LetStmt(param_var, StackAlloca("arg_value", 1), 
lookup_body);
+    lookup_body = tir::LetStmt(ret_var, StackAlloca("arg_value", 1), 
lookup_body);
+    lookup_body = tir::LetStmt(ret_code, StackAlloca("arg_value", 1), 
lookup_body);
+    lookup_body = tir::LetStmt(param_array, StackAlloca("arg_value", 1), 
lookup_body);
+    stmts_.push_back(lookup_body);
+    return param_array;
+  }
+
+  /*!
+   * brief Given an expression return the variable(s) associated with that 
expression
+   */
+  std::vector<te::Var> find_expr(Expr arg) {
+    auto input_iter = std::find(input_vars_.begin(), input_vars_.end(), arg);
+    if (input_iter != input_vars_.end()) {
+      // Input variable
+      int main_index = std::distance(input_vars_.begin(), input_iter);
+      return {main_signature_[main_index]};
+    } else if (params_by_expr_.find(arg) != params_by_expr_.end()) {
+      // Parameter of the network
+      return {pack_param(arg)};
+    } else {
+      // Storage identifier (i.e., intermediate memory)
+      return pack_sid(arg);
+    }
+  }
+
+  /*!
+   * brief Call a function with a given name
+   */
+  void func_call(Call call, std::string func_name) {
+    tvm::Array<PrimExpr> args{tvm::tir::StringImm(func_name)};
+    std::vector<tir::Stmt> func_call_stmts;
+
+    // Pack the inputs
+    for (Expr arg : call->args) {
+      auto var_arg = find_expr(arg);
+      args.push_back(var_arg[0]);
+    }
+
+    auto ret_expr = Downcast<Expr>(call);
+
+    // Pack the return(s) value. A call node can produce multiple outputs
+    for (const auto& var : pack_sid(ret_expr)) {
+      args.push_back(var);
+    }
+
+    // Use tvm_call_packed to execute the function
+    func_call_stmts.push_back(tir::Evaluate(
+        tvm::tir::Call(DataType::Int(32), 
tvm::tir::builtin::tvm_call_packed(), args)));
+    tir::Stmt body = tir::SeqStmt(func_call_stmts);
+    stmts_.push_back(body);
+  }
+
+  /*!
+   * brief Copy a variable to the output. This function is mainly used in edge 
cases
+   * when we want to return an input or a parameter.
+   */
+  void copy_to_output(te::Var out, te::Var in, size_t size) {
+    auto retval_get = tvm::tir::Call(DataType::Handle(), 
tvm::tir::builtin::tvm_struct_get(),
+                                     {in, 0, tir::builtin::kArrData});
+
+    // Define intermediate DLTensor to load/store the data
+    auto tmp0 = te::Var("tmp0", DataType::Handle());
+    auto tmp1 = te::Var("tmp1", DataType::Handle());
+    te::Var loop_idx("i", DataType::Int(32));
+    auto retval_i = tir::Load(DataType::UInt(8), tmp0, loop_idx, 
tir::const_true());
+    auto tostore = tvm::tir::Call(DataType::Handle(), 
tvm::tir::builtin::tvm_struct_get(),
+                                  {out, 0, tir::builtin::kArrData});
+
+    // Copy the variable from the input to the output
+    tir::Stmt copy = tir::For(
+        loop_idx, 0, ConstInt32(size), tir::ForKind::kSerial,
+        tir::Store(tmp1, tir::Let(tmp0, retval_get, retval_i), loop_idx, 
tir::const_true()));
+    stmts_.push_back(tir::LetStmt(tmp1, tostore, copy));
+  }
+
+  /*!
+   * Utility function to string together different arguments
+   */
+  template <typename... Args>
+  std::string make_string(Args const&... args) {
+    std::ostringstream ss;
+    using List = int[];
+    (void)List{0, ((void)(ss << args), 0)...};
+
+    return ss.str();
+  }
+
+  void VisitExpr_(const CallNode* op) override {
+    // Descend the call tree
+    for (auto arg : op->args) {
+      VisitExpr(arg);
+    }
+
+    Expr expr = GetRef<Expr>(op);
+    Function func;
+    if (op->op.as<OpNode>()) {
+      LOG(FATAL) << "Operators should be transformed away; try applying"
+                 << "the fuse_ops transformation to the expression.";
+    } else if (op->op.as<GlobalVarNode>()) {
+      LOG(FATAL) << "Not implemented";
+    } else if (op->op.as<FunctionNode>()) {
+      func = GetRef<Function>(op->op.as<FunctionNode>());
+    } else {
+      LOG(FATAL) << "TVM runtime does not support calls to " << 
op->op->GetTypeKey();
+    }
+    if (!func->HasNonzeroAttr(attr::kPrimitive)) {
+      LOG(FATAL) << "TVM only support calls to primitive functions "
+                 << "(i.e functions composed of fusable operator invocations)";
+    }
+
+    auto pf0 = GetPackedFunc("relay.backend._make_CCacheKey");
+    auto pf1 = GetPackedFunc("relay.backend._CompileEngineLower");
+    Target target;
+    // Handle external function
+    if (func->GetAttr<String>(attr::kCompiler).defined()) {
+      target = Target("ext_dev");
+      CCacheKey key = (*pf0)(func, target);
+      CachedFunc ext_func = (*pf1)(compile_engine_, key);
+      ICHECK(ext_func.defined()) << "External function is not defined.";
+      UpdateConstants(func, &params_);
+
+      // Generate the TIR function call
+      func_call(GetRef<Call>(op), ext_func->func_name);
+    }
+
+    ICHECK_GE(storage_device_map_.count(expr), 0);
+    auto& device_type = storage_device_map_[expr][1];
+    auto call_dev_type = device_type[0]->value;
+    // Normal Relay Function
+    if (targets_.size() == 1) {
+      // homogeneous execution.
+      const auto& it = targets_.begin();
+      target = (*it).second;
+    } else {
+      // heterogeneous execution.
+      std::string call_dev_name;
+      if (call_dev_type == 0) {
+        call_dev_name = "llvm";
+      } else {
+        call_dev_name = runtime::DeviceName(call_dev_type);
+      }
+      if (targets_.count(call_dev_type) == 0) {
+        LOG(FATAL) << "No target is provided for device " << call_dev_name;
+      }
+      target = targets_[call_dev_type];
+    }
+    CCacheKey key = (*pf0)(func, target);
+    CachedFunc lowered_func = (*pf1)(compile_engine_, key);
+    if (!lowered_funcs_.count(target->str())) {
+      lowered_funcs_[target->str()] = IRModule(Map<GlobalVar, BaseFunc>({}));
+    }
+    lowered_funcs_[target->str()]->Update(lowered_func->funcs);
+
+    // Generate the TIR function call
+    func_call(GetRef<Call>(op), lowered_func->func_name);
+  }
+
+  void VisitExpr_(const VarNode* op) override {
+    Expr expr = GetRef<Expr>(op);
+
+    // If the Var node is an output node we need to copy the content of the 
variable to the output
+    // It's safe to check the SID here because Var StorageToken are never 
reallocated
+    Array<IntegerArray> sids = storage_device_map_[expr];
+
+    auto output_iter = std::find(return_sid_.begin(), return_sid_.end(),
+                                 
static_cast<int>((sids[0][0].as<IntImmNode>())->value));
+    if (output_iter != return_sid_.end()) {
+      int output_index = std::distance(return_sid_.begin(), output_iter);
+      auto var_expr = find_expr(expr);
+      copy_to_output(main_signature_[input_vars_.size() + output_index], 
var_expr[0], sids[2][0]);
+    }
+  }
+
+  void VisitExpr_(const ConstantNode* op) override {
+    Expr expr = GetRef<Expr>(op);
+    size_t index = params_.size();
+    std::string name = "p" + std::to_string(index);
+
+    param_storage_ids_[name] = storage_device_map_[expr][0][0]->value;
+    params_[name] = op->data;
+    params_by_expr_.Set(expr, name);
+
+    // If the Constant node is an output node we need to copy the content of 
the parameter to the
+    // output A Var node can only produce a single output
+    Array<IntegerArray> sids = storage_device_map_[expr];
+    auto output_iter = std::find(return_sid_.begin(), return_sid_.end(),
+                                 
static_cast<int>((sids[0][0].as<IntImmNode>())->value));
+    if (output_iter != return_sid_.end()) {
+      int output_index = std::distance(return_sid_.begin(), output_iter);
+      copy_to_output(main_signature_[input_vars_.size() + output_index], 
pack_param(expr),
+                     sids[2][0]);
+    }
+  }
+
+  void VisitExpr_(const TupleNode* op) override {
+    for (auto field : op->fields) {
+      VisitExpr(field);
+    }
+  }
+
+  void VisitExpr_(const LetNode* op) override {
+    // TODO(giuseros): support Let nodes in AOT
+    CHECK(false) << "Let not yet implemented in AOT";
+  }
+  void VisitExpr_(const TupleGetItemNode* op) override { VisitExpr(op->tuple); 
}
+  void VisitExpr_(const OpNode* op) override {
+    throw std::runtime_error("can not compile op in non-eta expanded form");
+  }
+  void VisitExpr_(const GlobalVarNode* op) override { throw 
std::runtime_error(""); }
+  void VisitExpr_(const IfNode* op) override { throw std::invalid_argument("if 
not supported"); }
+  void VisitExpr_(const FunctionNode* op) override {
+    ICHECK(op->GetAttr<String>(attr::kCompiler).defined())
+        << "Only functions supported by custom codegen";
+  }
+  void VisitExpr_(const RefCreateNode* op) override {
+    throw std::invalid_argument("reference not supported");
+  }
+  void VisitExpr_(const RefReadNode* op) override {
+    throw std::invalid_argument("reference not supported");
+  }
+  void VisitExpr_(const RefWriteNode* op) override {
+    throw std::invalid_argument("reference not supported");
+  }
+  void VisitExpr_(const ConstructorNode* op) override {
+    throw std::invalid_argument("ADT constructor case not yet implemented");
+  }
+  void VisitExpr_(const MatchNode* op) override {
+    throw std::invalid_argument("match case not yet implemented");
+  }
+
+  // Create the main PrimFunc to execute the graph
+  tir::PrimFunc CreateMainFunc(unsigned int relay_params) {
+    tir::Stmt body = tir::SeqStmt(stmts_);
+
+    // Allocate the sids
+    std::unordered_map<int, bool> allocated;
+
+    for (auto kv : storage_device_map_) {
+      // Only allocate sids that are needed
+      const bool is_input =
+          (std::find(input_vars_.begin(), input_vars_.end(), kv.first) != 
input_vars_.end());
+      const bool is_param = (params_by_expr_.find(kv.first) != 
params_by_expr_.end());
+      if (is_input || is_param) {
+        continue;
+      }
+
+      for (unsigned int i = 0; i < kv.second[0].size(); i++) {
+        int size = kv.second[2][i];
+        int sid = static_cast<int>((kv.second[0][i].as<IntImmNode>())->value);
+
+        if (std::find(return_sid_.begin(), return_sid_.end(), sid) != 
return_sid_.end()) {
+          continue;
+        }
+
+        // TODO(giuseros): we should allocate this one time outside the 
PrimFunc
+        // so we dont' pay the price of allocation for every inference
+        if (!allocated[sid]) {
+          body = tir::LetStmt(sids_table_[sid], AllocateBackendMemory(size), 
body);
+        }
+        allocated[sid] = true;
+      }
+    }
+
+    // Define the attributes
+    body = tir::AttrStmt(PrimExpr(), tvm::tir::attr::device_type, 1, body);
+    body = tir::AttrStmt(PrimExpr(), tvm::tir::attr::device_id, 0, body);
+
+    // Make the PrimFunc
+    return tir::PrimFunc(main_signature_, body, VoidType(), Map<tir::Var, 
tir::Buffer>(),
+                         DictAttrs(dict_attrs_));
+  }
+
+ protected:
+  /*! \brief mod */
+  runtime::Module* mod_;
+  /*! \brief list of input expressions (i.e., variable passed by the user) */
+  std::vector<Expr> input_vars_;
+  /*! \brief input and output variables belonging to the main function 
signature */
+  Array<tir::Var> main_signature_;
+  /*! \brief target device */
+  TargetsMap targets_;
+  /*! \brief target host */
+  Target target_host_;
+  /*! PrimFunc attributes */
+  Map<String, ObjectRef> dict_attrs_;
+
+  /*!
+   * \brief parameters (i.e. ConstantNodes found in the graph).
+   * These are take as inputs to the GraphRuntime.
+   * Maps param name to a pair of storage_id and NDArray. At runtime, the 
storage_id can be
+   * used to lookup the parameter.
+   */
+  std::unordered_map<std::string, runtime::NDArray> params_;
+  /*! \brief mapping between expression and parameters */
+  Map<Expr, String> params_by_expr_;
+  /*! \brief mapping between parameter names ("p0", "p1", etc..) and storage 
identifiers*/
+  std::unordered_map<std::string, int64_t> param_storage_ids_;
+
+  /*! \brief plan memory of device result */
+  Map<Expr, Array<IntegerArray>> storage_device_map_;
+  std::unordered_map<int, te::Var> sids_table_;
+  /*! \brief lowered funcs */
+  std::unordered_map<std::string, IRModule> lowered_funcs_;
+  /*! \brief name map */
+  std::unordered_map<std::string, size_t> name_map_;

Review comment:
       is this used?

##########
File path: src/relay/backend/aot_codegen.cc
##########
@@ -0,0 +1,704 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file relay/backend/graph_codegen.cc
+ * \brief Graph runtime codegen
+ */
+
+#include <dmlc/any.h>
+#include <tvm/ir/module.h>
+#include <tvm/relay/expr_functor.h>
+#include <tvm/runtime/device_api.h>
+#include <tvm/tir/builtin.h>
+#include <tvm/tir/expr.h>
+#include <tvm/tir/stmt.h>
+
+#include <algorithm>
+#include <list>
+#include <string>
+#include <vector>
+
+#include "../../runtime/meta_data.h"
+#include "compile_engine.h"
+#include "utils.h"
+
+namespace tvm {
+namespace relay {
+namespace backend {
+
+using IntegerArray = Array<Integer>;
+using ShapeVector = std::vector<std::vector<int64_t>>;
+using GraphAttrs = std::unordered_map<std::string, dmlc::any>;
+using TargetsMap = std::unordered_map<int, Target>;
+
+/*! \brief Lowered outputs */
+struct AOTLoweredOutput {
+  tir::PrimFunc runner_func;
+  Map<String, IRModule> lowered_funcs;
+  Array<tvm::runtime::Module> external_mods;
+  std::unordered_map<std::string, std::pair<int, const tvm::runtime::NDArray>> 
params;
+  runtime::AOTMetadata aot_metadata;
+};
+
+class AotReturnSidVisitor : public ExprVisitor {
+ public:
+  explicit AotReturnSidVisitor(Map<Expr, Array<IntegerArray>> 
storage_device_map)
+      : storage_device_map_{storage_device_map}, return_sid_{-1} {}
+
+  IntegerArray FindReturnSid(Function func) {
+    VisitExpr(func->body);
+    return return_sid_;
+  }
+
+ protected:
+  void AssignReturnSid(Expr e) {
+    auto iter = storage_device_map_.find(e);
+    if (iter != storage_device_map_.end()) {
+      return_sid_ = (*iter).second[0];
+    }
+  }
+
+  void VisitExpr_(const ConstantNode* cn) override {
+    ExprVisitor::VisitExpr_(cn);
+    AssignReturnSid(GetRef<Expr>(cn));
+  }
+
+  void VisitExpr_(const VarNode* vn) override {
+    ExprVisitor::VisitExpr_(vn);
+    AssignReturnSid(GetRef<Expr>(vn));
+  }
+
+  void VisitExpr_(const CallNode* cn) override {
+    ExprVisitor::VisitExpr_(cn);
+    AssignReturnSid(GetRef<Expr>(cn));
+  }
+
+  void VisitExpr_(const LetNode* op) override { VisitExpr(op->body); }
+
+  void VisitExpr_(const TupleNode* tn) override {
+    ExprVisitor::VisitExpr_(tn);
+    AssignReturnSid(GetRef<Expr>(tn));
+  }
+
+ private:
+  Map<Expr, Array<IntegerArray>> storage_device_map_;
+  IntegerArray return_sid_;
+};
+
+/*! \brief Code generator for AOT executor */
+class AOTCodegen : public ExprVisitor {
+ protected:
+  /*!
+   * \brief Utility function to allocate a DLTensor or TVMValue
+   * \param  type the type of allocation
+   * \param num the number of variable to allocate on the stack
+   * \return PrimExpr representing the allocated object
+   */
+  PrimExpr StackAlloca(std::string type, size_t num) {
+    Array<PrimExpr> args = {tir::StringImm(type), ConstInt32(num)};
+    return tir::Call(DataType::Handle(), tir::builtin::tvm_stack_alloca(), 
args);
+  }
+
+  /*!
+   * \brief Utility function to allocate memory for storage identifiers
+   * \param  memory_size_byte size in bytes of the allocation
+   * \return PrimExpr representing the allocated memory
+   */
+  PrimExpr AllocateBackendMemory(int memory_size_byte) {
+    // TODO(giuseros): use tir::Allocate instead of TVMBackendAllocWorkspace
+    // to enable unified memory planning
+    static const Op& op = Op::Get("tir.TVMBackendAllocWorkspace");
+    return tvm::tir::Call(DataType::Handle(), op, {1, 0, memory_size_byte, 2, 
8});
+  }
+
+  /*!
+   * \brief Utility function to convert a concrete integer to a PrimExpr.
+   * \param num the number to convert
+   * \return PrimExpr representing num
+   */
+  inline PrimExpr ConstInt32(size_t num) {
+    ICHECK_LE(num, std::numeric_limits<int>::max());
+    return tir::make_const(DataType::Int(32), static_cast<int>(num));
+  }
+
+  /*!
+   * \brief Return a vector of variables that represents the sids for the 
given Relay Expr
+   */
+  std::vector<tir::Var> pack_sid(Expr expr) {
+    Array<IntegerArray> sids = storage_device_map_[expr];
+    std::vector<tir::Var> sid_vars;
+
+    // Note that an expression can have multiple sids associated with it
+    // e.g., returning multiple values from a function
+    for (const auto& sid : sids[0]) {
+      // Determine if an sid is an output buffer
+      int sid_int = static_cast<int>((sid.as<IntImmNode>())->value);
+      auto output_iter = std::find(return_sid_.begin(), return_sid_.end(), 
sid_int);
+      if (output_iter != return_sid_.end()) {
+        int output_index = std::distance(return_sid_.begin(), output_iter);
+        sid_vars.push_back(main_signature_[input_vars_.size() + output_index]);
+        continue;
+      }
+      // Pack the sid inside the TVMValue
+      auto sid_array = te::Var(make_string("sid_", sid, "_value"), 
DataType::Handle());
+      auto sid_value = sids_table_[sid];
+      tvm::PrimExpr set_tensor =
+          tvm::tir::Call(DataType::Handle(), 
tvm::tir::builtin::tvm_struct_set(),
+                         {sid_array, 0, tir::builtin::kArrData, sid_value});
+      stmts_.push_back(tir::LetStmt(sid_array, StackAlloca("array", 1), 
tir::Evaluate(set_tensor)));
+      sid_vars.push_back(sid_array);
+    }
+    return sid_vars;
+  }
+
+  /*!
+   * \brief Utility function to return a parameter associated with an 
expression
+   * \param expr Relay Expression assicated with the parameter
+   * \return Variable that represents the DLTensor associated with the 
parameters
+   */
+  tir::Var pack_param(Expr expr) {
+    // TODO(giuseros): Using call_extern to call into lookup_linked_param. 
This is because the
+    // builtin::ret is not supported yet in the c target. Once return is 
supported we can use
+    // tvm_call_packed_lowered().
+    int param_sid = param_storage_ids_[params_by_expr_[expr]];
+    auto lookup_linked_param_fn = 
tir::StringImm(::tvm::runtime::symbol::tvm_lookup_linked_param);
+    auto param_array = te::Var(make_string("param_", param_sid, "_array"), 
DataType::Handle());
+
+    // Compose the lookup_call using a local stack
+    Array<tir::Stmt> lookup_call;
+    auto param_var = te::Var(make_string("param_", param_sid, "_value"), 
DataType::Handle());
+    auto ret_var = te::Var("ret_value", DataType::Handle());
+    auto ret_code = te::Var("ret_value", DataType::Handle());
+
+    lookup_call.push_back(tir::Evaluate(
+        tvm::tir::Call(DataType::Handle(), tvm::tir::builtin::tvm_struct_set(),
+                       {param_var, 0, tir::builtin::kTVMValueContent, 
ConstInt32(param_sid)})));
+    lookup_call.push_back(tir::Evaluate(
+        tvm::tir::Call(DataType::Handle(), tir::builtin::call_extern(),
+                       {lookup_linked_param_fn, param_var, 0, 0, ret_var, 
ret_code, 0})));
+    auto ret_var_handle = tvm::tir::Call(DataType::Handle(), 
tvm::tir::builtin::tvm_struct_get(),
+                                         {ret_var, 0, 
tir::builtin::kTVMValueContent});
+
+    // Set the param to the value returned by lookup_call
+    tvm::PrimExpr set_param_array =
+        tvm::tir::Call(DataType::Handle(), tvm::tir::builtin::tvm_struct_set(),
+                       {param_array, 0, tir::builtin::kArrData, 
ret_var_handle});
+    lookup_call.push_back(tir::Evaluate(set_param_array));
+
+    tir::Stmt lookup_body = tir::SeqStmt(lookup_call);
+
+    // Allocate the DLTensors on the stack
+    lookup_body = tir::LetStmt(param_var, StackAlloca("arg_value", 1), 
lookup_body);
+    lookup_body = tir::LetStmt(ret_var, StackAlloca("arg_value", 1), 
lookup_body);
+    lookup_body = tir::LetStmt(ret_code, StackAlloca("arg_value", 1), 
lookup_body);
+    lookup_body = tir::LetStmt(param_array, StackAlloca("arg_value", 1), 
lookup_body);
+    stmts_.push_back(lookup_body);
+    return param_array;
+  }
+
+  /*!
+   * brief Given an expression return the variable(s) associated with that 
expression
+   */
+  std::vector<te::Var> find_expr(Expr arg) {
+    auto input_iter = std::find(input_vars_.begin(), input_vars_.end(), arg);
+    if (input_iter != input_vars_.end()) {
+      // Input variable
+      int main_index = std::distance(input_vars_.begin(), input_iter);
+      return {main_signature_[main_index]};
+    } else if (params_by_expr_.find(arg) != params_by_expr_.end()) {
+      // Parameter of the network
+      return {pack_param(arg)};
+    } else {
+      // Storage identifier (i.e., intermediate memory)
+      return pack_sid(arg);
+    }
+  }
+
+  /*!
+   * brief Call a function with a given name
+   */
+  void func_call(Call call, std::string func_name) {
+    tvm::Array<PrimExpr> args{tvm::tir::StringImm(func_name)};
+    std::vector<tir::Stmt> func_call_stmts;
+
+    // Pack the inputs
+    for (Expr arg : call->args) {
+      auto var_arg = find_expr(arg);
+      args.push_back(var_arg[0]);
+    }
+
+    auto ret_expr = Downcast<Expr>(call);
+
+    // Pack the return(s) value. A call node can produce multiple outputs
+    for (const auto& var : pack_sid(ret_expr)) {
+      args.push_back(var);
+    }
+
+    // Use tvm_call_packed to execute the function
+    func_call_stmts.push_back(tir::Evaluate(
+        tvm::tir::Call(DataType::Int(32), 
tvm::tir::builtin::tvm_call_packed(), args)));
+    tir::Stmt body = tir::SeqStmt(func_call_stmts);
+    stmts_.push_back(body);
+  }
+
+  /*!
+   * brief Copy a variable to the output. This function is mainly used in edge 
cases
+   * when we want to return an input or a parameter.
+   */
+  void copy_to_output(te::Var out, te::Var in, size_t size) {
+    auto retval_get = tvm::tir::Call(DataType::Handle(), 
tvm::tir::builtin::tvm_struct_get(),
+                                     {in, 0, tir::builtin::kArrData});
+
+    // Define intermediate DLTensor to load/store the data
+    auto tmp0 = te::Var("tmp0", DataType::Handle());
+    auto tmp1 = te::Var("tmp1", DataType::Handle());
+    te::Var loop_idx("i", DataType::Int(32));
+    auto retval_i = tir::Load(DataType::UInt(8), tmp0, loop_idx, 
tir::const_true());
+    auto tostore = tvm::tir::Call(DataType::Handle(), 
tvm::tir::builtin::tvm_struct_get(),
+                                  {out, 0, tir::builtin::kArrData});
+
+    // Copy the variable from the input to the output
+    tir::Stmt copy = tir::For(
+        loop_idx, 0, ConstInt32(size), tir::ForKind::kSerial,
+        tir::Store(tmp1, tir::Let(tmp0, retval_get, retval_i), loop_idx, 
tir::const_true()));
+    stmts_.push_back(tir::LetStmt(tmp1, tostore, copy));
+  }
+
+  /*!
+   * Utility function to string together different arguments
+   */
+  template <typename... Args>
+  std::string make_string(Args const&... args) {
+    std::ostringstream ss;
+    using List = int[];
+    (void)List{0, ((void)(ss << args), 0)...};
+
+    return ss.str();
+  }
+
+  void VisitExpr_(const CallNode* op) override {
+    // Descend the call tree
+    for (auto arg : op->args) {
+      VisitExpr(arg);
+    }
+
+    Expr expr = GetRef<Expr>(op);
+    Function func;
+    if (op->op.as<OpNode>()) {
+      LOG(FATAL) << "Operators should be transformed away; try applying"
+                 << "the fuse_ops transformation to the expression.";
+    } else if (op->op.as<GlobalVarNode>()) {
+      LOG(FATAL) << "Not implemented";
+    } else if (op->op.as<FunctionNode>()) {
+      func = GetRef<Function>(op->op.as<FunctionNode>());
+    } else {
+      LOG(FATAL) << "TVM runtime does not support calls to " << 
op->op->GetTypeKey();
+    }
+    if (!func->HasNonzeroAttr(attr::kPrimitive)) {
+      LOG(FATAL) << "TVM only support calls to primitive functions "
+                 << "(i.e functions composed of fusable operator invocations)";
+    }
+
+    auto pf0 = GetPackedFunc("relay.backend._make_CCacheKey");
+    auto pf1 = GetPackedFunc("relay.backend._CompileEngineLower");
+    Target target;
+    // Handle external function
+    if (func->GetAttr<String>(attr::kCompiler).defined()) {
+      target = Target("ext_dev");
+      CCacheKey key = (*pf0)(func, target);
+      CachedFunc ext_func = (*pf1)(compile_engine_, key);
+      ICHECK(ext_func.defined()) << "External function is not defined.";
+      UpdateConstants(func, &params_);
+
+      // Generate the TIR function call
+      func_call(GetRef<Call>(op), ext_func->func_name);
+    }
+
+    ICHECK_GE(storage_device_map_.count(expr), 0);
+    auto& device_type = storage_device_map_[expr][1];
+    auto call_dev_type = device_type[0]->value;
+    // Normal Relay Function
+    if (targets_.size() == 1) {
+      // homogeneous execution.
+      const auto& it = targets_.begin();
+      target = (*it).second;
+    } else {
+      // heterogeneous execution.
+      std::string call_dev_name;
+      if (call_dev_type == 0) {
+        call_dev_name = "llvm";
+      } else {
+        call_dev_name = runtime::DeviceName(call_dev_type);
+      }
+      if (targets_.count(call_dev_type) == 0) {
+        LOG(FATAL) << "No target is provided for device " << call_dev_name;
+      }
+      target = targets_[call_dev_type];
+    }
+    CCacheKey key = (*pf0)(func, target);
+    CachedFunc lowered_func = (*pf1)(compile_engine_, key);
+    if (!lowered_funcs_.count(target->str())) {
+      lowered_funcs_[target->str()] = IRModule(Map<GlobalVar, BaseFunc>({}));
+    }
+    lowered_funcs_[target->str()]->Update(lowered_func->funcs);
+
+    // Generate the TIR function call
+    func_call(GetRef<Call>(op), lowered_func->func_name);
+  }
+
+  void VisitExpr_(const VarNode* op) override {
+    Expr expr = GetRef<Expr>(op);
+
+    // If the Var node is an output node we need to copy the content of the 
variable to the output
+    // It's safe to check the SID here because Var StorageToken are never 
reallocated
+    Array<IntegerArray> sids = storage_device_map_[expr];
+
+    auto output_iter = std::find(return_sid_.begin(), return_sid_.end(),
+                                 
static_cast<int>((sids[0][0].as<IntImmNode>())->value));
+    if (output_iter != return_sid_.end()) {
+      int output_index = std::distance(return_sid_.begin(), output_iter);
+      auto var_expr = find_expr(expr);
+      copy_to_output(main_signature_[input_vars_.size() + output_index], 
var_expr[0], sids[2][0]);
+    }
+  }
+
+  void VisitExpr_(const ConstantNode* op) override {
+    Expr expr = GetRef<Expr>(op);
+    size_t index = params_.size();
+    std::string name = "p" + std::to_string(index);
+
+    param_storage_ids_[name] = storage_device_map_[expr][0][0]->value;
+    params_[name] = op->data;
+    params_by_expr_.Set(expr, name);
+
+    // If the Constant node is an output node we need to copy the content of 
the parameter to the
+    // output A Var node can only produce a single output
+    Array<IntegerArray> sids = storage_device_map_[expr];
+    auto output_iter = std::find(return_sid_.begin(), return_sid_.end(),
+                                 
static_cast<int>((sids[0][0].as<IntImmNode>())->value));
+    if (output_iter != return_sid_.end()) {
+      int output_index = std::distance(return_sid_.begin(), output_iter);
+      copy_to_output(main_signature_[input_vars_.size() + output_index], 
pack_param(expr),
+                     sids[2][0]);
+    }
+  }
+
+  void VisitExpr_(const TupleNode* op) override {
+    for (auto field : op->fields) {
+      VisitExpr(field);
+    }
+  }
+
+  void VisitExpr_(const LetNode* op) override {
+    // TODO(giuseros): support Let nodes in AOT
+    CHECK(false) << "Let not yet implemented in AOT";
+  }
+  void VisitExpr_(const TupleGetItemNode* op) override { VisitExpr(op->tuple); 
}
+  void VisitExpr_(const OpNode* op) override {
+    throw std::runtime_error("can not compile op in non-eta expanded form");
+  }
+  void VisitExpr_(const GlobalVarNode* op) override { throw 
std::runtime_error(""); }
+  void VisitExpr_(const IfNode* op) override { throw std::invalid_argument("if 
not supported"); }
+  void VisitExpr_(const FunctionNode* op) override {
+    ICHECK(op->GetAttr<String>(attr::kCompiler).defined())
+        << "Only functions supported by custom codegen";
+  }
+  void VisitExpr_(const RefCreateNode* op) override {
+    throw std::invalid_argument("reference not supported");
+  }
+  void VisitExpr_(const RefReadNode* op) override {
+    throw std::invalid_argument("reference not supported");
+  }
+  void VisitExpr_(const RefWriteNode* op) override {
+    throw std::invalid_argument("reference not supported");
+  }
+  void VisitExpr_(const ConstructorNode* op) override {
+    throw std::invalid_argument("ADT constructor case not yet implemented");
+  }
+  void VisitExpr_(const MatchNode* op) override {
+    throw std::invalid_argument("match case not yet implemented");
+  }
+
+  // Create the main PrimFunc to execute the graph
+  tir::PrimFunc CreateMainFunc(unsigned int relay_params) {
+    tir::Stmt body = tir::SeqStmt(stmts_);
+
+    // Allocate the sids
+    std::unordered_map<int, bool> allocated;
+
+    for (auto kv : storage_device_map_) {
+      // Only allocate sids that are needed
+      const bool is_input =
+          (std::find(input_vars_.begin(), input_vars_.end(), kv.first) != 
input_vars_.end());
+      const bool is_param = (params_by_expr_.find(kv.first) != 
params_by_expr_.end());
+      if (is_input || is_param) {
+        continue;
+      }
+
+      for (unsigned int i = 0; i < kv.second[0].size(); i++) {
+        int size = kv.second[2][i];
+        int sid = static_cast<int>((kv.second[0][i].as<IntImmNode>())->value);
+
+        if (std::find(return_sid_.begin(), return_sid_.end(), sid) != 
return_sid_.end()) {
+          continue;
+        }
+
+        // TODO(giuseros): we should allocate this one time outside the 
PrimFunc
+        // so we dont' pay the price of allocation for every inference
+        if (!allocated[sid]) {
+          body = tir::LetStmt(sids_table_[sid], AllocateBackendMemory(size), 
body);
+        }
+        allocated[sid] = true;
+      }
+    }
+
+    // Define the attributes
+    body = tir::AttrStmt(PrimExpr(), tvm::tir::attr::device_type, 1, body);
+    body = tir::AttrStmt(PrimExpr(), tvm::tir::attr::device_id, 0, body);
+
+    // Make the PrimFunc
+    return tir::PrimFunc(main_signature_, body, VoidType(), Map<tir::Var, 
tir::Buffer>(),
+                         DictAttrs(dict_attrs_));
+  }
+
+ protected:
+  /*! \brief mod */
+  runtime::Module* mod_;
+  /*! \brief list of input expressions (i.e., variable passed by the user) */
+  std::vector<Expr> input_vars_;
+  /*! \brief input and output variables belonging to the main function 
signature */
+  Array<tir::Var> main_signature_;
+  /*! \brief target device */
+  TargetsMap targets_;
+  /*! \brief target host */
+  Target target_host_;
+  /*! PrimFunc attributes */
+  Map<String, ObjectRef> dict_attrs_;
+
+  /*!
+   * \brief parameters (i.e. ConstantNodes found in the graph).
+   * These are take as inputs to the GraphRuntime.
+   * Maps param name to a pair of storage_id and NDArray. At runtime, the 
storage_id can be
+   * used to lookup the parameter.
+   */
+  std::unordered_map<std::string, runtime::NDArray> params_;
+  /*! \brief mapping between expression and parameters */
+  Map<Expr, String> params_by_expr_;
+  /*! \brief mapping between parameter names ("p0", "p1", etc..) and storage 
identifiers*/
+  std::unordered_map<std::string, int64_t> param_storage_ids_;
+
+  /*! \brief plan memory of device result */
+  Map<Expr, Array<IntegerArray>> storage_device_map_;
+  std::unordered_map<int, te::Var> sids_table_;
+  /*! \brief lowered funcs */
+  std::unordered_map<std::string, IRModule> lowered_funcs_;
+  /*! \brief name map */
+  std::unordered_map<std::string, size_t> name_map_;
+  /*! \brief compile engine */
+  CompileEngine compile_engine_;
+  /*! \brief GraphPlanMemory module */
+  runtime::Module graph_plan_memory_module_;

Review comment:
       is this used?

##########
File path: src/runtime/crt/memory/stack_allocator.c
##########
@@ -0,0 +1,48 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+// LINT_C_FILE
+
+#include <tvm/runtime/crt/stack_allocator.h>
+
+void* StackMemoryManager_Allocate(tvm_workspace_t* tvm_runtime_workspace, 
int32_t nbytes) {
+  uint32_t offset_bytes = (~nbytes + 1) & (TVM_RUNTIME_ALLOC_ALIGNMENT_BYTES - 
1);
+  uint8_t* current_alloc = tvm_runtime_workspace->next_alloc;
+  uint8_t* next_alloc = tvm_runtime_workspace->next_alloc + nbytes + 
offset_bytes;
+  uint8_t* workspace_end = tvm_runtime_workspace->workspace + 
tvm_runtime_workspace->workspace_size;
+
+  if (next_alloc > workspace_end) {
+    return NULL;
+  }
+
+  tvm_runtime_workspace->next_alloc = next_alloc;
+  return current_alloc;
+}
+
+tvm_crt_error_t StackMemoryManager_Free(tvm_workspace_t* 
tvm_runtime_workspace, void* ptr) {
+  tvm_runtime_workspace->next_alloc = ptr;

Review comment:
       should assert here that ptr is truly FIFO

##########
File path: src/target/source/codegen_c_host.cc
##########
@@ -211,21 +214,34 @@ void CodeGenCHost::PrintGetFuncFromBackend(const 
std::string& func_name,
   this->stream << "}\n";
 }
 
-void CodeGenCHost::PrintFuncCall(const std::string& packed_func_name, int 
num_args) {
+void CodeGenCHost::PrintFuncCall(const std::string& packed_func_name, PrimExpr 
values,
+                                 int num_args) {
   this->PrintIndent();
+  std::string stack_value = "stack_value";
+  if (const VarNode* stack_value_var = values.as<VarNode>()) {
+    stack_value = stack_value_var->name_hint;
+  }
   std::string ret_val = GetUniqueName("ret_val");
   std::string ret_type_code = GetUniqueName("ret_type_code");
   this->stream << "TVMValue " << ret_val << ";\n";
   this->PrintIndent();
   this->stream << "int " << ret_type_code << ";\n";
   this->PrintIndent();
-  this->stream << "if (TVMFuncCall(" << packed_func_name << ", "
-               << "(TVMValue*) stack_value"
-               << ", "
+
+  if (is_aot_executor_) {

Review comment:
       i'd say that for this PR, we should either:
   1. revert this and just use TVMFuncCall for now in AOT
   2. implement the new TIR node and make this logic dependent on the type of 
TIR being codegen'd.
   
   I don't think we should merge as-is.

##########
File path: src/runtime/crt/memory/stack_allocator.c
##########
@@ -0,0 +1,48 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+// LINT_C_FILE
+
+#include <tvm/runtime/crt/stack_allocator.h>
+
+void* StackMemoryManager_Allocate(tvm_workspace_t* tvm_runtime_workspace, 
int32_t nbytes) {
+  uint32_t offset_bytes = (~nbytes + 1) & (TVM_RUNTIME_ALLOC_ALIGNMENT_BYTES - 
1);

Review comment:
       does it make sense to store some tag e.g. just before or after the 
returned ptr, to help with the assert in Free?

##########
File path: src/target/source/codegen_c_host.cc
##########
@@ -324,15 +343,20 @@ inline void CodeGenCHost::PrintTernaryCondExpr(const T* 
op, const char* compare,
 }
 
 runtime::Module BuildCHost(IRModule mod, Target target) {
+  bool is_aot_executor =

Review comment:
       I don't think this should be needed in CodeGenCHost, perhaps the 
exception being the function ordering. we could also just print prototypes, too.

##########
File path: src/target/source/codegen_c_host.cc
##########
@@ -274,8 +290,11 @@ void CodeGenCHost::VisitExpr_(const CallNode* op, 
std::ostream& os) {  // NOLINT
           << "Expected name " << packed_func_name << " to not be taken";
       decl_stream << "static void* " << packed_func_name << " = NULL;\n";
     }
-    this->PrintGetFuncFromBackend(func_name, packed_func_name);
-    this->PrintFuncCall(packed_func_name, num_args);
+    if (!is_aot_executor_) {

Review comment:
       same thing here--should either implement the new tir node or revert for 
now

##########
File path: tests/python/unittest/test_crt.py
##########
@@ -157,8 +157,8 @@ def @main(%a : Tensor[(1, 2), uint8], %b : Tensor[(1, 2), 
uint8]) {
         factory = tvm.relay.build(relay_mod, target=TARGET)
 
     with _make_session(workspace, factory.get_lib()) as sess:
-        graph_mod = tvm.micro.create_local_graph_executor(
-            factory.get_json(), sess.get_system_lib(), sess.device
+        graph_mod = tvm.micro.create_local_graph_runtime(

Review comment:
       revert the first line here

##########
File path: tests/python/relay/test_backend_graph_executor.py
##########
@@ -133,7 +133,7 @@ def test_plan_memory():
     storage_ids = set()
     device_types = set()
     for k, v in smap.items():
-        assert len(v) == 2
+        assert len(v) == 3

Review comment:
       want to assert on the v[2] element?

##########
File path: tests/python/relay/aot/infra.py
##########
@@ -0,0 +1,226 @@
+# Licensed to the Apache Software Foundation (ASF) under one

Review comment:
       might propose to name this aot_test_util.py

##########
File path: include/tvm/target/target_kind.h
##########
@@ -140,6 +140,12 @@ static constexpr const char* kTvmRuntimeCpp = "c++";
 /*! \brief Value used with --runtime in target specs to indicate the C 
runtime. */
 static constexpr const char* kTvmRuntimeCrt = "c";
 
+/*! \brief Value used with --executor in target specs to indicate the graph 
executor. */
+static constexpr const char* kTvmExecutorGraph = "graph";

Review comment:
       we discussed this a bit offilne; documenting here. my main question here 
was how we should handle flags like `--executor` and `--runtime` in Target 
string; these don't really influence generated operator implementations and 
therefore may not belong in autotune logs. we're not near consensus here, but 
general thinking was that perhaps a second configuration should be passed to 
`tvm.relay.build` containing e.g. relay compiler configuration.

##########
File path: src/relay/backend/aot_codegen.cc
##########
@@ -0,0 +1,675 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file relay/backend/graph_codegen.cc
+ * \brief Graph runtime codegen
+ */
+
+#include <dmlc/any.h>
+#include <tvm/ir/module.h>
+#include <tvm/relay/expr_functor.h>
+#include <tvm/runtime/device_api.h>
+#include <tvm/tir/builtin.h>
+#include <tvm/tir/expr.h>
+#include <tvm/tir/stmt.h>
+
+#include <algorithm>
+#include <list>
+#include <string>
+#include <vector>
+
+#include "../../runtime/meta_data.h"
+#include "compile_engine.h"
+#include "utils.h"
+
+namespace tvm {
+namespace relay {
+namespace backend {
+
+using IntegerArray = Array<Integer>;
+using ShapeVector = std::vector<std::vector<int64_t>>;
+using GraphAttrs = std::unordered_map<std::string, dmlc::any>;
+using TargetsMap = std::unordered_map<int, Target>;
+
+/*! \brief Lowered outputs */
+struct AOTLoweredOutput {
+  std::string graph_tir;
+  Map<String, IRModule> lowered_funcs;
+  Array<tvm::runtime::Module> external_mods;
+  std::unordered_map<std::string, std::pair<int, const tvm::runtime::NDArray>> 
params;
+  runtime::AOTMetadata aot_metadata;
+};
+
+class AotReturnSidVisitor : public ExprVisitor {
+ public:
+  explicit AotReturnSidVisitor(Map<Expr, Array<IntegerArray>> 
storage_device_map)
+      : storage_device_map_{storage_device_map}, return_sid_{-1} {}
+
+  IntegerArray FindReturnSid(Function func) {
+    VisitExpr(func->body);
+    return return_sid_;
+  }
+
+ protected:
+  void AssignReturnSid(Expr e) {
+    auto iter = storage_device_map_.find(e);
+    if (iter != storage_device_map_.end()) {
+      return_sid_ = (*iter).second[0];
+    }
+  }
+
+  void VisitExpr_(const ConstantNode* cn) override {
+    ExprVisitor::VisitExpr_(cn);
+    AssignReturnSid(GetRef<Expr>(cn));
+  }
+
+  void VisitExpr_(const VarNode* vn) override {
+    ExprVisitor::VisitExpr_(vn);
+    AssignReturnSid(GetRef<Expr>(vn));
+  }
+
+  void VisitExpr_(const CallNode* cn) override {
+    ExprVisitor::VisitExpr_(cn);
+    AssignReturnSid(GetRef<Expr>(cn));
+  }
+
+  void VisitExpr_(const LetNode* op) override { VisitExpr(op->body); }
+
+  void VisitExpr_(const TupleNode* tn) override {
+    ExprVisitor::VisitExpr_(tn);
+    AssignReturnSid(GetRef<Expr>(tn));
+  }
+
+ private:
+  Map<Expr, Array<IntegerArray>> storage_device_map_;
+  IntegerArray return_sid_;
+};
+
+using TIRNetwork = tvm::Array<tir::Stmt>;
+
+/*! \brief Code generator for graph runtime */
+class AOTCodegen : public ExprVisitor {
+ protected:
+  /*!
+   * \brief Utility function to allocate a DLTensor or TVMValue
+   * \param  type the type of allocation
+   * \param num the number of variable to allocate on the stack
+   * \return PrimExpr representing the allocated object
+   */
+  PrimExpr StackAlloca(std::string type, size_t num) {
+    Array<PrimExpr> args = {tir::StringImm(type), ConstInt32(num)};
+    return tir::Call(DataType::Handle(), tir::builtin::tvm_stack_alloca(), 
args);
+  }
+
+  /*!
+   * \brief Utility function to allocate memory for storage identifiers
+   * \param  memory_size_byte size in bytes of the allocation
+   * \return PrimExpr representing the allocated memory
+   */
+  PrimExpr AllocateBackendMemory(int memory_size_byte) {
+    // TODO(giuseros): use tir::Allocate instead of TVMBackendAllocWorkspace
+    // to enable unified memory planning
+    static const Op& op = Op::Get("tir.TVMBackendAllocWorkspace");
+    return tvm::tir::Call(DataType::Handle(), op, {1, 0, memory_size_byte, 2, 
8});
+  }
+
+  /*!
+   * \brief Utility function to convert a concrete integer to a PrimExpr.
+   * \param num the number to convert
+   * \return PrimExpr representing num
+   */
+  inline PrimExpr ConstInt32(size_t num) {
+    ICHECK_LE(num, std::numeric_limits<int>::max());
+    return tir::make_const(DataType::Int(32), static_cast<int>(num));
+  }
+
+  /*!
+   * \brief Return a vector of variables that represents the sids for the 
given Relay Expr
+   */
+  std::vector<tir::Var> pack_sid(Expr expr) {
+    Array<IntegerArray> sids = storage_device_map_[expr];
+    std::vector<tir::Var> sid_vars;
+
+    // Note that an expression can have multiple sids associated with it
+    // e.g., returning multiple values from a function
+    for (const auto& sid : sids[0]) {
+      // Determine if an sid is an output buffer
+      int sid_int = static_cast<int>((sid.as<IntImmNode>())->value);
+      auto output_iter = std::find(return_sid_.begin(), return_sid_.end(), 
sid_int);
+      if (output_iter != return_sid_.end()) {
+        int output_index = std::distance(return_sid_.begin(), output_iter);
+        sid_vars.push_back(main_signature_[input_vars_.size() + output_index]);
+        continue;
+      }
+      // Pack the sid inside the TVMValue
+      auto sid_array = te::Var(make_string("sid_", sid, "_value"), 
DataType::Handle());
+      auto sid_value = sids_table_[sid];
+      tvm::PrimExpr set_tensor =
+          tvm::tir::Call(DataType::Handle(), 
tvm::tir::builtin::tvm_struct_set(),
+                         {sid_array, 0, tir::builtin::kArrData, sid_value});
+      stmts_.push_back(tir::LetStmt(sid_array, StackAlloca("array", 1), 
tir::Evaluate(set_tensor)));
+      sid_vars.push_back(sid_array);
+    }
+    return sid_vars;
+  }
+
+  /*!
+   * \brief Utility function to return a parameter associated with an 
expression
+   * \param expr Relay Expression assicated with the parameter
+   * \return Variable that represents the DLTensor associated with the 
parameters
+   */
+  tir::Var pack_param(Expr expr) {
+    // TODO(giuseros): Using call_extern to call into lookup_linked_param. 
This is because the
+    // builtin::ret is not supported yet in the c target. Once return is 
supported we can use
+    // tvm_call_packed_lowered().
+    int param_sid = param_storage_ids_[reverse_params_lookup_[expr]];
+    auto lookup_linked_param_fn = 
tir::StringImm(::tvm::runtime::symbol::tvm_lookup_linked_param);
+    auto param_array = te::Var(make_string("param_", param_sid, "_array"), 
DataType::Handle());
+
+    // Compose the lookup_call using a local stack
+    Array<tir::Stmt> lookup_call;
+    auto param_var = te::Var(make_string("param_", param_sid, "_value"), 
DataType::Handle());
+    auto ret_var = te::Var("ret_value", DataType::Handle());
+    auto ret_code = te::Var("ret_value", DataType::Handle());
+
+    lookup_call.push_back(tir::Evaluate(
+        tvm::tir::Call(DataType::Handle(), tvm::tir::builtin::tvm_struct_set(),
+                       {param_var, 0, tir::builtin::kTVMValueContent, 
ConstInt32(param_sid)})));
+    lookup_call.push_back(tir::Evaluate(
+        tvm::tir::Call(DataType::Handle(), tir::builtin::call_extern(),
+                       {lookup_linked_param_fn, param_var, 0, 0, ret_var, 
ret_code, 0})));
+    auto ret_var_handle = tvm::tir::Call(DataType::Handle(), 
tvm::tir::builtin::tvm_struct_get(),
+                                         {ret_var, 0, 
tir::builtin::kTVMValueContent});
+
+    // Set the param to the value returned by lookup_call
+    tvm::PrimExpr set_param_array =
+        tvm::tir::Call(DataType::Handle(), tvm::tir::builtin::tvm_struct_set(),
+                       {param_array, 0, tir::builtin::kArrData, 
ret_var_handle});
+    lookup_call.push_back(tir::Evaluate(set_param_array));
+
+    tir::Stmt lookup_body = tir::SeqStmt(lookup_call);
+
+    // Allocate the DLTensors on the stack
+    lookup_body = tir::LetStmt(param_var, StackAlloca("arg_value", 1), 
lookup_body);
+    lookup_body = tir::LetStmt(ret_var, StackAlloca("arg_value", 1), 
lookup_body);
+    lookup_body = tir::LetStmt(ret_code, StackAlloca("arg_value", 1), 
lookup_body);
+    lookup_body = tir::LetStmt(param_array, StackAlloca("arg_value", 1), 
lookup_body);
+    stmts_.push_back(lookup_body);
+    return param_array;
+  }
+
+  /*!
+   * brief Given an expression return the variable(s) associated with that 
expression
+   */
+  std::vector<te::Var> find_expr(Expr arg) {
+    auto input_iter = std::find(input_vars_.begin(), input_vars_.end(), arg);
+    if (input_iter != input_vars_.end()) {
+      // Input variable
+      int main_index = std::distance(input_vars_.begin(), input_iter);
+      return {main_signature_[main_index]};
+    } else if (reverse_params_lookup_.find(arg) != 
reverse_params_lookup_.end()) {
+      // Parameter of the network
+      return {pack_param(arg)};
+    } else {
+      // Storage identifier (i.e., intermediate memory)
+      return pack_sid(arg);
+    }
+  }
+
+  /*!
+   * brief Call a function with a given name
+   */
+  void func_call(Call call, std::string func_name) {
+    tvm::Array<PrimExpr> args{tvm::tir::StringImm(func_name)};
+    std::vector<tir::Stmt> func_call_stmts;
+
+    // Pack the inputs
+    for (Expr arg : call->args) {
+      auto var_arg = find_expr(arg);
+      args.push_back(var_arg[0]);
+    }
+
+    auto ret_expr = Downcast<Expr>(call);
+
+    // Pack the return(s) value. A call node can produce multiple outputs
+    for (const auto& var : pack_sid(ret_expr)) {
+      args.push_back(var);
+    }
+
+    // Use tvm_call_packed to execute the function
+    func_call_stmts.push_back(tir::Evaluate(
+        tvm::tir::Call(DataType::Int(32), 
tvm::tir::builtin::tvm_call_packed(), args)));
+    tir::Stmt body = tir::SeqStmt(func_call_stmts);
+    stmts_.push_back(body);
+  }
+
+  /*!
+   * brief Copy a variable to the output. This function is mainly used in edge 
cases

Review comment:
       ok that makes sense. I'm good with a TODO, the main goal of this PR 
should just be parity with GraphExecutor.

##########
File path: tests/python/unittest/test_runtime_module_based_interface.py
##########
@@ -526,11 +526,11 @@ def test_debug_graph_executor():
     out = get_output(0).asnumpy()
     tvm.testing.assert_allclose(out, verify(data), atol=1e-5)
 
-    # debug graph executor wrapper
-    debug_g_mod = debug_executor.GraphModuleDebug(
-        complied_graph_lib["debug_create"]("default", dev),
-        [dev],
-        complied_graph_lib.get_json(),
+    # debug graph runtime wrapper
+    debug_g_mod = debug_runtime.GraphModuleDebug(

Review comment:
       at least the runtime part




-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
[email protected]


Reply via email to