(tvm) branch main updated: [RUNTIME][CLML] Fix for CLML ops and enable more test case (#15896)

srk Wed, 20 Dec 2023 00:20:14 -0800

This is an automated email from the ASF dual-hosted git repository.

srk pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tvm.git



The following commit(s) were added to refs/heads/main by this push:
     new 3a57a40c1b [RUNTIME][CLML] Fix for CLML ops and enable more test case 
(#15896)
3a57a40c1b is described below

commit 3a57a40c1ba40e1c330346905f8db72775fc9992
Author: krishnaraj36 <quic_kvegi...@quicinc.com>
AuthorDate: Wed Dec 20 13:50:00 2023 +0530

    [RUNTIME][CLML] Fix for CLML ops and enable more test case (#15896)
    
    * [RUNTIME][CLML] Fix for few clml ops
    
    Fixed the dense operator and enhance clml network testcase
    
    * [RUNTIME][CLML] Fix for dense layer and float16
    
    Fixed the dense layer issue in network level and improved
    converage of dense layer with clml
    Fixed float16 crash error.
    
    * Update comment for dense pattern
    
    * fix in clml test cases
    
    * Enable more test cases and few fixes
    
    * Fix the import error
    
    * Fix the import error
    
    * Fix in batchnorm testcase
    
    * Restructure clml test case and enable vm executor
    
    * Fix the import error in clml test network
    
    * Fix the test failure for vm tests
    
    * Update clml.py
---
 python/tvm/relay/op/contrib/clml.py              | 118 ++-
 src/relay/backend/contrib/clml/codegen.cc        |   2 +-
 src/runtime/contrib/clml/clml_runtime.cc         | 521 ++++++++-----
 tests/python/contrib/test_clml/conftest.py       |  21 +-
 tests/python/contrib/test_clml/infrastructure.py | 242 +++---
 tests/python/contrib/test_clml/test_network.py   | 249 +++---
 tests/python/contrib/test_clml/test_ops.py       | 942 +++++++++++++++++------
 tests/scripts/task_python_adreno.sh              |   1 +
 8 files changed, 1332 insertions(+), 764 deletions(-)

diff --git a/python/tvm/relay/op/contrib/clml.py 
b/python/tvm/relay/op/contrib/clml.py
index f194dd114b..14dd35a3cb 100644
--- a/python/tvm/relay/op/contrib/clml.py
+++ b/python/tvm/relay/op/contrib/clml.py
@@ -18,6 +18,7 @@
 """CLML Library supported operators."""
 import json
 from string import Template
+import numpy as np
 import tvm
 
 from tvm import relay
@@ -27,7 +28,7 @@ from tvm.relay import transform
 from tvm.relay.build_module import bind_params_by_name
 from tvm.relay import function as _function
 from tvm.relay.expr_functor import ExprMutator
-from tvm.relay.expr import Call, TupleGetItem
+from tvm.relay.expr import Call, TupleGetItem, Var, Constant
 
 from ...dataflow_pattern import wildcard, is_op, is_constant, 
is_tuple_get_item, is_tuple
 from .register import register_pattern_table
@@ -81,34 +82,61 @@ class RemoveDropoutPass:
         return RemoveDropout().visit(func)
 
 
-class BroadcastInputs(ExprMutator):
+class OptimizeBatchnorm(ExprMutator):
     """
-    Binary operators need broadcasting for CLML.
+    Fuse Conv+Batchnorm and constant folder to generate Conv+Add.
     """
 
-    def visit_call(self, call):
-        if call.op.name in ["add", "subtract", "multiply", "divide", 
"maximum", "minimum"]:
-            new_fn = self.visit(call.op)
-            call_shape = call.checked_type.shape
-            lhs = call.args[0]
-            rhs = call.args[1]
-            lhs_shape = lhs.checked_type.shape
-            rhs_shape = rhs.checked_type.shape
-            if list(call_shape) != list(lhs_shape):
-                lhs = relay.broadcast_to(self.visit(lhs), call_shape)
-            if list(call_shape) != list(rhs_shape):
-                rhs = relay.broadcast_to(self.visit(rhs), call_shape)
-            args = [lhs, rhs]
-            return Call(new_fn, args, call.attrs)
-        return super().visit_call(call)
+    def visit_call(self, call) -> relay.expr.Expr:
+        new_args = []
+        for arg in call.args:
+            if (
+                not isinstance(arg, (Var, Constant))
+                and isinstance(arg, tvm.relay.TupleGetItem)
+                and arg.tuple_value.op.name == "nn.batch_norm"
+                and (not isinstance(arg.tuple_value.args[0], (Var, Constant)))
+                and arg.tuple_value.args[0].op.name == "nn.conv2d"
+            ):
+                ep = arg.tuple_value.attrs["epsilon"]
+                wt = arg.tuple_value.args[1].data.numpy()
+                bs = arg.tuple_value.args[2].data.numpy()
+                mn = arg.tuple_value.args[3].data.numpy()
+                vr = arg.tuple_value.args[4].data.numpy() + ep
+                dino = np.sqrt(vr)
+                wt = wt / dino
+                bs = bs - mn * wt
+                conv_op = arg.tuple_value.args[0]
+                conv_args = list(conv_op.args)
+                wt_conv = conv_args[1].data.numpy()
+                if conv_op.attrs["kernel_layout"] == "OIHW":
+                    wt = wt.reshape(wt.shape[0], 1, 1, 1)
+                elif conv_op.attrs["kernel_layout"] == "IOHW":
+                    wt = wt.reshape(1, wt.shape[0], 1, 1)
+                else:
+                    raise ValueError("Unsupported Conv2d kernel layout")
+                wt_conv = wt_conv * wt
+                conv_args[1] = relay.const(tvm.nd.array(wt_conv))
+                bs_args = relay.const(tvm.nd.array(bs.reshape(-1, bs.shape[0], 
1, 1)))
+                conv_out = Call(
+                    arg.tuple_value.args[0].op, conv_args, 
arg.tuple_value.args[0].attrs
+                )
+                mod = tvm.relay.add(conv_out, bs_args)
+                new_args.append(mod)
+            else:
+                new_args.append(arg)
+
+        call = Call(call.op, new_args, call.attrs)
+        args = [self.visit(arg) for arg in call.args]
+
+        return Call(call.op, args, call.attrs)
 
 
 @transform.function_pass(opt_level=0)
-class BinaryOpBroadcaster:
+class OptimizeBatchnormPass:
     def transform_function(
         self, func: relay.function.Function, mod: tvm.IRModule, _: 
tvm.transform.PassContext
     ) -> relay.function.Function:
-        return BroadcastInputs().visit(func)
+        return OptimizeBatchnorm().visit(func)
 
 
 def partition_for_clml(mod, params=None, **opts):
@@ -134,8 +162,8 @@ def partition_for_clml(mod, params=None, **opts):
         [
             transform.InferType(),
             RemoveDropoutPass(),
-            BinaryOpBroadcaster(),
             transform.FoldConstant(),
+            OptimizeBatchnormPass(),
             transform.MergeComposite(clml_pattern_table()),
             transform.AnnotateTarget("clml", False),
             transform.MergeCompilerRegions(),
@@ -289,8 +317,15 @@ def clml_pattern_table():
 
         return pattern
 
-    def dense_pattern():
-        """Create a dense pattern."""
+    def dense1d_pattern():
+        """Create a dense pattern for 1d vector to matrix multiple."""
+        pattern = is_op("nn.dense")(wildcard(), is_constant())
+        pattern = pattern.optional(lambda x: is_op("nn.bias_add")(x, 
is_constant()))
+        pattern = pattern.optional(lambda x: is_op("add")(x, is_constant()))
+        return pattern
+
+    def dense2d_pattern():
+        """Create a dense pattern for 2d matrix to matrix multiple."""
         pattern = is_op("nn.dense")(wildcard(), is_constant())
         return pattern
 
@@ -377,6 +412,9 @@ def clml_pattern_table():
         if len(call.args[1].checked_type.shape) == 0:
             return False
 
+        if tuple(call.args[0].checked_type.shape) != 
tuple(call.args[1].checked_type.shape):
+            return False
+
         for arg in call.args:
             # Avoid any operators with dtype Int64
             if arg.checked_type.dtype == "int64":
@@ -436,11 +474,33 @@ def clml_pattern_table():
             return False
         return True
 
+    def check_dense1d_op(extract):
+        call = extract
+        # Only support single Matmul
+        if call.args[0].checked_type.shape[0] > 1:
+            return False
+        if not (call.op.name in ["nn.bias_add", "add"] and 
call.args[0].op.name == "nn.dense"):
+            return False
+        return True
+
+    def check_reshape(extract):
+        call = extract
+        call_shape = call.checked_type.shape
+        # Only support batch dim = 1
+        if call_shape[0] > 1:
+            return False
+        # Checking buffer indexing limit
+        for shape in call_shape:
+            if shape > 32768:
+                return False
+        return True
+
     return [
         ("clml.pad_conv2d", pad_conv_pattern(), check_conv),
         ("clml.conv2d", conv_pattern(), check_conv),
         ("clml.conv2d_transpose", conv_transpose_pattern(), 
check_conv_transpose),
-        ("clml.dense", dense_pattern(), check_default_op),
+        ("clml.dense1d", dense1d_pattern(), check_dense1d_op),
+        ("clml.dense2d", dense2d_pattern(), check_default_op),
         ("clml.pad", pad_pattern(), check_pad_op),
         ("clml.concat", concat_pattern(), check_concat_op),
         ("clml.batch_norm", batch_norm_pattern(), check_default_op),
@@ -451,7 +511,7 @@ def clml_pattern_table():
         ("clml.minimum", is_op("minimum")(wildcard(), wildcard()), 
check_binary_op),
         ("clml.maximum", is_op("maximum")(wildcard(), wildcard()), 
check_binary_op),
         ("clml.softmax", is_op("nn.softmax")(wildcard()), check_softmax_op),
-        # ("clml.reshape", is_op("reshape")(wildcard()), check_default_op),
+        ("clml.reshape", is_op("reshape")(wildcard()), check_reshape),
         ("clml.avg_pool2d", is_op("nn.avg_pool2d")(wildcard()), 
check_default_op),
         ("clml.max_pool2d", is_op("nn.max_pool2d")(wildcard()), 
check_default_op),
         ("clml.global_avg_pool2d", is_op("nn.global_avg_pool2d")(wildcard()), 
check_default_op),
@@ -807,7 +867,7 @@ class CLMLGetSubModuleSrc:
                         elif activation == "relu6":
                             activation = "CL_ACTIVATION_RELU6"
                         else:
-                            RuntimeError("Unknown activation:" + activation)
+                            raise RuntimeError("Unknown activation:" + 
activation)
                     has_bias = bool((node["inputs"] == 3) or (node["inputs"] 
== 7))
                     has_bn = bool((node["inputs"] == 6) or (node["inputs"] == 
7))
                     input_tensor = get_tensor_from_map(node["inputs"][0][0])
@@ -907,8 +967,8 @@ class CLMLGetSubModuleSrc:
                         )
                     )
                 elif node["name"] == "nn.batch_norm":
-                    bn_attrs = tuple(node["attrs"]["batchnorm"][0][0])
-                    axis = bn_attrs[0]
+                    bn_attrs = tuple(node["attrs"]["axis"])
+                    axis = int(bn_attrs[0][0])
                     bn_shape = [1, 1, 1, 1]
                     bn_node = self.nodes[node["inputs"][0][0]]
                     bn_shape[axis] = bn_node["attrs"]["shape"][0][0]
@@ -1094,7 +1154,7 @@ class CLMLGetSubModuleSrc:
                         )
                     )
                 else:
-                    RuntimeError("Unsupported Op:" + node["name"])
+                    raise RuntimeError("Unsupported Op:" + node["name"])
                 self.clml_code.append(
                     self.MapInsert.substitute(nid=node_out_name, 
tensor_desc=node_out_name)
                 )
diff --git a/src/relay/backend/contrib/clml/codegen.cc 
b/src/relay/backend/contrib/clml/codegen.cc
index 069e11dac5..5d6fc0c2cf 100644
--- a/src/relay/backend/contrib/clml/codegen.cc
+++ b/src/relay/backend/contrib/clml/codegen.cc
@@ -87,7 +87,7 @@ class CLMLJSONSerializer : public 
backend::contrib::JSONSerializer {
       json_node = CreateCompositeConvJSONNode(cn);
     } else if (name == "clml.batch_norm") {
       json_node = CreateBatchNormJSONNode(cn);
-    } else if (name == "clml.dense") {
+    } else if (name == "clml.dense1d" || name == "clml.dense2d") {
       json_node = CreateDenseJSONNode(cn);
     } else if (name == "clml.pad") {
       json_node = CreatePadJSONNode(cn);
diff --git a/src/runtime/contrib/clml/clml_runtime.cc 
b/src/runtime/contrib/clml/clml_runtime.cc
index 1146ff7249..aa1e2b82b6 100644
--- a/src/runtime/contrib/clml/clml_runtime.cc
+++ b/src/runtime/contrib/clml/clml_runtime.cc
@@ -512,36 +512,36 @@ class CLMLRuntime : public JSONRuntimeBase {
   /*!
    * \brief Create an CLML tensor from JSON node entry. Lookup storage map 
before creation.
    *
-   * \param tensor The tensor as Node Entry .
+   * \param nid The node index of graph JSON.
    * \param shape shape information of tensor
    * \param layout the tensor layout to be used
    * \param dtype tensor data type
    * \return CLML Tensor descriptor.
    */
   std::shared_ptr<cl_ml_tensor_memory_desc_qcom> MakeCLMLTensorFromJSONEntry(
-      const JSONGraphNodeEntry& tensor, std::vector<size_t> shape, 
cl_ml_tensor_layout_qcom layout,
-      cl_uint dtype) {
-    JSONGraphNode node = nodes_[tensor.id_];
+      size_t nid, std::vector<size_t> shape, cl_ml_tensor_layout_qcom layout, 
cl_uint dtype) {
+    const JSONGraphNode node = nodes_[nid];
 
-    if (this->layer_.storage_map.find(tensor.id_) == 
this->layer_.storage_map.end()) {
+    if (this->layer_.storage_map.find(nid) == this->layer_.storage_map.end()) {
       void* node_data = nullptr;
       if (node.GetOpType() == "const") {
-        node_data = data_entry_[EntryID(tensor)]->data;
+        uint32_t eid = EntryID(nid, 0);
+        node_data = data_entry_[eid]->data;
       }
       auto clml_tensor = MakeCLMLTensorFromJSONNode(node, layout, dtype, 
node_data, shape);
-      this->layer_.storage_map.insert({tensor.id_, std::make_pair(clml_tensor, 
node)});
+      this->layer_.storage_map.insert({nid, std::make_pair(clml_tensor, 
node)});
 
       if ("input" == node.GetOpType()) {
-        this->layer_.inputs.insert({tensor.id_, clml_tensor});
+        this->layer_.inputs.insert({nid, this->layer_.storage_map[nid].first});
         // Input copy placeholder Tensor
         this->layer_.in_placeholder.insert(
-            {tensor.id_, MakeCLMLTensorFromJSONNode(node, 
CL_TENSOR_LAYOUT_NCHW_QCOM, dtype,
-                                                    node_data, shape)});
+            {nid, MakeCLMLTensorFromJSONNode(node, CL_TENSOR_LAYOUT_NCHW_QCOM, 
dtype, node_data,
+                                             shape)});
       }
 
       return clml_tensor;
     } else {
-      return this->layer_.storage_map[tensor.id_].first;
+      return this->layer_.storage_map[nid].first;
     }
   }
 
@@ -553,76 +553,62 @@ class CLMLRuntime : public JSONRuntimeBase {
    */
   void BuildEngine() {
     size_t nid;
+    // Create tensors for the operators which has distinct layout format
+    // other than CL_TENSOR_LAYOUT_OPTIMAL_QCOM.
+    for (nid = 0; nid < nodes_.size(); ++nid) {
+      const auto& node = nodes_[nid];
+      if ("nn.dense" == node.GetOpName()) CreateDenseLayerTensor(&layer_, 
node, nid);
+      if ("nn.batch_matmul" == node.GetOpName()) 
CreateBatchMatmulLayerTensor(&layer_, node, nid);
+    }
+
     for (nid = 0; nid < nodes_.size(); ++nid) {
       const auto& node = nodes_[nid];
-      DLDataType tvm_dtype = node.GetOpDataType()[0];
-      cl_channel_type cl_dtype = MakeCLDataType(tvm_dtype);
       if (node.GetOpType() == "input") {
         // Layers may request for different layout. Differ the input 
allocation.
       } else if (node.GetOpType() == "kernel") {
         auto op_name = node.GetOpName();
-        if ("nn.conv2d" == op_name) {
-          auto out = CreateConvolution2DLayer(&layer_, node, 
CL_CONVOLUTION_MODE_CONVOLUTION_QCOM);
-          this->layer_.storage_map.insert({nid, std::make_pair(out, node)});
-        } else if ("nn.depthwise_conv2d" == op_name) {
-          auto out = CreateConvolution2DLayer(&layer_, node, 
CL_CONVOLUTION_MODE_DEPTHWISE_QCOM);
-          this->layer_.storage_map.insert({nid, std::make_pair(out, node)});
-        } else if ("nn.conv2d_transpose" == op_name) {
-          auto out = CreateConvolution2DLayer(&layer_, node, 
CL_CONVOLUTION_MODE_TRANSPOSE_QCOM);
-          this->layer_.storage_map.insert({nid, std::make_pair(out, node)});
-        } else if ("nn.relu6" == op_name) {
-          auto out = CreateReLULayer(&layer_, node, CL_ACTIVATION_RELU6);
-          this->layer_.storage_map.insert({nid, std::make_pair(out, node)});
-        } else if ("nn.relu" == op_name) {
-          auto out = CreateReLULayer(&layer_, node, CL_ACTIVATION_RELU);
-          this->layer_.storage_map.insert({nid, std::make_pair(out, node)});
-        } else if ("nn.batch_norm" == op_name) {
-          auto out = CreateBatchNormLayer(&layer_, node);
-          this->layer_.storage_map.insert({nid, std::make_pair(out, node)});
-        } else if ("nn.max_pool2d" == op_name || "nn.avg_pool2d" == op_name ||
-                   "nn.l2_pool2d" == op_name) {
-          auto out = CreatePoolingLayer(&layer_, node);
-          this->layer_.storage_map.insert({nid, std::make_pair(out, node)});
-        } else if ("nn.global_max_pool2d" == op_name || "nn.global_avg_pool2d" 
== op_name) {
-          auto out = CreateGlobalPoolingLayer(&layer_, node);
-          this->layer_.storage_map.insert({nid, std::make_pair(out, node)});
-        } else if ("reshape" == op_name) {
-          auto out = CreateReshapeLayer(&layer_, node);
-          this->layer_.storage_map.insert({nid, std::make_pair(out, node)});
-        } else if ("concatenate" == op_name) {
-          auto out = CreateConcatLayer(&layer_, node);
-          this->layer_.storage_map.insert({nid, std::make_pair(out, node)});
-        } else if ("nn.dense" == op_name) {
-          auto out = CreateDenseLayer(&layer_, node);
-          this->layer_.storage_map.insert({nid, std::make_pair(out, node)});
-        } else if ("nn.softmax" == op_name) {
-          auto out = CreateSoftMaxLayer(&layer_, node);
-          this->layer_.storage_map.insert({nid, std::make_pair(out, node)});
-        } else if ("nn.pad" == op_name) {
-          auto out = CreatePadLayer(&layer_, node);
-          this->layer_.storage_map.insert({nid, std::make_pair(out, node)});
-        } else if ("nn.batch_flatten" == op_name) {
-          auto out = CreateBatchFlattenLayer(&layer_, node);
-          this->layer_.storage_map.insert({nid, std::make_pair(out, node)});
-        } else if ("clip" == op_name) {
-          auto out = CreateClipLayer(&layer_, node);
-          this->layer_.storage_map.insert({nid, std::make_pair(out, node)});
-        } else if ("add" == op_name || "subtract" == op_name || "multiply" == 
op_name ||
-                   "minimum" == op_name || "maximum" == op_name || "divide" == 
op_name) {
-          auto out = CreateBinaryLayer(&layer_, node);
-          this->layer_.storage_map.insert({nid, std::make_pair(out, node)});
-        } else if ("nn.depth_to_space" == op_name) {
-          auto out = CreateDepthToSpaceLayer(&layer_, node);
-          this->layer_.storage_map.insert({nid, std::make_pair(out, node)});
-        } else if ("nn.upsampling" == op_name) {
-          auto out = CreateResizeLayer(&layer_, node);
-          this->layer_.storage_map.insert({nid, std::make_pair(out, node)});
-        } else if ("nn.batch_matmul" == op_name) {
-          auto out = CreateBatchMatmulLayer(&layer_, node, nid);
-          this->layer_.storage_map.insert({nid, std::make_pair(out, node)});
-        } else {
+        if ("nn.conv2d" == op_name)
+          CreateConvolution2DLayer(&layer_, node, 
CL_CONVOLUTION_MODE_CONVOLUTION_QCOM, nid);
+        else if ("nn.depthwise_conv2d" == op_name)
+          CreateConvolution2DLayer(&layer_, node, 
CL_CONVOLUTION_MODE_DEPTHWISE_QCOM, nid);
+        else if ("nn.conv2d_transpose" == op_name)
+          CreateConvolution2DLayer(&layer_, node, 
CL_CONVOLUTION_MODE_TRANSPOSE_QCOM, nid);
+        else if ("nn.relu6" == op_name)
+          CreateReLULayer(&layer_, node, nid, CL_ACTIVATION_RELU6);
+        else if ("nn.relu" == op_name)
+          CreateReLULayer(&layer_, node, nid, CL_ACTIVATION_RELU);
+        else if ("nn.batch_norm" == op_name)
+          CreateBatchNormLayer(&layer_, node, nid);
+        else if ("nn.max_pool2d" == op_name || "nn.avg_pool2d" == op_name ||
+                 "nn.l2_pool2d" == op_name)
+          CreatePoolingLayer(&layer_, node, nid);
+        else if ("nn.global_max_pool2d" == op_name || "nn.global_avg_pool2d" 
== op_name)
+          CreateGlobalPoolingLayer(&layer_, node, nid);
+        else if ("reshape" == op_name)
+          CreateReshapeLayer(&layer_, node, nid);
+        else if ("concatenate" == op_name)
+          CreateConcatLayer(&layer_, node, nid);
+        else if ("nn.dense" == op_name)
+          CreateDenseLayer(&layer_, node, nid);
+        else if ("nn.softmax" == op_name)
+          CreateSoftMaxLayer(&layer_, node, nid);
+        else if ("nn.pad" == op_name)
+          CreatePadLayer(&layer_, node, nid);
+        else if ("nn.batch_flatten" == op_name)
+          CreateBatchFlattenLayer(&layer_, node, nid);
+        else if ("clip" == op_name)
+          CreateClipLayer(&layer_, node, nid);
+        else if ("add" == op_name || "subtract" == op_name || "multiply" == 
op_name ||
+                 "minimum" == op_name || "maximum" == op_name || "divide" == 
op_name)
+          CreateBinaryLayer(&layer_, node, nid);
+        else if ("nn.depth_to_space" == op_name)
+          CreateDepthToSpaceLayer(&layer_, node, nid);
+        else if ("nn.upsampling" == op_name)
+          CreateResizeLayer(&layer_, node, nid);
+        else if ("nn.batch_matmul" == op_name)
+          CreateBatchMatmulLayer(&layer_, node, nid);
+        else
           LOG(FATAL) << "Unsupported op: " << op_name;
-        }
         this->layer_.layer_names.push_back(op_name);
       } else if (node.GetOpType() != "const") {
         LOG(WARNING) << "Build Engine: Unknown Node:" << node.GetOpType();
@@ -778,16 +764,20 @@ class CLMLRuntime : public JSONRuntimeBase {
    *
    * \param layer The CLML layer to build. Containing inputs, outputs and the 
CLML function.
    * \param node The JSON representation of the operator.
+   * \param mode The conv2d mode type - CL_CONVOLUTION_MODE_CONVOLUTION_QCOM
+   *                                    or CL_CONVOLUTION_MODE_DEPTHWISE_QCOM
+   *                                    or CL_CONVOLUTION_MODE_TRANSPOSE_QCOM.
+   * \param nid The node index of JSON graph node, which points to this 
operator.
    */
-  std::shared_ptr<cl_ml_tensor_memory_desc_qcom> CreateConvolution2DLayer(
-      CachedLayer* layer, const JSONGraphNode& node, cl_convolution_mode_qcom 
mode) {
+  void CreateConvolution2DLayer(CachedLayer* layer, const JSONGraphNode& node,
+                                cl_convolution_mode_qcom mode, size_t nid) {
     std::vector<std::string> padding = 
node.GetAttr<std::vector<std::string>>("padding");
     std::vector<std::string> strides = 
node.GetAttr<std::vector<std::string>>("strides");
     std::vector<std::string> dilation = 
node.GetAttr<std::vector<std::string>>("dilation");
     std::vector<cl_uint> clml_padding = GetVectorValues(padding);
     DLDataType tvm_dtype = node.GetOpDataType()[0];
     cl_channel_type cl_dtype = MakeCLDataType(tvm_dtype);
-    cl_arithmetic_mode_qcom cl_arithmetic_mode = MakeCLArithMode(cl_dtype);
+    cl_arithmetic_mode_qcom cl_arithmetic_mode = MakeCLArithMode(cl_dtype, 
cl_dtype);
     if (!node.HasAttr("padding")) {
       clml_padding.resize(4);
       std::fill(clml_padding.begin(), clml_padding.end(), 0);
@@ -835,14 +825,15 @@ class CLMLRuntime : public JSONRuntimeBase {
     has_bn = (num_inputs == 6) || (num_inputs == 7);
     // Input
     auto input =
-        MakeCLMLTensorFromJSONEntry(inputs[0], {}, 
CL_TENSOR_LAYOUT_OPTIMAL_QCOM, cl_dtype);
+        MakeCLMLTensorFromJSONEntry(inputs[0].id_, {}, 
CL_TENSOR_LAYOUT_OPTIMAL_QCOM, cl_dtype);
     // Weight
     auto weight =
-        MakeCLMLTensorFromJSONEntry(inputs[1], {}, 
CL_TENSOR_LAYOUT_OPTIMAL_QCOM, cl_dtype);
+        MakeCLMLTensorFromJSONEntry(inputs[1].id_, {}, 
CL_TENSOR_LAYOUT_OPTIMAL_QCOM, cl_dtype);
     // Bias
     auto bias = std::make_shared<cl_ml_tensor_memory_desc_qcom>();
     if (has_bias) {
-      bias = MakeCLMLTensorFromJSONEntry(inputs[2], {}, 
CL_TENSOR_LAYOUT_OPTIMAL_QCOM, cl_dtype);
+      bias =
+          MakeCLMLTensorFromJSONEntry(inputs[2].id_, {}, 
CL_TENSOR_LAYOUT_OPTIMAL_QCOM, cl_dtype);
     } else {
       cl_ml_tensor_desc_qcom desc = {};
       desc.num_dimensions = CL_TENSOR_UNUSED_QCOM;
@@ -851,7 +842,7 @@ class CLMLRuntime : public JSONRuntimeBase {
       bias->tensor = layer_.unusedTensor;
     }
     // Output
-    auto output = MakeCLMLTensorFromJSONNode(node, 
CL_TENSOR_LAYOUT_OPTIMAL_QCOM, cl_dtype);
+    auto output = MakeCLMLTensorFromJSONEntry(nid, {}, 
CL_TENSOR_LAYOUT_OPTIMAL_QCOM, cl_dtype);
     cl_ml_op_convolution_desc_qcom conv_desc{mode,
                                              groups,
                                              4,
@@ -886,13 +877,13 @@ class CLMLRuntime : public JSONRuntimeBase {
       auto bn_var = std::make_shared<cl_ml_tensor_memory_desc_qcom>();
       auto bn_scale = std::make_shared<cl_ml_tensor_memory_desc_qcom>();
       auto bn_bias = std::make_shared<cl_ml_tensor_memory_desc_qcom>();
-      bn_scale = MakeCLMLTensorFromJSONEntry(inputs[bn_index], bn_shape,
+      bn_scale = MakeCLMLTensorFromJSONEntry(inputs[bn_index].id_, bn_shape,
                                              CL_TENSOR_LAYOUT_OPTIMAL_QCOM, 
cl_dtype);
-      bn_bias = MakeCLMLTensorFromJSONEntry(inputs[bn_index + 1], bn_shape,
+      bn_bias = MakeCLMLTensorFromJSONEntry(inputs[bn_index + 1].id_, bn_shape,
                                             CL_TENSOR_LAYOUT_OPTIMAL_QCOM, 
cl_dtype);
-      bn_mean = MakeCLMLTensorFromJSONEntry(inputs[bn_index + 2], bn_shape,
+      bn_mean = MakeCLMLTensorFromJSONEntry(inputs[bn_index + 2].id_, bn_shape,
                                             CL_TENSOR_LAYOUT_OPTIMAL_QCOM, 
cl_dtype);
-      bn_var = MakeCLMLTensorFromJSONEntry(inputs[bn_index + 3], bn_shape,
+      bn_var = MakeCLMLTensorFromJSONEntry(inputs[bn_index + 3].id_, bn_shape,
                                            CL_TENSOR_LAYOUT_OPTIMAL_QCOM, 
cl_dtype);
 
       cl_ml_op_batchnorm_desc_qcom bn_desc = {CL_BATCHNORM_MODE_SPATIAL_QCOM, 
cl_arithmetic_mode};
@@ -912,7 +903,7 @@ class CLMLRuntime : public JSONRuntimeBase {
       }
       layer->function.push_back(op);
     }
-    return output;
+    return;
   }
 
   /*!
@@ -920,18 +911,18 @@ class CLMLRuntime : public JSONRuntimeBase {
    *
    * \param layer The CLML layer to build. Containing inputs, outputs and the 
CLML output.
    * \param node The JSON representation of the operator.
+   * \param nid The node index of JSON graph node, which points to this 
operator.
    */
-  std::shared_ptr<cl_ml_tensor_memory_desc_qcom> CreateReLULayer(
-      CachedLayer* layer, const JSONGraphNode& node,
-      cl_activation_function_qcom clml_act_type = CL_ACTIVATION_RELU) {
+  void CreateReLULayer(CachedLayer* layer, const JSONGraphNode& node, size_t 
nid,
+                       cl_activation_function_qcom clml_act_type = 
CL_ACTIVATION_RELU) {
     cl_int result = 0;
     cl_ml_op_qcom op = nullptr;
     DLDataType tvm_dtype = node.GetOpDataType()[0];
     cl_channel_type cl_dtype = MakeCLDataType(tvm_dtype);
-    cl_arithmetic_mode_qcom cl_arithmetic_mode = MakeCLArithMode(cl_dtype);
-    auto input = MakeCLMLTensorFromJSONEntry(node.GetInputs()[0], {}, 
CL_TENSOR_LAYOUT_OPTIMAL_QCOM,
-                                             cl_dtype);
-    auto output = MakeCLMLTensorFromJSONNode(node, 
CL_TENSOR_LAYOUT_OPTIMAL_QCOM, cl_dtype);
+    cl_arithmetic_mode_qcom cl_arithmetic_mode = MakeCLArithMode(cl_dtype, 
cl_dtype);
+    auto input = MakeCLMLTensorFromJSONEntry(node.GetInputs()[0].id_, {},
+                                             CL_TENSOR_LAYOUT_OPTIMAL_QCOM, 
cl_dtype);
+    auto output = MakeCLMLTensorFromJSONEntry(nid, {}, 
CL_TENSOR_LAYOUT_OPTIMAL_QCOM, cl_dtype);
 
     cl_ml_op_activation_desc_qcom act_desc = {clml_act_type, 
CL_PROPAGATE_NAN_QCOM,
                                               cl_arithmetic_mode};
@@ -947,7 +938,7 @@ class CLMLRuntime : public JSONRuntimeBase {
     ICHECK(op && result == CL_SUCCESS) << "Activation Error:" << result;
 
     layer->function.push_back(op);
-    return output;
+    return;
   }
 
   /*!
@@ -956,16 +947,16 @@ class CLMLRuntime : public JSONRuntimeBase {
    *
    * \param layer The CLML layer to build. Containing inputs, outputs and the 
CLML function.
    * \param node The JSON representation of the operator.
+   * \param nid The node index of JSON graph node, which points to this 
operator.
    */
-  std::shared_ptr<cl_ml_tensor_memory_desc_qcom> 
CreateBatchNormLayer(CachedLayer* layer,
-                                                                      const 
JSONGraphNode& node) {
+  void CreateBatchNormLayer(CachedLayer* layer, const JSONGraphNode& node, 
size_t nid) {
     cl_int result = 0;
     cl_ml_op_qcom op = nullptr;
     DLDataType tvm_dtype = node.GetOpDataType()[0];
     cl_channel_type cl_dtype = MakeCLDataType(tvm_dtype);
-    cl_arithmetic_mode_qcom cl_arithmetic_mode = MakeCLArithMode(cl_dtype);
-    auto input = MakeCLMLTensorFromJSONEntry(node.GetInputs()[0], {}, 
CL_TENSOR_LAYOUT_OPTIMAL_QCOM,
-                                             cl_dtype);
+    cl_arithmetic_mode_qcom cl_arithmetic_mode = MakeCLArithMode(cl_dtype, 
cl_dtype);
+    auto input = MakeCLMLTensorFromJSONEntry(node.GetInputs()[0].id_, {},
+                                             CL_TENSOR_LAYOUT_OPTIMAL_QCOM, 
cl_dtype);
     int axis = std::stoi(node.GetAttr<std::vector<std::string>>("axis")[0]);
     float epsilon = 
std::stof(node.GetAttr<std::vector<std::string>>("epsilon")[0]);
 
@@ -981,16 +972,16 @@ class CLMLRuntime : public JSONRuntimeBase {
     auto bn_var = std::make_shared<cl_ml_tensor_memory_desc_qcom>();
     auto bn_scale = std::make_shared<cl_ml_tensor_memory_desc_qcom>();
     auto bn_bias = std::make_shared<cl_ml_tensor_memory_desc_qcom>();
-    bn_scale = MakeCLMLTensorFromJSONEntry(node.GetInputs()[1], bn_shape,
+    bn_scale = MakeCLMLTensorFromJSONEntry(node.GetInputs()[1].id_, bn_shape,
                                            CL_TENSOR_LAYOUT_OPTIMAL_QCOM, 
cl_dtype);
-    bn_bias = MakeCLMLTensorFromJSONEntry(node.GetInputs()[2], bn_shape,
+    bn_bias = MakeCLMLTensorFromJSONEntry(node.GetInputs()[2].id_, bn_shape,
                                           CL_TENSOR_LAYOUT_OPTIMAL_QCOM, 
cl_dtype);
-    bn_mean = MakeCLMLTensorFromJSONEntry(node.GetInputs()[3], bn_shape,
+    bn_mean = MakeCLMLTensorFromJSONEntry(node.GetInputs()[3].id_, bn_shape,
                                           CL_TENSOR_LAYOUT_OPTIMAL_QCOM, 
cl_dtype);
-    bn_var = MakeCLMLTensorFromJSONEntry(node.GetInputs()[4], bn_shape,
+    bn_var = MakeCLMLTensorFromJSONEntry(node.GetInputs()[4].id_, bn_shape,
                                          CL_TENSOR_LAYOUT_OPTIMAL_QCOM, 
cl_dtype);
 
-    auto output = MakeCLMLTensorFromJSONNode(node, 
CL_TENSOR_LAYOUT_OPTIMAL_QCOM, cl_dtype);
+    auto output = MakeCLMLTensorFromJSONEntry(nid, {}, 
CL_TENSOR_LAYOUT_OPTIMAL_QCOM, cl_dtype);
 
     cl_ml_op_batchnorm_desc_qcom bn_desc = {CL_BATCHNORM_MODE_SPATIAL_QCOM, 
cl_arithmetic_mode};
 
@@ -1000,7 +991,7 @@ class CLMLRuntime : public JSONRuntimeBase {
     ICHECK(op && result == CL_SUCCESS) << "Batchnorm Error:" << result;
 
     layer->function.push_back(op);
-    return output;
+    return;
   }
 
   /*!
@@ -1010,17 +1001,17 @@ class CLMLRuntime : public JSONRuntimeBase {
    *
    * \param layer The CLML layer to build. Containing inputs, outputs and the 
CLML function.
    * \param node The JSON representation of the operator.
+   * \param nid The node index of JSON graph node, which points to this 
operator.
    */
-  std::shared_ptr<cl_ml_tensor_memory_desc_qcom> 
CreatePoolingLayer(CachedLayer* layer,
-                                                                    const 
JSONGraphNode& node) {
+  void CreatePoolingLayer(CachedLayer* layer, const JSONGraphNode& node, 
size_t nid) {
     cl_int result = 0;
     cl_ml_op_qcom op = nullptr;
     DLDataType tvm_dtype = node.GetOpDataType()[0];
     cl_channel_type cl_dtype = MakeCLDataType(tvm_dtype);
-    cl_arithmetic_mode_qcom cl_arithmetic_mode = MakeCLArithMode(cl_dtype);
-    auto input = MakeCLMLTensorFromJSONEntry(node.GetInputs()[0], {}, 
CL_TENSOR_LAYOUT_OPTIMAL_QCOM,
-                                             cl_dtype);
-    auto output = MakeCLMLTensorFromJSONNode(node, 
CL_TENSOR_LAYOUT_OPTIMAL_QCOM, cl_dtype);
+    cl_arithmetic_mode_qcom cl_arithmetic_mode = MakeCLArithMode(cl_dtype, 
cl_dtype);
+    auto input = MakeCLMLTensorFromJSONEntry(node.GetInputs()[0].id_, {},
+                                             CL_TENSOR_LAYOUT_OPTIMAL_QCOM, 
cl_dtype);
+    auto output = MakeCLMLTensorFromJSONEntry(nid, {}, 
CL_TENSOR_LAYOUT_OPTIMAL_QCOM, cl_dtype);
 
     std::vector<std::string> windows = 
node.GetAttr<std::vector<std::string>>("pool_size");
     std::vector<std::string> strides = 
node.GetAttr<std::vector<std::string>>("strides");
@@ -1053,7 +1044,7 @@ class CLMLRuntime : public JSONRuntimeBase {
     ICHECK(op && result == CL_SUCCESS) << "Pooling Error:" << result;
 
     layer->function.push_back(op);
-    return output;
+    return;
   }
 
   /*!
@@ -1063,17 +1054,17 @@ class CLMLRuntime : public JSONRuntimeBase {
    *
    * \param layer The CLML layer to build. Containing inputs, outputs and the 
CLML function.
    * \param node The JSON representation of the operator.
+   * \param nid The node index of JSON graph node, which points to this 
operator.
    */
-  std::shared_ptr<cl_ml_tensor_memory_desc_qcom> CreateGlobalPoolingLayer(
-      CachedLayer* layer, const JSONGraphNode& node) {
+  void CreateGlobalPoolingLayer(CachedLayer* layer, const JSONGraphNode& node, 
size_t nid) {
     cl_int result = 0;
     cl_ml_op_qcom op = nullptr;
     DLDataType tvm_dtype = node.GetOpDataType()[0];
     cl_channel_type cl_dtype = MakeCLDataType(tvm_dtype);
-    cl_arithmetic_mode_qcom cl_arithmetic_mode = MakeCLArithMode(cl_dtype);
-    auto input = MakeCLMLTensorFromJSONEntry(node.GetInputs()[0], {}, 
CL_TENSOR_LAYOUT_OPTIMAL_QCOM,
-                                             cl_dtype);
-    auto output = MakeCLMLTensorFromJSONNode(node, 
CL_TENSOR_LAYOUT_OPTIMAL_QCOM, cl_dtype);
+    cl_arithmetic_mode_qcom cl_arithmetic_mode = MakeCLArithMode(cl_dtype, 
cl_dtype);
+    auto input = MakeCLMLTensorFromJSONEntry(node.GetInputs()[0].id_, {},
+                                             CL_TENSOR_LAYOUT_OPTIMAL_QCOM, 
cl_dtype);
+    auto output = MakeCLMLTensorFromJSONEntry(nid, {}, 
CL_TENSOR_LAYOUT_OPTIMAL_QCOM, cl_dtype);
     auto in_dims = GetTensorDims(nodes_[node.GetInputs()[0].id_]);
     cl_ml_op_pooling_desc_qcom pool_desc = {
         node.GetOpName() == "nn.global_max_pool2d" ? CL_POOLING_MODE_MAX_QCOM
@@ -1098,7 +1089,7 @@ class CLMLRuntime : public JSONRuntimeBase {
     ICHECK(op && result == CL_SUCCESS) << "Pooling Error:" << result;
 
     layer->function.push_back(op);
-    return output;
+    return;
   }
 
   /*!
@@ -1106,19 +1097,19 @@ class CLMLRuntime : public JSONRuntimeBase {
    *
    * \param layer The CLML layer to build. Containing inputs, outputs and the 
CLML output.
    * \param node The JSON representation of the operator.
+   * \param nid The node index of JSON graph node, which points to this 
operator.
    */
-  std::shared_ptr<cl_ml_tensor_memory_desc_qcom> 
CreateSoftMaxLayer(CachedLayer* layer,
-                                                                    const 
JSONGraphNode& node) {
+  void CreateSoftMaxLayer(CachedLayer* layer, const JSONGraphNode& node, 
size_t nid) {
     cl_int result = 0;
     cl_ml_op_qcom op = nullptr;
     DLDataType tvm_dtype = node.GetOpDataType()[0];
     cl_channel_type cl_dtype = MakeCLDataType(tvm_dtype);
-    cl_arithmetic_mode_qcom cl_arithmetic_mode = MakeCLArithMode(cl_dtype);
-    auto input = MakeCLMLTensorFromJSONEntry(node.GetInputs()[0], {}, 
CL_TENSOR_LAYOUT_OPTIMAL_QCOM,
-                                             cl_dtype);
+    cl_arithmetic_mode_qcom cl_arithmetic_mode = MakeCLArithMode(cl_dtype, 
cl_dtype);
+    auto input = MakeCLMLTensorFromJSONEntry(node.GetInputs()[0].id_, {},
+                                             CL_TENSOR_LAYOUT_OPTIMAL_QCOM, 
cl_dtype);
     auto out_dims = GetTensorDims(nodes_[node.GetInputs()[0].id_]);
-    auto output = MakeCLMLTensorFromJSONNode(node, 
CL_TENSOR_LAYOUT_OPTIMAL_QCOM, cl_dtype, nullptr,
-                                             {out_dims.n, out_dims.c, 1, 1});
+    auto output = MakeCLMLTensorFromJSONEntry(nid, {out_dims.n, out_dims.c, 1, 
1},
+                                              CL_TENSOR_LAYOUT_OPTIMAL_QCOM, 
cl_dtype);
 
     cl_ml_op_softmax_desc_qcom softmax_desc = 
{CL_SOFTMAX_ALGORITHM_ACCURATE_QCOM,
                                                CL_SOFTMAX_MODE_INSTANCE_QCOM, 
cl_arithmetic_mode};
@@ -1128,7 +1119,7 @@ class CLMLRuntime : public JSONRuntimeBase {
     ICHECK(op && result == CL_SUCCESS) << "SoftMax Error:" << result;
 
     layer->function.push_back(op);
-    return output;
+    return;
   }
 
   /*!
@@ -1136,17 +1127,17 @@ class CLMLRuntime : public JSONRuntimeBase {
    *
    * \param layer The CLML layer to build. Containing inputs, outputs and the 
CLML output.
    * \param node The JSON representation of the operator.
+   * \param nid The node index of JSON graph node, which points to this 
operator.
    */
-  std::shared_ptr<cl_ml_tensor_memory_desc_qcom> CreatePadLayer(CachedLayer* 
layer,
-                                                                const 
JSONGraphNode& node) {
+  void CreatePadLayer(CachedLayer* layer, const JSONGraphNode& node, size_t 
nid) {
     cl_int result = 0;
     cl_ml_op_qcom op = nullptr;
     DLDataType tvm_dtype = node.GetOpDataType()[0];
     cl_channel_type cl_dtype = MakeCLDataType(tvm_dtype);
-    cl_arithmetic_mode_qcom cl_arithmetic_mode = MakeCLArithMode(cl_dtype);
-    auto input = MakeCLMLTensorFromJSONEntry(node.GetInputs()[0], {}, 
CL_TENSOR_LAYOUT_OPTIMAL_QCOM,
-                                             cl_dtype);
-    auto output = MakeCLMLTensorFromJSONNode(node, 
CL_TENSOR_LAYOUT_OPTIMAL_QCOM, cl_dtype);
+    cl_arithmetic_mode_qcom cl_arithmetic_mode = MakeCLArithMode(cl_dtype, 
cl_dtype);
+    auto input = MakeCLMLTensorFromJSONEntry(node.GetInputs()[0].id_, {},
+                                             CL_TENSOR_LAYOUT_OPTIMAL_QCOM, 
cl_dtype);
+    auto output = MakeCLMLTensorFromJSONEntry(nid, {}, 
CL_TENSOR_LAYOUT_OPTIMAL_QCOM, cl_dtype);
 
     std::string pad_mode = 
node.GetAttr<std::vector<std::string>>("pad_mode")[0];
     std::vector<std::string> padding = 
node.GetAttr<std::vector<std::string>>("pad_width");
@@ -1173,7 +1164,7 @@ class CLMLRuntime : public JSONRuntimeBase {
     ICHECK(op && result == CL_SUCCESS) << "Pad Error:" << result;
 
     layer->function.push_back(op);
-    return output;
+    return;
   }
 
   /*!
@@ -1181,23 +1172,23 @@ class CLMLRuntime : public JSONRuntimeBase {
    *
    * \param layer The CLML layer to build. Containing inputs, outputs and the 
CLML output.
    * \param node The JSON representation of the operator.
+   * \param nid The node index of JSON graph node, which points to this 
operator.
    */
-  std::shared_ptr<cl_ml_tensor_memory_desc_qcom> CreateBatchFlattenLayer(
-      CachedLayer* layer, const JSONGraphNode& node) {
+  void CreateBatchFlattenLayer(CachedLayer* layer, const JSONGraphNode& node, 
size_t nid) {
     cl_int result = 0;
     cl_ml_op_qcom op = nullptr;
     DLDataType tvm_dtype = node.GetOpDataType()[0];
     cl_channel_type cl_dtype = MakeCLDataType(tvm_dtype);
-    auto input = MakeCLMLTensorFromJSONEntry(node.GetInputs()[0], {}, 
CL_TENSOR_LAYOUT_OPTIMAL_QCOM,
-                                             cl_dtype);
-    auto output = MakeCLMLTensorFromJSONNode(node, 
CL_TENSOR_LAYOUT_OPTIMAL_QCOM, cl_dtype);
+    auto input = MakeCLMLTensorFromJSONEntry(node.GetInputs()[0].id_, {},
+                                             CL_TENSOR_LAYOUT_OPTIMAL_QCOM, 
cl_dtype);
+    auto output = MakeCLMLTensorFromJSONEntry(nid, {}, 
CL_TENSOR_LAYOUT_OPTIMAL_QCOM, cl_dtype);
 
     result = CLML_INTF->clCreateMLOpReshapeQCOM(CLML_CTX, nullptr, 
input->tensor, output->tensor,
                                                 &op, layer_.tuning_cache);
     ICHECK(op && result == CL_SUCCESS) << "Reshape Error:" << result;
 
     layer->function.push_back(op);
-    return output;
+    return;
   }
 
   /*!
@@ -1205,23 +1196,23 @@ class CLMLRuntime : public JSONRuntimeBase {
    *
    * \param layer The CLML layer to build. Containing inputs, outputs and the 
CLML output.
    * \param node The JSON representation of the operator.
+   * \param nid The node index of JSON graph node, which points to this 
operator.
    */
-  std::shared_ptr<cl_ml_tensor_memory_desc_qcom> 
CreateReshapeLayer(CachedLayer* layer,
-                                                                    const 
JSONGraphNode& node) {
+  void CreateReshapeLayer(CachedLayer* layer, const JSONGraphNode& node, 
size_t nid) {
     cl_int result = 0;
     cl_ml_op_qcom op = nullptr;
     DLDataType tvm_dtype = node.GetOpDataType()[0];
     cl_channel_type cl_dtype = MakeCLDataType(tvm_dtype);
-    auto input = MakeCLMLTensorFromJSONEntry(node.GetInputs()[0], {}, 
CL_TENSOR_LAYOUT_OPTIMAL_QCOM,
-                                             cl_dtype);
-    auto output = MakeCLMLTensorFromJSONNode(node, 
CL_TENSOR_LAYOUT_OPTIMAL_QCOM, cl_dtype);
+    auto input = MakeCLMLTensorFromJSONEntry(node.GetInputs()[0].id_, {},
+                                             CL_TENSOR_LAYOUT_OPTIMAL_QCOM, 
cl_dtype);
+    auto output = MakeCLMLTensorFromJSONEntry(nid, {}, 
CL_TENSOR_LAYOUT_OPTIMAL_QCOM, cl_dtype);
 
     result = CLML_INTF->clCreateMLOpReshapeQCOM(CLML_CTX, nullptr, 
input->tensor, output->tensor,
                                                 &op, layer_.tuning_cache);
     ICHECK(op && result == CL_SUCCESS) << "Reshape Error:" << result;
 
     layer->function.push_back(op);
-    return output;
+    return;
   }
 
   /*!
@@ -1230,21 +1221,21 @@ class CLMLRuntime : public JSONRuntimeBase {
    *
    * \param layer The CLML layer to build. Containing inputs, outputs and the 
CLML function.
    * \param node The JSON representation of the operator.
+   * \param nid The node index of JSON graph node, which points to this 
operator.
    */
-  std::shared_ptr<cl_ml_tensor_memory_desc_qcom> 
CreateConcatLayer(CachedLayer* layer,
-                                                                   const 
JSONGraphNode& node) {
+  void CreateConcatLayer(CachedLayer* layer, const JSONGraphNode& node, size_t 
nid) {
     cl_int result = 0;
     cl_ml_op_qcom op = nullptr;
     std::vector<JSONGraphNodeEntry> input_ = node.GetInputs();
     DLDataType tvm_dtype = node.GetOpDataType()[0];
     cl_channel_type cl_dtype = MakeCLDataType(tvm_dtype);
-    cl_arithmetic_mode_qcom cl_arithmetic_mode = MakeCLArithMode(cl_dtype);
+    cl_arithmetic_mode_qcom cl_arithmetic_mode = MakeCLArithMode(cl_dtype, 
cl_dtype);
     int inputSize = input_.size();
-    auto output = MakeCLMLTensorFromJSONNode(node, 
CL_TENSOR_LAYOUT_OPTIMAL_QCOM, cl_dtype);
+    auto output = MakeCLMLTensorFromJSONEntry(nid, {}, 
CL_TENSOR_LAYOUT_OPTIMAL_QCOM, cl_dtype);
     cl_uint axis = 
std::stoi(node.GetAttr<std::vector<std::string>>("axis")[0]);
     cl_ml_tensor_qcom* concatInputs = new cl_ml_tensor_qcom[inputSize];
     for (int i = 0; i < inputSize; i++) {
-      auto input = MakeCLMLTensorFromJSONEntry(node.GetInputs()[i], {},
+      auto input = MakeCLMLTensorFromJSONEntry(node.GetInputs()[i].id_, {},
                                                CL_TENSOR_LAYOUT_OPTIMAL_QCOM, 
cl_dtype);
       concatInputs[i] = input->tensor;
     }
@@ -1257,7 +1248,7 @@ class CLMLRuntime : public JSONRuntimeBase {
     layer->function.push_back(op);
 
     delete[] concatInputs;
-    return output;
+    return;
   }
 
   /*!
@@ -1266,40 +1257,112 @@ class CLMLRuntime : public JSONRuntimeBase {
    *
    * \param layer The CLML layer to build. Containing inputs, outputs and the 
CLML function.
    * \param node The JSON representation of the operator.
+   * \param nid The node index of JSON graph node, which points to this 
operator.
    */
-  std::shared_ptr<cl_ml_tensor_memory_desc_qcom> CreateDenseLayer(CachedLayer* 
layer,
-                                                                  const 
JSONGraphNode& node) {
+  void CreateDenseLayer(CachedLayer* layer, const JSONGraphNode& node, size_t 
nid) {
     cl_int result = 0;
     cl_ml_op_qcom op = nullptr;
     DLDataType tvm_dtype = node.GetOpDataType()[0];
     cl_channel_type cl_dtype = MakeCLDataType(tvm_dtype);
-    cl_arithmetic_mode_qcom cl_arithmetic_mode = MakeCLArithMode(cl_dtype);
+    cl_arithmetic_mode_qcom cl_arithmetic_mode = MakeCLArithMode(cl_dtype, 
cl_dtype);
+    size_t num_inputs = node.GetInputs().size();
+    bool has_bias = (num_inputs == 3);
     auto in_dims = GetTensorDims(nodes_[node.GetInputs()[0].id_]);
-    auto input =
-        MakeCLMLTensorFromJSONEntry(node.GetInputs()[0], {}, 
CL_TENSOR_LAYOUT_NCHW_QCOM, cl_dtype);
+    cl_ml_tensor_layout_qcom layout = CL_TENSOR_LAYOUT_NCHW_QCOM;
+    bool is_vec_matmul = false;
+    if (in_dims.n == 1 && has_bias) {
+      layout = CL_TENSOR_LAYOUT_OPTIMAL_QCOM;
+      is_vec_matmul = true;
+    }
+
+    auto input = MakeCLMLTensorFromJSONEntry(node.GetInputs()[0].id_, {}, 
layout, cl_dtype);
     auto wt_dims = GetTensorDims(nodes_[node.GetInputs()[1].id_]);
-    auto weight = MakeCLMLTensorFromJSONEntry(node.GetInputs()[1], {1, 1, 
wt_dims.n, wt_dims.c},
-                                              CL_TENSOR_LAYOUT_NCHW_QCOM, 
cl_dtype);
-    auto output = MakeCLMLTensorFromJSONNode(node, CL_TENSOR_LAYOUT_NCHW_QCOM, 
cl_dtype);
-    cl_gemm_transform_qcom b_transform = CL_GEMM_TRANSFORM_NONE_QCOM;
-    if (in_dims.c == wt_dims.c) {
-      b_transform = CL_GEMM_TRANSFORM_TRANSPOSE_QCOM;
+    auto weight = MakeCLMLTensorFromJSONEntry(node.GetInputs()[1].id_, {1, 1, 
wt_dims.n, wt_dims.c},
+                                              layout, cl_dtype);
+    auto output = MakeCLMLTensorFromJSONEntry(nid, {}, layout, cl_dtype);
+
+    auto bias = std::make_shared<cl_ml_tensor_memory_desc_qcom>();
+    if (has_bias) {
+      bias = MakeCLMLTensorFromJSONEntry(node.GetInputs()[2].id_, {}, layout, 
cl_dtype);
+    } else {
+      cl_ml_tensor_desc_qcom desc = {};
+      desc.num_dimensions = CL_TENSOR_UNUSED_QCOM;
+      bias->tensor = layer_.unusedTensor;
     }
-    cl_ml_op_gemm_desc_qcom gemmDesc = {in_dims.n,                    // m
-                                        wt_dims.n,                    // n
-                                        wt_dims.c,                    // k
-                                        CL_GEMM_TRANSFORM_NONE_QCOM,  // A 
transform
-                                        b_transform,                  // B 
transform
-                                        {{1.0}, CL_FLOAT},            // alpha
-                                        {{0.0}, CL_FLOAT},            // beta
-                                        cl_arithmetic_mode};
 
-    result = CLML_INTF->clCreateMLOpGemmQCOM(CLML_CTX, 0, &gemmDesc, 
input->tensor, weight->tensor,
-                                             output->tensor, &op, 
layer_.tuning_cache);
-    ICHECK(op && result == CL_SUCCESS) << "Dense Error:" << result;
+    if (is_vec_matmul) {
+      cl_fc_weight_transform_qcom w_transform = 
CL_FC_WEIGHT_TRANSFORM_NONE_QCOM;
+      if (in_dims.c == wt_dims.c) w_transform = 
CL_FC_WEIGHT_TRANSFORM_TRANSPOSE_QCOM;
 
-    layer->function.push_back(op);
-    return output;
+      cl_ml_op_fully_connected_desc_qcom fc_desc{1,  // refer clml_ops.txt for 
struct
+                                                 w_transform, 
cl_arithmetic_mode};
+
+      result = CLML_INTF->clCreateMLOpFullyConnectedQCOM(CLML_CTX, nullptr, 
&fc_desc, input->tensor,
+                                                         weight->tensor, 
bias->tensor,
+                                                         output->tensor, &op, 
layer_.tuning_cache);
+      ICHECK(op && result == CL_SUCCESS) << "FC layer Error:" << result;
+      layer->function.push_back(op);
+    } else {
+      cl_gemm_transform_qcom b_transform = CL_GEMM_TRANSFORM_NONE_QCOM;
+      if (in_dims.c == wt_dims.c) b_transform = 
CL_GEMM_TRANSFORM_TRANSPOSE_QCOM;
+
+      cl_ml_op_gemm_desc_qcom gemmDesc = {in_dims.n,                    // m
+                                          wt_dims.n,                    // n
+                                          wt_dims.c,                    // k
+                                          CL_GEMM_TRANSFORM_NONE_QCOM,  // A 
transform
+                                          b_transform,                  // B 
transform
+                                          {{1.0}, CL_FLOAT},            // 
alpha
+                                          {{0.0}, CL_FLOAT},            // beta
+                                          cl_arithmetic_mode};
+
+      result =
+          CLML_INTF->clCreateMLOpGemmQCOM(CLML_CTX, 0, &gemmDesc, 
input->tensor, weight->tensor,
+                                          output->tensor, &op, 
layer_.tuning_cache);
+      ICHECK(op && result == CL_SUCCESS) << "Gemm layer Error:" << result;
+      layer->function.push_back(op);
+      if (has_bias) {
+        cl_ml_op_binary_desc_qcom binaryDesc = {CL_TENSOR_OP_ADD_QCOM,
+                                                {{1.0}, CL_FLOAT},  // alpha
+                                                {{1.0}, CL_FLOAT},  // beta
+                                                {{1.0}, CL_FLOAT},  // gamma
+                                                cl_arithmetic_mode};
+        result = CLML_INTF->clCreateMLOpBinaryQCOM(CLML_CTX, 0, &binaryDesc, 
bias->tensor,
+                                                   layer_.unusedTensor, 
output->tensor, &op,
+                                                   layer_.tuning_cache);
+        layer->function.push_back(op);
+      }
+    }
+
+    return;
+  }
+
+  /*!
+   * \brief Create a dense layer Tensors with supported layout.
+   *
+   *
+   * \param layer The CLML layer to build. Containing inputs, outputs and the 
CLML function.
+   * \param node The JSON representation of the operator.
+   * \param nid The node index of JSON graph node, which points to this 
operator.
+   */
+  void CreateDenseLayerTensor(CachedLayer* layer, const JSONGraphNode& node, 
size_t nid) {
+    cl_int result = 0;
+    cl_ml_op_qcom op = nullptr;
+    DLDataType tvm_dtype = node.GetOpDataType()[0];
+    cl_channel_type cl_dtype = MakeCLDataType(tvm_dtype);
+    auto in_dims = GetTensorDims(nodes_[node.GetInputs()[0].id_]);
+    size_t num_inputs = node.GetInputs().size();
+    bool has_bias = (num_inputs == 3);
+    cl_ml_tensor_layout_qcom layout = CL_TENSOR_LAYOUT_NCHW_QCOM;
+    if (in_dims.n == 1 && has_bias) {
+      layout = CL_TENSOR_LAYOUT_OPTIMAL_QCOM;
+    }
+    auto input = MakeCLMLTensorFromJSONEntry(node.GetInputs()[0].id_, {}, 
layout, cl_dtype);
+    auto wt_dims = GetTensorDims(nodes_[node.GetInputs()[1].id_]);
+    auto weight = MakeCLMLTensorFromJSONEntry(node.GetInputs()[1].id_, {1, 1, 
wt_dims.n, wt_dims.c},
+                                              layout, cl_dtype);
+    auto output = MakeCLMLTensorFromJSONEntry(nid, {}, layout, cl_dtype);
+
+    return;
   }
 
   /*!
@@ -1308,20 +1371,19 @@ class CLMLRuntime : public JSONRuntimeBase {
    *
    * \param layer The CLML layer to build. Containing inputs, outputs and the 
CLML function.
    * \param node The JSON representation of the operator.
+   * \param nid The node index of JSON graph node, which points to this 
operator.
    */
-  std::shared_ptr<cl_ml_tensor_memory_desc_qcom> 
CreateBatchMatmulLayer(CachedLayer* layer,
-                                                                        const 
JSONGraphNode& node,
-                                                                        int 
nid) {
+  void CreateBatchMatmulLayer(CachedLayer* layer, const JSONGraphNode& node, 
size_t nid) {
     cl_int result = 0;
     cl_ml_op_qcom op = nullptr;
     DLDataType tvm_dtype = node.GetOpDataType()[0];
     cl_channel_type cl_dtype = MakeCLDataType(tvm_dtype);
-    cl_arithmetic_mode_qcom cl_arithmetic_mode = MakeCLArithMode(cl_dtype);
+    cl_arithmetic_mode_qcom cl_arithmetic_mode = MakeCLArithMode(cl_dtype, 
cl_dtype);
     auto in_dims = GetTensorDims(nodes_[node.GetInputs()[0].id_]);
-    auto input = MakeCLMLTensorFromJSONEntry(node.GetInputs()[0], {in_dims.c, 
in_dims.h},
+    auto input = MakeCLMLTensorFromJSONEntry(node.GetInputs()[0].id_, 
{in_dims.c, in_dims.h},
                                              CL_TENSOR_LAYOUT_NCHW_QCOM, 
cl_dtype);
     auto wt_dims = GetTensorDims(nodes_[node.GetInputs()[1].id_]);
-    auto weight = MakeCLMLTensorFromJSONEntry(node.GetInputs()[1], {1, 1, 
wt_dims.c, wt_dims.h},
+    auto weight = MakeCLMLTensorFromJSONEntry(node.GetInputs()[1].id_, {1, 1, 
wt_dims.c, wt_dims.h},
                                               CL_TENSOR_LAYOUT_NCHW_QCOM, 
cl_dtype);
 
     std::vector<int64_t> out_shape = node.GetOpShape()[0];
@@ -1330,8 +1392,8 @@ class CLMLRuntime : public JSONRuntimeBase {
     clml_out_shape.push_back(out_shape[2]);
     clml_out_shape.push_back(1);
     clml_out_shape.push_back(1);
-    auto output = MakeCLMLTensorFromJSONNode(node, CL_TENSOR_LAYOUT_NCHW_QCOM, 
cl_dtype, nullptr,
-                                             clml_out_shape);
+    auto output =
+        MakeCLMLTensorFromJSONEntry(nid, clml_out_shape, 
CL_TENSOR_LAYOUT_NCHW_QCOM, cl_dtype);
     layer->out_shapes.insert({nid, clml_out_shape});
 
     cl_bool b_transpose = 
std::stoi(node.GetAttr<std::vector<std::string>>("transpose_b")[0]);
@@ -1353,7 +1415,40 @@ class CLMLRuntime : public JSONRuntimeBase {
     ICHECK(op && result == CL_SUCCESS) << "BatchMatmul Error:" << result;
 
     layer->function.push_back(op);
-    return output;
+    return;
+  }
+
+  /*!
+   * \brief Create a Batch matmul layer(batch_size=1 supported) Tensors with 
supported layout.
+   *
+   *
+   * \param layer The CLML layer to build. Containing inputs, outputs and the 
CLML function.
+   * \param node The JSON representation of the operator.
+   * \param nid The node index of JSON graph node, which points to this 
operator.
+   */
+  void CreateBatchMatmulLayerTensor(CachedLayer* layer, const JSONGraphNode& 
node, size_t nid) {
+    cl_int result = 0;
+    cl_ml_op_qcom op = nullptr;
+    DLDataType tvm_dtype = node.GetOpDataType()[0];
+    cl_channel_type cl_dtype = MakeCLDataType(tvm_dtype);
+    cl_arithmetic_mode_qcom cl_arithmetic_mode = MakeCLArithMode(cl_dtype, 
cl_dtype);
+    auto in_dims = GetTensorDims(nodes_[node.GetInputs()[0].id_]);
+    auto input = MakeCLMLTensorFromJSONEntry(node.GetInputs()[0].id_, 
{in_dims.c, in_dims.h},
+                                             CL_TENSOR_LAYOUT_NCHW_QCOM, 
cl_dtype);
+    auto wt_dims = GetTensorDims(nodes_[node.GetInputs()[1].id_]);
+    auto weight = MakeCLMLTensorFromJSONEntry(node.GetInputs()[1].id_, {1, 1, 
wt_dims.c, wt_dims.h},
+                                              CL_TENSOR_LAYOUT_NCHW_QCOM, 
cl_dtype);
+
+    std::vector<int64_t> out_shape = node.GetOpShape()[0];
+    std::vector<size_t> clml_out_shape;
+    clml_out_shape.push_back(out_shape[1]);
+    clml_out_shape.push_back(out_shape[2]);
+    clml_out_shape.push_back(1);
+    clml_out_shape.push_back(1);
+    auto output =
+        MakeCLMLTensorFromJSONEntry(nid, clml_out_shape, 
CL_TENSOR_LAYOUT_NCHW_QCOM, cl_dtype);
+    layer->out_shapes.insert({nid, clml_out_shape});
+    return;
   }
 
   /*!
@@ -1361,17 +1456,17 @@ class CLMLRuntime : public JSONRuntimeBase {
    *
    * \param layer The CLML layer to build. Containing inputs, outputs and the 
CLML output.
    * \param node The JSON representation of the operator.
+   * \param nid The node index of JSON graph node, which points to this 
operator.
    */
-  std::shared_ptr<cl_ml_tensor_memory_desc_qcom> CreateClipLayer(CachedLayer* 
layer,
-                                                                 const 
JSONGraphNode& node) {
+  void CreateClipLayer(CachedLayer* layer, const JSONGraphNode& node, size_t 
nid) {
     cl_int result = 0;
     cl_ml_op_qcom op = nullptr;
     DLDataType tvm_dtype = node.GetOpDataType()[0];
     cl_channel_type cl_dtype = MakeCLDataType(tvm_dtype);
-    cl_arithmetic_mode_qcom cl_arithmetic_mode = MakeCLArithMode(cl_dtype);
-    auto input = MakeCLMLTensorFromJSONEntry(node.GetInputs()[0], {}, 
CL_TENSOR_LAYOUT_OPTIMAL_QCOM,
-                                             cl_dtype);
-    auto output = MakeCLMLTensorFromJSONNode(node, 
CL_TENSOR_LAYOUT_OPTIMAL_QCOM, cl_dtype);
+    cl_arithmetic_mode_qcom cl_arithmetic_mode = MakeCLArithMode(cl_dtype, 
cl_dtype);
+    auto input = MakeCLMLTensorFromJSONEntry(node.GetInputs()[0].id_, {},
+                                             CL_TENSOR_LAYOUT_OPTIMAL_QCOM, 
cl_dtype);
+    auto output = MakeCLMLTensorFromJSONEntry(nid, {}, 
CL_TENSOR_LAYOUT_OPTIMAL_QCOM, cl_dtype);
     cl_float a_max = 
std::stof(node.GetAttr<std::vector<std::string>>("a_max")[0]);
     cl_float a_min = 
std::stof(node.GetAttr<std::vector<std::string>>("a_min")[0]);
 
@@ -1383,7 +1478,7 @@ class CLMLRuntime : public JSONRuntimeBase {
     ICHECK(op && result == CL_SUCCESS) << "Clip Error:" << result;
 
     layer->function.push_back(op);
-    return output;
+    return;
   }
 
   /*!
@@ -1391,19 +1486,19 @@ class CLMLRuntime : public JSONRuntimeBase {
    *
    * \param layer The CLML layer to build. Containing inputs, outputs and the 
CLML output.
    * \param node The JSON representation of the operator.
+   * \param nid The node index of JSON graph node, which points to this 
operator.
    */
-  std::shared_ptr<cl_ml_tensor_memory_desc_qcom> 
CreateBinaryLayer(CachedLayer* layer,
-                                                                   const 
JSONGraphNode& node) {
+  void CreateBinaryLayer(CachedLayer* layer, const JSONGraphNode& node, size_t 
nid) {
     cl_int result = 0;
     cl_ml_op_qcom op = nullptr;
     DLDataType tvm_dtype = node.GetOpDataType()[0];
     cl_channel_type cl_dtype = MakeCLDataType(tvm_dtype);
-    cl_arithmetic_mode_qcom cl_arithmetic_mode = MakeCLArithMode(cl_dtype);
-    auto input_a = MakeCLMLTensorFromJSONEntry(node.GetInputs()[0], {},
+    cl_arithmetic_mode_qcom cl_arithmetic_mode = MakeCLArithMode(cl_dtype, 
cl_dtype);
+    auto input_a = MakeCLMLTensorFromJSONEntry(node.GetInputs()[0].id_, {},
                                                CL_TENSOR_LAYOUT_OPTIMAL_QCOM, 
cl_dtype);
-    auto input_b = MakeCLMLTensorFromJSONEntry(node.GetInputs()[1], {},
+    auto input_b = MakeCLMLTensorFromJSONEntry(node.GetInputs()[1].id_, {},
                                                CL_TENSOR_LAYOUT_OPTIMAL_QCOM, 
cl_dtype);
-    auto output = MakeCLMLTensorFromJSONNode(node, 
CL_TENSOR_LAYOUT_OPTIMAL_QCOM, cl_dtype);
+    auto output = MakeCLMLTensorFromJSONEntry(nid, {}, 
CL_TENSOR_LAYOUT_OPTIMAL_QCOM, cl_dtype);
     std::string op_name = node.GetOpName();
     cl_binary_op_qcom binary_op = CL_TENSOR_OP_ADD_QCOM;
     if (op_name == "subtract")
@@ -1425,7 +1520,7 @@ class CLMLRuntime : public JSONRuntimeBase {
     ICHECK(op && result == CL_SUCCESS) << op_name << " Node Error:" << result;
 
     layer->function.push_back(op);
-    return output;
+    return;
   }
 
   /*!
@@ -1433,17 +1528,17 @@ class CLMLRuntime : public JSONRuntimeBase {
    *
    * \param layer The CLML layer to build. Containing inputs, outputs and the 
CLML output.
    * \param node The JSON representation of the operator.
+   * \param nid The node index of JSON graph node, which points to this 
operator.
    */
-  std::shared_ptr<cl_ml_tensor_memory_desc_qcom> CreateDepthToSpaceLayer(
-      CachedLayer* layer, const JSONGraphNode& node) {
+  void CreateDepthToSpaceLayer(CachedLayer* layer, const JSONGraphNode& node, 
size_t nid) {
     cl_int result = 0;
     cl_ml_op_qcom op = nullptr;
     DLDataType tvm_dtype = node.GetOpDataType()[0];
     cl_channel_type cl_dtype = MakeCLDataType(tvm_dtype);
-    cl_arithmetic_mode_qcom cl_arithmetic_mode = MakeCLArithMode(cl_dtype);
-    auto input = MakeCLMLTensorFromJSONEntry(node.GetInputs()[0], {}, 
CL_TENSOR_LAYOUT_OPTIMAL_QCOM,
-                                             cl_dtype);
-    auto output = MakeCLMLTensorFromJSONNode(node, 
CL_TENSOR_LAYOUT_OPTIMAL_QCOM, cl_dtype);
+    cl_arithmetic_mode_qcom cl_arithmetic_mode = MakeCLArithMode(cl_dtype, 
cl_dtype);
+    auto input = MakeCLMLTensorFromJSONEntry(node.GetInputs()[0].id_, {},
+                                             CL_TENSOR_LAYOUT_OPTIMAL_QCOM, 
cl_dtype);
+    auto output = MakeCLMLTensorFromJSONEntry(nid, {}, 
CL_TENSOR_LAYOUT_OPTIMAL_QCOM, cl_dtype);
     cl_uint block_size = 
std::stoi(node.GetAttr<std::vector<std::string>>("block_size")[0]);
 
     cl_ml_op_depthtospace_desc_qcom dtos_desc = {block_size, 
cl_arithmetic_mode};
@@ -1452,7 +1547,7 @@ class CLMLRuntime : public JSONRuntimeBase {
     ICHECK(op && result == CL_SUCCESS) << "DepthToSpace Layer Error:" << 
result;
 
     layer->function.push_back(op);
-    return output;
+    return;
   }
 
   /*!
@@ -1460,17 +1555,17 @@ class CLMLRuntime : public JSONRuntimeBase {
    *
    * \param layer The CLML layer to build. Containing inputs, outputs and the 
CLML output.
    * \param node The JSON representation of the operator.
+   * \param nid The node index of JSON graph node, which points to this 
operator.
    */
-  std::shared_ptr<cl_ml_tensor_memory_desc_qcom> 
CreateResizeLayer(CachedLayer* layer,
-                                                                   const 
JSONGraphNode& node) {
+  void CreateResizeLayer(CachedLayer* layer, const JSONGraphNode& node, size_t 
nid) {
     cl_int result = 0;
     cl_ml_op_qcom op = nullptr;
     DLDataType tvm_dtype = node.GetOpDataType()[0];
     cl_channel_type cl_dtype = MakeCLDataType(tvm_dtype);
-    cl_arithmetic_mode_qcom cl_arithmetic_mode = MakeCLArithMode(cl_dtype);
-    auto input = MakeCLMLTensorFromJSONEntry(node.GetInputs()[0], {}, 
CL_TENSOR_LAYOUT_OPTIMAL_QCOM,
-                                             cl_dtype);
-    auto output = MakeCLMLTensorFromJSONNode(node, 
CL_TENSOR_LAYOUT_OPTIMAL_QCOM, cl_dtype);
+    cl_arithmetic_mode_qcom cl_arithmetic_mode = MakeCLArithMode(cl_dtype, 
cl_dtype);
+    auto input = MakeCLMLTensorFromJSONEntry(node.GetInputs()[0].id_, {},
+                                             CL_TENSOR_LAYOUT_OPTIMAL_QCOM, 
cl_dtype);
+    auto output = MakeCLMLTensorFromJSONEntry(nid, {}, 
CL_TENSOR_LAYOUT_OPTIMAL_QCOM, cl_dtype);
     cl_bool align_corners = 
std::stoi(node.GetAttr<std::vector<std::string>>("align_corners")[0]);
 
     cl_ml_op_resize_bilinear_desc_qcom resize_desc = {align_corners, false, 
cl_arithmetic_mode};
@@ -1479,7 +1574,7 @@ class CLMLRuntime : public JSONRuntimeBase {
     ICHECK(op && result == CL_SUCCESS) << "Resize Layer Error:" << result;
 
     layer->function.push_back(op);
-    return output;
+    return;
   }
 
   /*!
diff --git a/tests/python/contrib/test_clml/conftest.py 
b/tests/python/contrib/test_clml/conftest.py
index a51fc8edf1..6b9c91ec10 100644
--- a/tests/python/contrib/test_clml/conftest.py
+++ b/tests/python/contrib/test_clml/conftest.py
@@ -15,12 +15,25 @@
 # specific language governing permissions and limitations
 # under the License.
 
-import sys
+import os
 import tvm
+from tvm import rpc
 import pytest
-from test_clml.infrastructure import Device
 
 
 @pytest.fixture(scope="session")
-def device():
-    return Device()
+def remote():
+    if (
+        "TVM_TRACKER_HOST" in os.environ
+        and "TVM_TRACKER_PORT" in os.environ
+        and "RPC_DEVICE_KEY" in os.environ
+    ):
+
+        rpc_tracker_host = os.environ["TVM_TRACKER_HOST"]
+        rpc_tracker_port = int(os.environ["TVM_TRACKER_PORT"])
+        rpc_device_key = os.environ["RPC_DEVICE_KEY"]
+        tracker = rpc.connect_tracker(rpc_tracker_host, rpc_tracker_port)
+        remote = tracker.request(rpc_device_key, priority=0, 
session_timeout=600)
+        return remote
+    else:
+        return None
diff --git a/tests/python/contrib/test_clml/infrastructure.py 
b/tests/python/contrib/test_clml/infrastructure.py
index f0a513cc17..b8ce236cdd 100644
--- a/tests/python/contrib/test_clml/infrastructure.py
+++ b/tests/python/contrib/test_clml/infrastructure.py
@@ -33,72 +33,23 @@ from tvm import autotvm
 from tvm.autotvm.measure import request_remote
 from tvm.relay.expr_functor import ExprMutator, Call
 
+"""Utils for adreno compute/schedules"""
 
-class Device:
-    """
-    Configuration for CLML tests.
-
-    Check tests/python/contrib/clml/ for the presence of an test_config.json 
file.
-    This file can be used to override the default configuration here which 
will attempt to run the
-    Open CLML runtime tests locally if the runtime is available. Changing the 
configuration
-    will allow these runtime tests to be offloaded to a remote Snapdragon 
device via a tracker for example.
-
-    Notes
-    -----
-        The test configuration will be loaded once when the class is created. 
If the configuration
-        changes between tests, any changes will not be picked up.
-
-    Parameters
-    ----------
-    device : RPCSession
-        Allows tests to connect to and use remote device.
-
-    Attributes
-    ----------
-    connection_type : str
-        Details the type of RPC connection to use. Options:
-        local - Use the local device,
-        tracker - Connect to a tracker to request a remote device,
-        remote - Connect to a remote device directly.
-    host : str
-        Specify IP address or hostname of remote target.
-    port : int
-        Specify port number of remote target.
-    target : str
-        The compilation target.
-    device_key : str
-        The device key of the remote target. Use when connecting to a remote 
device via a tracker.
-    cross_compile : str
-        Specify path to cross compiler to use when connecting a remote device 
from a non-arm platform.
-    """
-
-    connection_type = "tracker"
-    host = os.getenv("TVM_TRACKER_HOST", "localhost")
-    port = int(os.getenv("TVM_TRACKER_PORT", 9090))
-    target = "opencl"
-    target_host = "llvm -mtriple=aarch64-linux-gnu"
-    device_key = os.getenv("RPC_DEVICE_KEY", "android")
-    cross_compile = os.getenv("TVM_NDK_CC", "aarch64-linux-android-g++")
-
-    def __init__(self):
-        """Keep remote device for lifetime of object."""
-        self.device = self._get_remote()
-
-    @classmethod
-    def _get_remote(cls):
-        """Get a remote (or local) device to use for testing."""
-        if cls.connection_type == "tracker":
-            device = request_remote(cls.device_key, cls.host, cls.port, 
timeout=1000)
-        elif cls.connection_type == "remote":
-            device = rpc.connect(cls.host, cls.port)
-        elif cls.connection_type == "local":
-            device = rpc.LocalSession()
-        else:
-            raise ValueError(
-                "connection_type in test_config.json should be one of: " 
"local, tracker, remote."
-            )
+import os
+import tvm
+import numpy as np
+from tvm import relay
+from tvm import autotvm
+from tvm import rpc
+from tvm.contrib import utils, ndk
+from tvm.relay import testing
+from tvm.relay.transform import recast
+from tvm.contrib import graph_runtime
+from tvm.runtime.vm import VirtualMachine
+import json
 
-        return device
+
+NDK_CROSS_COMPILER = os.getenv("TVM_NDK_CC", "aarch64-linux-android-g++")
 
 
 def get_cpu_op_count(mod):
@@ -139,78 +90,102 @@ def get_non_cpu_op_count(mod):
     return c.count
 
 
-def skip_codegen_test():
-    """Skip test if it requires the CLML codegen and it's not present."""
-    if not tvm.get_global_func("relay.ext.clml", True):
-        print("Skip because CLML codegen is not available.")
-        return True
-
+# build module run with opencl or clml target with graph executor
+def build_and_run(
+    remote,
+    mod,
+    params1,
+    inputs,
+    target="llvm",
+    enable_clml=False,
+    stat_file=None,
+):
+    if remote is None:
+        target_host = "llvm"
+    else:
+        target_host = "llvm -mtriple=arm64-linux-android"
 
-def build_module(mod, target, target_host, params=None, enable_clml=True, 
tune_log=""):
-    """Build module with option to build for CLML."""
     if isinstance(mod, tvm.relay.expr.Call):
         mod = tvm.IRModule.from_expr(mod)
 
-    with autotvm.apply_history_best(tune_log):
-        with tvm.transform.PassContext(opt_level=3, 
disabled_pass=["AlterOpLayout"]):
+    with autotvm.apply_history_best(stat_file):
+        with tvm.transform.PassContext(opt_level=3):
             if enable_clml:
-                mod = clml.preprocess_module(mod)
-                mod = clml.partition_for_clml(mod, params)
-            relay.backend.te_compiler.get().clear()
-            return relay.build(mod, target=target, target_host=target_host, 
params=params)
-
+                mod = clml.partition_for_clml(mod, params1)
+            graph, lib, params = relay.build(
+                mod, target_host=target_host, target=target, params=params1
+            )
 
-def build_and_run(
-    mod, inputs, outputs, params, device, enable_clml=True, no_runs=1, 
config=None, tune_log=""
+    if remote is None:
+        ctx = tvm.opencl()
+        m = graph_runtime.create(graph, lib, ctx)
+    else:
+        temp = utils.tempdir()
+        dso_binary = "dev_lib_cl.so"
+        dso_binary_path = temp.relpath(dso_binary)
+        ctx = remote.cl(0)
+        lib.export_library(dso_binary_path, fcompile=ndk.create_shared)
+        remote.upload(dso_binary_path)
+        rlib = remote.load_module(dso_binary)
+        m = graph_runtime.create(graph, rlib, ctx)
+    m.set_input(**params)
+    m.set_input(**inputs)
+    m.run()
+    return m.get_output(0)
+
+
+# build module run with opencl or clml target with vm executor
+def build_and_run_vm(
+    remote,
+    mod,
+    params1,
+    inputs,
+    target="llvm",
+    enable_clml=False,
+    stat_file=None,
 ):
-    """Build and run the relay module."""
-    if config is None:
-        config = {}
-
-    try:
-        libm = build_module(mod, device.target, device.target_host, params, 
enable_clml, tune_log)
-        clml_modules = extract_clml_modules(libm)
-        for mod in clml_modules:
-            source = mod.get_source("json")
-            codegen = json.loads(source)["nodes"]
-            # remove input and const names as these cannot be predetermined
-            for node in range(len(codegen)):
-                if codegen[node]["op"] == "input" or codegen[node]["op"] == 
"const":
-                    codegen[node]["name"] = ""
-            codegen_str = json.dumps(codegen, sort_keys=True, indent=2)
-
-    except Exception as e:
-        err_msg = "The module could not be built.\n"
-        if config:
-            err_msg += f"The test failed with the following parameters: 
{config}\n"
-        err_msg += str(e)
-        raise Exception(err_msg)
-
-    lib = update_lib(libm, device.device, device.cross_compile)
-    gen_module = 
graph_executor.GraphModule(lib["default"](device.device.cl(0)))
-    gen_module.set_input(**inputs)
-    out = []
-    for _ in range(no_runs):
-        gen_module.run()
-        out.append([gen_module.get_output(i) for i in range(outputs)])
-    # time_f = gen_module.module.time_evaluator("run", device.device.cl(0), 
number=1)
-    # cost = time_f().mean
-    # print("%g secs/iteration\n" % cost)
-    return out
+    if remote is None:
+        target_host = "llvm"
+    else:
+        target_host = "llvm -mtriple=arm64-linux-android"
+
+    target_host = tvm.target.Target(target_host)
+    target = tvm.target.Target(target, target_host)
+    if isinstance(mod, relay.Function):
+        module = tvm.IRModule({})
+        module["main"] = mod
+        mod = module
+    elif isinstance(mod, tvm.relay.expr.Call):
+        mod = tvm.IRModule.from_expr(mod)
 
+    with autotvm.apply_history_best(stat_file):
+        with tvm.transform.PassContext(opt_level=3):
+            if enable_clml:
+                mod = clml.partition_for_clml(mod, params1)
+            vmc = relay.vm.compile(mod, target=target, params=params1)
 
-def update_lib(lib, device, cross_compile):
-    """Export the library to the remote/local device."""
-    lib_name = "mod.so"
-    temp = utils.tempdir()
-    lib_path = temp.relpath(lib_name)
-    if cross_compile:
-        lib.export_library(lib_path, cc=cross_compile)
+    if remote is None:
+        dev = tvm.opencl()
+        vm = VirtualMachine(vmc, dev, "naive")
     else:
-        lib.export_library(lib_path)
-    device.upload(lib_path)
-    lib = device.load_module(lib_name)
-    return lib
+        temp = utils.tempdir()
+        dso_binary = "dev_lib_cl.so"
+        dso_binary_path = temp.relpath(dso_binary)
+        dev = remote.cl(0)
+        vmc.mod.export_library(dso_binary_path, cc=NDK_CROSS_COMPILER)
+        remote.upload(dso_binary_path)
+        rlib = remote.load_module(dso_binary)
+        vm = VirtualMachine(rlib, dev, "naive")
+    inputs_data = {}
+    for key in inputs.keys():
+        inputs_data[key] = tvm.nd.array(inputs[key], dev)
+    for k, v in params1.items():
+        inputs_data[k] = tvm.nd.array(v, dev)
+    vm.set_input("main", **inputs_data)
+    vm.invoke_stateful("main")
+    out = vm.get_outputs()[0]
+
+    return out
 
 
 def extract_clml_modules(module):
@@ -219,18 +194,23 @@ def extract_clml_modules(module):
 
 
 def verify_codegen(
+    remote,
     mod,
-    known_good_codegen,
-    device,
     params,
+    known_good_codegen,
+    target="llvm",
     num_clml_modules=1,
     tvm_ops=0,
 ):
+    if remote is None:
+        target_host = "llvm"
+    else:
+        target_host = "llvm -mtriple=arm64-linux-android"
+
     """Check clml codegen against a known good output."""
     if isinstance(mod, tvm.relay.expr.Call):
         mod = tvm.IRModule.from_expr(mod)
-    with tvm.transform.PassContext(opt_level=3, 
disabled_pass=["AlterOpLayout"]):
-        mod = clml.preprocess_module(mod)
+    with tvm.transform.PassContext(opt_level=3):
         mod = clml.partition_for_clml(mod, params)
         tvm_op_count = get_cpu_op_count(mod)
         assert tvm_op_count == tvm_ops, "Got {} TVM operators, expected 
{}".format(
@@ -246,7 +226,7 @@ def verify_codegen(
         ), "Got {} Open CLML partitions, expected {}".format(partition_count, 
num_clml_modules)
     relay.backend.te_compiler.get().clear()
 
-    module = relay.build(mod, target=device.target, 
target_host=device.target_host, params=params)
+    module = relay.build(mod, target=target, target_host=target_host, 
params=params)
     clml_modules = extract_clml_modules(module)
     assert len(clml_modules) == num_clml_modules, (
         f"The number of CLML modules produced ({len(clml_modules)}) does not "
diff --git a/tests/python/contrib/test_clml/test_network.py 
b/tests/python/contrib/test_clml/test_network.py
index 177359d9b1..ec51510920 100644
--- a/tests/python/contrib/test_clml/test_network.py
+++ b/tests/python/contrib/test_clml/test_network.py
@@ -21,158 +21,137 @@ import numpy as np
 from tvm import relay
 from tvm.relay import testing
 from tvm.contrib import utils
-from test_clml.infrastructure import build_and_run, Device
+from test_clml.infrastructure import build_and_run, build_and_run_vm
 import pytest
 
 
-def _build_and_run_network(mod, params, inputs, data, device, atol, rtol, 
tvm_log=""):
+def _build_and_run_network(remote, mod, params, input_data, target, 
executor_type, tvm_log=""):
     """Helper function to build and run a network."""
 
     outputs = []
     for clml in [True, False]:
-        outputs.append(
-            build_and_run(mod, data, 1, params, device, enable_clml=clml, 
tune_log=tvm_log)[0][0]
-        )
+        if executor_type == "ge":
+            outputs.append(
+                build_and_run(
+                    remote,
+                    mod,
+                    params,
+                    input_data,
+                    target,
+                    enable_clml=clml,
+                    stat_file=tvm_log,
+                )
+            )
+        else:
+            outputs.append(
+                build_and_run_vm(
+                    remote,
+                    mod,
+                    params,
+                    input_data,
+                    target,
+                    enable_clml=clml,
+                    stat_file=tvm_log,
+                )
+            )
     return outputs
 
 
-def _get_keras_model(keras_model, inputs_dict, data):
-    """Convert Keras graph to relay."""
-    inputs = {}
-    for name, (shape, _) in inputs_dict.items():
-        inputs[keras_model.input_names[0]] = shape
-
-    from tensorflow.keras.layers import Input
-    from tensorflow.keras.models import Model
-
-    def get_bottom_top_model(model, layer_name):
-        layer = model.get_layer(layer_name)
-        bottom_input = model.layers[0].input
-        bottom_output = layer.output
-        bottom_model = Model(bottom_input, bottom_output)
-        return bottom_model
-
-    keras_model = get_bottom_top_model(keras_model, "predictions")
-    ref_output = keras_model.predict(data["input_1"].transpose(0, 2, 3, 1))
-
-    mod, params = relay.frontend.from_keras(keras_model, inputs, layout="NCHW")
-    return mod, params, ref_output
-
-
-@pytest.mark.parametrize("dtype", ["float16"])
-@tvm.testing.requires_openclml
-def test_mobilenet(device, dtype):
-    def get_model():
-        from tensorflow.keras.applications import MobileNet
-        import tensorflow as tf
-
-        tf.keras.backend.clear_session()
-
-        mobilenet = MobileNet(
-            include_top=True, weights=None, input_shape=(224, 224, 3), 
classes=1000
+def get_network(name, batch_size, dtype="float32"):
+    """Get the symbol definition and random weight of a network
+
+    Parameters
+    ----------
+    name: str
+        The name of the network, can be 'resnet-18', 'resnet-50', 'vgg-16', 
'inception_v3', 'mobilenet', ...
+    batch_size: int
+        batch size
+    dtype: str
+        Data type
+
+    Returns
+    -------
+    net: tvm.IRModule
+        The relay function of network definition
+    params: dict
+        The random parameters for benchmark
+    input_shape: tuple
+        The shape of input tensor
+    output_shape: tuple
+        The shape of output tensor
+    """
+    input_shape = (batch_size, 3, 224, 224)
+    output_shape = (batch_size, 1000)
+
+    if name == "mobilenet":
+        net, params = testing.mobilenet.get_workload(batch_size=batch_size, 
dtype=dtype)
+    elif name == "inception_v3":
+        input_shape = (batch_size, 3, 299, 299)
+        net, params = testing.inception_v3.get_workload(batch_size=batch_size, 
dtype=dtype)
+    elif "resnet" in name:
+        n_layer = int(name.split("-")[1])
+        net, params = testing.resnet.get_workload(
+            num_layers=n_layer, batch_size=batch_size, dtype=dtype
         )
-        inputs = {mobilenet.input_names[0]: ((1, 3, 224, 224), "float32")}
-
-        data = {}
-        np.random.seed(0)
-
-        for name, (shape, dtype) in inputs.items():
-            if dtype == "uint8":
-                low, high = 0, 1
-            else:
-                low, high = -1, 1
-            data[name] = np.random.uniform(low, high, shape).astype(dtype)
-
-        mod, params, ref_outputs = _get_keras_model(mobilenet, inputs, data)
-        return mod, params, inputs, data, ref_outputs
-
-    mod, params, inputs, input_data, ref_outputs = get_model()
-    outputs = _build_and_run_network(
-        mod, params, inputs, input_data, device=device, atol=1e-5, rtol=1e-5
-    )
-
-    opencl_sort = np.argsort(outputs[1].asnumpy()).flatten()
-    clml_sort = np.argsort(outputs[0].asnumpy()).flatten()
-    tvm.testing.assert_allclose(opencl_sort[:10], clml_sort[:10], rtol=1e-5, 
atol=1e-5)
-
-
-@pytest.mark.parametrize("dtype", ["float16"])
-@tvm.testing.requires_openclml
-def test_inception_v3(device, dtype):
-    def get_model():
-        from tensorflow.keras.applications import InceptionV3
-        import tensorflow as tf
-
-        tf.keras.backend.clear_session()
-
-        inceptionV3 = InceptionV3(
-            include_top=True, weights=None, input_shape=(299, 299, 3), 
classes=1000
+    elif "vgg" in name:
+        n_layer = int(name.split("-")[1])
+        net, params = testing.vgg.get_workload(
+            num_layers=n_layer, batch_size=batch_size, dtype=dtype
         )
-        inputs = {inceptionV3.input_names[0]: ((1, 3, 299, 299), "float16")}
-
-        data = {}
-        np.random.seed(0)
-        for name, (shape, dtype) in inputs.items():
-            if dtype == "uint8":
-                low, high = 0, 1
-            else:
-                low, high = -2, 1
-            data[name] = np.random.uniform(low, high, shape).astype(dtype)
-
-        mod, params, ref_outputs = _get_keras_model(inceptionV3, inputs, data)
-        return mod, params, inputs, data, ref_outputs
-
-    mod, params, inputs, input_data, ref_outputs = get_model()
-    outputs = _build_and_run_network(
-        mod, params, inputs, input_data, device=device, atol=1e-5, rtol=1e-5
-    )
-
-    opencl_sort = np.argsort(outputs[1].asnumpy()).flatten()
-    clml_sort = np.argsort(outputs[0].asnumpy()).flatten()
-    tvm.testing.assert_allclose(opencl_sort[:5], clml_sort[:5], rtol=1e-5, 
atol=1e-5)
-
-
-@pytest.mark.parametrize("dtype", ["float16"])
+    elif "densenet" in name:
+        n_layer = int(name.split("-")[1])
+        net, params = testing.densenet.get_workload(
+            densenet_size=n_layer, batch_size=batch_size, dtype=dtype
+        )
+    elif "squeezenet" in name:
+        version = name.split("_v")[1]
+        net, params = testing.squeezenet.get_workload(
+            batch_size=batch_size, version=version, dtype=dtype
+        )
+    else:
+        raise ValueError("Unsupported network: " + name)
+
+    initializer = relay.testing.init.Xavier()
+    for param_name in list(params.keys()):
+        filter_data = 
np.zeros(params[param_name].shape).astype(params[param_name].dtype)
+        if len(filter_data.shape) > 1:
+            initializer("weight", filter_data)
+        else:
+            initializer("bias", filter_data)
+        params[param_name] = tvm.nd.array(filter_data)
+
+    return net, params, {"data": (input_shape, dtype)}, output_shape
+
+
+executor_type = tvm.testing.parameter("ge", "vm")
+
+
+@pytest.mark.parametrize("dtype", ["float32", "float16"])
+@pytest.mark.parametrize(
+    "name",
+    [
+        "resnet-18",
+        "resnet-34",
+        "resnet-50",
+        "inception_v3",
+        "mobilenet",
+    ],
+)
 @tvm.testing.requires_openclml
-def test_resnet50v2(device, dtype):
-    def get_model():
-        from tensorflow.keras.applications import ResNet50V2
-        import tensorflow as tf
-
-        tf.keras.backend.clear_session()
-
-        model = ResNet50V2(include_top=True, weights=None, input_shape=(224, 
224, 3), classes=1000)
-        inputs_dict = {model.input_names[0]: ((1, 3, 224, 224), "float32")}
-
-        data = {}
-        np.random.seed(0)
-
-        for name, (shape, dtype) in inputs_dict.items():
-            if dtype == "uint8":
-                low, high = 0, 1
-            else:
-                low, high = -1, 1
-            data[name] = np.random.uniform(low, high, shape).astype(dtype)
-
-        """Convert Keras graph to relay."""
-        inputs = {}
-        for name, (shape, _) in inputs_dict.items():
-            inputs[model.input_names[0]] = shape
-
-        ref_outputs = model.predict(data["input_1"].transpose(0, 2, 3, 1))
-
-        mod, params = relay.frontend.from_keras(model, inputs, layout="NCHW")
-
-        return mod, params, inputs, data, ref_outputs
+@tvm.testing.parametrize_targets("opencl -device=adreno")
+def test_network(remote, name, dtype, target, executor_type):
+    print("Network evaluating .. " + name + " " + dtype)
+    np.random.seed(0)
+    mod, params, inputs, _ = get_network(name, 1, dtype=dtype)
+    input_data = {}
 
-    mod, params, inputs, input_data, ref_outputs = get_model()
-    outputs = _build_and_run_network(
-        mod, params, inputs, input_data, device=device, atol=1e-5, rtol=1e-5
-    )
+    for name, (shape, dtype) in inputs.items():
+        input_data[name] = np.random.uniform(-1.0, 1.0, shape).astype(dtype)
 
+    outputs = _build_and_run_network(remote, mod, params, input_data, target, 
executor_type)
     opencl_sort = np.argsort(outputs[1].asnumpy()).flatten()
     clml_sort = np.argsort(outputs[0].asnumpy()).flatten()
-    tvm.testing.assert_allclose(opencl_sort[:10], clml_sort[:10], rtol=1e-5, 
atol=1e-5)
+    tvm.testing.assert_allclose(opencl_sort[-5:], clml_sort[-5:], rtol=0, 
atol=0)
 
 
 if __name__ == "__main__":
diff --git a/tests/python/contrib/test_clml/test_ops.py 
b/tests/python/contrib/test_clml/test_ops.py
index e59a73a485..58365bf429 100644
--- a/tests/python/contrib/test_clml/test_ops.py
+++ b/tests/python/contrib/test_clml/test_ops.py
@@ -25,15 +25,47 @@ from tvm.ir import IRModule
 from tvm.contrib import utils
 from test_clml.infrastructure import (
     build_and_run,
-    Device,
-    skip_codegen_test,
+    build_and_run_vm,
     verify_codegen,
-    build_module,
-    get_cpu_op_count,
 )
 import pytest
 
 
+executor_type = tvm.testing.parameter("ge", "vm")
+
+
+def _build_and_run_network(remote, mod, params, input_data, target, 
executor_type, tvm_log=""):
+    """Helper function to build and run a network."""
+
+    outputs = []
+    for clml in [True, False]:
+        if executor_type == "ge":
+            outputs.append(
+                build_and_run(
+                    remote,
+                    mod,
+                    params,
+                    input_data,
+                    target,
+                    enable_clml=clml,
+                    stat_file=tvm_log,
+                )
+            )
+        else:
+            outputs.append(
+                build_and_run_vm(
+                    remote,
+                    mod,
+                    params,
+                    input_data,
+                    target,
+                    enable_clml=clml,
+                    stat_file=tvm_log,
+                )
+            )
+    return outputs
+
+
 def _get_conv_model(
     shape,
     kernel_h,
@@ -181,34 +213,36 @@ def _get_conv_expected_codegen(
     return inputs
 
 
-@pytest.mark.parametrize("dtype", ["float32"])
-@tvm.testing.requires_openclml
-def test_conv2d(device, dtype):
-    trials = [
+@pytest.mark.parametrize("dtype", ["float32", "float16"])
+@pytest.mark.parametrize(
+    "trials",
+    [
         # Normal convolution
-        [3, 3, (1, 1), (1, 1), (1, 1), 4, (14, 10, 10), (False, False, False), 
False],
-        [2, 1, (2, 2), (1, 1), (1, 1), 7, (15, 16, 12), (True, False, True), 
False],
-        [3, 3, (2, 1), (1, 1), (1, 1), 4, (14, 10, 10), (False, True, False), 
False],
-        [3, 3, (2, 1), (1, 1), (1, 1), 4, (14, 10, 10), (False, True, True), 
False],
-        [2, 2, (1, 1), (1, 1), (1, 1), 4, (14, 10, 10), (False, False, False), 
False],
-        [2, 1, (2, 2), (1, 1), (1, 1), 7, (16, 12, 15), (False, False, True), 
False],
-        [3, 3, (2, 1), (1, 1), (1, 1), 4, (14, 10, 10), (False, True, False), 
False],
-        [3, 3, (1, 1), (1, 1), (1, 1), 16, (16, 12, 15), (False, False, 
False), False],
-        [5, 5, (1, 1), (2, 2), (1, 1), 4, (14, 10, 10), (False, False, False), 
False],
-        [1, 3, (1, 1), (1, 1), (1, 1), 7, (20, 20, 20), (False, False, True), 
False],
-        [2, 2, (2, 2), (1, 1), (1, 1), 4, (20, 20, 20), (False, True, False), 
False],
+        [3, 3, (1, 1), (1, 1), (1, 1), 4, (14, 10, 10), (False, True, False), 
False],
+        [2, 2, (1, 1), (1, 1), (1, 1), 4, (16, 10, 10), (False, False, False), 
False],
         [5, 5, (1, 1), (2, 2), (1, 1), 4, (14, 10, 10), (False, False, False), 
False],
-        [3, 3, (2, 1), (1, 1), (1, 1), 7, (20, 20, 20), (False, False, False), 
False],
-        [3, 3, (1, 1), (2, 2), (1, 1), 16, (14, 10, 10), (False, True, True), 
False],
+        [5, 5, (1, 1), (2, 2), (1, 1), 4, (16, 10, 10), (False, False, False), 
False],
+        [5, 5, (1, 1), (1, 1), (1, 1), 4, (6, 256, 256), (True, True, True), 
False],
+        [3, 3, (0, 0), (1, 1), (1, 1), 4, (4, 512, 512), (False, True, False), 
False],
+        [3, 3, (1, 1), (1, 1), (1, 1), 8, (6, 512, 512), (False, True, False), 
False],
+        [1, 3, (0, 0), (1, 1), (1, 1), 16, (16, 20, 20), (False, False, True), 
False],
+        [3, 1, (0, 0), (1, 1), (1, 1), 64, (64, 20, 20), (False, False, True), 
False],
+        # [3, 3, (1, 1), (1, 1), (1, 1), 128, (128, 16, 16), (False, True, 
False), False],
+        # [3, 3, (1, 1), (2, 2), (1, 1), 256, (128, 16, 16), (False, True, 
True), False],
         # Depth-wise convolution
-        [3, 3, (1, 1), (1, 1), (1, 1), 20, (20, 20, 20), (False, False, True), 
True],
-        [5, 5, (2, 2), (1, 1), (1, 1), 20, (20, 20, 20), (False, True, False), 
True],
-        [3, 3, (2, 2), (2, 2), (1, 1), 14, (14, 10, 10), (False, False, 
False), True],
-        [5, 5, (0, 0), (1, 1), (1, 1), 20, (20, 20, 20), (False, False, 
False), True],
-        [3, 3, (1, 1), (2, 2), (1, 1), 14, (14, 10, 10), (False, True, True), 
True],
-    ]
+        [3, 3, (1, 1), (1, 1), (1, 1), 11, (11, 20, 20), (False, False, True), 
True],
+        [5, 5, (2, 2), (1, 1), (1, 1), 32, (32, 20, 20), (False, True, False), 
True],
+        [3, 3, (2, 2), (2, 2), (1, 1), 128, (128, 8, 8), (False, False, 
False), True],
+        [5, 5, (0, 0), (1, 1), (1, 1), 64, (64, 32, 32), (False, False, 
False), True],
+        [3, 3, (1, 1), (2, 2), (1, 1), 16, (16, 256, 256), (False, True, 
True), True],
+    ],
+)
+@tvm.testing.requires_openclml
+@tvm.testing.parametrize_targets("opencl -device=adreno")
+def test_conv2d(remote, dtype, target, trials, executor_type):
+    np.random.seed(0)
 
-    for (
+    (
         kernel_h,
         kernel_w,
         pad,
@@ -218,43 +252,43 @@ def test_conv2d(device, dtype):
         shape,
         composite,
         is_depthwise,
-    ) in trials:
-        shape = (1, *shape)
-        if is_depthwise:
-            groups = shape[1]
-        else:
-            groups = 1
-        outputs = []
-        inputs = {
-            "a": tvm.nd.array(np.random.uniform(-1, 1, shape).astype(dtype)),
-        }
+    ) = trials
 
-        func, params = _get_conv_model(
-            shape,
-            kernel_h,
-            kernel_w,
-            pad,
-            stride,
-            dilation,
-            groups,
-            dtype,
-            out_channels,
-            inputs,
-            has_pad=composite[0],
-            has_bias=composite[1],
-            has_activation=composite[2],
-        )
-        opencl_out = build_and_run(func, inputs, 1, params, device, 
enable_clml=False)[0]
-        clml_out = build_and_run(func, inputs, 1, params, device, 
enable_clml=True)[0]
+    shape = (1, *shape)
+    if is_depthwise:
+        groups = shape[1]
+    else:
+        groups = 1
+    outputs = []
+    inputs = {
+        "a": tvm.nd.array(np.random.uniform(-1, 1, shape).astype(dtype)),
+    }
 
-        tvm.testing.assert_allclose(
-            clml_out[0].asnumpy(), opencl_out[0].asnumpy(), rtol=1e-5, 
atol=1e-5
-        )
-        args = (shape, kernel_h, kernel_w, pad, stride, dilation, groups, 
dtype, out_channels)
-        exp_codegen = _get_conv_expected_codegen(
-            *args, has_bias=composite[1], has_activation=composite[2]
-        )
-        verify_codegen(func, exp_codegen, device, params)
+    func, params = _get_conv_model(
+        shape,
+        kernel_h,
+        kernel_w,
+        pad,
+        stride,
+        dilation,
+        groups,
+        dtype,
+        out_channels,
+        inputs,
+        has_pad=composite[0],
+        has_bias=composite[1],
+        has_activation=composite[2],
+    )
+    outputs = _build_and_run_network(remote, func, params, inputs, target, 
executor_type)
+    out_rtol = 1e-1 if dtype == "float16" else 1e-5
+    tvm.testing.assert_allclose(
+        outputs[0].asnumpy(), outputs[1].asnumpy(), rtol=out_rtol, 
atol=out_rtol
+    )
+    args = (shape, kernel_h, kernel_w, pad, stride, dilation, groups, dtype, 
out_channels)
+    exp_codegen = _get_conv_expected_codegen(
+        *args, has_bias=composite[1], has_activation=composite[2]
+    )
+    verify_codegen(remote, func, params, exp_codegen, target)
 
 
 def _get_conv2d_transpose_expected_codegen(
@@ -301,69 +335,75 @@ def _get_conv2d_transpose_expected_codegen(
     return exp_codegen
 
 
-@pytest.mark.parametrize("dtype", ["float32"])
-@tvm.testing.requires_openclml
-def test_conv2d_transpose(device, dtype):
-    trials = [
-        [(1, 256, 100, 100), (256, 64, 4, 4), 64, (4, 4), (2, 2), (1, 1, 1, 
1)],
+@pytest.mark.parametrize("dtype", ["float32", "float16"])
+@pytest.mark.parametrize(
+    "trials",
+    [
         [(1, 64, 200, 200), (64, 64, 4, 4), 64, (4, 4), (2, 2), (1, 1, 1, 1)],
         [(1, 64, 400, 400), (64, 16, 4, 4), 16, (4, 4), (2, 2), (1, 1, 1, 1)],
-    ]
-    for (dshape, kshape, channels, kernel_size, strides, padding) in trials:
-        x = relay.var("input", shape=dshape, dtype=dtype)
-        input_arr = tvm.nd.array(np.random.uniform(-1, 1, 
dshape).astype(dtype))
-        w = relay.var("wt", shape=kshape, dtype=dtype)
-        weight_arr = tvm.nd.array(np.random.uniform(-1, 1, 
kshape).astype(dtype))
-        inputs = {
-            "input": input_arr,
-        }
-        params = {
-            "wt": weight_arr,
-        }
-        y = relay.nn.conv2d_transpose(
-            x,
-            w,
-            channels=channels,
-            kernel_size=kernel_size,
-            strides=strides,
-            padding=padding,
-            kernel_layout="IOHW",
-            data_layout="NCHW",
-        )
-        func = relay.Function([x, w], y)
-        mod = IRModule.from_expr(func)
-
-        opencl_out = build_and_run(mod, inputs, 1, params, device, 
enable_clml=False)[0]
-        clml_out = build_and_run(mod, inputs, 1, params, device, 
enable_clml=True)[0]
-        tvm.testing.assert_allclose(
-            clml_out[0].asnumpy(), opencl_out[0].asnumpy(), rtol=1e-3, 
atol=1e-3
-        )
-
-        args = (
-            dshape,
-            kshape,
-            channels,
-            kernel_size,
-            strides,
-            padding,
-            (1, 1),
-            dtype,
-            opencl_out[0].shape,
-        )
-        exp_codegen = _get_conv2d_transpose_expected_codegen(*args)
-        verify_codegen(mod, exp_codegen, device, params)
+        [(1, 16, 32, 32), (16, 16, 3, 3), 16, (3, 3), (1, 1), (1, 1, 1, 1)],
+        # [(1, 256, 100, 100), (256, 64, 4, 4), 64, (4, 4), (2, 2), (1, 1, 1, 
1)],
+    ],
+)
+@tvm.testing.requires_openclml
+@tvm.testing.parametrize_targets("opencl -device=adreno")
+def test_conv2d_transpose(remote, dtype, target, trials, executor_type):
+    np.random.seed(0)
+    (dshape, kshape, channels, kernel_size, strides, padding) = trials
+    x = relay.var("input", shape=dshape, dtype=dtype)
+    input_arr = tvm.nd.array(np.random.uniform(-1, 1, dshape).astype(dtype))
+    w = relay.var("wt", shape=kshape, dtype=dtype)
+    weight_arr = tvm.nd.array(np.random.uniform(-1, 1, kshape).astype(dtype))
+    inputs = {
+        "input": input_arr,
+    }
+    params = {
+        "wt": weight_arr,
+    }
+    y = relay.nn.conv2d_transpose(
+        x,
+        w,
+        channels=channels,
+        kernel_size=kernel_size,
+        strides=strides,
+        padding=padding,
+        kernel_layout="IOHW",
+        data_layout="NCHW",
+    )
+    func = relay.Function([x, w], y)
+    mod = IRModule.from_expr(func)
+    outputs = _build_and_run_network(remote, mod, params, inputs, target, 
executor_type)
+    out_rtol = 1e-1 if dtype == "float16" else 1e-5
+    tvm.testing.assert_allclose(
+        outputs[0].asnumpy(), outputs[1].asnumpy(), rtol=out_rtol, 
atol=out_rtol
+    )
+    args = (
+        dshape,
+        kshape,
+        channels,
+        kernel_size,
+        strides,
+        padding,
+        (1, 1),
+        dtype,
+        outputs[0].shape,
+    )
+    exp_codegen = _get_conv2d_transpose_expected_codegen(*args)
+    verify_codegen(remote, mod, params, exp_codegen, target)
 
 
-@pytest.mark.parametrize("dtype", ["float16"])
+@pytest.mark.parametrize("dtype", ["float32", "float16"])
+@pytest.mark.parametrize("trials", [[1, 64, 8, 8], [1, 16, 64, 64]])
 @tvm.testing.requires_openclml
-def test_batchnorm(device, dtype):
+@tvm.testing.parametrize_targets("opencl -device=adreno")
+def test_batchnorm(remote, dtype, target, trials, executor_type):
     if clml.clml_sdk_version() < 3:
         print("Skip due to unsupported CLML version:", clml.clml_sdk_version())
         return
-    in_shape = (1, 8, 64, 64)
-    channels = 8
+    in_shape = trials
+    channels = in_shape[1]
 
-    np.random.seed(8)
+    np.random.seed(0)
 
     input_arr = tvm.nd.array(np.random.uniform(-1, 1, in_shape).astype(dtype))
     inp = relay.var("a", shape=in_shape, dtype=dtype)
@@ -381,24 +421,58 @@ def test_batchnorm(device, dtype):
 
     func = relay.nn.batch_norm(inp, gamma, beta, mean, variance, axis=1, 
epsilon=0.0003)[0]
     mod = IRModule.from_expr(func)
-
     inputs = {
         "a": input_arr,
     }
-
-    opencl_out = build_and_run(mod, inputs, 1, params, device, 
enable_clml=False)[0]
-    clml_out = build_and_run(mod, inputs, 1, params, device, 
enable_clml=True)[0]
-
+    outputs = _build_and_run_network(remote, mod, params, inputs, target, 
executor_type)
+    out_rtol = 1e-3 if dtype == "float16" else 1e-5
     tvm.testing.assert_allclose(
-        clml_out[0].asnumpy(), opencl_out[0].asnumpy(), rtol=1e-3, atol=1e-3
+        outputs[0].asnumpy(), outputs[1].asnumpy(), rtol=out_rtol, 
atol=out_rtol
     )
+    exp_codegen = [
+        {
+            "attrs": {"dtype": [[dtype]], "shape": 
[[list(inputs["a"].shape)]]},
+            "name": "",
+            "op": "input",
+        },
+        {"attrs": {"dtype": [[dtype]], "shape": [[[channels]]]}, "name": "", 
"op": "const"},
+        {"attrs": {"dtype": [[dtype]], "shape": [[[channels]]]}, "name": "", 
"op": "const"},
+        {"attrs": {"dtype": [[dtype]], "shape": [[[channels]]]}, "name": "", 
"op": "const"},
+        {"attrs": {"dtype": [[dtype]], "shape": [[[channels]]]}, "name": "", 
"op": "const"},
+        {
+            "attrs": {
+                "axis": [["1"]],
+                "center": [["1"]],
+                "dtype": [[dtype]],
+                "epsilon": [["0.00029999999999999997"]],
+                "num_inputs": "5",
+                "num_outputs": "1",
+                "scale": [["1"]],
+                "shape": [[list(outputs[0].shape)]],
+            },
+            "inputs": [[0, 0, 0], [1, 0, 0], [2, 0, 0], [3, 0, 0], [4, 0, 0]],
+            "name": "nn.batch_norm",
+            "op": "kernel",
+        },
+    ]
+    verify_codegen(remote, mod, params, exp_codegen, target)
 
 
-@pytest.mark.parametrize("dtype", ["float16"])
+@pytest.mark.parametrize("dtype", ["float32", "float16"])
+@pytest.mark.parametrize(
+    "trials",
+    [
+        [(1, 64, 64, 40), (1, 64, 64, 40)],
+        [(1, 1280, 32, 32), (1, 640, 32, 32)],
+        [(1, 64), (1, 32)],
+    ],
+)
 @tvm.testing.requires_openclml
-def test_concat(device, dtype):
-    in_shape_1 = (1, 16, 16, 16)
-    in_shape_2 = (1, 16, 16, 16)
+@tvm.testing.parametrize_targets("opencl -device=adreno")
+def test_concat(remote, dtype, target, trials, executor_type):
+    np.random.seed(0)
+    in_shape_1 = trials[0]
+    in_shape_2 = trials[1]
     a = relay.var("input_1", shape=in_shape_1, dtype=dtype)
     b = relay.var("input_2", shape=in_shape_2, dtype=dtype)
     low, high = -1, 1
@@ -409,14 +483,13 @@ def test_concat(device, dtype):
 
     params = {}
     func = relay.concatenate((a, b), axis=1)
-    mod = IRModule.from_expr(func)
-
-    opencl_out = build_and_run(mod, inputs, 1, params, device, 
enable_clml=False)[0]
-    clml_out = build_and_run(mod, inputs, 1, params, device, 
enable_clml=True)[0]
 
+    outputs = _build_and_run_network(remote, func, params, inputs, target, 
executor_type)
+    out_rtol = 1e-2 if dtype == "float16" else 1e-5
     tvm.testing.assert_allclose(
-        clml_out[0].asnumpy(), opencl_out[0].asnumpy(), rtol=1e-3, atol=1e-3
+        outputs[0].asnumpy(), outputs[1].asnumpy(), rtol=out_rtol, 
atol=out_rtol
     )
+
     exp_codegen = [
         {
             "attrs": {
@@ -440,14 +513,14 @@ def test_concat(device, dtype):
                 "dtype": [[dtype]],
                 "num_inputs": "2",
                 "num_outputs": "1",
-                "shape": [[list(clml_out[0].shape)]],
+                "shape": [[list(outputs[0].shape)]],
             },
             "inputs": [[0, 0, 0], [1, 0, 0]],
             "name": "concatenate",
             "op": "kernel",
         },
     ]
-    verify_codegen(func, exp_codegen, device, params)
+    verify_codegen(remote, func, params, exp_codegen, target)
 
 
 def _get_pool_expected_codegen(input_shape, pool_size, stride, padding, 
pool_type, dtype):
@@ -488,10 +561,10 @@ def _get_pool_expected_codegen(input_shape, pool_size, 
stride, padding, pool_typ
     return exp_codegen
 
 
-@pytest.mark.parametrize("dtype", ["float16"])
-@tvm.testing.requires_openclml
-def test_pool(device, dtype):
-    trials = [
+@pytest.mark.parametrize("dtype", ["float32", "float16"])
+@pytest.mark.parametrize(
+    "trials",
+    [
         # input size         pool_size stride  paading
         [(1, 64, 147, 147), (3, 3), (2, 2), (0, 0, 0, 0), "max"],
         [(1, 192, 71, 71), (3, 3), (2, 2), (0, 0, 0, 0), "max"],
@@ -503,42 +576,59 @@ def test_pool(device, dtype):
         [(1, 288, 35, 35), (3, 3), (1, 1), (0, 0, 1, 1), "avg"],
         [(1, 768, 17, 17), (3, 3), (1, 1), (0, 0, 1, 1), "avg"],
         [(1, 1280, 8, 8), (3, 3), (1, 1), (0, 0, 1, 1), "avg"],
-    ]
+    ],
+)
+@tvm.testing.requires_openclml
+@tvm.testing.parametrize_targets("opencl -device=adreno")
+def test_pool(remote, dtype, target, trials, executor_type):
+    np.random.seed(0)
     params = {}
-    for (
+    (
         input_shape,
         pool_size,
         stride,
         padding,
         pooling_type,
-    ) in trials:
-        a = relay.var("input_1", shape=input_shape, dtype=dtype)
-        input_arr = tvm.nd.array(np.random.uniform(-1, 1, 
input_shape).astype(dtype))
-        inputs = {
-            "input_1": input_arr,
-        }
-
-        if pooling_type == "max":
-            func = relay.nn.max_pool2d(a, pool_size=pool_size, strides=stride, 
padding=padding)
-        else:
-            func = relay.nn.avg_pool2d(a, pool_size=pool_size, strides=stride, 
padding=padding)
-        mod = IRModule.from_expr(func)
-
-        opencl_out = build_and_run(mod, inputs, 1, params, device, 
enable_clml=False)[0]
-        clml_out = build_and_run(mod, inputs, 1, params, device, 
enable_clml=True)[0]
-        tvm.testing.assert_allclose(
-            clml_out[0].asnumpy(), opencl_out[0].asnumpy(), rtol=1e-3, 
atol=1e-3
-        )
+    ) = trials
+    a = relay.var("input_1", shape=input_shape, dtype=dtype)
+    input_arr = tvm.nd.array(np.random.uniform(-1, 1, 
input_shape).astype(dtype))
+    inputs = {
+        "input_1": input_arr,
+    }
+    if pooling_type == "max":
+        func = relay.nn.max_pool2d(a, pool_size=pool_size, strides=stride, 
padding=padding)
+    else:
+        func = relay.nn.avg_pool2d(a, pool_size=pool_size, strides=stride, 
padding=padding)
 
-        args = (input_shape, pool_size, stride, padding, pooling_type, dtype)
-        exp_codegen = _get_pool_expected_codegen(*args)
-        verify_codegen(func, exp_codegen, device, params)
+    outputs = _build_and_run_network(remote, func, params, inputs, target, 
executor_type)
+    out_rtol = 1e-2 if dtype == "float16" else 1e-5
+    tvm.testing.assert_allclose(
+        outputs[0].asnumpy(), outputs[1].asnumpy(), rtol=out_rtol, 
atol=out_rtol
+    )
+    args = (input_shape, pool_size, stride, padding, pooling_type, dtype)
+    exp_codegen = _get_pool_expected_codegen(*args)
+    verify_codegen(remote, func, params, exp_codegen, target)
 
 
-@pytest.mark.parametrize("dtype", ["float32"])
+@pytest.mark.parametrize("dtype", ["float32", "float16"])
+@pytest.mark.parametrize(
+    "trials",
+    [
+        [(5, 16), (32, 16), False],
+        [(320, 64), (320, 64), False],
+        [(256, 256), (256, 256), False],
+        [(512, 512), (512, 512), False],
+        [(1, 256), (100, 256), False],
+        [(1, 16), (32, 16), True],
+        [(1, 512), (512, 512), True],
+        [(1, 5), (4, 5), True],
+    ],
+)
 @tvm.testing.requires_openclml
-def test_dense(device, dtype):
+@tvm.testing.parametrize_targets("opencl -device=adreno")
+def test_dense(remote, dtype, target, trials, executor_type):
     def _get_model(x_shape, k_shape, has_bias=False):
+        np.random.seed(0)
         x = relay.var("x", shape=(x_shape), dtype=dtype)
         kernel = relay.var("kernel", shape=(k_shape), dtype=dtype)
         out = relay.nn.dense(x, kernel, units=k_shape[0])
@@ -562,22 +652,8 @@ def test_dense(device, dtype):
                 "op": "const",
             },
         ]
-
-        dense_node = {
-            "attrs": {
-                "num_inputs": "2",
-                "num_outputs": "1",
-                "dtype": [[dtype]],
-                "out_dtype": [[""]],
-                "shape": [[[x_shape[0], k_shape[0]]]],
-                "units": [[str(k_shape[0])]],
-            },
-            "inputs": [[0, 0, 0], [1, 0, 0]],
-            "name": "nn.dense",
-            "op": "kernel",
-        }
-        exp_codegen.append(dense_node)
-
+        input_nodes = [[0, 0, 0], [1, 0, 0]]
+        num_inputs = 2
         if has_bias:
             bias = relay.var("bias", shape=(k_shape[0],), dtype=dtype)
             out = relay.nn.bias_add(out, bias)
@@ -590,43 +666,48 @@ def test_dense(device, dtype):
                 "op": "const",
             }
             exp_codegen.append(bias_data_node)
-            bias_node = {
-                "attrs": {
-                    "num_inputs": "2",
-                    "num_outputs": "1",
-                    "dtype": [[dtype]],
-                    "shape": [[[x_shape[0], k_shape[0]]]],
-                },
-                "inputs": [[2, 0, 0], [3, 0, 0]],
-                "name": "add",
-                "op": "kernel",
-            }
-            exp_codegen.append(bias_node)
-
+            input_nodes.append([2, 0, 0])
+            num_inputs += 1
             params["bias"] = tvm.nd.array(np.random.uniform(-1, 1, 
(k_shape[0],)).astype(dtype))
 
+        dense_node = {
+            "attrs": {
+                "num_inputs": str(num_inputs),
+                "num_outputs": "1",
+                "dtype": [[dtype]],
+                "out_dtype": [[""]],
+                "shape": [[[x_shape[0], k_shape[0]]]],
+                "units": [[str(k_shape[0])]],
+            },
+            "inputs": input_nodes,
+            "name": "nn.dense",
+            "op": "kernel",
+        }
+        exp_codegen.append(dense_node)
+
         return out, params, inputs, exp_codegen
 
     def _verify(out, params, inputs, exp_codegen):
         mod = IRModule.from_expr(out)
-        opencl_out = build_and_run(mod, inputs, 1, params, device, 
enable_clml=False)[0]
-        clml_out = build_and_run(mod, inputs, 1, params, device, 
enable_clml=True)[0]
+        outputs = _build_and_run_network(remote, mod, params, inputs, target, 
executor_type)
+        out_rtol = 1e-1 if dtype == "float16" else 1e-5
         tvm.testing.assert_allclose(
-            clml_out[0].asnumpy(), opencl_out[0].asnumpy(), rtol=1e-2, 
atol=1e-2
+            outputs[0].asnumpy(), outputs[1].asnumpy(), rtol=out_rtol, 
atol=out_rtol
         )
-        verify_codegen(out, exp_codegen, device, params)
+        verify_codegen(remote, mod, params, exp_codegen, target)
 
-    _verify(*(_get_model((5, 16), (32, 16), False)))
-    _verify(*(_get_model((1, 16), (32, 16), True)))
+    _verify(*(_get_model(trials[0], trials[1], trials[2])))
 
 
-@pytest.mark.parametrize("dtype", ["float32"])
+@pytest.mark.parametrize("dtype", ["float32", "float16"])
 @tvm.testing.requires_openclml
-def test_binary_ops(device, dtype):
-    def _get_model(a_shape, b_shape, op):
+@tvm.testing.parametrize_targets("opencl -device=adreno")
+def test_binary_ops(remote, dtype, target, executor_type):
+    def _get_model(a_shape, b_shape, op_func):
+        np.random.seed(0)
         a = relay.var("a", shape=(a_shape), dtype=dtype)
         b = relay.var("b", shape=(b_shape), dtype=dtype)
-        out = op(a, b)
+        out = op_func(a, b)
         inputs = {
             "a": tvm.nd.array(np.random.uniform(-1, 1, a_shape).astype(dtype)),
             "b": tvm.nd.array(np.random.uniform(-1, 1, b_shape).astype(dtype)),
@@ -636,32 +717,56 @@ def test_binary_ops(device, dtype):
 
     def _verify(out, params, inputs):
         mod = IRModule.from_expr(out)
-        opencl_out = build_and_run(mod, inputs, 1, params, device, 
enable_clml=False)[0]
-        clml_out = build_and_run(mod, inputs, 1, params, device, 
enable_clml=True)[0]
+        outputs = _build_and_run_network(remote, mod, params, inputs, target, 
executor_type)
+        out_rtol = 1e-2 if dtype == "float16" else 1e-5
         tvm.testing.assert_allclose(
-            clml_out[0].asnumpy(), opencl_out[0].asnumpy(), rtol=1e-3, 
atol=1e-3
+            outputs[0].asnumpy(), outputs[1].asnumpy(), rtol=out_rtol, 
atol=out_rtol
         )
-
-        # Check to make sure these ops are offloaded to CLML instead of TVM.
-        with tvm.transform.PassContext(opt_level=3, 
disabled_pass=["AlterOpLayout"]):
-            mod = clml.partition_for_clml(mod, params)
-            tvm_op_count = get_cpu_op_count(mod)
-            assert tvm_op_count == 0, "Got {} TVM Native Compute partitions, 
expected 0".format(
-                tvm_op_count
-            )
+        exp_codegen = [
+            {
+                "attrs": {
+                    "dtype": [[dtype]],
+                    "shape": [[list(inputs["a"].shape)]],
+                },
+                "name": "",
+                "op": "input",
+            },
+            {
+                "attrs": {
+                    "dtype": [[dtype]],
+                    "shape": [[list(inputs["b"].shape)]],
+                },
+                "name": "",
+                "op": "input",
+            },
+            {
+                "attrs": {
+                    "dtype": [[dtype]],
+                    "num_inputs": "2",
+                    "num_outputs": "1",
+                    "shape": [[list(outputs[0].shape)]],
+                },
+                "inputs": [[0, 0, 0], [1, 0, 0]],
+                "name": str(out.op.name),
+                "op": "kernel",
+            },
+        ]
+        verify_codegen(remote, mod, params, exp_codegen, target)
 
     _verify(*(_get_model((1, 16), (1, 16), relay.add)))
-    _verify(*(_get_model((1, 16), (1, 16), relay.subtract)))
-    _verify(*(_get_model((1, 16), (1, 16), relay.multiply)))
-    _verify(*(_get_model((1, 16), (1, 16), relay.divide)))
+    _verify(*(_get_model((1, 18), (1, 18), relay.subtract)))
+    _verify(*(_get_model((1, 256), (1, 256), relay.multiply)))
+    _verify(*(_get_model((1, 10), (1, 10), relay.divide)))
     _verify(*(_get_model((1, 16), (1, 16), relay.minimum)))
-    _verify(*(_get_model((1, 16), (1, 16), relay.maximum)))
+    _verify(*(_get_model((1, 512), (1, 512), relay.maximum)))
 
 
-@pytest.mark.parametrize("dtype", ["float32"])
+@pytest.mark.parametrize("dtype", ["float32", "float16"])
 @tvm.testing.requires_openclml
-def test_unary_ops(device, dtype):
+@tvm.testing.parametrize_targets("opencl -device=adreno")
+def test_unary_ops(remote, dtype, target, executor_type):
     def _get_model(a_shape, op):
+        np.random.seed(0)
         a = relay.var("a", shape=(a_shape), dtype=dtype)
         out = op(a)
         inputs = {"a": tvm.nd.array(np.random.uniform(-1, 1, 
a_shape).astype(dtype))}
@@ -670,28 +775,45 @@ def test_unary_ops(device, dtype):
 
     def _verify(out, params, inputs):
         mod = IRModule.from_expr(out)
-        opencl_out = build_and_run(mod, inputs, 1, params, device, 
enable_clml=False)[0]
-        clml_out = build_and_run(mod, inputs, 1, params, device, 
enable_clml=True)[0]
+        outputs = _build_and_run_network(remote, mod, params, inputs, target, 
executor_type)
+        out_rtol = 1e-2 if dtype == "float16" else 1e-5
         tvm.testing.assert_allclose(
-            clml_out[0].asnumpy(), opencl_out[0].asnumpy(), rtol=1e-3, 
atol=1e-3
+            outputs[0].asnumpy(), outputs[1].asnumpy(), rtol=out_rtol, 
atol=out_rtol
         )
 
-        # Check to make sure these ops are offloaded to CLML instead of TVM.
-        with tvm.transform.PassContext(opt_level=3, 
disabled_pass=["AlterOpLayout"]):
-            mod = clml.partition_for_clml(mod, params)
-            tvm_op_count = get_cpu_op_count(mod)
-            assert tvm_op_count == 0, "Got {} TVM Native Compute partitions, 
expected 0".format(
-                tvm_op_count
-            )
+        exp_codegen = [
+            {
+                "attrs": {
+                    "dtype": [[dtype]],
+                    "shape": [[list(inputs["a"].shape)]],
+                },
+                "name": "",
+                "op": "input",
+            },
+            {
+                "attrs": {
+                    "dtype": [[dtype]],
+                    "num_inputs": "1",
+                    "num_outputs": "1",
+                    "shape": [[list(outputs[0].shape)]],
+                },
+                "inputs": [[0, 0, 0]],
+                "name": "nn.relu",
+                "op": "kernel",
+            },
+        ]
+        verify_codegen(remote, mod, params, exp_codegen, target)
 
-    _verify(*(_get_model((1, 16), relay.nn.softmax)))
     _verify(*(_get_model((1, 16), relay.nn.relu)))
+    _verify(*(_get_model((1, 256), relay.nn.relu)))
 
 
 @pytest.mark.parametrize("dtype", ["float32", "float16"])
 @tvm.testing.requires_openclml
-def test_depth_to_space(device, dtype):
+@tvm.testing.parametrize_targets("opencl -device=adreno")
+def test_depth_to_space(remote, dtype, target, executor_type):
     def _get_model(a_shape, block_size):
+        np.random.seed(0)
         a = relay.var("a", shape=(a_shape), dtype=dtype)
         out = relay.nn.depth_to_space(a, block_size)
         inputs = {"a": tvm.nd.array(np.random.uniform(-1, 1, 
a_shape).astype(dtype))}
@@ -700,10 +822,10 @@ def test_depth_to_space(device, dtype):
 
     def _verify(out, params, inputs):
         mod = IRModule.from_expr(out)
-        opencl_out = build_and_run(mod, inputs, 1, params, device, 
enable_clml=False)[0]
-        clml_out = build_and_run(mod, inputs, 1, params, device, 
enable_clml=True)[0]
+        outputs = _build_and_run_network(remote, mod, params, inputs, target, 
executor_type)
+        out_rtol = 1e-2 if dtype == "float16" else 1e-5
         tvm.testing.assert_allclose(
-            clml_out[0].asnumpy(), opencl_out[0].asnumpy(), rtol=1e-3, 
atol=1e-3
+            outputs[0].asnumpy(), outputs[1].asnumpy(), rtol=out_rtol, 
atol=out_rtol
         )
 
         # Check to make sure these ops are offloaded to CLML instead of TVM.
@@ -724,23 +846,26 @@ def test_depth_to_space(device, dtype):
                     "dtype": [[dtype]],
                     "num_inputs": "1",
                     "num_outputs": "1",
-                    "shape": [[list(clml_out[0].shape)]],
+                    "shape": [[list(outputs[0].shape)]],
                 },
                 "inputs": [[0, 0, 0]],
                 "name": "nn.depth_to_space",
                 "op": "kernel",
             },
         ]
-        verify_codegen(out, exp_codegen, device, params)
+        verify_codegen(remote, mod, params, exp_codegen, target)
 
     _verify(*(_get_model((1, 64, 8, 8), 4)))
     _verify(*(_get_model((1, 64, 8, 8), 8)))
+    _verify(*(_get_model((1, 512, 8, 8), 8)))
 
 
 @pytest.mark.parametrize("dtype", ["float32", "float16"])
 @tvm.testing.requires_openclml
-def test_resize_bilinear(device, dtype):
+@tvm.testing.parametrize_targets("opencl -device=adreno")
+def test_resize_bilinear(remote, dtype, target, executor_type):
     def _get_model(a_shape, scale, align_corners):
+        np.random.seed(0)
         a = relay.var("a", shape=(a_shape), dtype=dtype)
         out = relay.nn.upsampling(
             a, scale_h=scale[0], scale_w=scale[1], method="bilinear", 
align_corners=align_corners
@@ -751,10 +876,10 @@ def test_resize_bilinear(device, dtype):
 
     def _verify(out, params, inputs):
         mod = IRModule.from_expr(out)
-        opencl_out = build_and_run(mod, inputs, 1, params, device, 
enable_clml=False)[0]
-        clml_out = build_and_run(mod, inputs, 1, params, device, 
enable_clml=True)[0]
+        outputs = _build_and_run_network(remote, mod, params, inputs, target, 
executor_type)
+        out_rtol = 1e-2 if dtype == "float16" else 1e-5
         tvm.testing.assert_allclose(
-            clml_out[0].asnumpy(), opencl_out[0].asnumpy(), rtol=1e-3, 
atol=1e-3
+            outputs[0].asnumpy(), outputs[1].asnumpy(), rtol=out_rtol, 
atol=out_rtol
         )
 
         # Check to make sure these ops are offloaded to CLML instead of TVM.
@@ -777,23 +902,35 @@ def test_resize_bilinear(device, dtype):
                     "dtype": [[dtype]],
                     "num_inputs": "1",
                     "num_outputs": "1",
-                    "shape": [[list(clml_out[0].shape)]],
+                    "shape": [[list(outputs[0].shape)]],
                 },
                 "inputs": [[0, 0, 0]],
                 "name": "nn.upsampling",
                 "op": "kernel",
             },
         ]
-        verify_codegen(out, exp_codegen, device, params)
+        verify_codegen(remote, mod, params, exp_codegen, target)
 
     _verify(*(_get_model((1, 16, 8, 8), (2, 2), False)))
     _verify(*(_get_model((1, 16, 7, 7), (2, 2), True)))
+    _verify(*(_get_model((1, 64, 8, 8), (2, 2), True)))
 
 
-@pytest.mark.parametrize("dtype", ["float32"])
+@pytest.mark.parametrize("dtype", ["float32", "float16"])
+@pytest.mark.parametrize(
+    "trials",
+    [
+        [(1, 512, 32), (1, 512, 32), False, True],
+        [(1, 128, 32), (1, 128, 32), False, True],
+        [(1, 128, 128), (1, 32, 128), False, True],
+        [(1, 64, 40), (1, 64, 40), False, True],
+    ],
+)
 @tvm.testing.requires_openclml
-def test_batch_matmul(device, dtype):
+@tvm.testing.parametrize_targets("opencl -device=adreno")
+def test_batch_matmul(remote, dtype, target, executor_type, trials):
     def _get_model(a_shape, b_shape, a_transpose, b_transpose):
+        np.random.seed(0)
         a = relay.var("a", shape=(a_shape), dtype=dtype)
         b = relay.var("b", shape=(b_shape), dtype=dtype)
         out = relay.nn.batch_matmul(a, b, transpose_a=a_transpose, 
transpose_b=b_transpose)
@@ -806,10 +943,10 @@ def test_batch_matmul(device, dtype):
 
     def _verify(out, params, inputs):
         mod = IRModule.from_expr(out)
-        opencl_out = build_and_run(mod, inputs, 1, params, device, 
enable_clml=False)[0]
-        clml_out = build_and_run(mod, inputs, 1, params, device, 
enable_clml=True)[0]
+        outputs = _build_and_run_network(remote, mod, params, inputs, target, 
executor_type)
+        out_rtol = 1e-1 if dtype == "float16" else 1e-5
         tvm.testing.assert_allclose(
-            clml_out[0].asnumpy(), opencl_out[0].asnumpy(), rtol=1e-3, 
atol=1e-3
+            outputs[0].asnumpy(), outputs[1].asnumpy(), rtol=out_rtol, 
atol=out_rtol
         )
 
         # Check to make sure these ops are offloaded to CLML instead of TVM.
@@ -838,17 +975,320 @@ def test_batch_matmul(device, dtype):
                     "dtype": [[dtype]],
                     "num_inputs": "2",
                     "num_outputs": "1",
-                    "shape": [[list(clml_out[0].shape)]],
+                    "shape": [[list(outputs[0].shape)]],
                 },
                 "inputs": [[0, 0, 0], [1, 0, 0]],
                 "name": "nn.batch_matmul",
                 "op": "kernel",
             },
         ]
-        verify_codegen(out, exp_codegen, device, params)
+        verify_codegen(remote, mod, params, exp_codegen, target)
+
+    _verify(*(_get_model(trials[0], trials[1], trials[2], trials[3])))
+
+
+def _get_softmax_exp_codegen(inputs, dtype, output_shape, axis):
+
+    exp_codegen = [
+        {
+            "attrs": {
+                "dtype": [[dtype]],
+                "shape": [[list(inputs["a"].shape)]],
+            },
+            "name": "",
+            "op": "input",
+        },
+        {
+            "attrs": {
+                "axis": [[str(axis)]],
+                "dtype": [[dtype]],
+                "num_inputs": "1",
+                "num_outputs": "1",
+                "shape": [[list(output_shape)]],
+            },
+            "inputs": [[0, 0, 0]],
+            "name": "nn.softmax",
+            "op": "kernel",
+        },
+    ]
+    return exp_codegen
+
+
+@pytest.mark.parametrize("dtype", ["float32", "float16"])
+@tvm.testing.requires_openclml
+@tvm.testing.parametrize_targets("opencl -device=adreno")
+def test_softmax(remote, dtype, target, executor_type):
+    def _get_model(a_shape, axis):
+        np.random.seed(0)
+        a = relay.var("a", shape=(a_shape), dtype=dtype)
+        inputs = {"a": tvm.nd.array(np.random.uniform(-1, 1, 
a_shape).astype(dtype))}
+        out = relay.nn.softmax(a, axis)
+        params = {}
+        return out, params, inputs, axis
+
+    def _verify(out, params, inputs, axis):
+        mod = IRModule.from_expr(out)
+        outputs = _build_and_run_network(remote, mod, params, inputs, target, 
executor_type)
+        out_rtol = 1e-1 if dtype == "float16" else 1e-5
+        tvm.testing.assert_allclose(
+            outputs[0].asnumpy(), outputs[1].asnumpy(), rtol=out_rtol, 
atol=out_rtol
+        )
+        args = (inputs, dtype, outputs[0].shape, axis)
+        exp_codegen = _get_softmax_exp_codegen(*args)
+        verify_codegen(remote, mod, params, exp_codegen, target)
+
+    _verify(*(_get_model((1, 5), 1)))
+    _verify(*(_get_model((1, 1000), 1)))
+    _verify(*(_get_model((1, 3), 1)))
+
+
+@pytest.mark.parametrize("dtype", ["float32", "float16"])
+@pytest.mark.parametrize(
+    "trials",
+    [
+        [(1, 1, 2, 2), 2, 1],
+        [(1, 16, 2, 2), 4, 4],
+        [(1, 8, 4, 4), 3, 2],
+    ],
+)
+@tvm.testing.requires_openclml
+@tvm.testing.parametrize_targets("opencl -device=adreno")
+def test_upsampling(remote, dtype, target, executor_type, trials):
+    def _verify(in_shape, scale_h, scale_w):
+        np.random.seed(0)
+        a = relay.var("a", shape=in_shape, dtype=dtype)
+        inputs = {
+            "a": tvm.nd.array(np.random.uniform(-1, 1, 
in_shape).astype(dtype)),
+        }
+        params = {}
+        func = relay.nn.upsampling(
+            a, scale_h, scale_w, layout="NCHW", method="bilinear", 
align_corners=False
+        )
+        mod = IRModule.from_expr(func)
+        outputs = _build_and_run_network(remote, mod, params, inputs, target, 
executor_type)
+        out_rtol = 1e-2 if dtype == "float16" else 1e-5
+        tvm.testing.assert_allclose(
+            outputs[0].asnumpy(), outputs[1].asnumpy(), rtol=out_rtol, 
atol=out_rtol
+        )
+        exp_codegen = [
+            {
+                "attrs": {"dtype": [[dtype]], "shape": 
[[list(inputs["a"].shape)]]},
+                "name": "",
+                "op": "input",
+            },
+            {
+                "attrs": {
+                    "align_corners": [["0"]],
+                    "dtype": [[dtype]],
+                    "layout": [["NCHW"]],
+                    "method": [["bilinear"]],
+                    "num_inputs": "1",
+                    "num_outputs": "1",
+                    "scale_h": [[str(scale_h)]],
+                    "scale_w": [[str(scale_w)]],
+                    "shape": [[list(outputs[0].shape)]],
+                },
+                "inputs": [[0, 0, 0]],
+                "name": "nn.upsampling",
+                "op": "kernel",
+            },
+        ]
+        verify_codegen(remote, mod, params, exp_codegen, target)
+
+    _verify(trials[0], trials[1], trials[2])
+
+
+@pytest.mark.parametrize("dtype", ["float32", "float16"])
+@pytest.mark.parametrize(
+    "trials",
+    [
+        [(1, 40, 64, 64), (1, 40, 4096)],
+        [(1, 77, 768), (1, 1, -1, 768)],
+        [(1, 80, 32, 32), (1, 80, 1024)],
+        [(1, 2, 3, 4), (1, 0, -1)],
+    ],
+)
+@tvm.testing.requires_openclml
+@tvm.testing.parametrize_targets("opencl -device=adreno")
+def test_reshape(remote, dtype, target, executor_type, trials):
+    def _verify(shape, newshape):
+        np.random.seed(0)
+        x = relay.var("x", shape=(shape), dtype=dtype)
+        # Defined the test case with unary operator
+        # Single reshape op is failing in native OpenCL with vm executor type
+        # Empty TVM mod in VM doesn't pick appropriate cross compiler
+        out = relay.nn.relu(x)
+        out = relay.reshape(out, newshape)
+
+        inputs = {"x": tvm.nd.array(np.random.uniform(-1, 1, 
shape).astype(dtype))}
+        params = {}
+        mod = IRModule.from_expr(out)
+        outputs = _build_and_run_network(remote, mod, params, inputs, target, 
executor_type)
+        out_rtol = 1e-3 if dtype == "float16" else 1e-5
+        tvm.testing.assert_allclose(
+            outputs[0].asnumpy(), outputs[1].asnumpy(), rtol=out_rtol, 
atol=out_rtol
+        )
+        exp_codegen = [
+            {
+                "attrs": {"dtype": [[dtype]], "shape": 
[[list(inputs["x"].shape)]]},
+                "name": "",
+                "op": "input",
+            },
+            {
+                "attrs": {
+                    "dtype": [[dtype]],
+                    "num_inputs": "1",
+                    "num_outputs": "1",
+                    "shape": [[list(inputs["x"].shape)]],
+                },
+                "inputs": [[0, 0, 0]],
+                "name": "nn.relu",
+                "op": "kernel",
+            },
+            {
+                "attrs": {
+                    "allowzero": [["0"]],
+                    "dtype": [[dtype]],
+                    "newshape": [[str(ele) for ele in list(newshape)]],
+                    "num_inputs": "1",
+                    "num_outputs": "1",
+                    "shape": [[list(outputs[0].shape)]],
+                },
+                "inputs": [[1, 0, 0]],
+                "name": "reshape",
+                "op": "kernel",
+            },
+        ]
+        verify_codegen(remote, mod, params, exp_codegen, target)
+
+    _verify(trials[0], trials[1])
+
+
+def _get_pool_global_expected_codegen(input_shape, pool_type, dtype, 
out_shape):
+
+    exp_codegen = [
+        {
+            "attrs": {
+                "dtype": [[str(dtype)]],
+                "shape": [[list(input_shape)]],
+            },
+            "name": "",
+            "op": "input",
+        },
+        {
+            "attrs": {
+                "dtype": [[str(dtype)]],
+                "layout": [["NCHW"]],
+                "num_inputs": "1",
+                "num_outputs": "1",
+                "out_layout": [[""]],
+                "shape": [[list(out_shape)]],
+            },
+            "inputs": [[0, 0, 0]],
+            "name": "nn.global_avg_pool2d" if pool_type == "avg" else 
"nn.global_max_pool2d",
+            "op": "kernel",
+        },
+    ]
+    return exp_codegen
+
+
+@pytest.mark.parametrize("dtype", ["float32", "float16"])
+@pytest.mark.parametrize(
+    "trials",
+    [
+        [(1, 3, 32, 32), "avg"],
+        [(1, 64, 147, 147), "max"],
+        [(1, 192, 71, 71), "max"],
+        [(1, 288, 35, 35), "max"],
+        [(1, 768, 17, 17), "max"],
+        [(1, 2048, 17, 17), "max"],
+        [(1, 192, 35, 35), "avg"],
+        [(1, 256, 35, 35), "avg"],
+        [(1, 288, 35, 35), "avg"],
+        [(1, 768, 17, 17), "avg"],
+        [(1, 1280, 8, 8), "avg"],
+    ],
+)
+@tvm.testing.requires_openclml
+@tvm.testing.parametrize_targets("opencl -device=adreno")
+def test_pool_global(remote, dtype, target, executor_type, trials):
+    params = {}
+    (input_shape, pooling_type) = trials
+    np.random.seed(0)
+    a = relay.var("a", shape=input_shape, dtype=dtype)
+    inputs = {"a": tvm.nd.array(np.random.uniform(-1, 1, 
input_shape).astype(dtype))}
+    if pooling_type == "max":
+        func = relay.nn.global_max_pool2d(a)
+    else:
+        func = relay.nn.global_avg_pool2d(a)
+    mod = IRModule.from_expr(func)
+    outputs = _build_and_run_network(remote, mod, params, inputs, target, 
executor_type)
+    out_rtol = 1e-3 if dtype == "float16" else 1e-5
+    tvm.testing.assert_allclose(
+        outputs[0].asnumpy(), outputs[1].asnumpy(), rtol=out_rtol, 
atol=out_rtol
+    )
+    args = (input_shape, pooling_type, dtype, outputs[0].shape)
+    exp_codegen = _get_pool_global_expected_codegen(*args)
+    verify_codegen(remote, mod, params, exp_codegen, target)
+
+
+@pytest.mark.parametrize("dtype", ["float32", "float16"])
+@tvm.testing.requires_openclml
+@tvm.testing.parametrize_targets("opencl -device=adreno")
+def test_batch_flatten(remote, dtype, target, executor_type):
+    def _get_model(a_shape):
+        a = relay.var("a", shape=(a_shape), dtype=dtype)
+        # Defined the test case with unary operator
+        # Single batch_flatten op is failing in native OpenCL
+        # Empty TVM mod in VM doesn't pick appropriate cross compiler
+        out = relay.nn.relu(a)
+        out = relay.nn.batch_flatten(out)
+        inputs = {"a": tvm.nd.array(np.random.uniform(-1, 1, 
a_shape).astype(dtype))}
+        params = {}
+        return out, params, inputs
+
+    def _verify(out, params, inputs):
+        mod = IRModule.from_expr(out)
+        outputs = _build_and_run_network(remote, mod, params, inputs, target, 
executor_type)
+        out_rtol = 1e-3 if dtype == "float16" else 1e-5
+        tvm.testing.assert_allclose(
+            outputs[0].asnumpy(), outputs[1].asnumpy(), rtol=out_rtol, 
atol=out_rtol
+        )
+        exp_codegen = [
+            {
+                "attrs": {"dtype": [[dtype]], "shape": 
[[list(inputs["a"].shape)]]},
+                "name": "",
+                "op": "input",
+            },
+            {
+                "attrs": {
+                    "dtype": [[dtype]],
+                    "num_inputs": "1",
+                    "num_outputs": "1",
+                    "shape": [[list(inputs["a"].shape)]],
+                },
+                "inputs": [[0, 0, 0]],
+                "name": "nn.relu",
+                "op": "kernel",
+            },
+            {
+                "attrs": {
+                    "dtype": [[dtype]],
+                    "num_inputs": "1",
+                    "num_outputs": "1",
+                    "shape": [[list(outputs[0].shape)]],
+                },
+                "inputs": [[1, 0, 0]],
+                "name": "nn.batch_flatten",
+                "op": "kernel",
+            },
+        ]
+        verify_codegen(remote, mod, params, exp_codegen, target)
 
-    _verify(*(_get_model((1, 128, 32), (1, 128, 32), False, True)))
-    _verify(*(_get_model((1, 128, 128), (1, 32, 128), False, True)))
+    _verify(*(_get_model((1, 3, 2))))
+    _verify(*(_get_model((1, 4, 3, 2))))
+    _verify(*(_get_model((1, 64, 8, 8))))
+    _verify(*(_get_model((1, 128, 4, 4))))
 
 
 if __name__ == "__main__":
diff --git a/tests/scripts/task_python_adreno.sh 
b/tests/scripts/task_python_adreno.sh
index 634d9adbd6..18e0feb815 100755
--- a/tests/scripts/task_python_adreno.sh
+++ b/tests/scripts/task_python_adreno.sh
@@ -31,6 +31,7 @@ export TVM_TRACKER_PORT=$(((RANDOM % 100) + 9100))
 export RPC_DEVICE_KEY="android"
 export RPC_TARGET="adreno"
 export 
TVM_NDK_CC="${ANDROID_NDK_HOME}/toolchains/llvm/prebuilt/linux-x86_64/bin/aarch64-linux-android28-clang"
+export 
CXX="${ANDROID_NDK_HOME}/toolchains/llvm/prebuilt/linux-x86_64/bin/aarch64-linux-android28-clang"
 
 env PYTHONPATH=python python3 -m tvm.exec.rpc_tracker --host 
"${TVM_TRACKER_HOST}" --port "${TVM_TRACKER_PORT}" &
 TRACKER_PID=$!

(tvm) branch main updated: [RUNTIME][CLML] Fix for CLML ops and enable more test case (#15896)

Reply via email to