This is an automated email from the ASF dual-hosted git repository.

masahi pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tvm.git


The following commit(s) were added to refs/heads/main by this push:
     new 27b0aad5a5 [BYOC-OpenCLML] OpenCLML integration with TVM. (#10243)
27b0aad5a5 is described below

commit 27b0aad5a55254815a076dbcacb53e9725019f9d
Author: Siva <[email protected]>
AuthorDate: Tue Jun 14 16:00:28 2022 +0530

    [BYOC-OpenCLML] OpenCLML integration with TVM. (#10243)
    
    * [BYOC-OpenCLML] OpenCLML integration with TVM.
    
    * [BYOC-OpenCLML] Cleanup and review.
---
 CMakeLists.txt                                     |    3 +
 cmake/config.cmake                                 |    5 +
 cmake/modules/LibInfo.cmake                        |    2 +
 cmake/modules/contrib/CLML.cmake                   |   58 ++
 python/tvm/relay/op/contrib/__init__.py            |    1 +
 python/tvm/relay/op/contrib/clml.py                |  247 +++++
 src/relay/backend/contrib/clml/codegen.cc          |  412 ++++++++
 src/runtime/contrib/clml/clml_runtime.cc           | 1091 ++++++++++++++++++++
 src/support/libinfo.cc                             |    2 +
 .../python/contrib/test_clml}/__init__.py          |   13 +-
 tests/python/contrib/test_clml/infrastructure.py   |  256 +++++
 tests/python/contrib/test_clml/test_network.py     |  139 +++
 tests/python/contrib/test_clml/test_ops.py         |  216 ++++
 13 files changed, 2433 insertions(+), 12 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 2238665644..6931b40c66 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -110,6 +110,8 @@ tvm_option(USE_TENSORRT_RUNTIME "Build with TensorRT 
runtime" OFF)
 tvm_option(USE_RUST_EXT "Build with Rust based compiler extensions, STATIC, 
DYNAMIC, or OFF" OFF)
 tvm_option(USE_VITIS_AI "Build with VITIS-AI Codegen support" OFF)
 tvm_option(SUMMARIZE "Print CMake option summary after configuring" OFF)
+tvm_option(USE_CLML "Build with CLML Codegen support" OFF)
+tvm_option(USE_CLML_GRAPH_EXECUTOR "Build with CLML graph runtime" OFF)
 
 # include directories
 include_directories(${CMAKE_INCLUDE_PATH})
@@ -492,6 +494,7 @@ include(cmake/modules/contrib/ArmComputeLib.cmake)
 include(cmake/modules/contrib/TensorRT.cmake)
 include(cmake/modules/contrib/VitisAI.cmake)
 include(cmake/modules/contrib/Verilator.cmake)
+include(cmake/modules/contrib/CLML.cmake)
 include(cmake/modules/Git.cmake)
 include(cmake/modules/LibInfo.cmake)
 include(cmake/modules/RustExt.cmake)
diff --git a/cmake/config.cmake b/cmake/config.cmake
index 2c22d2b498..212b565f25 100644
--- a/cmake/config.cmake
+++ b/cmake/config.cmake
@@ -269,6 +269,11 @@ set(USE_VITIS_AI OFF)
 # Build Verilator codegen and runtime
 set(USE_VERILATOR OFF)
 
+#Whether to use CLML codegen
+set(USE_CLML OFF)
+# USE_CLML_GRAPH_EXECUTOR - CLML SDK PATH or ON or OFF
+set(USE_CLML_GRAPH_EXECUTOR OFF)
+
 # Build ANTLR parser for Relay text format
 # Possible values:
 # - ON: enable ANTLR by searching default locations (cmake find_program for 
antlr4 and /usr/local for jar)
diff --git a/cmake/modules/LibInfo.cmake b/cmake/modules/LibInfo.cmake
index 2c07a94ed5..06c42494a3 100644
--- a/cmake/modules/LibInfo.cmake
+++ b/cmake/modules/LibInfo.cmake
@@ -113,6 +113,8 @@ function(add_lib_info src_file)
     TVM_INFO_USE_THRUST="${USE_THRUST}"
     TVM_INFO_USE_VITIS_AI="${USE_VITIS_AI}"
     TVM_INFO_USE_VULKAN="${USE_VULKAN}"
+    TVM_INFO_USE_CLML="${USE_CLML}"
+    TVM_INFO_USE_CLML_GRAPH_EXECUTOR="${USE_CLML_GRAPH_EXECUTOR}"
   )
 
 endfunction()
diff --git a/cmake/modules/contrib/CLML.cmake b/cmake/modules/contrib/CLML.cmake
new file mode 100644
index 0000000000..30e60423b0
--- /dev/null
+++ b/cmake/modules/contrib/CLML.cmake
@@ -0,0 +1,58 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+if(USE_CLML)
+    file(GLOB CLML_RELAY_CONTRIB_SRC src/relay/backend/contrib/clml/*.cc)
+    file(GLOB CLML_RUNTIME_MODULE src/runtime/contrib/clml/clml_runtime.cc)
+    list(APPEND COMPILER_SRCS ${CLML_RELAY_CONTRIB_SRC})
+    if(NOT USE_CLML_GRAPH_EXECUTOR)
+        list(APPEND COMPILER_SRCS ${CLML_RUNTIME_MODULE})
+    endif()
+    message(STATUS "Build with CLML support...")
+endif()
+
+if(USE_CLML_GRAPH_EXECUTOR)
+    set(CLML_PATH ${CMAKE_CURRENT_SOURCE_DIR}/clml)
+    # Detect custom CLML path.
+    if (NOT USE_CLML_GRAPH_EXECUTOR STREQUAL "ON")
+        set(CLML_PATH ${USE_CLML_GRAPH_EXECUTOR})
+    endif()
+
+    file(GLOB CLML_CONTRIB_SRC src/runtime/contrib/clml/*)
+
+    # Cmake needs to find clml library, include and support directories
+    # in the path specified by CLML_PATH.
+    set(CLML_INCLUDE_DIRS ${CLML_PATH}/include ${CLML_PATH})
+    include_directories(${CLML_INCLUDE_DIRS})
+    find_library(EXTERN_CLML_COMPUTE_LIB
+          NAMES OpenCL libOpenCL
+          HINTS "${CLML_PATH}" "${CLML_PATH}/lib64"
+          )
+    list(APPEND TVM_RUNTIME_LINKER_LIBS ${EXTERN_CLML_COMPUTE_LIB})
+    list(APPEND RUNTIME_SRCS ${CLML_CONTRIB_SRC})
+    message(STATUS "Build with CLML graph runtime support: "
+            ${EXTERN_CLML_COMPUTE_LIB})
+
+    # Set flag to detect CLML graph runtime support.
+    add_definitions(-DTVM_GRAPH_EXECUTOR_CLML)
+
+    message(STATUS "Enable OpenCL as fallback to CLML")
+    file(GLOB RUNTIME_OPENCL_SRCS src/runtime/opencl/*.cc)
+    list(APPEND RUNTIME_SRCS ${RUNTIME_OPENCL_SRCS})
+    set(USE_OPENCL ON)
+
+endif()
diff --git a/python/tvm/relay/op/contrib/__init__.py 
b/python/tvm/relay/op/contrib/__init__.py
index a03d0f6d4f..01708e8452 100644
--- a/python/tvm/relay/op/contrib/__init__.py
+++ b/python/tvm/relay/op/contrib/__init__.py
@@ -26,3 +26,4 @@ from .ethosn import *
 from .libtorch import *
 from .tensorrt import *
 from .cutlass import *
+from .clml import *
diff --git a/python/tvm/relay/op/contrib/clml.py 
b/python/tvm/relay/op/contrib/clml.py
new file mode 100644
index 0000000000..cacd10de28
--- /dev/null
+++ b/python/tvm/relay/op/contrib/clml.py
@@ -0,0 +1,247 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+# pylint: disable=invalid-name, unused-argument
+"""CLML Library supported operators."""
+import tvm
+
+from tvm import relay
+from tvm._ffi import register_func
+from tvm.relay import transform
+from tvm.relay.build_module import bind_params_by_name
+
+from ...dataflow_pattern import wildcard, is_op, is_constant, is_tuple_get_item
+from .register import register_pattern_table
+from ..strategy.generic import is_depthwise_conv2d
+
+
+def is_clml_runtime_enabled():
+    """Check if the CLML graph runtime is present.
+
+    Returns
+    -------
+    ret: bool
+        True if present, False if not.
+    """
+    check_enabled = tvm.get_global_func("relay.op.is_clml_runtime_enabled", 
True)
+    if check_enabled:
+        return check_enabled()
+    return False
+
+
+def partition_for_clml(mod, params=None):
+    """Partition the graph greedily offloading supported
+    operators to CLML Library.
+
+    Parameters
+    ----------
+    mod : Module
+        The module to run passes on.
+    params : Optional[Dict[str, NDArray]]
+        Constant input parameters.
+
+    Returns
+    -------
+    ret : annotated and partitioned module.
+    """
+
+    if params:
+        mod["main"] = bind_params_by_name(mod["main"], params)
+
+    seq = tvm.transform.Sequential(
+        [
+            transform.InferType(),
+            transform.FoldConstant(),
+            transform.MergeComposite(clml_pattern_table()),
+            transform.AnnotateTarget("clml", False),
+            transform.MergeCompilerRegions(),
+            transform.PartitionGraph(),
+        ]
+    )
+
+    result_mod = seq(mod)
+    return result_mod
+
+
+@register_func("relay.ext.clml.optimize")
+def preprocess_module(mod):
+    """
+    Pre-process a module containing functions ready for CLML codegen. For now 
we enforce OIHW
+    kernel layout and fold the transforms away.
+
+    Parameters
+    ----------
+    mod : Module
+        The module to run passes on.
+
+    Returns
+    -------
+    preprocessed_mod : The processed module.
+    """
+
+    def convert_layout_conv2d(conv2d_function):
+        def convert_conv(attrs, inputs, tinfos, desired_layouts):
+            new_attrs = dict(attrs)
+            data_info = tinfos[0]
+            weight_info = tinfos[1]
+            desired_data_layout, desired_kernel_layout = map(str, 
desired_layouts)
+            new_attrs["data_layout"] = desired_data_layout
+            new_attrs["kernel_layout"] = desired_kernel_layout
+
+            if is_depthwise_conv2d(
+                data_info.shape,
+                attrs["data_layout"],
+                weight_info.shape,
+                attrs["kernel_layout"],
+                attrs["groups"],
+            ):
+                dkl = desired_kernel_layout
+                new_attrs["kernel_layout"] = dkl[1] + dkl[0] + dkl[2] + dkl[3]
+            return conv2d_function(*inputs, **new_attrs)
+
+        return convert_conv
+
+    with OpAttrContext(
+        "nn.conv2d", "FTVMConvertOpLayout", 
convert_layout_conv2d(tvm.relay.nn.conv2d)
+    ):
+        seq = tvm.transform.Sequential(
+            [
+                transform.ConvertLayout({"nn.conv2d": ["NCHW", "OIHW"]}),
+                transform.FoldConstant(),
+            ]
+        )
+        preprocessed_mod = seq(mod)
+    return preprocessed_mod
+
+
+@register_pattern_table("clml")
+def clml_pattern_table():
+    """Get the CLML pattern table."""
+
+    def conv_pattern():
+        """Create a convolution pattern."""
+        pattern = is_op("nn.conv2d")(wildcard(), is_constant())
+        pattern = pattern.optional(lambda x: is_op("nn.bias_add")(x, 
is_constant()))
+        pattern = pattern.optional(
+            lambda x: is_op("nn.batch_norm")(
+                x, is_constant(), is_constant(), is_constant(), is_constant()
+            )
+        )
+        pattern = pattern.optional(is_tuple_get_item)
+        pattern = pattern.optional(is_op("nn.relu"))
+        return pattern
+
+    def batch_norm_pattern():
+        """Create a batch norm pattern."""
+        pattern = is_op("nn.batch_norm")(
+            wildcard(), is_constant(), is_constant(), is_constant(), 
is_constant()
+        )
+        pattern = is_tuple_get_item(pattern)
+        return pattern
+
+    def dense_pattern():
+        """Create a dense pattern."""
+        pattern = is_op("nn.dense")(wildcard(), is_constant())
+        pattern = pattern.optional(lambda x: is_op("add")(x, is_constant()))
+        return pattern
+
+    def pad_pattern():
+        """Create a pad pattern."""
+        pattern = is_op("nn.pad")(wildcard(), wildcard())
+        return pattern
+
+    def check_conv(extract):
+        """Check conv pattern is supported by CLML."""
+        call = extract
+        if isinstance(call, tvm.relay.expr.TupleGetItem):
+            call = call.tuple_value
+        elif call.op.name == "nn.relu":
+            call = call.args[0]
+            if isinstance(call, tvm.relay.expr.TupleGetItem):
+                call = call.tuple_value
+        while call.op.name != "nn.conv2d":
+            call = call.args[0]
+        attrs, args = call.attrs, call.args
+        if attrs.data_layout != "NCHW":
+            return False
+        data_typ = args[0].checked_type
+        kernel_typ = args[1].checked_type
+        is_depthwise = is_depthwise_conv2d(
+            data_typ.shape,
+            attrs["data_layout"],
+            kernel_typ.shape,
+            attrs["kernel_layout"],
+            attrs["groups"],
+        )
+        if attrs.groups != 1 and not is_depthwise:
+            return False
+        return True
+
+    return [
+        ("clml.conv2d", conv_pattern(), check_conv),
+        ("clml.dense", dense_pattern()),
+        ("clml.pad", pad_pattern()),
+        ("clml.batch_norm", batch_norm_pattern()),
+    ]
+
+
+def _register_external_op_helper(op_name, supported=True):
+    @tvm.ir.register_op_attr(op_name, "target.clml")
+    def _func_wrapper(expr):
+        return supported
+
+    return _func_wrapper
+
+
+_register_external_op_helper("clip")
+_register_external_op_helper("relu")
+_register_external_op_helper("nn.global_avg_pool2d")
+_register_external_op_helper("nn.global_max_pool2d")
+_register_external_op_helper("nn.softmax")
+_register_external_op_helper("reshape")
+
+
+class OpAttrContext(object):
+    """Temporarily changes the attr of an op."""
+
+    def __init__(self, op_name, attr_key, attr_value):
+        """Saves the required info for RAII pattern usage.
+
+        Parameters
+        ----------
+        op_name : str
+            The op name.
+
+        attr_key : str
+            The attribute name.
+
+        attr_value : object
+            The attribute value.
+        """
+        self.op = relay.op.get(op_name)
+        self.attr_key = attr_key
+        self.attr_value = attr_value
+
+    def __enter__(self):
+        self.older_attr = self.op.get_attr(self.attr_key)
+        self.op.reset_attr(self.attr_key)
+        self.op.set_attr(self.attr_key, self.attr_value)
+        return self
+
+    def __exit__(self, ptype, value, trace):
+        self.op.reset_attr(self.attr_key)
+        if self.older_attr:
+            self.op.set_attr(self.attr_key, self.older_attr)
diff --git a/src/relay/backend/contrib/clml/codegen.cc 
b/src/relay/backend/contrib/clml/codegen.cc
new file mode 100644
index 0000000000..fa082a423d
--- /dev/null
+++ b/src/relay/backend/contrib/clml/codegen.cc
@@ -0,0 +1,412 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file src/relay/backend/contrib/clml/codegen.cc
+ * \brief Implementation of the Relay -> CLML JSON serializer.
+ */
+#include <tvm/ir/module.h>
+#include <tvm/relay/attrs/nn.h>
+#include <tvm/relay/type.h>
+#include <tvm/tir/analysis.h>
+
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+#include "../../utils.h"
+#include "../codegen_json/codegen_json.h"
+
+namespace tvm {
+namespace relay {
+namespace contrib {
+
+/*!
+ * \brief Generates an CLMLModule from a relay expression. This "compilation"
+ * does not require CLML since the actual conversion using CLML APIs is
+ * deferred until creation of the runtime. This step simply serializes the
+ * relay program into a JSON string.
+ */
+class CLMLJSONSerializer : public backend::contrib::JSONSerializer {
+  using JSONGraphNode = tvm::runtime::json::JSONGraphNode;
+  using JSONGraphNodeEntry = tvm::runtime::json::JSONGraphNodeEntry;
+
+ public:
+  CLMLJSONSerializer(const std::string& symbol, const Expr& expr)
+      : JSONSerializer(symbol, expr), clml_symbol_(symbol) {}
+
+  /*!
+   * \brief A series of operators that form a composite
+   * convolution. Supports nn.conv2d
+   */
+  struct CompositeConvNode {
+    const CallNode* pad = nullptr;
+    const CallNode* conv = nullptr;
+    const CallNode* bn = nullptr;
+    const CallNode* bias = nullptr;
+    const CallNode* activation = nullptr;
+    std::string act_type;
+  };
+
+  /*!
+   * \brief Visit call nodes and generate appropriate JSON node.
+   *
+   * \param cn The current call node.
+   * \return A list of graph entry nodes.
+   */
+  std::vector<JSONGraphNodeEntry> VisitExpr_(const CallNode* cn) override {
+    if (cn->op.as<OpNode>()) {
+      return JSONSerializer::VisitExpr_(cn);
+    }
+    if (!cn->op.as<FunctionNode>()) {
+      LOG(FATAL) << "CLML JSON runtime does not support calls to " << 
cn->op->GetTypeKey();
+    }
+    auto fn = cn->op.as<FunctionNode>();
+    auto comp = fn->GetAttr<String>(attr::kComposite);
+    ICHECK(comp.defined()) << "CLML JSON runtime only supports composite 
functions.";
+    const std::string name = comp.value();
+    std::shared_ptr<JSONGraphNode> json_node;
+    if (name == "clml.conv2d") {
+      json_node = CreateCompositeConvJSONNode(cn);
+    } else if (name == "clml.batch_norm") {
+      json_node = CreateBatchNormJSONNode(cn);
+    } else if (name == "clml.dense") {
+      json_node = CreateDenseJSONNode(cn);
+    } else if (name == "clml.pad") {
+      json_node = CreatePadJSONNode(cn);
+    } else {
+      LOG(FATAL) << "Unrecognized CLML  pattern: " << name;
+    }
+    return AddNode(json_node, GetRef<Expr>(cn));
+  }
+
+  /*!
+   * \brief Visit call nodes and generate ordered params.
+   *
+   * \param cn The current constant node.
+   * \return A list of graph entry nodes.
+   */
+  std::vector<JSONGraphNodeEntry> VisitExpr_(const ConstantNode* cn) override {
+    std::string name = "clml_" + clml_symbol_ + "_const_" + 
std::to_string(clml_params_.size());
+    clml_params_.push_back(name);
+    clml_params_map_[name] = cn->data;
+    auto node = std::make_shared<JSONGraphNode>(name, "const" /* op_type_ */);
+    return AddNode(node, GetRef<Expr>(cn));
+  }
+
+  Array<String> GetParams() const { return clml_params_; }
+  Map<String, runtime::NDArray> GetParamsMap() const {
+    return Map<String, runtime::NDArray>(clml_params_map_);
+  }
+
+ private:
+  std::string clml_symbol_;
+  Array<String> clml_params_;
+  std::unordered_map<String, runtime::NDArray> clml_params_map_;
+  /*!
+   * \brief Extract convolution nodes from a composite function.
+   *
+   * \param cn The call node of the composite function.
+   * \return Extracted composite convolution nodes.
+   */
+  static CompositeConvNode UnpackCompositeConvolution(const CallNode* cn) {
+    CompositeConvNode nodes{};
+
+    const auto* fn = cn->op.as<FunctionNode>();
+    ICHECK(fn);
+    // Traverse composite convolution function from child to parent
+    const auto* current_call = fn->body.as<CallNode>();
+    if (fn->body.as<TupleGetItemNode>()) {
+      auto tuple_item = fn->body.as<TupleGetItemNode>();
+      current_call = tuple_item->tuple.as<CallNode>();
+    } else {
+      current_call = fn->body.as<CallNode>();
+    }
+    if (backend::IsOp(current_call, "nn.relu")) {
+      nodes.activation = current_call;
+      nodes.act_type = "relu";
+      if (current_call->args[0].as<TupleGetItemNode>()) {
+        auto tuple_item = current_call->args[0].as<TupleGetItemNode>();
+        current_call = tuple_item->tuple.as<CallNode>();
+      } else {
+        current_call = current_call->args[0].as<CallNode>();
+      }
+    }
+    if (backend::IsOp(current_call, "nn.batch_norm")) {
+      nodes.bn = current_call;
+      current_call = current_call->args[0].as<CallNode>();
+    }
+    if (backend::IsOp(current_call, "add")) {
+      nodes.bias = current_call;
+      current_call = current_call->args[0].as<CallNode>();
+    }
+    // Enforce a convolution node exists at this point during traversal
+    ICHECK(backend::IsOp(current_call, "nn.conv2d"));
+    nodes.conv = current_call;
+    if (!current_call->args.empty() && 
current_call->args[0]->IsInstance<CallNode>()) {
+      current_call = current_call->args[0].as<CallNode>();
+      if (backend::IsOp(current_call, "nn.pad")) {
+        nodes.pad = current_call;
+      }
+    }
+    return nodes;
+  }
+
+  /*!
+   * \brief Create a JSON representation of a composite convolution.
+   *
+   * \param cn The call to be represented.
+   * \return A JSON representation of a specific operator.
+   */
+  std::shared_ptr<JSONGraphNode> CreateCompositeConvJSONNode(const CallNode* 
cn) {
+    CompositeConvNode nodes = UnpackCompositeConvolution(cn);
+
+    const auto* conv_attr = nodes.conv->attrs.as<Conv2DAttrs>();
+    ICHECK(conv_attr);
+
+    std::string name;
+    std::string name_prefix = "nn";
+
+    // Distinguish between normal and depth-wise convolution
+    if (conv_attr->channels.defined() &&
+        tvm::tir::ExprDeepEqual()(conv_attr->channels, conv_attr->groups) &&
+        conv_attr->groups != 1) {
+      name = "depthwise_conv2d";
+      ICHECK(conv_attr->kernel_layout == "IOHW")
+          << "Kernel layout must be IHWO, has the module been pre-processed 
correctly?";
+    } else {
+      name = "conv2d";
+      ICHECK(conv_attr->kernel_layout == "OIHW")
+          << "Kernel layout must be OHWI, has the module been pre-processed 
correctly?";
+    }
+
+    // Inputs must be added in the same order they appear in the relay graph.
+    std::vector<JSONGraphNodeEntry> inputs;
+
+    inputs.push_back(VisitExpr(cn->args[0])[0]);
+    inputs.push_back(VisitExpr(nodes.conv->args[1])[0]);
+    if (nodes.bias) {
+      inputs.push_back(VisitExpr(nodes.bias->args[1])[0]);
+    }
+    // Deal with Batchnorm Fusing here
+    if (nodes.bn) {
+      inputs.push_back(VisitExpr(nodes.bn->args[1])[0]);
+      inputs.push_back(VisitExpr(nodes.bn->args[2])[0]);
+      inputs.push_back(VisitExpr(nodes.bn->args[3])[0]);
+      inputs.push_back(VisitExpr(nodes.bn->args[4])[0]);
+    }
+
+    auto json_node = std::make_shared<JSONGraphNode>(name_prefix + "." + name, 
"kernel", inputs, 1);
+    SetCallNodeAttribute(json_node, nodes.conv);
+
+    if (nodes.bn) {
+      const auto* bn_attr = nodes.bn->attrs.as<BatchNormAttrs>();
+      std::vector<dmlc::any> bn_any_attr;
+      std::vector<std::string> bn_args = {
+          std::to_string(bn_attr->axis), std::to_string(bn_attr->epsilon),
+          std::to_string(bn_attr->center), std::to_string(bn_attr->scale)};
+      bn_any_attr.emplace_back(bn_args);
+      json_node->SetAttr("batchnorm", bn_any_attr);
+    }
+
+    // Override attributes
+    if (nodes.pad) {
+      const auto* pad_attr = nodes.pad->attrs.as<PadAttrs>();
+      ICHECK(pad_attr);
+      auto p = pad_attr->pad_width;
+      // Standard convolution pad layout for TVM: dimension wise pair of pre 
and post padding.
+      // CLML takes dimension wise pre-padding followed by dimension wise 
post-padding.
+      std::vector<std::string> padding = 
{std::to_string(p[2][0].as<IntImmNode>()->value),
+                                          
std::to_string(p[3][0].as<IntImmNode>()->value),
+                                          
std::to_string(p[2][1].as<IntImmNode>()->value),
+                                          
std::to_string(p[3][1].as<IntImmNode>()->value)};
+      std::vector<dmlc::any> padding_attr;
+      padding_attr.emplace_back(padding);
+      json_node->SetAttr("padding", padding_attr);
+    }
+
+    if (nodes.activation) {
+      std::vector<std::string> activation_type = {nodes.act_type};
+      std::vector<dmlc::any> act_attr;
+      act_attr.emplace_back(activation_type);
+      json_node->SetAttr("activation_type", act_attr);
+    }
+    return json_node;
+  }
+
+  /*!
+   * \brief Create a JSON representation of a Batchnorm operator.
+   *
+   * \param cn The call to be represented.
+   * \return A JSON representation of a specific operator.
+   */
+  std::shared_ptr<JSONGraphNode> CreateBatchNormJSONNode(const CallNode* cn) {
+    const auto* fn = cn->op.as<FunctionNode>();
+    ICHECK(fn);
+    const auto* tuple_item = fn->body.as<TupleGetItemNode>();
+    ICHECK(tuple_item);
+    const auto* bn = tuple_item->tuple.as<CallNode>();
+    ICHECK(bn);
+    const auto* bn_op = bn->op.as<OpNode>();
+    ICHECK(bn_op);
+    const std::string name = bn_op->name;
+
+    std::vector<JSONGraphNodeEntry> inputs;
+    inputs.push_back(VisitExpr(cn->args[0])[0]);
+    inputs.push_back(VisitExpr(bn->args[1])[0]);
+    inputs.push_back(VisitExpr(bn->args[2])[0]);
+    inputs.push_back(VisitExpr(bn->args[3])[0]);
+    inputs.push_back(VisitExpr(bn->args[4])[0]);
+    auto json_node = std::make_shared<JSONGraphNode>(name, "kernel", inputs, 
1);
+    SetCallNodeAttribute(json_node, bn);
+    return json_node;
+  }
+
+  /*!
+   * \brief Create a JSON representation of a Dense operator.
+   *
+   * \param cn The call to be represented.
+   * \return A JSON representation of a specific operator.
+   */
+  std::shared_ptr<JSONGraphNode> CreateDenseJSONNode(const CallNode* cn) {
+    const auto* fn = cn->op.as<FunctionNode>();
+    ICHECK(fn);
+    const auto* dense = fn->body.as<CallNode>();
+    const CallNode* bias = nullptr;
+
+    if (backend::IsOp(dense, "add")) {
+      bias = dense;
+      dense = dense->args[0].as<CallNode>();
+    }
+    ICHECK(backend::IsOp(dense, "nn.dense"));
+    const auto* dense_op = dense->op.as<OpNode>();
+    ICHECK(dense_op);
+    const std::string name = dense_op->name;
+
+    std::vector<JSONGraphNodeEntry> inputs;
+    inputs.push_back(VisitExpr(cn->args[0])[0]);
+    inputs.push_back(VisitExpr(dense->args[1])[0]);
+    if (bias) {
+      inputs.push_back(VisitExpr(bias->args[1])[0]);
+    }
+    auto json_node = std::make_shared<JSONGraphNode>(name, "kernel", inputs, 
1);
+    SetCallNodeAttribute(json_node, dense);
+    return json_node;
+  }
+
+  /*!
+   * \brief Create a JSON representation of a Pad operator.
+   *
+   * \param cn The call to be represented.
+   * \return A JSON representation of a specific operator.
+   */
+  std::shared_ptr<JSONGraphNode> CreatePadJSONNode(const CallNode* cn) {
+    const auto* fn = cn->op.as<FunctionNode>();
+    ICHECK(fn);
+    const auto* pad = fn->body.as<CallNode>();
+    const auto* pad_op = pad->op.as<OpNode>();
+    ICHECK(pad_op);
+    const std::string name = pad_op->name;
+
+    std::vector<JSONGraphNodeEntry> inputs;
+    inputs.push_back(VisitExpr(cn->args[0])[0]);
+
+    auto json_node = std::make_shared<JSONGraphNode>(name, "kernel", inputs, 
1);
+
+    const auto* pad_attr = pad->attrs.as<PadAttrs>();
+    ICHECK(pad_attr);
+    auto p = pad_attr->pad_width;
+    // TVM padding format: Dimension wise pair of pre and post padding.
+    // CLML padding format: Dimension wise pre padding followed by dimension 
wise post padding.
+    std::vector<std::string> padding = 
{std::to_string(p[2][0].as<IntImmNode>()->value),
+                                        
std::to_string(p[2][1].as<IntImmNode>()->value),
+                                        
std::to_string(p[3][0].as<IntImmNode>()->value),
+                                        
std::to_string(p[3][1].as<IntImmNode>()->value)};
+    std::vector<dmlc::any> padding_attr;
+    padding_attr.emplace_back(padding);
+    json_node->SetAttr("pad_width", padding_attr);
+
+    std::vector<std::string> pad_mode = {pad_attr->pad_mode};
+    std::vector<dmlc::any> pad_mode_attr;
+    pad_mode_attr.emplace_back(pad_mode);
+    json_node->SetAttr("pad_mode", pad_mode_attr);
+
+    return json_node;
+  }
+};
+
+/*!
+ * \brief Create a runtime module for CLML.
+ *
+ * This consists of a series of "serialized functions" which each represent a
+ * sub-graph to be computed by CLML and will each be executed independently 
from
+ * one another. Each function consists of serialized JSON describing the 
sub-graph
+ * and serialized constant tensors.
+ *
+ * \note The CLML runtime module only supports a single operator per
+ * sub-graph currently.
+ *
+ * \param ref The ext_func Relay expression/module to be executed using extern 
ops.
+ * \return A runtime module.
+ */
+runtime::Module CLMLCompiler(const ObjectRef& ref) {
+  ICHECK(ref->IsInstance<FunctionNode>()) << "The input ref is expected to be 
a Relay function.";
+  Function func = Downcast<Function>(ref);
+  std::string func_name = backend::GetExtSymbol(func);
+
+  CLMLJSONSerializer serializer(func_name, func);
+  serializer.serialize();
+  std::string graph_json = serializer.GetJSON();
+  auto param_names = serializer.GetParams();
+  const auto* pf = runtime::Registry::Get("runtime.clml_runtime_create");
+  ICHECK(pf != nullptr) << "Cannot find CLML runtime module to create";
+  runtime::Module lib = (*pf)(func_name, graph_json, param_names);
+  return lib;
+}
+
+TVM_REGISTER_GLOBAL("relay.ext.clml").set_body_typed(CLMLCompiler);
+
+/*!
+ * \brief Check whether CLML graph runtime is used.
+ *
+ * \return True if CLML graph runtime is enabled, False if not.
+ */
+inline constexpr bool IsCLMLRuntimeEnabled() {
+#if TVM_GRAPH_EXECUTOR_CLML
+  return true;
+#else
+  return false;
+#endif
+}
+
+TVM_REGISTER_GLOBAL("relay.op.is_clml_runtime_enabled").set_body_typed(IsCLMLRuntimeEnabled);
+
+Map<String, runtime::NDArray> CLMLConstantUpdater(Expr func, std::string 
symbol) {
+  CLMLJSONSerializer serializer(symbol, func);
+  serializer.serialize();
+  auto pmap = serializer.GetParamsMap();
+  return pmap;
+}
+
+TVM_REGISTER_GLOBAL("relay.ext.clml.constant_updater").set_body_typed(CLMLConstantUpdater);
+
+}  // namespace contrib
+}  // namespace relay
+}  // namespace tvm
diff --git a/src/runtime/contrib/clml/clml_runtime.cc 
b/src/runtime/contrib/clml/clml_runtime.cc
new file mode 100644
index 0000000000..7966c0e78b
--- /dev/null
+++ b/src/runtime/contrib/clml/clml_runtime.cc
@@ -0,0 +1,1091 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file src/runtime/contrib/clml/clml_runtime.cc
+ * \brief A simple JSON runtime for CLML.
+ */
+
+#include <CL/cl.h>
+#include <CL/opencl.h>
+#ifdef TVM_GRAPH_EXECUTOR_CLML
+#include <CL/cl_qcom_ml_ops.h>
+#endif
+#include <stdlib.h>
+#include <tvm/runtime/ndarray.h>
+#include <tvm/runtime/registry.h>
+
+#include <fstream>
+#include <map>
+#include <utility>
+
+#include "../../opencl/opencl_common.h"
+#include "../json/json_node.h"
+#include "../json/json_runtime.h"
+
+namespace tvm {
+namespace runtime {
+namespace contrib {
+
+using namespace tvm::runtime::json;
+using JSONGraphNode = tvm::runtime::json::JSONGraphNode;
+
+class CLMLRuntime : public JSONRuntimeBase {
+ public:
+  /*!
+   * \brief The CLML runtime module. Deserialize the provided functions
+   * on creation and store in the layer cache.
+   *
+   * \param symbol_name The name of the function.
+   * \param graph_json serialized JSON representation of a sub-graph.
+   * \param const_names The names of each constant in the sub-graph.
+   */
+  explicit CLMLRuntime(const std::string& symbol_name, const std::string& 
graph_json,
+                       const Array<String>& const_names)
+      : JSONRuntimeBase(symbol_name, graph_json, const_names) {}
+
+  ~CLMLRuntime() {
+#ifdef TVM_GRAPH_EXECUTOR_CLML
+    cl_int result = 0;
+    if (this->is_tuning_run) {
+      result = h_ClmlIntf->clReleaseMLTuningCacheQCOM(this->tuning_cache);
+      ICHECK(result == CL_SUCCESS) << "clReleaseMLTuningCacheQCOM:" << result;
+    }
+    for (auto it = this->layer_.storage_map.begin(); it != 
this->layer_.storage_map.end(); it++) {
+      auto tensor_desc = it->second.first;
+      result = h_ClmlIntf->clReleaseMLTensorQCOM(tensor_desc->tensor);
+      ICHECK(result == CL_SUCCESS) << "clReleaseMLTensorQCOM:" << result;
+      result = clReleaseMemObject(tensor_desc->memory);
+      ICHECK(result == CL_SUCCESS) << "clReleaseMemObject:" << result;
+    }
+    for (size_t i = 0; i < this->layer_.function.size(); ++i) {
+      result = h_ClmlIntf->clReleaseMLOpQCOM(this->layer_.function[i]);
+      ICHECK(result == CL_SUCCESS) << "clReleaseMLOpQCOM:" << result;
+    }
+    for (auto it = this->layer_.in_placeholder.begin(); it != 
this->layer_.in_placeholder.end();
+         it++) {
+      result = h_ClmlIntf->clReleaseMLTensorQCOM((*it)->tensor);
+      ICHECK(result == CL_SUCCESS) << "clReleaseMLTensorQCOM:" << result;
+    }
+    for (auto it = this->layer_.out_placeholder.begin(); it != 
this->layer_.out_placeholder.end();
+         it++) {
+      result = h_ClmlIntf->clReleaseMLTensorQCOM((*it)->tensor);
+      ICHECK(result == CL_SUCCESS) << "clReleaseMLTensorQCOM:" << result;
+    }
+    result = 
h_ClmlIntf->clReleaseMLTensorMemoryDescriptorSetQCOM(layer_.descriptorSet);
+    ICHECK(result == CL_SUCCESS) << 
"clReleaseMLTensorMemoryDescriptorSetQCOM:" << result;
+#endif
+  }
+
+  /*!
+   * \brief The type key of the module.
+   *
+   * \return module type key.
+   */
+  const char* type_key() const override { return "clml"; }
+
+  /*!
+   * \brief Initialize runtime. Create CLML layer from JSON
+   * representation.
+   *
+   * \param consts The constant params from compiled model.
+   */
+  void Init(const Array<NDArray>& consts) override {
+    ICHECK_EQ(consts.size(), const_idx_.size())
+        << "The number of input constants must match the number of required.";
+    SetupConstants(consts);
+
+#ifdef TVM_GRAPH_EXECUTOR_CLML
+    InitCLML();
+#endif
+
+    BuildEngine();
+  }
+
+#ifdef TVM_GRAPH_EXECUTOR_CLML
+  std::vector<cl_uint> GetVectorValues(const std::vector<std::string>& val) {
+    std::vector<cl_uint> array;
+    for (auto i : val) {
+      array.push_back((cl_uint)stoi(i));
+    }
+    return array;
+  }
+
+  void InitCLML() {
+    // Setup CLML Context
+    cl_int result = 0;
+
+    // Initialize Context and Command Queue
+    result = clGetPlatformIDs(1, &platform, NULL);
+    ICHECK(result == CL_SUCCESS) << "clGetPlatformIDs:" << result;
+
+    uint32_t num_devices = 0;
+    result = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 0, NULL, 
&num_devices);
+    ICHECK(result == CL_SUCCESS && num_devices == 1) << "clGetDeviceIDs:" << 
result;
+
+    result = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &device_id, NULL);
+    ICHECK(device_id && result == CL_SUCCESS) << "clGetDeviceIDs:" << result;
+
+    if (!ExtensionStringPresent(device_id)) {
+      LOG(WARNING) << "CLML Runtime Init: Qualcomm extn not present.\n";
+      return;
+    }
+
+    // Reuse the OpenCl work space from TVM Device API.
+    auto func = tvm::runtime::Registry::Get("device_api.opencl");
+    ICHECK(func != nullptr) << "Cannot find OpenCL device_api in registry";
+    auto device_api = static_cast<cl::OpenCLWorkspace*>(((*func)()).operator 
void*());
+    this->context = device_api->context;
+    bool queue_found = false;
+    for (size_t i = 0; i < device_api->devices.size(); ++i) {
+      if (device_api->devices[i] == device_id) {
+        this->queue = device_api->queues[i];
+        this->evts = &(device_api->events[i]);
+        queue_found = true;
+      }
+    }
+    ICHECK(queue_found != false) << "Device queue not found in OpenCL 
Workspace";
+
+    // Query and Get CLML Interface
+    static const cl_uint MAX_VERSIONS = 256;
+    cl_int majorVersions[MAX_VERSIONS];
+    cl_int minorVersions[MAX_VERSIONS];
+    cl_uint numVersions = 0;
+    result = clQueryMLInterfaceVersionsQCOM(NULL, NULL, 0, &numVersions);
+    ICHECK(result == CL_SUCCESS) << "clQueryMLInterfaceVersionsQCOM:" << 
result;
+    ICHECK(numVersions > 0u);
+    ICHECK(numVersions <= MAX_VERSIONS);
+
+    result = clQueryMLInterfaceVersionsQCOM(majorVersions, minorVersions, 
numVersions, NULL);
+    ICHECK(result == CL_SUCCESS) << "clQueryMLInterfaceVersionsQCOM:" << 
result;
+
+    for (cl_uint i = 0; i < numVersions; ++i) {
+      if (majorVersions[i] == 2) {
+        LOG(WARNING) << "CLML Version Selected:" << majorVersions[i] << " : " 
<< majorVersions[i];
+        h_ClmlIntf = clGetMLInterfaceV2QCOM(0);
+        ICHECK(h_ClmlIntf != NULL) << "clGetMLInterfaceV2QCOM:" << result;
+        break;
+      }
+    }
+    char* tune_flag;
+    if ((tune_flag = getenv("CLML_IS_TUNNING_RUN")))
+      this->is_tuning_run = std::stoi(tune_flag);
+    else
+      this->is_tuning_run = 0;
+
+    if (!(tuning_file = getenv("CLML_TUNNING_CACHE"))) this->is_tuning_run = 0;
+    // A Tuning run, so create the cache from scratch
+    result = h_ClmlIntf->clCreateMLTuningCacheQCOM(&tuning_cache);
+    ICHECK(result == CL_SUCCESS) << "clCreateMLTuningCacheQCOM:" << result;
+    if (!this->is_tuning_run && this->tuning_file) {
+      std::vector<unsigned char> buffer;
+      buffer = readBinFile(this->tuning_file);
+      result = h_ClmlIntf->clLoadMLTuningCacheQCOM(tuning_cache, 
buffer.size(), buffer.data());
+      ICHECK(result == CL_SUCCESS) << "clLoadMLTuningCacheQCOM:" << result;
+    }
+  }
+
+  std::vector<unsigned char> readBinFile(const std::string& filename) {
+    std::ifstream fin(filename, std::ios::binary | std::ios::ate);
+    if (!fin.good()) {
+      LOG(FATAL) << "ERROR: Could not load tuning cache file: " + filename;
+    }
+    ICHECK(fin.good());
+    int64_t size = fin.tellg();
+    fin.seekg(0, std::ios::beg);
+    std::vector<unsigned char> buffer(static_cast<size_t>(size));
+    char* ptr = reinterpret_cast<char*>(buffer.data());
+    fin.read(ptr, size);
+    ICHECK(fin.good());
+    return buffer;
+  }
+
+  void CopyDataToCLMLTensor(std::shared_ptr<cl_ml_tensor_memory_desc_qcom> 
tensor, void* data,
+                            cl_ml_tensor_layout_qcom layout = 
CL_TENSOR_LAYOUT_NCHW_QCOM) {
+    cl_int result = 0;
+    cl_event evt = NULL;
+    result = h_ClmlIntf->clEnqueueWriteMLTensorDataQCOM(queue, data, layout, 
tensor->tensor,
+                                                        tensor->memory,
+                                                        0,      // n waitlist
+                                                        NULL,   // waitlist
+                                                        &evt);  // event
+    ICHECK((evt != NULL) && result == CL_SUCCESS) << 
"clEnqueueWriteMLTensorDataQCOM:" << result;
+  }
+
+  void CopyDataFromCLMLTensor(std::shared_ptr<cl_ml_tensor_memory_desc_qcom> 
tensor, void* data,
+                              cl_ml_tensor_layout_qcom layout = 
CL_TENSOR_LAYOUT_NCHW_QCOM) {
+    cl_int result = 0;
+    cl_event readEvent = NULL;
+    // Read the output tensor
+    result = h_ClmlIntf->clEnqueueReadMLTensorDataQCOM(queue, tensor->tensor, 
tensor->memory, data,
+                                                       layout,
+                                                       0,            // n 
waitlist
+                                                       NULL,         // 
waitlist
+                                                       &readEvent);  // event
+    ICHECK(result == CL_SUCCESS) << "clEnqueueReadMLTensorDataQCOM:" << result;
+
+    result = clWaitForEvents(1, &readEvent);
+    ICHECK(result == CL_SUCCESS) << "clWaitForEvents:" << result;
+  }
+
+  /*!
+   * \brief Unpack inputs and outputs and run inference on a given layer.
+   *
+   * \param args Access inputs and outputs.
+   * \param function The layer to execute inference on.
+   * \return Status of inference.
+   */
+  void Run() override {
+    cl_int result = 0;
+    for (size_t i = 0; i < input_nodes_.size(); ++i) {
+      auto nid = input_nodes_[i];
+      uint32_t eid = EntryID(nid, 0);
+      if (nodes_[nid].GetOpType() == "input") {
+        void* data = data_entry_[eid]->data;
+        size_t isize = 1;
+        for (size_t j = 0; j < data_entry_[eid]->ndim; ++j) {
+          isize *= data_entry_[eid]->shape[j];
+        }
+        if (kDLCPU == data_entry_[eid]->device.device_type) {
+          CopyDataToCLMLTensor(layer_.inputs[i], data);
+        } else if (kDLOpenCL == data_entry_[eid]->device.device_type) {
+          layer_.in_placeholder[i]->memory = static_cast<cl_mem>(
+              
((cl::BufferDescriptor*)const_cast<DLTensor*>(data_entry_[eid])->data)->buffer);
+          cl_event cpy_evt = NULL;
+          result = h_ClmlIntf->clEnqueueCopyMLTensorDataQCOM(
+              queue, layer_.in_placeholder[i]->tensor, 
layer_.in_placeholder[i]->memory,
+              layer_.inputs[i]->tensor, layer_.inputs[i]->memory, 0, NULL, 
&cpy_evt);
+          ICHECK(result == CL_SUCCESS) << "clEnqueueCopyMLTensorDataQCOM:" << 
result;
+        } else {
+          DLDataType tvm_dtype = 
const_cast<DLTensor*>(data_entry_[eid])->dtype;
+          cl_channel_type cl_dtype = MakeCLDataType(tvm_dtype);
+          int dtype_size = cl_dtype == CL_FLOAT ? 4 : 2;
+          void* tmpptr = reinterpret_cast<void*>(malloc(isize * dtype_size));
+          TVMArrayCopyToBytes(const_cast<DLTensor*>(data_entry_[eid]), 
const_cast<void*>(tmpptr),
+                              isize * dtype_size);
+          CopyDataToCLMLTensor(layer_.inputs[i], tmpptr);
+          free(tmpptr);
+        }
+      }
+    }
+
+    for (size_t i = 0; i < this->layer_.function.size(); ++i) {
+      this->evts->resize(this->evts->size() + 1);
+      cl_event* evt = &(this->evts->back());
+      result = h_ClmlIntf->clEnqueueMLOpQCOM(queue, this->layer_.function[i],
+                                             this->layer_.descriptorSet, 0, 
NULL, evt);
+      ICHECK(result == CL_SUCCESS) << "clEnqueueMLOpQCOM:" << result;
+    }
+
+    if (getenv("CLML_PROFILING")) {
+      cl_ulong start, end;
+      cl_ulong duration = 0;
+      clWaitForEvents(1, &(this->evts->back()));
+      for (size_t i = 0; i < this->layer_.layer_names.size(); ++i) {
+        clGetEventProfilingInfo((*this->evts)[i], CL_PROFILING_COMMAND_START, 
sizeof(cl_ulong),
+                                &start, nullptr);
+        clGetEventProfilingInfo((*this->evts)[i], CL_PROFILING_COMMAND_END, 
sizeof(cl_ulong), &end,
+                                nullptr);
+        duration += (end - start);
+        LOG(WARNING) << "Layer:" << this->layer_.layer_names[i] << " 
Duration:" << (end - start);
+      }
+      LOG(WARNING) << "Total Duration:" << duration;
+    }
+
+    for (size_t i = 0; i < outputs_.size(); ++i) {
+      uint32_t eid = EntryID(outputs_[i]);
+      void* data = data_entry_[eid]->data;
+
+      size_t osize = 1;
+      for (size_t j = 0; j < data_entry_[eid]->ndim; ++j) {
+        osize *= data_entry_[eid]->shape[j];
+      }
+      if (kDLCPU == data_entry_[eid]->device.device_type) {
+        CopyDataFromCLMLTensor(layer_.outputs[0], data);
+      } else if (kDLOpenCL == data_entry_[eid]->device.device_type) {
+        layer_.out_placeholder[i]->memory = static_cast<cl_mem>(
+            
((cl::BufferDescriptor*)const_cast<DLTensor*>(data_entry_[eid])->data)->buffer);
+        cl_event cpy_evt = NULL;
+        result = h_ClmlIntf->clEnqueueCopyMLTensorDataQCOM(
+            queue, layer_.outputs[i]->tensor, layer_.outputs[i]->memory,
+            layer_.out_placeholder[i]->tensor, 
layer_.out_placeholder[i]->memory, 0, NULL,
+            &cpy_evt);
+        ICHECK(result == CL_SUCCESS) << "clEnqueueCopyMLTensorDataQCOM:" << 
result;
+      } else {
+        DLDataType tvm_dtype = const_cast<DLTensor*>(data_entry_[eid])->dtype;
+        cl_channel_type cl_dtype = MakeCLDataType(tvm_dtype);
+        int dtype_size = cl_dtype == CL_FLOAT ? 4 : 2;
+
+        void* tmpptr = reinterpret_cast<void*>(malloc(osize * dtype_size));
+        CopyDataFromCLMLTensor(layer_.outputs[0], tmpptr);
+        TVMArrayCopyFromBytes(const_cast<DLTensor*>(data_entry_[eid]), 
const_cast<void*>(tmpptr),
+                              osize * dtype_size);
+        free(tmpptr);
+      }
+    }
+  }
+
+ private:
+  /*!
+   * \brief Build CLML layer from JSON representation and cache.
+   *
+   * \note For the time being only one layer or operator is supported
+   * per engine.
+   */
+  void BuildEngine() {
+    size_t nid;
+    for (nid = 0; nid < nodes_.size(); ++nid) {
+      const auto& node = nodes_[nid];
+      if (node.GetOpType() == "input") {
+        auto clml_input = MakeCLMLTensorFromJSONNode(node);
+        this->layer_.storage_map.insert({nid, std::make_pair(clml_input, 
node)});
+        this->layer_.inputs.push_back(clml_input);
+        // Input copy placeholder Tensor
+        this->layer_.in_placeholder.push_back(
+            MakeCLMLTensorFromJSONNode(node, CL_TENSOR_LAYOUT_NCHW_QCOM));
+      } else if (node.GetOpType() == "kernel") {
+        auto op_name = node.GetOpName();
+        if ("nn.conv2d" == op_name) {
+          auto out = CreateConvolution2DLayer(&layer_, node, 
CL_CONVOLUTION_MODE_CONVOLUTION_QCOM);
+          this->layer_.storage_map.insert({nid, std::make_pair(out, node)});
+          this->layer_.func_outs.push_back(out);
+        } else if ("nn.depthwise_conv2d" == op_name) {
+          auto out = CreateConvolution2DLayer(&layer_, node, 
CL_CONVOLUTION_MODE_DEPTHWISE_QCOM);
+          this->layer_.storage_map.insert({nid, std::make_pair(out, node)});
+          this->layer_.func_outs.push_back(out);
+        } else if ("nn.relu6" == op_name) {
+          auto out = CreateReLULayer(&layer_, node, CL_ACTIVATION_RELU6);
+          this->layer_.storage_map.insert({nid, std::make_pair(out, node)});
+          this->layer_.func_outs.push_back(out);
+        } else if ("nn.relu" == op_name) {
+          auto out = CreateReLULayer(&layer_, node, CL_ACTIVATION_RELU);
+          this->layer_.storage_map.insert({nid, std::make_pair(out, node)});
+          this->layer_.func_outs.push_back(out);
+        } else if ("nn.batch_norm" == op_name) {
+          auto out = CreateBatchNormLayer(&layer_, node);
+          this->layer_.storage_map.insert({nid, std::make_pair(out, node)});
+          this->layer_.func_outs.push_back(out);
+        } else if ("nn.global_max_pool2d" == op_name || "nn.global_avg_pool2d" 
== op_name) {
+          auto out = CreateGlobalPoolingLayer(&layer_, node);
+          this->layer_.storage_map.insert({nid, std::make_pair(out, node)});
+          this->layer_.func_outs.push_back(out);
+        } else if ("reshape" == op_name) {
+          auto out = CreateReshapeLayer(&layer_, node);
+          this->layer_.storage_map.insert({nid, std::make_pair(out, node)});
+          this->layer_.func_outs.push_back(out);
+        } else if ("nn.dense" == op_name) {
+          auto out = CreateDenseLayer(&layer_, node);
+          this->layer_.storage_map.insert({nid, std::make_pair(out, node)});
+          this->layer_.func_outs.push_back(out);
+        } else if ("nn.softmax" == op_name) {
+          auto out = CreateSoftMaxLayer(&layer_, node);
+          this->layer_.storage_map.insert({nid, std::make_pair(out, node)});
+          this->layer_.func_outs.push_back(out);
+        } else if ("nn.pad" == op_name) {
+          auto out = CreatePadLayer(&layer_, node);
+          this->layer_.storage_map.insert({nid, std::make_pair(out, node)});
+          this->layer_.func_outs.push_back(out);
+        } else if ("clip" == op_name) {
+          auto out = CreateClipLayer(&layer_, node);
+          this->layer_.storage_map.insert({nid, std::make_pair(out, node)});
+          this->layer_.func_outs.push_back(out);
+        } else {
+          LOG(FATAL) << "Unsupported op: " << op_name;
+        }
+        this->layer_.layer_names.push_back(op_name);
+      } else if (node.GetOpType() != "const") {
+        LOG(WARNING) << "Build Engine: Unknown Node:" << node.GetOpType();
+      }
+    }
+    if (nid > 0) {
+      this->layer_.outputs.push_back(this->layer_.storage_map[nid - 1].first);
+      this->layer_.out_placeholder.push_back(
+          MakeCLMLTensorFromJSONNode(nodes_[nid - 1], 
CL_TENSOR_LAYOUT_NCHW_QCOM));
+    }
+    // ALlocate device memories and initialize the params if any
+    cl_int result = 0;
+    for (auto it = this->layer_.storage_map.begin(); it != 
this->layer_.storage_map.end(); it++) {
+      auto tensor_desc = it->second.first;
+      JSONGraphNode node = it->second.second;
+      void* node_data = nullptr;
+
+      allocateTensorMemory(h_ClmlIntf, context, tensor_desc);
+
+      if (node.GetOpType() == "const") {
+        node_data = data_entry_[EntryID(it->first, 0)]->data;
+        if (node_data != nullptr) {
+          CopyDataToCLMLTensor(tensor_desc, node_data);
+        }
+      }
+      this->layer_.tensorMemDescs.push_back(*tensor_desc);
+    }
+
+    // Setup descriptor set
+    result = 
h_ClmlIntf->clCreateMLTensorMemoryDescriptorSetQCOM(&this->layer_.descriptorSet);
+    ICHECK(result == CL_SUCCESS) << "clCreateMLTensorMemoryDescriptorSetQCOM:" 
<< result;
+
+    result = h_ClmlIntf->clUpdateMLTensorMemoryDescriptorSetQCOM(
+        this->layer_.descriptorSet, 
static_cast<uint32_t>(this->layer_.tensorMemDescs.size()),
+        this->layer_.tensorMemDescs.data());
+    ICHECK(result == CL_SUCCESS) << "clUpdateMLTensorMemoryDescriptorSetQCOM:" 
<< result;
+
+    if (this->is_tuning_run) {
+      LOG(WARNING) << "CLML Tunning In Progress:";
+      for (size_t i = 0; i < this->layer_.function.size(); ++i) {
+        LOG(WARNING) << "CLML Tunning:" << i;
+        result = h_ClmlIntf->clTuneMLOpQCOM(queue, this->layer_.function[i],
+                                            this->layer_.descriptorSet, 
this->tuning_cache, NULL);
+        ICHECK(result == CL_SUCCESS) << "clTuneMLOpQCOM:" << result;
+      }
+
+      size_t cacheLenBytes = 0;
+      size_t lenRet = 0;
+      result = h_ClmlIntf->clSaveMLTuningCacheQCOM(tuning_cache, 0, NULL, 
&cacheLenBytes);
+      ICHECK(result == CL_SUCCESS) << "clSaveMLTuningCacheQCOM:" << result;
+
+      std::vector<unsigned char> savedCache(cacheLenBytes, 0);
+      result = h_ClmlIntf->clSaveMLTuningCacheQCOM(tuning_cache, 
savedCache.size(),
+                                                   savedCache.data(), &lenRet);
+      assert(result == CL_SUCCESS);
+
+      std::ofstream cache_out(tuning_file, std::ios_base::binary);
+      if (cache_out) {
+        cache_out.write(reinterpret_cast<char*>(savedCache.data()), 
savedCache.size());
+        cache_out.close();
+      }
+      LOG(WARNING) << "CLML: Tuning cache dumped to:" << tuning_file;
+    }
+  }
+
+  /*!
+   * \brief CLML objects we cache in order to avoid needing to construct
+   * a new layer each time.
+   */
+  struct CachedLayer {
+    std::vector<cl_ml_op_qcom> function;
+    std::vector<std::shared_ptr<cl_ml_tensor_memory_desc_qcom>> inputs;
+    std::vector<std::shared_ptr<cl_ml_tensor_memory_desc_qcom>> in_placeholder;
+    std::vector<std::shared_ptr<cl_ml_tensor_memory_desc_qcom>> outputs;
+    std::vector<std::shared_ptr<cl_ml_tensor_memory_desc_qcom>> 
out_placeholder;
+    std::vector<std::shared_ptr<cl_ml_tensor_memory_desc_qcom>> func_outs;
+    std::vector<std::shared_ptr<cl_ml_tensor_memory_desc_qcom>> func_ins;
+    std::map<int, std::pair<std::shared_ptr<cl_ml_tensor_memory_desc_qcom>, 
JSONGraphNode>>
+        storage_map;
+    std::vector<cl_ml_tensor_memory_desc_qcom> tensorMemDescs;
+    std::vector<cl_ml_tensor_memory_desc_qcom> in_tensorMemDescs;
+    std::vector<cl_ml_tensor_memory_desc_qcom> out_tensorMemDescs;
+    cl_ml_tensor_mem_desc_set_qcom descriptorSet;
+    std::vector<std::string> layer_names;
+    cl_ml_tensor_qcom unusedTensor = NULL;
+  };
+
+  struct tensor_dims_t {
+    uint32_t n, c, h, w;
+  };
+
+  bool ExtensionStringPresent(cl_device_id device_id) {
+    cl_int result = 0;
+
+    size_t reqd_size = 0;
+    result = clGetDeviceInfo(device_id, CL_DEVICE_EXTENSIONS, 0, NULL, 
&reqd_size);
+    ICHECK(reqd_size > 0u && result == CL_SUCCESS) << "clGetDeviceInfo:" << 
result;
+
+    std::vector<char> buf(reqd_size);
+    result = clGetDeviceInfo(device_id, CL_DEVICE_EXTENSIONS, reqd_size, 
buf.data(), NULL);
+    ICHECK(result == CL_SUCCESS) << "clGetDeviceInfo:" << result;
+
+    std::string extensions(buf.data());
+    LOG(WARNING) << "OpenCL Extensions:" << extensions;
+    return (extensions.find("cl_qcom_ml_ops") != std::string::npos);
+  }
+
+  cl_ml_tensor_qcom DeviceMakeCLMLTensor(
+      void* pClmlIntf, cl_context context, tensor_dims_t dims,
+      cl_ml_tensor_layout_qcom layout = CL_TENSOR_LAYOUT_OPTIMAL_QCOM,
+      cl_channel_type dtype = CL_FLOAT) {
+    cl_ml_tensor_qcom tensor;
+    cl_int result = CL_OUT_OF_RESOURCES;
+
+    cl_ml_tensor_desc_qcom desc = {
+        dtype, layout, dims.n, dims.c, dims.h, dims.w, 0, 
CL_TENSOR_DIMENSIONS_4D_QCOM, { 0 }};
+    CLMLInterfaceV2QCOM* clmlIntf = 
reinterpret_cast<CLMLInterfaceV2QCOM*>(pClmlIntf);
+    result = clmlIntf->clCreateMLTensorQCOM(context, NULL, &desc, &tensor);
+    ICHECK(tensor && result == CL_SUCCESS) << "clCreateMLTensorQCOM:" << 
result;
+    (void)result;
+    return tensor;
+  }
+
+  cl_int allocateTensorMemory(void* pClmlIntf, cl_context context,
+                              std::shared_ptr<cl_ml_tensor_memory_desc_qcom> 
pTensorMemDesc) {
+    uint32_t size = 0;
+    cl_int result = CL_OUT_OF_HOST_MEMORY;
+    cl_mem buffer = NULL;
+
+    CLMLInterfaceV2QCOM* clmlIntf = 
reinterpret_cast<CLMLInterfaceV2QCOM*>(pClmlIntf);
+    result = clmlIntf->clGetMLTensorMemorySizeQCOM(context, 
pTensorMemDesc->tensor, &size);
+    ICHECK(result == CL_SUCCESS) << "clGetMLTensorMemorySizeQCOM:" << result;
+
+    buffer = clCreateBuffer(context, CL_MEM_READ_WRITE, size, NULL, &result);
+    ICHECK(result == CL_SUCCESS) << "clCreateBuffer:" << result;
+
+    pTensorMemDesc->memory = buffer;
+
+    return result;
+  }
+
+  tensor_dims_t get_tensor_dims(const JSONGraphNode& node) {
+    std::vector<int64_t> shape = node.GetOpShape()[0];
+    tensor_dims_t dims;
+    dims.n = shape[0];
+    dims.c = shape[1];
+    dims.h = shape[2];
+    dims.w = shape[3];
+    return dims;
+  }
+
+  cl_channel_type MakeCLDataType(const DLDataType& data_type) {
+    if (data_type.code == DLDataTypeCode::kDLFloat && data_type.bits == 32) {
+      return CL_FLOAT;
+    } else if (data_type.code == DLDataTypeCode::kDLFloat && data_type.bits == 
16) {
+      return CL_HALF_FLOAT;
+    } else {
+      LOG(FATAL) << "Datatype " << data_type << " unsupported by CLML runtime";
+      return -1;
+    }
+  }
+
+  std::shared_ptr<cl_ml_tensor_memory_desc_qcom> MakeCLMLTensor(
+      const JSONGraphNode& tensor_rep, void* data, std::vector<size_t> c_shape,
+      cl_ml_tensor_layout_qcom layout = CL_TENSOR_LAYOUT_OPTIMAL_QCOM, cl_uint 
dtype = CL_FLOAT) {
+    std::vector<int64_t> shape = tensor_rep.GetOpShape()[0];
+    std::vector<size_t> clml_shape(shape.begin(), shape.end());
+    if (c_shape.size() > 0) {
+      clml_shape = c_shape;
+    }
+    // Make sure the tensors with dimensions less than 4 are padded with 1.
+    clml_shape.push_back(1);
+    clml_shape.push_back(1);
+    clml_shape.push_back(1);
+
+    tensor_dims_t dims;
+    dims.n = clml_shape[0];
+    dims.c = clml_shape[1];
+    dims.h = clml_shape[2];
+    dims.w = clml_shape[3];
+    DLDataType tvm_dtype = tensor_rep.GetOpDataType()[0];
+    cl_channel_type cl_dtype = MakeCLDataType(tvm_dtype);
+
+    auto tensor_dsc = std::make_shared<cl_ml_tensor_memory_desc_qcom>();
+    tensor_dsc->tensor = DeviceMakeCLMLTensor(h_ClmlIntf, context, dims, 
layout, cl_dtype);
+    return tensor_dsc;
+  }
+
+  /*!
+   * \brief Create an CLML tensor given the JSON representation. If scale
+   * and offset are given, then create a quantized CLML tensor.
+   *
+   * \param tensor The tensor to represent.
+   * \return CLML Tensor.
+   */
+
+  std::shared_ptr<cl_ml_tensor_memory_desc_qcom> MakeCLMLTensorFromJSONEntry(
+      const JSONGraphNodeEntry& tensor, std::vector<size_t> shape = {},
+      cl_ml_tensor_layout_qcom layout = CL_TENSOR_LAYOUT_OPTIMAL_QCOM, cl_uint 
dtype = CL_FLOAT) {
+    JSONGraphNode node = nodes_[tensor.id_];
+    if (this->layer_.storage_map.find(tensor.id_) == 
this->layer_.storage_map.end()) {
+      void* node_data = nullptr;
+      if (node.GetOpType() == "const") {
+        node_data = data_entry_[EntryID(tensor)]->data;
+      }
+      auto clml_tensor = MakeCLMLTensorFromJSONNode(node, layout, dtype, 
node_data, shape);
+      this->layer_.storage_map.insert({tensor.id_, std::make_pair(clml_tensor, 
node)});
+      return clml_tensor;
+    } else {
+      return this->layer_.storage_map[tensor.id_].first;
+    }
+  }
+  /*!
+   * \brief Create an CLML tensor given the JSON representation. If scale
+   * and offset are given, then create a quantized CLML tensor.
+   *
+   * \param node The tensor to represent.
+   * \param data (optional) Constant data of input node.
+   * \return CLML Tensor.
+   */
+  std::shared_ptr<cl_ml_tensor_memory_desc_qcom> MakeCLMLTensorFromJSONNode(
+      const JSONGraphNode& node, cl_ml_tensor_layout_qcom layout = 
CL_TENSOR_LAYOUT_OPTIMAL_QCOM,
+      cl_uint dtype = CL_FLOAT, void* data = nullptr, std::vector<size_t> 
shape = {}) {
+    return MakeCLMLTensor(node, data, shape, layout, dtype);
+  }
+  /*!
+   * \brief Create a 2D convolution layer.
+   *
+   * \param layer The CLML layer to build. Containing inputs, outputs and the 
CLML function.
+   * \param node The JSON representation of the operator.
+   */
+  std::shared_ptr<cl_ml_tensor_memory_desc_qcom> CreateConvolution2DLayer(
+      CachedLayer* layer, const JSONGraphNode& node, cl_convolution_mode_qcom 
mode) {
+    std::vector<std::string> padding = 
node.GetAttr<std::vector<std::string>>("padding");
+    std::vector<std::string> strides = 
node.GetAttr<std::vector<std::string>>("strides");
+    std::vector<std::string> dilation = 
node.GetAttr<std::vector<std::string>>("dilation");
+    std::vector<cl_uint> clml_padding = GetVectorValues(padding);
+    if (!node.HasAttr("padding")) {
+      clml_padding.resize(4);
+      std::fill(clml_padding.begin(), clml_padding.end(), 0);
+    }
+    cl_uint clml_padding_b[CL_ML_TENSOR_MAX_SPATIAL_DIMS_QCOM] = 
{clml_padding[0], clml_padding[1]};
+    cl_uint clml_padding_a[CL_ML_TENSOR_MAX_SPATIAL_DIMS_QCOM] = 
{clml_padding[2], clml_padding[3]};
+    std::vector<cl_uint> v_strides = GetVectorValues(strides);
+    std::vector<cl_uint> v_dilation = GetVectorValues(dilation);
+    cl_uint clml_strides[CL_ML_TENSOR_MAX_SPATIAL_DIMS_QCOM] = {v_strides[0], 
v_strides[1]};
+    cl_uint clml_dilation[CL_ML_TENSOR_MAX_SPATIAL_DIMS_QCOM] = 
{v_dilation[0], v_dilation[1]};
+    cl_int result = 0;
+
+    cl_uint groups = 
std::stoi(node.GetAttr<std::vector<std::string>>("groups")[0]);
+    if (CL_CONVOLUTION_MODE_CONVOLUTION_QCOM == mode) {
+      ICHECK(groups == 1) << "CLML convolution only supports group size of 1.";
+    } else {
+      groups = 1;  // Don't need to pass groups to depthwise
+    }
+
+    bool has_act = false;
+    std::string activation_type;
+    cl_activation_function_qcom clml_act_type = CL_ACTIVATION_RELU;
+    if (node.HasAttr("activation_type")) {
+      activation_type = 
node.GetAttr<std::vector<std::string>>("activation_type")[0];
+      ICHECK(activation_type == "relu" || activation_type == "relu6")
+          << "Unknown activation type:" << activation_type;
+      if (activation_type == "relu") {
+        clml_act_type = CL_ACTIVATION_RELU;
+      } else {
+        clml_act_type = CL_ACTIVATION_RELU6;
+      }
+      has_act = true;
+    }
+    cl_ml_op_activation_desc_qcom act_desc = {clml_act_type, 
CL_PROPAGATE_NAN_QCOM,
+                                              CL_ARITHMETIC_MODE_FP32_QCOM};
+
+    // Collect inputs and outputs, handling nn.conv2d.
+    std::vector<JSONGraphNodeEntry> inputs = node.GetInputs();
+    size_t num_inputs = inputs.size();
+    bool has_bias;
+    bool has_bn;
+    ICHECK(num_inputs >= 2U && num_inputs <= 7U)
+        << "Batchnorm fused convolution requires bax 7 arguments";
+    has_bias = (num_inputs == 3) || (num_inputs == 7);
+    has_bn = (num_inputs == 6) || (num_inputs == 7);
+    // Input
+    auto input = MakeCLMLTensorFromJSONEntry(inputs[0]);
+
+    // Weight
+    auto weight = MakeCLMLTensorFromJSONEntry(inputs[1]);
+
+    // Bias
+    auto bias = std::make_shared<cl_ml_tensor_memory_desc_qcom>();
+    if (has_bias) {
+      bias = MakeCLMLTensorFromJSONEntry(inputs[2]);
+    } else {
+      cl_ml_tensor_desc_qcom desc = {};
+      desc.num_dimensions = CL_TENSOR_UNUSED_QCOM;
+      result = h_ClmlIntf->clCreateMLTensorQCOM(context, NULL, &desc, 
&layer_.unusedTensor);
+      ICHECK(layer_.unusedTensor && result == CL_SUCCESS) << 
"clCreateMLTensorQCOM:" << result;
+      bias->tensor = layer_.unusedTensor;
+    }
+    // Output
+    auto output = MakeCLMLTensorFromJSONNode(node);
+    cl_ml_op_convolution_desc_qcom conv_desc{mode,
+                                             groups,
+                                             4,
+                                             {clml_padding_b[0], 
clml_padding_b[1]},
+                                             {clml_padding_a[0], 
clml_padding_a[1]},
+                                             {clml_strides[0], 
clml_strides[1]},
+                                             {clml_dilation[0], 
clml_dilation[1]},
+                                             0,
+                                             CL_ARITHMETIC_MODE_FP32_QCOM};
+
+    cl_ml_op_qcom op = NULL;
+    if (!has_bn) {
+      if (!has_act) {
+        result = h_ClmlIntf->clCreateMLOpConvolutionForwardQCOM(
+            context, 0, &conv_desc, input->tensor, weight->tensor, 
bias->tensor, output->tensor,
+            &op, NULL);
+        ICHECK(op && result == CL_SUCCESS) << "Convolution Error:" << result;
+      } else {
+        result = h_ClmlIntf->clCreateMLOpFusedConvolutionActivationForwardQCOM(
+            context, 0, &conv_desc, &act_desc, input->tensor, weight->tensor, 
bias->tensor, NULL,
+            output->tensor, &op, tuning_cache);
+        ICHECK(op && result == CL_SUCCESS) << "Convolution Error:" << result;
+      }
+      layer_.func_ins.push_back(input);
+      layer->function.push_back(op);
+    } else {
+      int bn_index = has_bias ? 3 : 2;
+      int axis = 
std::stoi(node.GetAttr<std::vector<std::string>>("batchnorm")[0]);
+      auto bn_dims = get_tensor_dims(nodes_[inputs[bn_index].id_]);
+      std::vector<size_t> bn_shape = {1, 1, 1, 1};
+      bn_shape[axis] = bn_dims.n;
+      auto bn_mean = std::make_shared<cl_ml_tensor_memory_desc_qcom>();
+      auto bn_var = std::make_shared<cl_ml_tensor_memory_desc_qcom>();
+      auto bn_scale = std::make_shared<cl_ml_tensor_memory_desc_qcom>();
+      auto bn_bias = std::make_shared<cl_ml_tensor_memory_desc_qcom>();
+      bn_scale = MakeCLMLTensorFromJSONEntry(inputs[bn_index], bn_shape);
+      bn_bias = MakeCLMLTensorFromJSONEntry(inputs[bn_index + 1], bn_shape);
+      bn_mean = MakeCLMLTensorFromJSONEntry(inputs[bn_index + 2], bn_shape);
+      bn_var = MakeCLMLTensorFromJSONEntry(inputs[bn_index + 3], bn_shape);
+
+      cl_ml_op_batchnorm_desc_qcom bn_desc = {CL_BATCHNORM_MODE_SPATIAL_QCOM,
+                                              CL_ARITHMETIC_MODE_FP32_QCOM};
+      if (!has_act) {
+        result = h_ClmlIntf->clCreateMLOpFusedConvolutionBatchNormForwardQCOM(
+            context, 0, &conv_desc, &bn_desc, input->tensor, weight->tensor, 
bias->tensor,
+            output->tensor, bn_mean->tensor, bn_var->tensor, bn_scale->tensor, 
bn_bias->tensor, &op,
+            tuning_cache);
+        ICHECK(op && result == CL_SUCCESS) << "Convolution Error:" << result;
+      } else {
+        result = 
h_ClmlIntf->clCreateMLOpFusedConvolutionBatchNormActivationForwardQCOM(
+            context, 0, &conv_desc, &bn_desc, &act_desc, input->tensor, 
weight->tensor,
+            bias->tensor, output->tensor, NULL, bn_mean->tensor, 
bn_var->tensor, bn_scale->tensor,
+            bn_bias->tensor, &op, tuning_cache);
+
+        ICHECK(op && result == CL_SUCCESS) << "Convolution Error:" << result;
+      }
+      layer_.func_ins.push_back(input);
+      layer->function.push_back(op);
+    }
+    return output;
+  }
+
+  /*!
+   * \brief Create a ReLU(X) layer.
+   *
+   * \param layer The CLML layer to build. Containing inputs, outputs and the 
CLML output.
+   * \param node The JSON representation of the operator.
+   */
+  std::shared_ptr<cl_ml_tensor_memory_desc_qcom> CreateReLULayer(
+      CachedLayer* layer, const JSONGraphNode& node,
+      cl_activation_function_qcom clml_act_type = CL_ACTIVATION_RELU) {
+    cl_int result = 0;
+    cl_ml_op_qcom op = NULL;
+    auto input = MakeCLMLTensorFromJSONEntry(node.GetInputs()[0]);
+    auto output = MakeCLMLTensorFromJSONNode(node);
+
+    cl_ml_op_activation_desc_qcom act_desc = {clml_act_type, 
CL_PROPAGATE_NAN_QCOM,
+                                              CL_ARITHMETIC_MODE_FP32_QCOM};
+
+    cl_ml_tensor_desc_qcom desc = {};
+    desc.num_dimensions = CL_TENSOR_UNUSED_QCOM;
+    result = h_ClmlIntf->clCreateMLTensorQCOM(context, NULL, &desc, 
&layer_.unusedTensor);
+    ICHECK(layer_.unusedTensor && result == CL_SUCCESS) << ":" << result;
+
+    result = h_ClmlIntf->clCreateMLOpActivationForwardQCOM(context, 0, 
&act_desc, input->tensor,
+                                                           
layer_.unusedTensor, output->tensor, &op,
+                                                           tuning_cache);
+    ICHECK(op && result == CL_SUCCESS) << "Activation Error:" << result;
+
+    layer_.func_ins.push_back(input);
+    layer->function.push_back(op);
+    return output;
+  }
+
+  /*!
+   * \brief Create a batch norm layer.
+   *
+   *
+   * \param layer The CLML layer to build. Containing inputs, outputs and the 
CLML function.
+   * \param node The JSON representation of the operator.
+   */
+  std::shared_ptr<cl_ml_tensor_memory_desc_qcom> 
CreateBatchNormLayer(CachedLayer* layer,
+                                                                      const 
JSONGraphNode& node) {
+    cl_int result = 0;
+    cl_ml_op_qcom op = NULL;
+    auto input = MakeCLMLTensorFromJSONEntry(node.GetInputs()[0]);
+    int axis = std::stoi(node.GetAttr<std::vector<std::string>>("axis")[0]);
+    auto bn_dims = get_tensor_dims(nodes_[node.GetInputs()[1].id_]);
+    std::vector<size_t> bn_shape = {1, 1, 1, 1};
+    bn_shape[axis] = bn_dims.n;
+    auto bn_mean = std::make_shared<cl_ml_tensor_memory_desc_qcom>();
+    auto bn_var = std::make_shared<cl_ml_tensor_memory_desc_qcom>();
+    auto bn_scale = std::make_shared<cl_ml_tensor_memory_desc_qcom>();
+    auto bn_bias = std::make_shared<cl_ml_tensor_memory_desc_qcom>();
+    bn_scale = MakeCLMLTensorFromJSONEntry(node.GetInputs()[1], bn_shape);
+    bn_bias = MakeCLMLTensorFromJSONEntry(node.GetInputs()[2], bn_shape);
+    bn_mean = MakeCLMLTensorFromJSONEntry(node.GetInputs()[3], bn_shape);
+    bn_var = MakeCLMLTensorFromJSONEntry(node.GetInputs()[4], bn_shape);
+
+    auto output = MakeCLMLTensorFromJSONNode(node);
+
+    cl_ml_op_batchnorm_desc_qcom bn_desc = {CL_BATCHNORM_MODE_SPATIAL_QCOM,
+                                            CL_ARITHMETIC_MODE_FP32_QCOM};
+
+    result = h_ClmlIntf->clCreateMLOpBatchNormForwardQCOM(
+        context, 0, &bn_desc, input->tensor, bn_mean->tensor, bn_var->tensor, 
bn_scale->tensor,
+        bn_bias->tensor, output->tensor, &op, tuning_cache);
+    ICHECK(op && result == CL_SUCCESS) << "Batchnorm Error:" << result;
+
+    layer->function.push_back(op);
+    layer_.func_ins.push_back(input);
+    return output;
+  }
+
+  /*!
+   * \brief Create a global pooling layer.
+   *
+   * \note Currently global_max_pool2d and global_avg_pool2d are supported.
+   *
+   * \param layer The CLML layer to build. Containing inputs, outputs and the 
CLML function.
+   * \param node The JSON representation of the operator.
+   */
+  std::shared_ptr<cl_ml_tensor_memory_desc_qcom> CreateGlobalPoolingLayer(
+      CachedLayer* layer, const JSONGraphNode& node) {
+    cl_int result = 0;
+    cl_ml_op_qcom op = NULL;
+    auto input = MakeCLMLTensorFromJSONEntry(node.GetInputs()[0]);
+    auto output = MakeCLMLTensorFromJSONNode(node);
+    auto in_dims = get_tensor_dims(nodes_[node.GetInputs()[0].id_]);
+    cl_ml_op_pooling_desc_qcom pool_desc = {
+        node.GetOpName() == "nn.global_max_pool2d" ? CL_POOLING_MODE_MAX_QCOM
+                                                   : 
CL_POOLING_MODE_AVERAGE_EXCLUDE_PADDING_QCOM,
+        4,  // reserved
+        {0, 0},
+        {0, 0},
+        {1, 1},
+        {in_dims.w, in_dims.h},
+        CL_PROPAGATE_NAN_QCOM,
+        CL_ARITHMETIC_MODE_FP32_QCOM,
+    };
+
+    cl_ml_tensor_desc_qcom desc = {};
+    desc.num_dimensions = CL_TENSOR_UNUSED_QCOM;
+    result = h_ClmlIntf->clCreateMLTensorQCOM(context, NULL, &desc, 
&layer_.unusedTensor);
+    ICHECK(layer_.unusedTensor && result == CL_SUCCESS) << ":" << result;
+
+    result = h_ClmlIntf->clCreateMLOpPoolingForwardQCOM(context, 0, 
&pool_desc, input->tensor,
+                                                        layer_.unusedTensor, 
output->tensor, &op,
+                                                        tuning_cache);
+    ICHECK(op && result == CL_SUCCESS) << "Pooling Error:" << result;
+
+    layer_.func_ins.push_back(input);
+    layer->function.push_back(op);
+    return output;
+  }
+
+  /*!
+   * \brief Create a SoftMax layer.
+   *
+   * \param layer The CLML layer to build. Containing inputs, outputs and the 
CLML output.
+   * \param node The JSON representation of the operator.
+   */
+  std::shared_ptr<cl_ml_tensor_memory_desc_qcom> 
CreateSoftMaxLayer(CachedLayer* layer,
+                                                                    const 
JSONGraphNode& node) {
+    cl_int result = 0;
+    cl_ml_op_qcom op = NULL;
+    auto input = MakeCLMLTensorFromJSONEntry(node.GetInputs()[0]);
+    auto out_dims = get_tensor_dims(nodes_[node.GetInputs()[0].id_]);
+    auto output = MakeCLMLTensorFromJSONNode(node, 
CL_TENSOR_LAYOUT_OPTIMAL_QCOM, CL_FLOAT, nullptr,
+                                             {out_dims.n, out_dims.c, 1, 1});
+
+    cl_ml_op_softmax_desc_qcom softmax_desc = 
{CL_SOFTMAX_ALGORITHM_ACCURATE_QCOM,
+                                               CL_SOFTMAX_MODE_INSTANCE_QCOM,
+                                               CL_ARITHMETIC_MODE_FP32_QCOM};
+
+    result = h_ClmlIntf->clCreateMLOpSoftmaxQCOM(context, 0, &softmax_desc, 
input->tensor,
+                                                 output->tensor, &op, 
tuning_cache);
+    ICHECK(op && result == CL_SUCCESS) << "SoftMax Error:" << result;
+
+    layer_.func_ins.push_back(input);
+    layer->function.push_back(op);
+    return output;
+  }
+
+  /*!
+   * \brief Create a Pad layer.
+   *
+   * \param layer The CLML layer to build. Containing inputs, outputs and the 
CLML output.
+   * \param node The JSON representation of the operator.
+   */
+  std::shared_ptr<cl_ml_tensor_memory_desc_qcom> CreatePadLayer(CachedLayer* 
layer,
+                                                                const 
JSONGraphNode& node) {
+    cl_int result = 0;
+    cl_ml_op_qcom op = NULL;
+    auto input = MakeCLMLTensorFromJSONEntry(node.GetInputs()[0]);
+    auto output = MakeCLMLTensorFromJSONNode(node);
+
+    std::string pad_mode = 
node.GetAttr<std::vector<std::string>>("pad_mode")[0];
+    std::vector<std::string> padding = 
node.GetAttr<std::vector<std::string>>("pad_width");
+    std::vector<cl_uint> clml_padding = GetVectorValues(padding);
+
+    cl_pad_mode_qcom clml_pad_mode = CL_PAD_MODE_CONSTANT_QCOM;
+    if (pad_mode == "constant")
+      clml_pad_mode = CL_PAD_MODE_CONSTANT_QCOM;
+    else if (pad_mode == "edge")
+      clml_pad_mode = CL_PAD_MODE_SYMMETRIC_QCOM;
+    else if (pad_mode == "reflect")
+      clml_pad_mode = CL_PAD_MODE_REFLECT_QCOM;
+    else
+      LOG(FATAL) << "Padding mode not supported by CLML:" << pad_mode;
+
+    cl_ml_op_pad_desc_qcom pad_desc{
+        clml_pad_mode,
+        {0, 0},
+        {clml_padding[0], clml_padding[1], clml_padding[2], clml_padding[3], 
0, 0, 0, 0},
+        CL_ARITHMETIC_MODE_FP32_QCOM};
+
+    result = h_ClmlIntf->clCreateMLOpPadQCOM(context, 0, &pad_desc, 
input->tensor, output->tensor,
+                                             &op, tuning_cache);
+    ICHECK(op && result == CL_SUCCESS) << "Pad Error:" << result;
+
+    layer_.func_ins.push_back(input);
+    layer->function.push_back(op);
+    return output;
+  }
+
+  /*!
+   * \brief Create a Reshape layer.
+   *
+   * \param layer The CLML layer to build. Containing inputs, outputs and the 
CLML output.
+   * \param node The JSON representation of the operator.
+   */
+  std::shared_ptr<cl_ml_tensor_memory_desc_qcom> 
CreateReshapeLayer(CachedLayer* layer,
+                                                                    const 
JSONGraphNode& node) {
+    cl_int result = 0;
+    cl_ml_op_qcom op = NULL;
+    auto input = MakeCLMLTensorFromJSONEntry(node.GetInputs()[0]);
+    auto output = MakeCLMLTensorFromJSONNode(node);
+
+    result = h_ClmlIntf->clCreateMLOpReshapeQCOM(context, 0, input->tensor, 
output->tensor, &op,
+                                                 tuning_cache);
+    ICHECK(op && result == CL_SUCCESS) << "Reshape Error:" << result;
+
+    layer_.func_ins.push_back(input);
+    layer->function.push_back(op);
+    return output;
+  }
+
+  /*!
+   * \brief Create a dense layer.
+   *
+   *
+   * \param layer The CLML layer to build. Containing inputs, outputs and the 
CLML function.
+   * \param node The JSON representation of the operator.
+   */
+  std::shared_ptr<cl_ml_tensor_memory_desc_qcom> CreateDenseLayer(CachedLayer* 
layer,
+                                                                  const 
JSONGraphNode& node) {
+    cl_int result = 0;
+    cl_ml_op_qcom op = NULL;
+    auto input = MakeCLMLTensorFromJSONEntry(node.GetInputs()[0]);
+    auto wt_dims = get_tensor_dims(nodes_[node.GetInputs()[1].id_]);
+    bool has_bias = node.GetInputs().size() == 3 ? true : false;
+
+    auto weight = MakeCLMLTensorFromJSONEntry(node.GetInputs()[1], {1, 1, 
wt_dims.n, wt_dims.c});
+    auto bias = std::make_shared<cl_ml_tensor_memory_desc_qcom>();
+    if (has_bias) {
+      auto bias_dims = get_tensor_dims(nodes_[node.GetInputs()[2].id_]);
+      bias = MakeCLMLTensorFromJSONEntry(node.GetInputs()[2], {1, bias_dims.c, 
1, 1});
+    }
+
+    cl_ml_op_fully_connected_desc_qcom fc_desc = {1, 
CL_FC_WEIGHT_TRANSFORM_TRANSPOSE_QCOM,
+                                                  
CL_ARITHMETIC_MODE_FP32_QCOM};
+
+    auto output = MakeCLMLTensorFromJSONNode(node);
+    if (has_bias) {
+      result = h_ClmlIntf->clCreateMLOpFullyConnectedQCOM(context, 0, 
&fc_desc, input->tensor,
+                                                          weight->tensor, 
bias->tensor,
+                                                          output->tensor, &op, 
tuning_cache);
+    } else {
+      result = h_ClmlIntf->clCreateMLOpFullyConnectedQCOM(context, 0, 
&fc_desc, input->tensor,
+                                                          weight->tensor, 
NULL, output->tensor, &op,
+                                                          tuning_cache);
+    }
+    ICHECK(op && result == CL_SUCCESS) << "Fully Connected Error:" << result;
+
+    layer->function.push_back(op);
+    layer_.func_ins.push_back(input);
+    return output;
+  }
+
+  /*!
+   * \brief Create a Clip(X) layer.
+   *
+   * \param layer The CLML layer to build. Containing inputs, outputs and the 
CLML output.
+   * \param node The JSON representation of the operator.
+   */
+  std::shared_ptr<cl_ml_tensor_memory_desc_qcom> CreateClipLayer(CachedLayer* 
layer,
+                                                                 const 
JSONGraphNode& node) {
+    cl_int result = 0;
+    cl_ml_op_qcom op = NULL;
+    auto input = MakeCLMLTensorFromJSONEntry(node.GetInputs()[0]);
+    auto output = MakeCLMLTensorFromJSONNode(node);
+    cl_float a_max = 
std::stof(node.GetAttr<std::vector<std::string>>("a_max")[0]);
+    cl_float a_min = 
std::stof(node.GetAttr<std::vector<std::string>>("a_min")[0]);
+
+    cl_ml_op_clip_desc_qcom clip_desc = {CL_CLIP_BY_VALUE_QCOM,
+                                         {{a_max}, CL_FLOAT},
+                                         {{a_min}, CL_FLOAT},
+                                         CL_ARITHMETIC_MODE_FP32_QCOM};
+
+    result = h_ClmlIntf->clCreateMLOpClipQCOM(context, 0, &clip_desc, 
input->tensor, output->tensor,
+                                              &op, tuning_cache);
+    ICHECK(op && result == CL_SUCCESS) << "Clip Error:" << result;
+
+    layer_.func_ins.push_back(input);
+    layer->function.push_back(op);
+    return output;
+  }
+
+  /*!
+   * \brief The network layers represented by acl functions.
+   * \note Currently only supports a single layer.
+   */
+
+  CachedLayer layer_;
+  // CLML Context
+  CLMLInterfaceV2QCOM* h_ClmlIntf = NULL;
+  cl_platform_id platform = NULL;
+  cl_context context = NULL;
+  cl_device_id device_id = NULL;
+  cl_command_queue queue = NULL;
+  std::vector<cl_event>* evts;
+  cl_ml_tuningcache_qcom tuning_cache = NULL;
+  bool is_tuning_run;
+  char* tuning_file;
+#else
+  void Run() override {
+    LOG(FATAL) << "Cannot call run on CLML module without runtime enabled. "
+               << "Please build with USE_CLML_GRAPH_EXECUTOR.";
+  }
+
+  void BuildEngine() {
+    LOG(WARNING) << "CLML engine is not initialized. "
+                 << "Please build with USE_CLML_GRAPH_EXECUTOR.";
+  }
+#endif
+};
+
+runtime::Module CLMLRuntimeCreate(const String& symbol_name, const String& 
graph_json,
+                                  const Array<String>& const_names) {
+  auto n = make_object<CLMLRuntime>(symbol_name, graph_json, const_names);
+  return runtime::Module(n);
+}
+
+TVM_REGISTER_GLOBAL("runtime.clml_runtime_create").set_body_typed(CLMLRuntimeCreate);
+TVM_REGISTER_GLOBAL("runtime.module.loadbinary_clml")
+    .set_body_typed(JSONRuntimeBase::LoadFromBinary<CLMLRuntime>);
+}  //  namespace contrib
+}  //  namespace runtime
+}  //  namespace tvm
diff --git a/src/support/libinfo.cc b/src/support/libinfo.cc
index f98e08ce94..be0cd9eb8f 100644
--- a/src/support/libinfo.cc
+++ b/src/support/libinfo.cc
@@ -310,6 +310,8 @@ TVM_DLL Map<String, String> GetLibInfo() {
       {"USE_THRUST", TVM_INFO_USE_THRUST},
       {"USE_VITIS_AI", TVM_INFO_USE_VITIS_AI},
       {"USE_VULKAN", TVM_INFO_USE_VULKAN},
+      {"USE_CLML", TVM_INFO_USE_CLML},
+      {"USE_CLML_GRAPH_EXECUTOR", TVM_INFO_USE_CLML_GRAPH_EXECUTOR},
   };
   return result;
 }
diff --git a/python/tvm/relay/op/contrib/__init__.py 
b/tests/python/contrib/test_clml/__init__.py
similarity index 71%
copy from python/tvm/relay/op/contrib/__init__.py
copy to tests/python/contrib/test_clml/__init__.py
index a03d0f6d4f..dfeb9ae5c8 100644
--- a/python/tvm/relay/op/contrib/__init__.py
+++ b/tests/python/contrib/test_clml/__init__.py
@@ -14,15 +14,4 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
-# pylint: disable=wildcard-import
-"""Contrib modules."""
-from .register import get_pattern_table, register_pattern_table
-
-from .arm_compute_lib import *
-from .dnnl import *
-from .bnns import *
-from .coreml import *
-from .ethosn import *
-from .libtorch import *
-from .tensorrt import *
-from .cutlass import *
+"""Infrastructure and tests for CLML"""
diff --git a/tests/python/contrib/test_clml/infrastructure.py 
b/tests/python/contrib/test_clml/infrastructure.py
new file mode 100644
index 0000000000..19901d733e
--- /dev/null
+++ b/tests/python/contrib/test_clml/infrastructure.py
@@ -0,0 +1,256 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+from itertools import zip_longest, combinations
+import json
+import os
+import warnings
+
+import numpy as np
+
+import tvm
+from tvm import relay
+from tvm import rpc
+
+# from tvm.contrib.debugger import debug_runtime as graph_executor
+from tvm.contrib import graph_executor
+from tvm.relay.op.contrib import clml
+from tvm.contrib import utils
+from tvm.autotvm.measure import request_remote
+from tvm.relay.expr_functor import ExprMutator, Call
+
+
+class Device:
+    """
+    Configuration for CLML tests.
+
+    Check tests/python/contrib/clml/ for the presence of an test_config.json 
file.
+    This file can be used to override the default configuration here which 
will attempt to run the Arm
+    Compute Library runtime tests locally if the runtime is available. 
Changing the configuration
+    will allow these runtime tests to be offloaded to a remote Arm device via 
a tracker for example.
+
+    Notes
+    -----
+        The test configuration will be loaded once when the the class is 
created. If the configuration
+        changes between tests, any changes will not be picked up.
+
+    Parameters
+    ----------
+    device : RPCSession
+        Allows tests to connect to and use remote device.
+
+    Attributes
+    ----------
+    connection_type : str
+        Details the type of RPC connection to use. Options:
+        local - Use the local device,
+        tracker - Connect to a tracker to request a remote device,
+        remote - Connect to a remote device directly.
+    host : str
+        Specify IP address or hostname of remote target.
+    port : int
+        Specify port number of remote target.
+    target : str
+        The compilation target.
+    device_key : str
+        The device key of the remote target. Use when connecting to a remote 
device via a tracker.
+    cross_compile : str
+        Specify path to cross compiler to use when connecting a remote device 
from a non-arm platform.
+    """
+
+    connection_type = "tracker"
+    host = "localhost"
+    port = 9090
+    target = "opencl"
+    target_host = "llvm -mtriple=aarch64-linux-gnu"
+    device_key = ""
+    cross_compile = ""
+
+    def __init__(self):
+        """Keep remote device for lifetime of object."""
+        self.device = self._get_remote()
+
+    @classmethod
+    def _get_remote(cls):
+        """Get a remote (or local) device to use for testing."""
+        if cls.connection_type == "tracker":
+            device = request_remote(cls.device_key, cls.host, cls.port, 
timeout=1000)
+        elif cls.connection_type == "remote":
+            device = rpc.connect(cls.host, cls.port)
+        elif cls.connection_type == "local":
+            device = rpc.LocalSession()
+        else:
+            raise ValueError(
+                "connection_type in test_config.json should be one of: " 
"local, tracker, remote."
+            )
+
+        return device
+
+    @classmethod
+    def load(cls, file_name):
+        """Load test config
+
+        Load the test configuration by looking for file_name relative
+        to the test_clml directory.
+        """
+        location = os.path.realpath(os.path.join(os.getcwd(), 
os.path.dirname(__file__)))
+        config_file = os.path.join(location, file_name)
+        if not os.path.exists(config_file):
+            warnings.warn("Config file doesn't exist, resuming CLML tests with 
default config.")
+            return
+        with open(config_file, mode="r") as config:
+            test_config = json.load(config)
+
+        cls.connection_type = test_config["connection_type"]
+        cls.host = test_config["host"]
+        cls.port = test_config["port"]
+        cls.target = test_config["target"]
+        cls.target_host = test_config["target_host"]
+        cls.device_key = test_config.get("device_key") or ""
+        cls.cross_compile = test_config.get("cross_compile") or ""
+
+
+def skip_runtime_test():
+    """Skip test if it requires the runtime and it's not present."""
+    # CLML codegen not present.
+    if not tvm.get_global_func("relay.ext.clml", True):
+        print("Skip because CLML codegen is not available.")
+        return True
+
+    # Remote device is in use or CLML runtime not present
+    # Note: Ensure that the device config has been loaded before this check
+    if not Device.connection_type != "local" and not 
clml.is_clml_runtime_enabled():
+        print("Skip because runtime isn't present or a remote device isn't 
being used.")
+        return True
+
+
+def skip_codegen_test():
+    """Skip test if it requires the CLML codegen and it's not present."""
+    if not tvm.get_global_func("relay.ext.clml", True):
+        print("Skip because CLML codegen is not available.")
+        return True
+
+
+def build_module(mod, target, target_host, params=None, enable_clml=True):
+    """Build module with option to build for CLML."""
+    if isinstance(mod, tvm.relay.expr.Call):
+        mod = tvm.IRModule.from_expr(mod)
+
+    with tvm.transform.PassContext(opt_level=3, 
disabled_pass=["AlterOpLayout"]):
+        if enable_clml:
+            mod = clml.partition_for_clml(mod, params)
+        relay.backend.te_compiler.get().clear()
+        # print("Build  Mod:", mod)
+        return relay.build(mod, target=target, target_host=target_host, 
params=params)
+
+
+def build_and_run(
+    mod,
+    inputs,
+    outputs,
+    params,
+    device,
+    enable_clml=True,
+    no_runs=1,
+    config=None,
+):
+    """Build and run the relay module."""
+    if config is None:
+        config = {}
+
+    try:
+        libm = build_module(mod, device.target, device.target_host, params, 
enable_clml)
+
+        clml_modules = extract_clml_modules(libm)
+        for mod in clml_modules:
+            source = mod.get_source("json")
+            codegen = json.loads(source)["nodes"]
+            # remove input and const names as these cannot be predetermined
+            for node in range(len(codegen)):
+                if codegen[node]["op"] == "input" or codegen[node]["op"] == 
"const":
+                    codegen[node]["name"] = ""
+            codegen_str = json.dumps(codegen, sort_keys=True, indent=2)
+
+    except Exception as e:
+        err_msg = "The module could not be built.\n"
+        if config:
+            err_msg += f"The test failed with the following parameters: 
{config}\n"
+        err_msg += str(e)
+        raise Exception(err_msg)
+
+    lib = update_lib(libm, device.device, device.cross_compile)
+    gen_module = 
graph_executor.GraphModule(lib["default"](device.device.cl(0)))
+    gen_module.set_input(**inputs)
+    out = []
+    for _ in range(no_runs):
+        gen_module.run()
+        out.append([gen_module.get_output(i) for i in range(outputs)])
+    time_f = gen_module.module.time_evaluator("run", device.device.cl(0), 
number=50)
+    cost = time_f().mean
+    print("%g secs/iteration\n" % cost)
+    return out
+
+
+def update_lib(lib, device, cross_compile):
+    """Export the library to the remote/local device."""
+    lib_name = "mod.so"
+    temp = utils.tempdir()
+    lib_path = temp.relpath(lib_name)
+    if cross_compile:
+        lib.export_library(lib_path, cc=cross_compile)
+    else:
+        lib.export_library(lib_path)
+    device.upload(lib_path)
+    lib = device.load_module(lib_name)
+    return lib
+
+
+def extract_clml_modules(module):
+    """Get the CLML module(s) from llvm module."""
+    return list(filter(lambda mod: mod.type_key == "clml", 
module.get_lib().imported_modules))
+
+
+def verify_codegen(
+    module,
+    known_good_codegen,
+    num_clml_modules=1,
+    tvm_ops=0,
+    target="llvm -mtriple=aarch64-linux-gnu",
+):
+    """Check clml codegen against a known good output."""
+    module = build_module(module, target, tvm_ops=tvm_ops, 
clml_partitions=num_clml_modules)
+    clml_modules = extract_clml_modules(module)
+
+    assert len(clml_modules) == num_clml_modules, (
+        f"The number of CLML modules produced ({len(clml_modules)}) does not "
+        f"match the expected value ({num_clml_modules})."
+    )
+
+    for mod in clml_modules:
+        source = mod.get_source("json")
+        codegen = json.loads(source)["nodes"]
+        # remove input and const names as these cannot be predetermined
+        for node in range(len(codegen)):
+            if codegen[node]["op"] == "input" or codegen[node]["op"] == 
"const":
+                codegen[node]["name"] = ""
+        codegen_str = json.dumps(codegen, sort_keys=True, indent=2)
+        known_good_codegen_str = json.dumps(known_good_codegen, 
sort_keys=True, indent=2)
+
+        assert codegen_str == known_good_codegen_str, (
+            f"The JSON produced by codegen does not match the expected result. 
\n"
+            f"Actual={codegen_str} \n"
+            f"Expected={known_good_codegen_str}"
+        )
diff --git a/tests/python/contrib/test_clml/test_network.py 
b/tests/python/contrib/test_clml/test_network.py
new file mode 100644
index 0000000000..d89676f10e
--- /dev/null
+++ b/tests/python/contrib/test_clml/test_network.py
@@ -0,0 +1,139 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""OpenCL ML network tests."""
+
+import numpy as np
+import pytest
+from tvm import testing
+from tvm import relay
+
+import tvm
+from test_clml.infrastructure import skip_runtime_test, build_and_run
+from test_clml.infrastructure import Device
+
+
+def _build_and_run_network(mod, params, inputs, data, device, atol, rtol):
+    """Helper function to build and run a network."""
+
+    outputs = []
+    for clml in [True, False]:
+        outputs.append(
+            build_and_run(
+                mod,
+                data,
+                1,
+                params,
+                device,
+                enable_clml=clml,
+            )[0]
+        )
+    return outputs
+
+
+def _get_keras_model(keras_model, inputs_dict, data):
+    """Convert Keras graph to relay."""
+    inputs = {}
+    for name, (shape, _) in inputs_dict.items():
+        inputs[keras_model.input_names[0]] = shape
+
+    from tensorflow.keras.layers import Input
+    from tensorflow.keras.models import Model
+
+    def get_bottom_top_model(model, layer_name):
+        layer = model.get_layer(layer_name)
+        bottom_input = model.layers[0].input
+        bottom_output = bottom_input
+        for layer in model.layers:
+            bottom_output = layer(bottom_output)
+            if layer.name == layer_name:
+                break
+        bottom_model = Model(bottom_input, bottom_output)
+        return bottom_model
+
+    keras_model = get_bottom_top_model(keras_model, "predictions")
+    ref_output = keras_model.predict(data["input_1"].transpose(0, 2, 3, 1))
+
+    mod, params = relay.frontend.from_keras(keras_model, inputs, layout="NCHW")
+    return mod, params, ref_output
+
+
+def test_mobilenet():
+    Device.load("test_config.json")
+
+    if skip_runtime_test():
+        return
+
+    device = Device()
+    dtype = "float16"
+
+    def get_model():
+        from tensorflow.keras.applications import MobileNet
+
+        mobilenet = MobileNet(
+            include_top=True, weights=None, input_shape=(224, 224, 3), 
classes=1000
+        )
+        mobilenet.load_weights("mobilenet_1_0_224_tf.h5")
+        inputs = {mobilenet.input_names[0]: ((1, 3, 224, 224), "float32")}
+
+        data = {}
+        np.random.seed(0)
+
+        for name, (shape, dtype) in inputs.items():
+            if dtype == "uint8":
+                low, high = 0, 1
+            else:
+                low, high = -1, 1
+            data[name] = np.random.uniform(low, high, shape).astype(dtype)
+
+        mod, params, ref_outputs = _get_keras_model(mobilenet, inputs, data)
+        return mod, params, inputs, data, ref_outputs
+
+    mod, params, inputs, input_data, ref_outputs = get_model()
+    outputs = _build_and_run_network(
+        mod, params, inputs, input_data, device=device, atol=1e-5, rtol=1e-5
+    )
+
+    # test
+    print("OpenCL:", outputs[0][0].asnumpy().shape)
+    print("CLML:", outputs[1][0].asnumpy().shape)
+
+    opencl_sort = np.argsort(outputs[1][0].asnumpy()).flatten()
+    clml_sort = np.argsort(outputs[0][0].asnumpy()).flatten()
+
+    tvm.testing.assert_allclose(opencl_sort[:10], clml_sort[:10], rtol=1e-5, 
atol=1e-5)
+
+
+"""
+    tvm.testing.assert_allclose(
+         ref_outputs, outputs[1][0].asnumpy(), rtol=1e-5, atol=1e-5)
+    print("OpenCL to Keras looks good")
+    tvm.testing.assert_allclose(
+         outputs[0][0].asnumpy(), outputs[1][0].asnumpy(), rtol=1e-5, 
atol=1e-5)
+    print("OpenCL to CLML looks good")
+    exit(0)
+
+    tvm.testing.assert_allclose(
+         ref_outputs.transpose(0, 3, 1, 2), outputs[1][0].asnumpy(), 
rtol=1e-5, atol=1e-5)
+    print("OpenCL to Keras looks good")
+    tvm.testing.assert_allclose(
+         outputs[0][0].asnumpy(), outputs[1][0].asnumpy(), rtol=1e-5, 
atol=1e-5)
+    print("OpenCL to CLML looks good")
+"""
+
+
+if __name__ == "__main__":
+    test_mobilenet()
diff --git a/tests/python/contrib/test_clml/test_ops.py 
b/tests/python/contrib/test_clml/test_ops.py
new file mode 100644
index 0000000000..63f5bc168f
--- /dev/null
+++ b/tests/python/contrib/test_clml/test_ops.py
@@ -0,0 +1,216 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""CLML integration conv2d tests."""
+
+import numpy as np
+
+np.random.seed(0)
+
+import tvm
+from tvm import testing
+from tvm import relay
+from tvm.ir import IRModule
+
+from test_clml.infrastructure import (
+    skip_runtime_test,
+    skip_codegen_test,
+    build_and_run,
+    Device,
+)
+
+
+def _get_conv_model(
+    shape,
+    kernel_h,
+    kernel_w,
+    padding,
+    strides,
+    dilation,
+    groups,
+    dtype,
+    channels,
+    var,
+    has_bias=False,
+    has_activation=False,
+    has_pad=False,
+):
+    """Return a model and any parameters it may have"""
+    a = relay.var(next(iter(var)), shape=shape, dtype=dtype)
+    input_arr = var[next(iter(var))]
+    if has_pad:
+        p = ((0, 0), (padding[0], padding[0]), (padding[1], padding[1]), (0, 
0))
+        a = relay.nn.pad(a, pad_width=p)
+        padding = (0, 0, 0, 0)
+    else:
+        if len(padding) == 2:
+            padding = (padding[0], padding[1], padding[0], padding[1])
+        shape = (shape[0], shape[1], shape[2] + padding[0] * 2, shape[3] + 
padding[1] * 2)
+    is_depthwise = shape[1] == channels == groups
+
+    weight_format = "OIHW" if is_depthwise else "OIHW"
+    if weight_format == "IOHW":
+        weight_shape = (shape[1] // groups, channels, kernel_h, kernel_w)
+    else:
+        weight_shape = (channels, shape[1] // groups, kernel_h, kernel_w)
+
+    w = tvm.nd.array(np.random.uniform(-1, 1, weight_shape).astype(dtype))
+    weights = relay.const(w, dtype)
+    out = relay.nn.conv2d(
+        a,
+        weights,
+        kernel_size=(kernel_h, kernel_w),
+        data_layout="NCHW",
+        kernel_layout=weight_format,
+        dilation=dilation,
+        strides=strides,
+        padding=padding,
+        groups=groups,
+        channels=channels,
+        out_dtype=dtype,
+    )
+    params = {"w": w}
+    if has_bias:
+        bias_shape = weight_shape[2] if is_depthwise else weight_shape[0]
+        b = tvm.nd.array(np.random.uniform(-1, 1, bias_shape).astype(dtype))
+        biasc = relay.const(b, dtype)
+        out = relay.nn.bias_add(out, biasc, axis=1)
+        params["b"] = b
+
+    if has_activation:
+        out = relay.nn.relu(out)
+
+    print("Out:", out)
+
+    return out, params
+
+
+def test_conv2d():
+    Device.load("test_config.json")
+
+    if skip_runtime_test():
+        return
+
+    device = Device()
+    np.random.seed(0)
+
+    dtype = "float32"
+
+    trials = [
+        # Normal convolution
+        [3, 3, (1, 1), (1, 1), (1, 1), 4, (14, 10, 10), (False, False, False)],
+        [2, 1, (2, 2), (1, 1), (1, 1), 7, (15, 16, 12), (False, False, True)],
+        [3, 3, (2, 1), (1, 1), (1, 1), 4, (14, 10, 10), (False, True, False)],
+        [3, 3, (2, 1), (1, 1), (1, 1), 4, (14, 10, 10), (False, True, True)],
+        # Normal convolution
+        [2, 2, (1, 1), (1, 1), (1, 1), 4, (14, 10, 10), (False, False, False)],
+        [2, 1, (2, 2), (1, 1), (1, 1), 7, (16, 12, 15), (False, False, True)],
+        [3, 3, (2, 1), (1, 1), (1, 1), 4, (14, 10, 10), (False, True, False)],
+        [3, 3, (1, 1), (1, 1), (1, 1), 16, (16, 12, 15), (False, False, 
False)],
+        [5, 5, (1, 1), (2, 2), (1, 1), 4, (14, 10, 10), (False, False, False)],
+        [1, 3, (1, 1), (1, 1), (1, 1), 7, (20, 20, 20), (False, False, True)],
+        [2, 2, (2, 2), (1, 1), (1, 1), 4, (20, 20, 20), (False, True, False)],
+        [5, 5, (1, 1), (2, 2), (1, 1), 4, (14, 10, 10), (False, False, False)],
+        [3, 3, (2, 1), (1, 1), (1, 1), 7, (20, 20, 20), (False, False, False)],
+        [3, 3, (1, 1), (2, 2), (1, 1), 16, (14, 10, 10), (False, True, True)],
+    ]
+
+    for (
+        kernel_h,
+        kernel_w,
+        pad,
+        stride,
+        dilation,
+        out_channels,
+        shape,
+        composite,
+    ) in trials:
+        shape = (1, *shape)
+        groups = 1
+        outputs = []
+        inputs = {
+            "a": tvm.nd.array(np.random.uniform(-1, 1, shape).astype(dtype)),
+        }
+
+        func, params = _get_conv_model(
+            shape,
+            kernel_h,
+            kernel_w,
+            pad,
+            stride,
+            dilation,
+            groups,
+            dtype,
+            out_channels,
+            inputs,
+            has_pad=composite[0],
+            has_bias=composite[1],
+            has_activation=composite[2],
+        )
+        opencl_out = build_and_run(func, inputs, 1, params, device, 
enable_clml=False)[0]
+        clml_out = build_and_run(func, inputs, 1, params, device, 
enable_clml=True)[0]
+
+        tvm.testing.assert_allclose(
+            clml_out[0].asnumpy(), opencl_out[0].asnumpy(), rtol=1e-5, 
atol=1e-5
+        )
+
+
+def test_batchnorm():
+    Device.load("test_config.json")
+
+    if skip_runtime_test():
+        return
+
+    device = Device()
+    np.random.seed(0)
+
+    dtype = "float32"
+
+    in_shape = (1, 8, 64, 64)
+    channels = 8
+
+    input_arr = tvm.nd.array(np.random.uniform(-1, 1, in_shape).astype(dtype))
+    inp = relay.var("a", shape=in_shape, dtype=dtype)
+    gamma_arr = tvm.nd.array(np.random.uniform(-1, 1, 
(channels)).astype(dtype))
+    beta_arr = tvm.nd.array(np.random.uniform(-1, 1, (channels)).astype(dtype))
+    gamma = relay.const(gamma_arr, dtype)
+    beta = relay.const(beta_arr, dtype)
+
+    mean_arr = tvm.nd.array(np.mean(input_arr.asnumpy(), axis=(0, 2, 3), 
keepdims=False))
+    mean = relay.const(mean_arr)
+    variance_arr = tvm.nd.array(np.var(input_arr.asnumpy(), axis=(0, 2, 3), 
keepdims=False))
+    variance = relay.const(variance_arr)
+
+    params = {}
+
+    func = relay.nn.batch_norm(inp, gamma, beta, mean, variance, axis=1, 
epsilon=0.0001)[0]
+    mod = IRModule.from_expr(func)
+
+    inputs = {
+        "a": input_arr,
+    }
+
+    opencl_out = build_and_run(mod, inputs, 1, params, device, 
enable_clml=False)[0]
+    clml_out = build_and_run(mod, inputs, 1, params, device, 
enable_clml=True)[0]
+
+    tvm.testing.assert_allclose(
+        clml_out[0].asnumpy(), opencl_out[0].asnumpy(), rtol=1e-5, atol=1e-5
+    )
+
+
+if __name__ == "__main__":
+    # test_conv2d()
+    test_batchnorm()

Reply via email to