(tvm) branch main updated: [RELAX][BYOC] OpenCLML offload support for Relax (#17654)

tqchen Wed, 19 Feb 2025 07:51:56 -0800

This is an automated email from the ASF dual-hosted git repository.

tqchen pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tvm.git



The following commit(s) were added to refs/heads/main by this push:
     new cc2f0796ba [RELAX][BYOC] OpenCLML offload support for Relax (#17654)
cc2f0796ba is described below

commit cc2f0796ba37de45a834f6467557cc5efae6b075
Author: Siva <[email protected]>
AuthorDate: Wed Feb 19 21:21:21 2025 +0530

    [RELAX][BYOC] OpenCLML offload support for Relax (#17654)
    
    
    This brings in OpenCLML offloading via BYOC path with available operators 
in Relax.
    Adds codegen tests for Mainline CI.
---
 cmake/modules/contrib/CLML.cmake                   |   4 +-
 python/tvm/relax/backend/__init__.py               |   2 +-
 .../backend/{gpu_generic => adreno}/__init__.py    |   3 +-
 python/tvm/relax/backend/adreno/clml.py            | 618 +++++++++++++++++
 python/tvm/relax/backend/adreno/pipeline.py        |  75 +++
 python/tvm/relax/backend/gpu_generic/__init__.py   |   1 +
 python/tvm/relax/pipeline.py                       |  10 +
 python/tvm/relax/transform/__init__.py             |   1 +
 .../fold_batch_norm_to_conv2d_for_inference.py     | 102 +++
 python/tvm/target/__init__.py                      |   1 +
 python/tvm/target/target.py                        |   7 +-
 src/relax/backend/contrib/clml/codegen.cc          | 358 ++++++++++
 src/relax/backend/contrib/utils.cc                 |  11 +
 src/relax/backend/contrib/utils.h                  |  26 +-
 src/runtime/contrib/clml/clml_runtime.cc           | 181 +++--
 src/runtime/contrib/clml/clml_runtime.h            |  17 +-
 src/runtime/contrib/clml/clml_utils.cc             |  11 +
 src/runtime/contrib/clml/clml_utils.h              |   2 +
 .../relax/backend/clml/conftest.py}                |  42 +-
 tests/python/relax/backend/clml/mod_utils.py       | 728 +++++++++++++++++++++
 .../python/relax/backend/clml/test_clml_codegen.py | 505 ++++++++++++++
 .../backend/clml/test_op_exec_clml_codegen.py      | 329 ++++++++++
 tests/python/relax/backend/clml/utils.py           |  90 +++
 .../test_transform_fold_batch_norm_to_conv2d.py    | 144 ++++
 tests/scripts/task_config_build_gpu.sh             |   1 +
 tests/scripts/task_python_adreno.sh                |   9 +
 tests/scripts/unity/task_python_relax.sh           |   3 +
 27 files changed, 3186 insertions(+), 95 deletions(-)

diff --git a/cmake/modules/contrib/CLML.cmake b/cmake/modules/contrib/CLML.cmake
index 118091696a..21621bf34c 100644
--- a/cmake/modules/contrib/CLML.cmake
+++ b/cmake/modules/contrib/CLML.cmake
@@ -16,10 +16,10 @@
 # under the License.
 
 if(USE_CLML)
-    file(GLOB CLML_RELAY_CONTRIB_SRC src/relay/backend/contrib/clml/*.cc)
+    file(GLOB CLML_RELAX_CONTRIB_SRC src/relax/backend/contrib/clml/*.cc)
     file(GLOB CLML_RUNTIME_MODULE src/runtime/contrib/clml/clml_runtime.cc)
     include_directories(SYSTEM "3rdparty/OpenCL-Headers")
-    list(APPEND COMPILER_SRCS ${CLML_RELAY_CONTRIB_SRC})
+    list(APPEND COMPILER_SRCS ${CLML_RELAX_CONTRIB_SRC})
     if(NOT USE_CLML_GRAPH_EXECUTOR)
         list(APPEND COMPILER_SRCS ${CLML_RUNTIME_MODULE})
     endif()
diff --git a/python/tvm/relax/backend/__init__.py 
b/python/tvm/relax/backend/__init__.py
index 2a64ffe27b..f414ae0c54 100644
--- a/python/tvm/relax/backend/__init__.py
+++ b/python/tvm/relax/backend/__init__.py
@@ -16,7 +16,7 @@
 # under the License.
 """Relax backends"""
 
-from . import contrib, cpu_generic, cuda, gpu_generic, metal, rocm
+from . import contrib, cpu_generic, cuda, gpu_generic, metal, rocm, adreno
 from .dispatch_sampling import DispatchSampling
 from .dispatch_sort_scan import DispatchSortScan
 from .pattern_registry import get_pattern, get_patterns_with_prefix
diff --git a/python/tvm/relax/backend/gpu_generic/__init__.py 
b/python/tvm/relax/backend/adreno/__init__.py
similarity index 90%
copy from python/tvm/relax/backend/gpu_generic/__init__.py
copy to python/tvm/relax/backend/adreno/__init__.py
index 9c5e65fb49..b3364f2f4b 100644
--- a/python/tvm/relax/backend/gpu_generic/__init__.py
+++ b/python/tvm/relax/backend/adreno/__init__.py
@@ -14,10 +14,11 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
-"""The Relax Metal backend compilation pipeline and other passes."""
+"""The Relax Adreno backend compilation pipeline and other passes."""
 from .pipeline import (
     finalize_passes,
     get_default_pipeline,
+    dataflow_lower_passes,
     legalize_passes,
     library_dispatch_passes,
 )
diff --git a/python/tvm/relax/backend/adreno/clml.py 
b/python/tvm/relax/backend/adreno/clml.py
new file mode 100644
index 0000000000..e50ac0dc1d
--- /dev/null
+++ b/python/tvm/relax/backend/adreno/clml.py
@@ -0,0 +1,618 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+# pylint: disable=invalid-name, unused-argument, pointless-exception-statement
+"""Pattern table for CLML backend"""
+import tvm
+from tvm import relax, IRModule
+from tvm.ir.transform import PassContext, module_pass
+from tvm.relax import transform
+from tvm.relax.expr_functor import PyExprMutator, mutator
+from tvm.relax.expr import TupleGetItem, VarBinding
+from tvm.relax.dpl.pattern import (
+    is_const,
+    is_op,
+    is_tuple_get_item,
+    wildcard,
+)
+from tvm.relax.transform import PatternCheckContext
+from ..pattern_registry import register_patterns
+
+
+@mutator
+class AppendReshapeToBNRewriter(PyExprMutator):
+    """
+    Append Reshape Operator to BatchNorm Pass Rewriter Pass
+
+    - Automatically appends a reshape operation after BatchNorm operators
+    - Resolves fusion issues for custom backends where BatchNorm output
+      might explicitly access the first elment of the Tuple
+
+    Algo:
+    Identifies BatchNorm operators in the computational graph
+    When BatchNorm's first output is accessed via TupleGetItem
+    Automatically inserts a reshape operation to match input shape
+
+    """
+
+    def __init__(self, mod):
+        super().__init__(mod)
+        self.bn_vars = {}
+
+    def visit_tuple_getitem_(self, op: TupleGetItem):
+        tuple_value = op.tuple_value
+        reshape_op = tvm.ir.Op.get("relax.reshape")
+
+        if isinstance(tuple_value, relax.Var) and tuple_value in self.bn_vars:
+            bn_call = self.bn_vars[tuple_value]
+            if op.index == 0:
+                bn_out = relax.TupleGetItem(bn_call, 0)
+                input_shape = bn_call.args[0].struct_info.shape
+                return relax.Call(reshape_op, [bn_out, input_shape])
+
+        return super().visit_tuple_getitem_(op)
+
+    def visit_var_binding_(self, binding: VarBinding):
+        if isinstance(binding.value, relax.Call) and binding.value.op.name == 
"relax.nn.batch_norm":
+            self.bn_vars[binding.var] = binding.value
+        return super().visit_var_binding_(binding)
+
+
[email protected]_pass(opt_level=0, name="AppendReshapeToBN")
+class AppendReshapeToBNRewriterPass:
+    def transform_function(
+        self, func: relax.Function, mod: IRModule, _ctx: 
tvm.transform.PassContext
+    ) -> relax.Function:
+        updated_func = AppendReshapeToBNRewriter(mod).visit_expr(func)
+        updated_func = relax.analysis.remove_all_unused(updated_func)
+        return updated_func
+
+
+def clml_sdk_version():
+    """Utility function to get clml version"""
+
+    return int(tvm.support.libinfo().get("TVM_CLML_VERSION", 2))
+
+
+def is_clml_runtime_enabled():
+    """Check if the CLML graph runtime is present.
+
+    Returns
+    -------
+    ret: bool
+        True if present, False if not.
+    """
+    check_enabled = 
tvm.get_global_func("relax.op.is_openclml_runtime_enabled", True)
+    if check_enabled:
+        return check_enabled()
+    return False
+
+
+def _check_default(context: PatternCheckContext) -> bool:
+    return True
+
+
+def clml_pattern_table():
+    """Get the CLML pattern table."""
+
+    def _check_conv2d(context: PatternCheckContext) -> bool:
+        if "root" in context.annotated_expr:
+            root_call = context.annotated_expr["root"]
+            if root_call.op.name == "relax.nn.conv2d":
+                input_layout = root_call.attrs.data_layout
+                weight_layout = root_call.attrs.kernel_layout
+                if input_layout != "NCHW" or weight_layout != "OIHW":
+                    return False
+            if root_call.op.name == "relax.nn.conv2d_transpose":
+                input_layout = root_call.attrs.data_layout
+                weight_layout = root_call.attrs.kernel_layout
+                if input_layout != "NCHW" or weight_layout != "OIHW":
+                    return False
+
+        if "data" in context.annotated_expr:
+            input_expr = context.annotated_expr["data"]
+            input_dtype = input_expr.struct_info.dtype
+            if input_dtype not in ["float32", "float16"]:
+                return False
+
+        if "weight" in context.annotated_expr:
+            weight_expr = context.annotated_expr["weight"]
+            weight_dtype = weight_expr.struct_info.dtype
+            if weight_dtype not in ["float32", "float16"]:
+                return False
+
+        return True
+
+    def populate_patterns(patterns, name, op, annotations, *args):
+        ret = {}
+        for k, v in patterns.items():
+            ret_ann = v["annotation"].copy()
+            ret_ann.update(annotations)
+            ret[name + "." + k] = {"pattern": op(v["pattern"], *args), 
"annotation": ret_ann.copy()}
+
+        return ret
+
+    def conv_pattern():
+        """Create a convolution pattern."""
+        data = wildcard()
+        weight = wildcard()
+        bias = is_const()
+        bn_scale = is_const()
+        bn_bias = is_const()
+        bn_mean = is_const()
+        bn_var = is_const()
+
+        annotations = {
+            "data": data,
+            "weight": weight,
+        }
+
+        patterns = {}
+        patterns["nn.conv2d"] = {
+            "pattern": is_op("relax.nn.conv2d")(data, weight),
+            "annotation": annotations.copy(),
+        }
+
+        pad_annotations = annotations.copy()
+        patterns["pad.nn.conv2d"] = {
+            "pattern": is_op("relax.nn.conv2d")(is_op("relax.nn.pad")(data), 
weight),
+            "annotation": pad_annotations,
+        }
+
+        patterns["nn.conv2d_transpose"] = {
+            "pattern": is_op("relax.nn.conv2d_transpose")(data, weight),
+            "annotation": annotations.copy(),
+        }
+        patterns.update(
+            populate_patterns(patterns, "bias", is_op("relax.add"), {"bias": 
bias}, bias)
+        )
+        patterns.update(
+            populate_patterns(
+                patterns,
+                "bn",
+                is_op("relax.nn.batch_norm"),
+                {
+                    "bn_scale": bn_scale,
+                    "bn_bias": bn_bias,
+                    "bn_mean": bn_mean,
+                    "bn_var": bn_var,
+                },
+                bn_scale,
+                bn_bias,
+                bn_mean,
+                bn_var,
+            )
+        )
+        tuple_patterns = {}
+        for k, v in patterns.items():
+            tuple_annotation = v["annotation"].copy()
+            tuple_patterns["tuple" + "." + k] = {
+                "pattern": is_tuple_get_item(v["pattern"], 0),
+                "annotation": tuple_annotation,
+            }
+        patterns.update(tuple_patterns)
+
+        relu_patterns = populate_patterns(patterns, "relu", 
is_op("relax.nn.relu"), {})
+        clip_patterns = populate_patterns(patterns, "clip", 
is_op("relax.clip"), {})
+        patterns.update(relu_patterns)
+        patterns.update(clip_patterns)
+
+        conv_patterns = []
+        for k, v in patterns.items():
+            ret_annotations = v["annotation"]
+            ret_annotations["root"] = v["pattern"]
+            conv_patterns.append(
+                ("openclml." + (k), v["pattern"], ret_annotations.copy(), 
_check_conv2d)
+            )
+        return conv_patterns[::-1]
+
+    def _check_maxpool2d(context: PatternCheckContext) -> bool:
+        root = context.annotated_expr.get("root")
+        if not root or not isinstance(root, relax.Call):
+            return False
+
+        if root.op.name != "relax.nn.max_pool2d":
+            return False
+
+        if "data" not in context.annotated_expr:
+            return False
+
+        data = context.annotated_expr["data"]
+        input_shape = data.struct_info.shape
+
+        if len(input_shape) != 4:
+            return False
+
+        if any(dim <= 0 for dim in input_shape):
+            return False
+
+        pool_size = root.attrs.pool_size
+        if len(pool_size) != 2:
+            return False
+        if any(size <= 0 for size in pool_size):
+            return False
+
+        strides = root.attrs.strides
+        if len(strides) != 2:
+            return False
+        if any(stride <= 0 for stride in strides):
+            return False
+
+        dilation = root.attrs.dilation
+        if len(dilation) != 2:
+            return False
+        if any(d <= 0 for d in dilation):
+            return False
+
+        padding = root.attrs.padding
+        if len(padding) != 4:
+            return False
+        if any(p < 0 for p in padding):
+            return False
+
+        return True
+
+    def maxpool_pattern():
+
+        """Create Pool Pattern"""
+        data = wildcard()
+        annotations = {
+            "data": data,
+        }
+        patterns = {}
+        patterns["nn.max_pool2d"] = {
+            "pattern": is_op("relax.nn.max_pool2d")(data),
+            "annotation": annotations.copy(),
+        }
+
+        pool_patterns = []
+        for k, v in patterns.items():
+            ret_annotations = v["annotation"]
+            ret_annotations["root"] = v["pattern"]
+            pool_patterns.append(
+                ("openclml." + (k), v["pattern"], ret_annotations.copy(), 
_check_maxpool2d)
+            )
+        return pool_patterns
+
+    def _check_avgpool2d(context: PatternCheckContext) -> bool:
+        root = context.annotated_expr.get("root")
+        if not root or not isinstance(root, relax.Call):
+            return False
+
+        if root.op.name != "relax.nn.avg_pool2d":
+            return False
+
+        if "data" not in context.annotated_expr:
+            return False
+
+        data = context.annotated_expr["data"]
+        input_shape = data.struct_info.shape
+
+        if len(input_shape) != 4:
+            return False
+
+        if any(dim <= 0 for dim in input_shape):
+            return False
+
+        pool_size = root.attrs.pool_size
+        if len(pool_size) != 2:
+            return False
+        if any(size <= 0 for size in pool_size):
+            return False
+
+        strides = root.attrs.strides
+        if len(strides) != 2:
+            return False
+        if any(stride <= 0 for stride in strides):
+            return False
+
+        padding = root.attrs.padding
+        if len(padding) != 4:
+            return False
+        if any(p < 0 for p in padding):
+            return False
+
+        return True
+
+    def avgpool_pattern():
+
+        data = wildcard()
+        annotations = {
+            "data": data,
+        }
+        patterns = {}
+        patterns["nn.avg_pool2d"] = {
+            "pattern": is_op("relax.nn.avg_pool2d")(data),
+            "annotation": annotations.copy(),
+        }
+
+        pool_patterns = []
+        for k, v in patterns.items():
+            ret_annotations = v["annotation"]
+            ret_annotations["root"] = v["pattern"]
+            pool_patterns.append(
+                ("openclml." + (k), v["pattern"], ret_annotations.copy(), 
_check_avgpool2d)
+            )
+        return pool_patterns
+
+    def _check_global_avgpool(context: PatternCheckContext) -> bool:
+
+        root = context.annotated_expr.get("root")
+        if not root or not isinstance(root, relax.Call):
+            return False
+
+        if root.op.name != "relax.mean":
+            return False
+
+        if "data" not in context.annotated_expr:
+            return False
+
+        data = context.annotated_expr["data"]
+        input_shape = data.struct_info.shape
+
+        if len(input_shape) != 4:
+            return False
+
+        if input_shape[1] <= 0 or input_shape[2] <= 0 or input_shape[3] <= 0:
+            return False
+
+        if not hasattr(root.attrs, "axis"):
+            return False
+
+        axis = root.attrs.axis
+        if not (len(axis) == 2 and axis[0] == 2 and axis[1] == 3):
+            return False
+
+        return True
+
+    def global_avgpool_pattern():
+
+        """Create Pool Pattern"""
+        data = wildcard()
+        pattern = is_op("relax.mean")(data).has_attr({"axis": [2, 3]})
+
+        annotations = {
+            "data": data,
+            "root": pattern,
+        }
+
+        return [
+            ("openclml.nn.global_avg_pool2d", pattern, annotations, 
_check_global_avgpool),
+        ]
+
+    def _check_reshape(context: PatternCheckContext) -> bool:
+
+        root = context.annotated_expr.get("root")
+        if not root or not isinstance(root, relax.Call):
+            return False
+
+        if root.op.name != "relax.reshape":
+            return False
+
+        shape_arg = root.args[1]
+        if not isinstance(shape_arg, relax.Expr):
+            return False
+
+        return True
+
+    def reshape_pattern():
+
+        """Create Reshape Pattern"""
+
+        pattern = is_op("relax.reshape")(wildcard(), wildcard())
+        annotations = {
+            "root": pattern,
+        }
+        return [("openclml.reshape", pattern, annotations, _check_reshape)]
+
+    def _check_batchnorm(context: PatternCheckContext) -> bool:
+        root = context.annotated_expr.get("root")
+        if not root or not isinstance(root, relax.Call):
+            return False
+
+        if root.op.name != "relax.reshape":
+            return False
+
+        required_params = ["moving_var", "gamma", "moving_mean", "beta"]
+        for param in required_params:
+            if param not in context.annotated_expr:
+                return False
+
+        params = {
+            "moving_var": context.annotated_expr["moving_var"],
+            "gamma": context.annotated_expr["gamma"],
+            "moving_mean": context.annotated_expr["moving_mean"],
+            "beta": context.annotated_expr["beta"],
+        }
+
+        for param in params.values():
+            if not isinstance(param, relax.expr.Constant):
+                return False
+
+        base_shape = None
+        for param in params.values():
+            shape = param.struct_info.shape
+            dtype = param.struct_info.dtype
+
+            if dtype not in {"float32"}:
+                return False
+
+            # Initialize base_shape if not set
+            if base_shape is None:
+                base_shape = shape
+                continue
+
+            # All parameters should have same shape
+            if len(shape) != len(base_shape):
+                return False
+            if any(s1 != s2 for s1, s2 in zip(shape, base_shape)):
+                return False
+
+        return True
+
+    def batch_norm_pattern():
+        """Create a batch norm pattern."""
+        data = wildcard()
+        bn_scale = is_const()
+        bn_bias = is_const()
+        bn_mean = is_const()
+        bn_var = is_const()
+
+        pattern = is_op("relax.nn.batch_norm")(data, bn_scale, bn_bias, 
bn_mean, bn_var)
+        pattern = is_tuple_get_item(pattern, 0)
+        pattern = is_op("relax.reshape")(pattern, wildcard())
+
+        annotations = {
+            "gamma": bn_scale,
+            "beta": bn_bias,
+            "moving_mean": bn_mean,
+            "moving_var": bn_var,
+            "root": pattern,
+        }
+
+        return [
+            ("openclml.nn.batch_norm", pattern, annotations, _check_batchnorm),
+        ]
+
+    def _check_binary_op(context: PatternCheckContext) -> bool:
+        def _check_arg(input_expr):
+            input_dtype = input_expr.struct_info.dtype
+            input_shape = input_expr.struct_info.shape
+            if len(input_shape) == 0:
+                return False
+
+            # Avoid any operators with dtype Int64
+            if input_dtype == "int64":
+                return False
+
+            # No support for batch> 1
+            if input_shape[0] > 1:
+                return False
+
+            return True
+
+        def compare_shapes(lhs_shape, rhs_shape):
+            if len(lhs_shape) != len(rhs_shape):
+                return False
+            for lhs_dim, rhs_dim in zip(lhs_shape, rhs_shape):
+                if lhs_dim != rhs_dim:
+                    return False
+            return True
+
+        lhs_shape = None
+        rhs_shape = None
+        if "lhs" in context.annotated_expr:
+            lhs = context.annotated_expr["lhs"]
+            lhs_shape = lhs.struct_info.shape
+            if not _check_arg(lhs):
+                return False
+
+        if "rhs" in context.annotated_expr:
+            rhs = context.annotated_expr["rhs"]
+            rhs_shape = rhs.struct_info.shape
+            if not _check_arg(rhs):
+                return False
+
+        # Checking for BinaryOps ( False for unaryOp )
+        if (
+            "lhs" in context.annotated_expr
+            and "rhs" in context.annotated_expr
+            and not compare_shapes(lhs_shape, rhs_shape)
+        ):
+
+            return False
+
+        return True
+
+    def binary_op_pattern():
+        """Create a binary op pattern."""
+
+        def make_pattern(op):
+            lhs = wildcard()
+            rhs = wildcard()
+            pattern = is_op(op)(lhs, rhs)
+            annotations = {"lhs": lhs, "rhs": rhs}
+            return ("openclml." + op, pattern, annotations, _check_binary_op)
+
+        binary_ops = [
+            "relax.add",
+            "relax.subtract",
+            "relax.multiply",
+            "relax.divide",
+            "relax.maximum",
+            "relax.minimum",
+        ]
+
+        return [make_pattern(op) for op in binary_ops]
+
+    def unary_op_pattern():
+        """Create a unary op pattern."""
+
+        def make_pattern(op):
+            lhs = wildcard()
+            pattern = is_op(op)(lhs)
+            annotations = {"lhs": lhs}
+            return ("openclml." + op, pattern, annotations, _check_binary_op)
+
+        unary_ops = [
+            "relax.nn.softmax",
+            "relax.nn.relu",
+            "relax.clip",
+        ]
+
+        return [make_pattern(op) for op in unary_ops]
+
+    return [
+        *conv_pattern(),
+        *batch_norm_pattern(),
+        *binary_op_pattern(),
+        *unary_op_pattern(),
+        *maxpool_pattern(),
+        *avgpool_pattern(),
+        *global_avgpool_pattern(),
+        *reshape_pattern(),
+    ]
+
+
+clml_patterns = clml_pattern_table()
+register_patterns(clml_patterns)
+
+
+@module_pass(opt_level=0, name="OpenCLMLOffLoad")
+class OpenCLMLOffLoad:
+    """The pass sequence used for CLML offload"""
+
+    def transform_module(self, mod: IRModule, ctx: PassContext) -> IRModule:
+        """The transform"""
+
+        clml_layouts = {
+            "relax.nn.conv2d": ["NCHW", "OIHW"],
+            "relax.nn.conv2d_transpose": ["NCHW", "OIHW"],
+        }
+        seq = tvm.transform.Sequential(
+            [
+                transform.ConvertLayout(clml_layouts),
+                transform.Normalize(),
+                transform.FoldBatchnormToConv2D(),
+                AppendReshapeToBNRewriterPass(),
+                transform.FoldConstant(),
+                transform.FuseOpsByPattern(clml_pattern_table()),
+                transform.MergeCompositeFunctions(),
+                transform.RunCodegen(),
+            ],
+        )
+        mod = seq(mod)
+        return mod
diff --git a/python/tvm/relax/backend/adreno/pipeline.py 
b/python/tvm/relax/backend/adreno/pipeline.py
new file mode 100644
index 0000000000..612b8ce701
--- /dev/null
+++ b/python/tvm/relax/backend/adreno/pipeline.py
@@ -0,0 +1,75 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""The Relax Adreno GPU backend compilation pipeline and other passes."""
+import tvm
+from tvm import dlight as dl
+from tvm import relax
+
+
+def library_dispatch_passes(target: tvm.target.Target):  # pylint: 
disable=unused-argument
+    """The default library dispatch passes for Adreno GPU backend."""
+    if "clml" in target.keys:
+        return [relax.backend.adreno.clml.OpenCLMLOffLoad()]
+    else:
+        return []
+
+
+def legalize_passes(target: tvm.target.Target):  # pylint: 
disable=unused-argument
+    """The default legalization passes for Adreno GPU backend."""
+    return [
+        relax.transform.DecomposeOpsForInference(),
+        relax.transform.FoldConstant(),
+        relax.transform.LegalizeOps(),
+        relax.transform.AnnotateTIROpPattern(),
+        relax.transform.FoldConstant(),
+        relax.transform.FuseOps(),
+        relax.transform.FuseTIR(),
+        relax.transform.DeadCodeElimination(),
+        dl.ApplyDefaultSchedule(
+            dl.gpu.Reduction(),
+            dl.gpu.GeneralReduction(),
+            dl.gpu.Fallback(),
+        ),
+    ]
+
+
+def dataflow_lower_passes(target: tvm.target.Target):  # pylint: 
disable=unused-argument
+    """The default dataflow lowering passes for Adreno GPU backend."""
+    return relax.backend.gpu_generic.dataflow_lower_passes(target)
+
+
+def finalize_passes(target: tvm.target.Target):  # pylint: 
disable=unused-argument
+    """The default finalization passes for Adreno GPU backend."""
+    return relax.backend.gpu_generic.finalize_passes(target)
+
+
+def get_default_pipeline(target: tvm.target.Target):
+    """Return the default compilation pipeline for Adreno GPU."""
+
+    @tvm.transform.module_pass(opt_level=0)
+    def _pipeline(mod: tvm.ir.IRModule, _ctx: tvm.transform.PassContext):
+        with target:
+            seq = tvm.transform.Sequential(
+                library_dispatch_passes(target)
+                + legalize_passes(target)
+                + dataflow_lower_passes(target)
+                + finalize_passes(target)
+            )
+            mod = seq(mod)
+        return mod
+
+    return _pipeline
diff --git a/python/tvm/relax/backend/gpu_generic/__init__.py 
b/python/tvm/relax/backend/gpu_generic/__init__.py
index 9c5e65fb49..ea2d2a2afb 100644
--- a/python/tvm/relax/backend/gpu_generic/__init__.py
+++ b/python/tvm/relax/backend/gpu_generic/__init__.py
@@ -19,5 +19,6 @@ from .pipeline import (
     finalize_passes,
     get_default_pipeline,
     legalize_passes,
+    dataflow_lower_passes,
     library_dispatch_passes,
 )
diff --git a/python/tvm/relax/pipeline.py b/python/tvm/relax/pipeline.py
index ebb61ad3e6..ffb38cdd93 100644
--- a/python/tvm/relax/pipeline.py
+++ b/python/tvm/relax/pipeline.py
@@ -250,6 +250,8 @@ def library_dispatch_passes(target: tvm.target.Target):
         return backend.gpu_generic.library_dispatch_passes(target)
     if target.kind.name == "llvm":
         return backend.cpu_generic.library_dispatch_passes(target)
+    if target.kind.name == "opencl" and "adreno" in target.keys:
+        return backend.adreno.library_dispatch_passes(target)
     # Todo(tvm-team): support gpu-generic
     raise ValueError(f"Target {target} is not yet supported by library 
dispatch passes.")
 
@@ -264,6 +266,8 @@ def legalize_passes(target: tvm.target.Target):
         return backend.gpu_generic.legalize_passes(target)
     if target.kind.name == "llvm":
         return backend.cpu_generic.legalize_passes(target)
+    if target.kind.name == "opencl" and "adreno" in target.keys:
+        return backend.adreno.legalize_passes(target)
     # Todo(tvm-team): support gpu-generic
     raise ValueError(f"Target {target} is not yet supported by library 
dispatch passes.")
 
@@ -278,6 +282,8 @@ def dataflow_lower_passes(target: tvm.target.Target):
         return backend.gpu_generic.dataflow_lower_passes(target)
     if target.kind.name == "llvm":
         return backend.cpu_generic.dataflow_lower_passes(target)
+    if target.kind.name == "opencl" and "adreno" in target.keys:
+        return backend.adreno.dataflow_lower_passes(target)
     # Todo(tvm-team): support gpu-generic
     raise ValueError(f"Target {target} is not yet supported by dataflow 
lowering passes.")
 
@@ -292,6 +298,8 @@ def finalize_passes(target: tvm.target.Target):
         return backend.gpu_generic.finalize_passes(target)
     if target.kind.name == "llvm":
         return backend.cpu_generic.finalize_passes(target)
+    if target.kind.name == "opencl" and "adreno" in target.keys:
+        return backend.adreno.finalize_passes(target)
     # Todo(tvm-team): support gpu-generic
     raise ValueError(f"Target {target} is not yet supported by finalization 
passes.")
 
@@ -306,6 +314,8 @@ def get_default_pipeline(target: tvm.target.Target):
         return backend.gpu_generic.get_default_pipeline(target)
     if target.kind.name == "llvm":
         return backend.cpu_generic.get_default_pipeline(target)
+    if target.kind.name == "opencl" and "adreno" in target.keys:
+        return backend.adreno.get_default_pipeline(target)
     # Todo(tvm-team): support gpu-generic
     raise ValueError(
         f"Target {target} is not yet supported by default pipeline. "
diff --git a/python/tvm/relax/transform/__init__.py 
b/python/tvm/relax/transform/__init__.py
index 16e4800ca3..ffdf31975a 100644
--- a/python/tvm/relax/transform/__init__.py
+++ b/python/tvm/relax/transform/__init__.py
@@ -94,6 +94,7 @@ from .ipc_allreduce_rewrite import IPCAllReduceRewrite
 from .lazy_transform_params import LazyTransformParams
 from .lower_gpu_ipc_alloc_storage import LowerGPUIPCAllocStorage
 from .optimize_layout_transform import OptimizeLayoutTransform
+from .fold_batch_norm_to_conv2d_for_inference import FoldBatchnormToConv2D
 from .remove_redundant_reshape import RemoveRedundantReshape
 
 # Import to register the legalization functions.
diff --git 
a/python/tvm/relax/transform/fold_batch_norm_to_conv2d_for_inference.py 
b/python/tvm/relax/transform/fold_batch_norm_to_conv2d_for_inference.py
new file mode 100644
index 0000000000..9680b540cf
--- /dev/null
+++ b/python/tvm/relax/transform/fold_batch_norm_to_conv2d_for_inference.py
@@ -0,0 +1,102 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+# pylint: disable=invalid-name, unused-argument, redefined-argument-from-local
+"""Relax Fold Batchnorm into Conv2D."""
+from tvm.ir.module import IRModule
+from tvm.ir.transform import PassContext
+from tvm.relax import Expr
+from tvm.relax.dpl import is_op, rewrite_call, wildcard, is_const, 
TupleGetItemPattern
+from tvm import relax, tir
+
+from . import function_pass
+
+
+@function_pass(opt_level=0)
+class FoldBatchnormToConv2D:
+    """
+    Fuse Batchnorm to its previous Conv2D
+    This optimization is a special case of FoldScaleAxis that folds scale into 
conv2d weights.
+    This pass can be removed when FoldScaleAcis enhances to support this case.
+    """
+
+    def __init__(self):
+        self.input = wildcard()
+        self.weight = is_const()
+        self.pattern_conv2d = is_op("relax.nn.conv2d")(self.input, self.weight)
+        self.bn_weight = is_const()
+        self.bias = is_const()
+        self.mean = is_const()
+        self.variance = is_const()
+        self.pattern_bn = is_op("relax.nn.batch_norm")(
+            self.pattern_conv2d, self.bn_weight, self.bias, self.mean, 
self.variance
+        )
+
+        self.pattern = TupleGetItemPattern(self.pattern_bn, 0)
+
+    def transform_function(self, func: Expr, mod: IRModule, ctx: PassContext) 
-> IRModule:
+        """
+        Tranformation function for pattern Conv2D+BatchNorm+TupleGetItem 
pattern
+        Parameters
+        ----------
+        func: Expr
+            The relax function to be optimized
+        mod: IRModule
+            The ir module
+        ctx: PassContext
+            Relax pass context
+        """
+
+        self.mod = mod
+        updated_call = func
+
+        # Skip primitive functions
+        if "Primitive" in func.attrs.keys() and func.attrs["Primitive"] != 0:
+            return updated_call
+
+        def rewriter(expr, matches):
+            conv_input = matches[self.input]
+            conv_weight = matches[self.weight]
+            bn_weight = matches[self.bn_weight]
+            bn_bias = matches[self.bias]
+            bn_mean = matches[self.mean]
+            bn_variance = matches[self.variance]
+            conv_op = matches[self.pattern_conv2d]
+            bn_op = matches[self.pattern_bn]
+            conv_attrs = conv_op.attrs
+            bn_attrs = bn_op.attrs
+
+            bn_variance = relax.op.add(
+                bn_variance, relax.PrimValue(tir.FloatImm("float32", 
bn_attrs["epsilon"]))
+            )
+            dino = relax.op.sqrt(bn_variance)
+            wt = relax.op.divide(bn_weight, dino)
+            bs = relax.op.subtract(bn_bias, relax.op.multiply(bn_mean, wt))
+            if conv_attrs["kernel_layout"] == "OIHW":
+                wt = relax.op.reshape(wt, 
shape=(bn_weight.struct_info.shape[0], 1, 1, 1))
+            elif conv_attrs["kernel_layout"] == "IOHW":
+                wt = wt.reshape(1, bn_weight.struct_info.shape[0], 1, 1)
+            else:
+                return expr
+            wt_conv = relax.op.multiply(conv_weight, wt)
+            bs_args = relax.op.reshape(bs, shape=(1, 
bn_bias.struct_info.shape[0], 1, 1))
+
+            conv_out = relax.Call(conv_op.op, (conv_input, wt_conv), 
conv_attrs)
+            return relax.op.add(conv_out, bs_args)
+
+        updated_call = rewrite_call(self.pattern, rewriter, func)
+
+        return updated_call
diff --git a/python/tvm/target/__init__.py b/python/tvm/target/__init__.py
index 1bb883e840..9288eb3f97 100644
--- a/python/tvm/target/__init__.py
+++ b/python/tvm/target/__init__.py
@@ -70,6 +70,7 @@ from .target import (
     riscv_cpu,
     hexagon,
     stm32,
+    adreno,
 )
 from .virtual_device import VirtualDevice
 from .compilation_config import make_compilation_config
diff --git a/python/tvm/target/target.py b/python/tvm/target/target.py
index 81baa57f9e..d78561eadf 100644
--- a/python/tvm/target/target.py
+++ b/python/tvm/target/target.py
@@ -830,7 +830,7 @@ def stm32(series="unknown", options=None):
     return Target(" ".join(["c"] + opts))
 
 
-def adreno(model="unknown", options=None):
+def adreno(model="unknown", options=None, clml=False):
     """Returns a Qualcomm GPU target.
     Parameters
     ----------
@@ -839,7 +839,10 @@ def adreno(model="unknown", options=None):
     options : str or list of str
         Additional options
     """
-    opts = ["-device=adreno", "-model=%s" % model]
+    if clml:
+        opts = ["-device=adreno", "--keys=adreno,opencl,gpu,clml", "-model=%s" 
% model]
+    else:
+        opts = ["-device=adreno", "--keys=adreno,opencl,gpu", "-model=%s" % 
model]
     opts = _merge_opts(opts, options)
     return Target(" ".join(["opencl"] + opts))
 
diff --git a/src/relax/backend/contrib/clml/codegen.cc 
b/src/relax/backend/contrib/clml/codegen.cc
new file mode 100644
index 0000000000..8480ca379a
--- /dev/null
+++ b/src/relax/backend/contrib/clml/codegen.cc
@@ -0,0 +1,358 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file src/relax/backend/contrib/clml/codegen.cc
+ * \brief Implementation of the OpenCLML JSON serializer.
+ */
+#include <tvm/ir/module.h>
+#include <tvm/ir/transform.h>
+#include <tvm/relax/type.h>
+
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "../../../transform/utils.h"
+#include "../codegen_json/codegen_json.h"
+#include "../utils.h"
+
+namespace tvm {
+namespace relax {
+namespace contrib {
+
+/*! \brief Attributes to store the compiler options for OpenCLML. */
+struct OpenCLMLCompilerConfigNode : public 
tvm::AttrsNode<OpenCLMLCompilerConfigNode> {
+  Integer clml_version;
+
+  TVM_DECLARE_ATTRS(OpenCLMLCompilerConfigNode, 
"relax.ext.attrs.OpenCLMLCompilerConfigNode") {
+    TVM_ATTR_FIELD(clml_version)
+        .describe("OpenCLML version as (major, minor, patch).")
+        .set_default(Integer(3));
+  }
+};
+
+class OpenCLMLCompilerConfig : public Attrs {
+ public:
+  TVM_DEFINE_NOTNULLABLE_OBJECT_REF_METHODS(OpenCLMLCompilerConfig, Attrs,
+                                            OpenCLMLCompilerConfigNode);
+};
+
+TVM_REGISTER_NODE_TYPE(OpenCLMLCompilerConfigNode);
+TVM_REGISTER_PASS_CONFIG_OPTION("relax.ext.clml.options", 
OpenCLMLCompilerConfig);
+
+using JSONGraphNode = tvm::runtime::json::JSONGraphNode;
+using JSONGraphNodeEntry = tvm::runtime::json::JSONGraphNodeEntry;
+using JSONGraphObjectPtr = backend::contrib::JSONGraphObjectPtr;
+using OpAttrExtractor = backend::contrib::OpAttrExtractor;
+using JSONSerializer = backend::contrib::JSONSerializer;
+
+class OpenCLMLJSONSerializer;
+
+/*!
+ * \brief Collect the constants and attributes from all operator calls in the 
body
+ * of a "Composite" function.
+ */
+class CollectCLMLFromCompositeFunctionBody : public ExprVisitor {
+ public:
+  explicit CollectCLMLFromCompositeFunctionBody(OpenCLMLJSONSerializer* 
serializer)
+      : serializer_(serializer), node_(std::make_shared<JSONGraphNode>()) {}
+
+  void VisitExpr_(const ConstantNode* constant_node) final;
+  void VisitExpr_(const CallNode* call_node) final;
+
+  void SetGenericAttributes(const CallNode* call_node) {
+    if (backend::IsOp(call_node, "relax.nn.relu")) {
+      std::vector<std::string> activation_type = {"relu"};
+      std::vector<dmlc::any> act_attr;
+      act_attr.emplace_back(activation_type);
+      node_->SetAttr("activation_type", act_attr);
+    }
+
+    OpAttrExtractor extractor(node_);
+    const Object* attr_obj = call_node->attrs.get();
+    extractor.Extract(const_cast<Object*>(attr_obj));
+  }
+
+  OpenCLMLJSONSerializer* serializer_;
+  /*! \brief Accumulated translated arguments. */
+  std::vector<JSONGraphNodeEntry> args_;
+  /*!
+   * \brief Temporary node into which we'll accumulate attributes. Ideally 
this would be the
+   * final JSONGraphNode however we don't yet know how many inputs that will 
have.
+   */
+  JSONGraphObjectPtr node_;
+};
+
+/*!
+ * \brief Generates an OpenCLMLModule from a relax expression by serializing 
the expression to a
+ * json representation. OpenCLML is not required here because use of OpenCLML 
APIs is deferred until
+ * runtime.
+ */
+class OpenCLMLJSONSerializer : public JSONSerializer {
+ public:
+  explicit OpenCLMLJSONSerializer(Map<Constant, String> constant_names, 
Map<Var, Expr> bindings)
+      : JSONSerializer(constant_names), bindings_(bindings) {}
+
+  /*!
+   * \brief A series of operators that form a composite
+   * convolution. Supports nn.conv2d
+   */
+  struct CompositeConvNode {
+    const CallNode* pad = nullptr;
+    const CallNode* conv = nullptr;
+    const CallNode* bn = nullptr;
+    const CallNode* bias = nullptr;
+    const CallNode* activation = nullptr;
+    std::string act_type;
+  };
+
+  using JSONSerializer::VisitExpr_;
+
+  std::vector<JSONGraphNodeEntry> VisitExpr_(const CallNode* call_node) final {
+    // The call must be to an inline "Composite" function
+    const auto* fn_var = call_node->op.as<VarNode>();
+    ICHECK(fn_var);
+    const auto fn = Downcast<Function>(bindings_[GetRef<Var>(fn_var)]);
+
+    auto opt_composite = fn->GetAttr<String>(attr::kComposite);
+    ICHECK(opt_composite.defined());
+    std::string name = opt_composite.value();
+
+    std::shared_ptr<JSONGraphNode> node;
+
+    if (backend::EndsWithPattern(name, "nn.conv2d") ||
+        backend::EndsWithPattern(name, "nn.pad_conv2d") ||
+        backend::EndsWithPattern(name, "nn.pad_conv2d_transpose")) {
+      node = CreateCompositeConvJSONNode(call_node);
+    } else {
+      // Collect the constants and attributes of all operator calls inside the 
composite body.
+      CollectCLMLFromCompositeFunctionBody collector(this);
+      collector.VisitExpr(fn->body);
+
+      // Capture the args to the "Composite" function as inputs for this node.
+      std::vector<JSONGraphNodeEntry> inputs;
+      for (const auto& arg : call_node->args) {
+        auto res = VisitExpr(arg);
+        inputs.insert(inputs.end(), res.begin(), res.end());
+      }
+      // Capture constants from the composite function body as additional 
inputs for this node.
+      for (const auto& node : collector.args_) {
+        inputs.emplace_back(node);
+      }
+
+      // Create the final node.
+      node = std::make_shared<JSONGraphNode>(name,
+                                             /*op_type=*/"kernel", inputs,
+                                             /*num_output=*/1);
+
+      // Transfer attributes from the collector's node to the final node.
+      node->CaptureAttrs(*collector.node_);
+
+      // Capture global settings on the JSON node.
+      SaveGlobalAttributes(node);
+
+      VLOG(1) << name << " has " << node->GetInputs().size() << " inputs";
+    }
+
+    return AddNode(node, GetRef<Expr>(call_node));
+  }
+
+  /*!
+   * \brief Extract convolution nodes from a composite function.
+   *
+   * \param cn The call node of the composite function.
+   * \return Extracted composite convolution nodes.
+   */
+  CompositeConvNode UnpackCompositeConvolution(const CallNode* cn) {
+    CompositeConvNode nodes{};
+
+    const auto* fn_var = cn->op.as<VarNode>();
+    ICHECK(fn_var);
+    const auto fn = Downcast<Function>(bindings_[GetRef<Var>(fn_var)]);
+    auto opt_composite = fn->GetAttr<String>(attr::kComposite);
+    ICHECK(opt_composite.defined());
+
+    nodes.pad = backend::TryGetOpInFunction(fn, "relax.nn.pad");
+    nodes.conv = backend::TryGetOpInFunction(fn, "relax.nn.conv2d");
+
+    if (!nodes.conv) {
+      nodes.conv = backend::TryGetOpInFunction(fn, 
"relax.nn.conv2d_transpose");
+    }
+    ICHECK(nodes.conv) << "No Convolution op found in composite function";
+    nodes.bn = backend::TryGetOpInFunction(fn, "relax.nn.batch_norm");
+    nodes.bias = backend::TryGetOpInFunction(fn, "relax.add");
+    nodes.activation = backend::TryGetOpInFunction(fn, "relax.nn.relu");
+    nodes.act_type = "relu";
+    return nodes;
+  }
+
+  /*!
+   * \brief Create a JSON representation of a composite convolution.
+   *
+   * \param cn The call to be represented.
+   * \return A JSON representation of a specific operator.
+   */
+  std::shared_ptr<JSONGraphNode> CreateCompositeConvJSONNode(const CallNode* 
cn) {
+    CompositeConvNode nodes = UnpackCompositeConvolution(cn);
+
+    const auto* fn_var = cn->op.as<VarNode>();
+    ICHECK(fn_var);
+    const auto fn = Downcast<Function>(bindings_[GetRef<Var>(fn_var)]);
+    auto opt_composite = fn->GetAttr<String>(attr::kComposite);
+    ICHECK(opt_composite.defined());
+    std::string name = opt_composite.value();
+
+    std::vector<JSONGraphNodeEntry> inputs;
+
+    inputs.push_back(VisitExpr(cn->args[0])[0]);
+    inputs.push_back(VisitExpr(nodes.conv->args[1])[0]);
+    if (nodes.bias) {
+      inputs.push_back(VisitExpr(nodes.bias->args[1])[0]);
+    }
+    // Deal with Batchnorm Fusing here
+    if (nodes.bn) {
+      inputs.push_back(VisitExpr(nodes.bn->args[1])[0]);
+      inputs.push_back(VisitExpr(nodes.bn->args[2])[0]);
+      inputs.push_back(VisitExpr(nodes.bn->args[3])[0]);
+      inputs.push_back(VisitExpr(nodes.bn->args[4])[0]);
+    }
+
+    auto json_node = std::make_shared<JSONGraphNode>(name, "kernel", inputs, 
1);
+    SetCallNodeAttribute(json_node, nodes.conv);
+
+    if (nodes.bn) {
+      const auto* bn_attr = nodes.bn->attrs.as<BatchNormAttrs>();
+      std::vector<dmlc::any> bn_any_attr;
+      std::vector<std::string> bn_args = {
+          std::to_string(bn_attr->axis), std::to_string(bn_attr->epsilon),
+          std::to_string(bn_attr->center), std::to_string(bn_attr->scale)};
+      bn_any_attr.emplace_back(bn_args);
+      json_node->SetAttr("batchnorm", bn_any_attr);
+    }
+
+    // Override attributes
+    if (nodes.pad) {
+      const auto* pad_attr = nodes.pad->attrs.as<PadAttrs>();
+      ICHECK(pad_attr);
+      auto p = pad_attr->pad_width;
+      // Pad layout for TVM: dimension wise pre and post padding.
+      // CLML takes dimension wise pre-padding followed by dimension wise 
post-padding for W, H.
+      std::vector<std::string> padding = 
{std::to_string(p[4].as<IntImmNode>()->value),
+                                          
std::to_string(p[6].as<IntImmNode>()->value),
+                                          
std::to_string(p[5].as<IntImmNode>()->value),
+                                          
std::to_string(p[7].as<IntImmNode>()->value)};
+      std::vector<dmlc::any> padding_attr;
+      padding_attr.emplace_back(padding);
+      json_node->SetAttr("padding", padding_attr);
+    }
+
+    if (nodes.activation) {
+      std::vector<std::string> activation_type = {nodes.act_type};
+      std::vector<dmlc::any> act_attr;
+      act_attr.emplace_back(activation_type);
+      json_node->SetAttr("activation_type", act_attr);
+    }
+    return json_node;
+  }
+
+  static void SaveGlobalAttributes(std::shared_ptr<JSONGraphNode> node) {
+    auto ctx = transform::PassContext::Current();
+    auto cfg = 
ctx->GetConfig<OpenCLMLCompilerConfig>("relax.ext.clml.options");
+    if (!cfg.defined()) {
+      cfg = AttrsWithDefaultValues<OpenCLMLCompilerConfig>();
+    }
+    std::vector<std::string> clml_version = 
{std::to_string(cfg.value()->clml_version.IntValue())};
+    std::vector<dmlc::any> clml_version_attr;
+    clml_version_attr.emplace_back(clml_version);
+    node->SetAttr("clml_version", clml_version_attr);
+  }
+
+ private:
+  /*! \brief The bindings to look up composite functions. */
+  Map<Var, Expr> bindings_;
+};
+
+void CollectCLMLFromCompositeFunctionBody::VisitExpr_(const ConstantNode* 
constant_node) {
+  for (const auto& entry : 
serializer_->VisitExpr(GetRef<Constant>(constant_node))) {
+    args_.emplace_back(entry);
+  }
+}
+
+void CollectCLMLFromCompositeFunctionBody::VisitExpr_(const CallNode* 
call_node) {
+  SetGenericAttributes(call_node);
+  ExprVisitor::VisitExpr_(call_node);
+}
+
+/*!
+ * \brief Create runtime modules for OpenCLML.
+ * \param functions The extern functions to be compiled via OpenCLML
+ * \return Runtime modules.
+ */
+Array<runtime::Module> OpenCLMLCompiler(Array<Function> functions,
+                                        Map<String, ObjectRef> /*unused*/,
+                                        Map<Constant, String> constant_names) {
+  Array<runtime::Module> compiled_functions;
+  for (const auto& func : functions) {
+    VLOG(1) << "OpenCLML partition:" << std::endl << func;
+    OpenCLMLJSONSerializer serializer(constant_names, AnalyzeVar2Value(func));
+    serializer.serialize(func);
+    std::string graph_json = serializer.GetJSON();
+    auto constant_names = serializer.GetConstantNames();
+    const auto* pf = runtime::Registry::Get("runtime.clml_runtime_create");
+    ICHECK(pf != nullptr) << "Cannot find OpenCLML runtime module create 
function.";
+    std::string func_name = GetExtSymbol(func);
+    VLOG(1) << "Creating clml runtime::Module for '" << func_name << "'";
+    compiled_functions.push_back((*pf)(func_name, graph_json, constant_names));
+  }
+  return compiled_functions;
+}
+
+TVM_REGISTER_GLOBAL("relax.ext.openclml").set_body_typed(OpenCLMLCompiler);
+
+/*!
+ * \brief Check whether OpenCLML graph executor is enabled.
+ * \return True if enabled, False if not.
+ */
+inline constexpr bool IsOpenCLMLRuntimeEnabled() {
+#if TVM_GRAPH_EXECUTOR_CLML
+  return true;
+#else
+  return false;
+#endif  // TVM_GRAPH_EXECUTOR_CLML
+}
+
+/*!
+ * \brief Get OpenCLML version that TVM is built against.
+ * \return The OpenCLML SDK version.
+ */
+Integer GetOpenCLMLVersion() {
+#if TVM_GRAPH_EXECUTOR_CLML
+  return Integer(TVM_CLML_VERSION);
+#else
+  return Integer(3);
+#endif  // TVM_GRAPH_EXECUTOR_CLML
+}
+
+TVM_REGISTER_GLOBAL("relax.is_openclml_runtime_enabled").set_body_typed(IsOpenCLMLRuntimeEnabled);
+TVM_REGISTER_GLOBAL("relax.get_openclml_version").set_body_typed(GetOpenCLMLVersion);
+
+}  // namespace contrib
+}  // namespace relax
+}  // namespace tvm
diff --git a/src/relax/backend/contrib/utils.cc 
b/src/relax/backend/contrib/utils.cc
index b260ea24be..8e214809dd 100644
--- a/src/relax/backend/contrib/utils.cc
+++ b/src/relax/backend/contrib/utils.cc
@@ -64,6 +64,17 @@ Map<String, IntImm> ExtractArgIdx(String pattern_name, 
Function f) {
   return arg_idx;
 }
 
+/*!
+ * \brief Utility function to find the string pattern in string str
+ * \param str the main string to check the pattern
+ * \param pattern the pattern to check in the main string
+ * \return return true if the main string ends with pattern, false otherwise
+ */
+bool EndsWithPattern(const std::string& str, const std::string& pattern) {
+  if (str.length() < pattern.length()) return false;
+  return str.compare(str.length() - pattern.length(), pattern.length(), 
pattern) == 0;
+}
+
 
TVM_REGISTER_GLOBAL("relax.contrib.extract_arg_idx").set_body_typed(ExtractArgIdx);
 
 }  // namespace backend
diff --git a/src/relax/backend/contrib/utils.h 
b/src/relax/backend/contrib/utils.h
index e0195a6195..aa3928ce02 100644
--- a/src/relax/backend/contrib/utils.h
+++ b/src/relax/backend/contrib/utils.h
@@ -111,19 +111,31 @@ inline bool IsOp(const CallNode* call, const std::string& 
op_name) {
  * The function must contain exactly one call to such op.
  * \param f The function to look for an op.
  * \param op_name The name of the op
- * \return A call node which calls an op with the given name
+ * \return A call node which calls an op with the given name or nullptr if not
  */
-inline const CallNode* GetOpInFunction(Function f, const std::string& op_name) 
{
+inline const CallNode* TryGetOpInFunction(Function f, const std::string& 
op_name) {
   auto local_bindings = AnalyzeVar2Value(f);
   for (const auto& entry : local_bindings) {
     if (auto call = entry.second.as<CallNode>(); call && backend::IsOp(call, 
op_name)) {
       return call;
     }
   }
-  LOG(FATAL) << op_name << " not found in the function:\n" << f;
   return nullptr;
 }
 
+/*!
+ * \brief Return a call node within the function which calls an op with the 
given name
+ * The function must contain exactly one call to such op.
+ * \param f The function to look for an op.
+ * \param op_name The name of the op
+ * \return A call node which calls an op with the given name
+ */
+inline const CallNode* GetOpInFunction(Function f, const std::string& op_name) 
{
+  const CallNode* op = TryGetOpInFunction(f, op_name);
+  ICHECK(op) << op_name << " not found in the function:\n" << f;
+  return op;
+}
+
 /*!
  * \brief Extract indices of the argument patterns in the function parameter 
list.
  * Each composite function pattern can register a mapping between variable 
names and the
@@ -149,6 +161,14 @@ std::string to_str(const Type& value) {
   return os.str();
 }
 
+/*!
+ * \brief Utility function to find the string pattern in string str
+ * \param str the main string to check the pattern
+ * \param pattern the pattern to check in the main string
+ * \return return true if the main string ends with pattern, false otherwise
+ */
+bool EndsWithPattern(const std::string& str, const std::string& pattern);
+
 }  // namespace backend
 }  // namespace relax
 }  // namespace tvm
diff --git a/src/runtime/contrib/clml/clml_runtime.cc 
b/src/runtime/contrib/clml/clml_runtime.cc
index fa7338177c..4998f2b476 100644
--- a/src/runtime/contrib/clml/clml_runtime.cc
+++ b/src/runtime/contrib/clml/clml_runtime.cc
@@ -157,7 +157,7 @@ class CLMLRuntime : public JSONRuntimeBase {
       CLML_CALL(clReleaseMLTuningCacheQCOM, this->layer_.tuning_cache);
     }
     for (auto it = this->layer_.storage_map.begin(); it != 
this->layer_.storage_map.end(); it++) {
-      auto tensor_desc = it->second.first;
+      auto tensor_desc = it->second.tensor_desc;
       CLML_CALL(clReleaseMLTensorQCOM, tensor_desc->tensor)
       if (this->layer_.ddr_storage_ref_map.find(tensor_desc->memory) !=
           this->layer_.ddr_storage_ref_map.end()) {
@@ -278,8 +278,8 @@ class CLMLRuntime : public JSONRuntimeBase {
     int op_index = 0;
     for (auto it = this->layer_.storage_map.begin(); it != 
this->layer_.storage_map.end(); it++) {
       int nid = it->first;
-      auto clml_desc = it->second.first;
-      auto node = it->second.second;
+      auto clml_desc = it->second.tensor_desc;
+      auto node = it->second.node;
 
       if ("kernel" == node.GetOpType()) {
         CLML_CALL(clEnqueueMLOpQCOM, queue, this->layer_.function[op_index],
@@ -431,6 +431,7 @@ class CLMLRuntime : public JSONRuntimeBase {
    * \return Status of inference.
    */
   void Run() override {
+    LOG_CLML << "Run Start";
     cl_command_queue queue = CLML_QUEUE;
     std::vector<cl_event>& evts = 
cws->workspace->GetEventQueue(cws->tentry->device);
     for (size_t i = 0; i < input_nodes_.size(); ++i) {
@@ -453,9 +454,11 @@ class CLMLRuntime : public JSONRuntimeBase {
             evts.resize(evts.size() + 1);
             evt = &(evts.back());
           }
+          LOG_CLML << "Enqueue CLML Copy";
           CLML_CALL(clEnqueueCopyMLTensorDataQCOM, queue, 
layer_.in_placeholder[nid]->tensor,
                     layer_.in_placeholder[nid]->memory, 
layer_.inputs[nid]->tensor,
                     layer_.inputs[nid]->memory, 0, nullptr, evt);
+          LOG_CLML << "Enqueue CLML Copy Completed";
         } else {
           DLDataType tvm_dtype = 
const_cast<DLTensor*>(data_entry_[eid])->dtype;
           cl_channel_type cl_dtype = MakeCLDataType(tvm_dtype);
@@ -468,9 +471,11 @@ class CLMLRuntime : public JSONRuntimeBase {
         }
       }
     }
+    LOG_CLML << "Inputs Set";
 
     int64_t duration = 0;
     if (cws->is_recordable_queue) {
+      LOG_CLML << "Execution by Rec Queue";
       if (cws->workspace->IsProfiling(cws->tentry->device)) {
         Timer t;
         auto f = Registry::Get(std::string("profiling.timer.opencl"));
@@ -488,8 +493,10 @@ class CLMLRuntime : public JSONRuntimeBase {
                   0, nullptr, 0, nullptr, 0, nullptr, 0, nullptr, 0, nullptr, 
0, nullptr, nullptr);
       }
     } else {
+      LOG_CLML << "Execution by Normal Queue";
       for (size_t i = 0; i < this->layer_.function.size(); ++i) {
         // Make CLML subgraphs accounted by OpenCLTimerNode.
+        LOG_CLML << "Run Layer:" << this->layer_.layer_names[i];
         if (cws->workspace->IsProfiling(cws->tentry->device)) {
           Timer t;
           auto f = Registry::Get(std::string("profiling.timer.opencl"));
@@ -514,6 +521,7 @@ class CLMLRuntime : public JSONRuntimeBase {
       LOG_CLML << "Total Duration for " << clml_symbol << " is:" << duration;
     }
 
+    LOG_CLML << "Run Completed";
     for (size_t i = 0; i < outputs_.size(); ++i) {
       uint32_t eid = EntryID(outputs_[i]);
       void* data = data_entry_[eid]->data;
@@ -548,6 +556,7 @@ class CLMLRuntime : public JSONRuntimeBase {
         free(tmpptr);
       }
     }
+    LOG_CLML << "Run End";
   }
 
  private:
@@ -611,7 +620,12 @@ class CLMLRuntime : public JSONRuntimeBase {
     for (size_t nid = 0; nid < nodes_.size(); ++nid) {
       const auto& node = nodes_[nid];
       uint32_t size = 0;
-      CLML_CALL(clGetMLTensorMemorySizeQCOM, CLML_CTX, 
layer_.storage_map[nid].first->tensor,
+      if (this->layer_.storage_map.find(nid) == 
this->layer_.storage_map.end()) {
+        // Possible that some nodes are not consumed by any operation
+        // Example being nn.pad second argument.
+        continue;
+      }
+      CLML_CALL(clGetMLTensorMemorySizeQCOM, CLML_CTX, 
layer_.storage_map[nid].tensor_desc->tensor,
                 &size);
 
       if ((node.GetOpType() == "kernel") || (node.GetOpType() == "input")) {
@@ -686,34 +700,57 @@ class CLMLRuntime : public JSONRuntimeBase {
     const JSONGraphNode node = nodes_[nid];
     cl_ml_tensor_usage_qcom usage = CL_TENSOR_USAGE_CNN_QCOM;
 
-    if (this->layer_.storage_map.find(nid) == this->layer_.storage_map.end()) {
-      void* node_data = nullptr;
-      if (node.GetOpType() == "const") {
-        uint32_t eid = EntryID(nid, 0);
-        node_data = data_entry_[eid]->data;
-        usage = CL_TENSOR_USAGE_PARAMETER_QCOM;
+    if (this->layer_.storage_map.find(nid) != this->layer_.storage_map.end()) {
+      if (nullptr != layer_.storage_map[nid].tensor_desc) {
+        return this->layer_.storage_map[nid].tensor_desc;
       }
+    } else {
+      this->layer_.storage_map.insert({nid, NodeDescriptor()});
+      this->layer_.storage_map[nid].node = node;
+    }
 
-      auto clml_tensor = MakeCLMLTensorFromJSONNode(node, layout, usage, 
dtype, node_data, shape);
-      this->layer_.storage_map.insert({nid, std::make_pair(clml_tensor, 
node)});
+    void* node_data = nullptr;
+    if (node.GetOpType() == "const") {
+      uint32_t eid = EntryID(nid, 0);
+      node_data = data_entry_[eid]->data;
+      usage = CL_TENSOR_USAGE_PARAMETER_QCOM;
+      ICHECK(CL_TENSOR_USAGE_INVALID_QCOM == 
this->layer_.storage_map[nid].usage)
+          << "Parameter have usage reservation !!!";
+    }
+    if (CL_TENSOR_USAGE_INVALID_QCOM != this->layer_.storage_map[nid].usage) {
+      // Respect special reservation on usage.
+      usage = this->layer_.storage_map[nid].usage;
+    } else {
+      this->layer_.storage_map[nid].usage = usage;
+    }
+    if (this->layer_.storage_map[nid].custom_layout) {
+      // Respect special reservation on layout.
+      layout = this->layer_.storage_map[nid].layout;
+    } else {
+      this->layer_.storage_map[nid].layout = layout;
+    }
 
-      if ("input" == node.GetOpType()) {
-        this->layer_.inputs.insert({nid, this->layer_.storage_map[nid].first});
-        // Input copy placeholder Tensor
-        if (layout == CL_TENSOR_LAYOUT_OPTIMAL_QCOM) {
-          this->layer_.in_placeholder.insert(
-              {nid, MakeCLMLTensorFromJSONNode(node, 
CL_TENSOR_LAYOUT_NCHW_QCOM, usage, dtype,
-                                               node_data, shape)});
-        } else {
-          this->layer_.in_placeholder.insert(
-              {nid, MakeCLMLTensorFromJSONNode(node, layout, usage, dtype, 
node_data, shape)});
-        }
+    auto clml_tensor = MakeCLMLTensorFromJSONNode(node, layout, usage, dtype, 
node_data, shape);
+
+    this->layer_.storage_map[nid].tensor_desc = clml_tensor;
+    this->layer_.storage_map[nid].usage = usage;
+    this->layer_.storage_map[nid].layout = layout;
+    LOG_CLML << "Storage Map Alloc:" << nid << " Name:" << node.GetOpName() << 
" Usage: " << usage
+             << " Layout:" << layout;
+
+    if ("input" == node.GetOpType()) {
+      this->layer_.inputs.insert({nid, 
this->layer_.storage_map[nid].tensor_desc});
+      // Input copy placeholder Tensor
+      if (layout == CL_TENSOR_LAYOUT_OPTIMAL_QCOM) {
+        this->layer_.in_placeholder.insert(
+            {nid, MakeCLMLTensorFromJSONNode(node, CL_TENSOR_LAYOUT_NCHW_QCOM, 
usage, dtype,
+                                             node_data, shape)});
+      } else {
+        this->layer_.in_placeholder.insert(
+            {nid, MakeCLMLTensorFromJSONNode(node, layout, usage, dtype, 
node_data, shape)});
       }
-
-      return clml_tensor;
-    } else {
-      return this->layer_.storage_map[nid].first;
     }
+    return clml_tensor;
   }
 
   /*!
@@ -730,7 +767,8 @@ class CLMLRuntime : public JSONRuntimeBase {
       const auto& node = nodes_[nid];
       if ("nn.dense" == node.GetOpName()) CreateDenseLayerTensor(&layer_, 
node, nid);
       if ("nn.batch_matmul" == node.GetOpName()) 
CreateBatchMatmulLayerTensor(&layer_, node, nid);
-      if ("nn.softmax" == node.GetOpName()) CreateSoftmaxLayerTensor(&layer_, 
node, nid);
+      if ("nn.softmax" == node.GetOpName() || PatternMatch(node.GetOpName(), 
"nn.softmax"))
+        CreateSoftmaxLayerTensor(&layer_, node, nid);
     }
 
     for (nid = 0; nid < nodes_.size(); ++nid) {
@@ -739,30 +777,33 @@ class CLMLRuntime : public JSONRuntimeBase {
         // Layers may request for different layout. Differ the input 
allocation.
       } else if (node.GetOpType() == "kernel") {
         auto op_name = node.GetOpName();
-        if ("nn.conv2d" == op_name)
+        if (PatternMatch(op_name, "nn.conv2d") || PatternMatch(op_name, 
"nn.pad_conv2d"))
           CreateConvolution2DLayer(&layer_, node, 
CL_CONVOLUTION_MODE_CONVOLUTION_QCOM, nid);
-        else if ("nn.depthwise_conv2d" == op_name)
+        else if (PatternMatch(op_name, "nn.depthwise_conv2d"))
           CreateConvolution2DLayer(&layer_, node, 
CL_CONVOLUTION_MODE_DEPTHWISE_QCOM, nid);
-        else if ("nn.conv2d_transpose" == op_name)
+        else if (PatternMatch(op_name, "nn.conv2d_transpose"))
           CreateConvolution2DLayer(&layer_, node, 
CL_CONVOLUTION_MODE_TRANSPOSE_QCOM, nid);
-        else if ("nn.relu6" == op_name)
+        else if ("nn.relu6" == op_name || PatternMatch(op_name, "nn.relu6"))
           CreateReLULayer(&layer_, node, nid, CL_ACTIVATION_RELU6);
-        else if ("nn.relu" == op_name)
+        else if (PatternMatch(op_name, "nn.relu"))
           CreateReLULayer(&layer_, node, nid, CL_ACTIVATION_RELU);
-        else if ("nn.batch_norm" == op_name)
+        else if (PatternMatch(op_name, "nn.batch_norm"))
           CreateBatchNormLayer(&layer_, node, nid);
         else if ("nn.max_pool2d" == op_name || "nn.avg_pool2d" == op_name ||
-                 "nn.l2_pool2d" == op_name)
+                 "nn.l2_pool2d" == op_name || PatternMatch(op_name, 
"nn.max_pool2d") ||
+                 PatternMatch(op_name, "nn.avg_pool2d"))
           CreatePoolingLayer(&layer_, node, nid);
-        else if ("nn.global_max_pool2d" == op_name || "nn.global_avg_pool2d" 
== op_name)
+        else if ("nn.global_max_pool2d" == op_name || "nn.global_avg_pool2d" 
== op_name ||
+                 PatternMatch(op_name, "nn.global_avg_pool2d") ||
+                 PatternMatch(op_name, "nn.global_max_pool2d"))
           CreateGlobalPoolingLayer(&layer_, node, nid);
-        else if ("reshape" == op_name)
+        else if ("reshape" == op_name || PatternMatch(op_name, "reshape"))
           CreateReshapeLayer(&layer_, node, nid);
         else if ("concatenate" == op_name)
           CreateConcatLayer(&layer_, node, nid);
         else if ("nn.dense" == op_name)
           CreateDenseLayer(&layer_, node, nid);
-        else if ("nn.softmax" == op_name)
+        else if ("nn.softmax" == op_name || PatternMatch(op_name, 
"nn.softmax"))
           CreateSoftMaxLayer(&layer_, node, nid);
         else if ("nn.pad" == op_name)
           CreatePadLayer(&layer_, node, nid);
@@ -771,7 +812,11 @@ class CLMLRuntime : public JSONRuntimeBase {
         else if ("clip" == op_name)
           CreateClipLayer(&layer_, node, nid);
         else if ("add" == op_name || "subtract" == op_name || "multiply" == 
op_name ||
-                 "minimum" == op_name || "maximum" == op_name || "divide" == 
op_name)
+                 "minimum" == op_name || "maximum" == op_name || "divide" == 
op_name ||
+                 PatternMatch(op_name, "relax.add") || PatternMatch(op_name, 
"relax.subtract") ||
+                 PatternMatch(op_name, "relax.multiply") ||
+                 PatternMatch(op_name, "relax.minimum") || 
PatternMatch(op_name, "relax.maximum") ||
+                 PatternMatch(op_name, "relax.divide"))
           CreateBinaryLayer(&layer_, node, nid);
         else if ("nn.depth_to_space" == op_name)
           CreateDepthToSpaceLayer(&layer_, node, nid);
@@ -793,7 +838,7 @@ class CLMLRuntime : public JSONRuntimeBase {
       nid = outputs_[i].id_;
       DLDataType tvm_dtype = nodes_[nid].GetOpDataType()[0];
       cl_channel_type cl_dtype = MakeCLDataType(tvm_dtype);
-      this->layer_.outputs.push_back(this->layer_.storage_map[nid].first);
+      
this->layer_.outputs.push_back(this->layer_.storage_map[nid].tensor_desc);
       if (this->layer_.out_shapes.find(nid) != this->layer_.out_shapes.end()) {
         // Handle customized shapes here
         this->layer_.out_placeholder.push_back(MakeCLMLTensorFromJSONNode(
@@ -814,12 +859,12 @@ class CLMLRuntime : public JSONRuntimeBase {
     size_t alloc_ddr = 0;
     size_t alloc_ddr_reuse = 0;
     for (auto it = this->layer_.storage_map.begin(); it != 
this->layer_.storage_map.end(); it++) {
-      auto tensor_desc = it->second.first;
+      auto tensor_desc = it->second.tensor_desc;
       uint32_t mem_size = 0;
       result = CL_OUT_OF_HOST_MEMORY;
       CLML_CALL(clGetMLTensorMemorySizeQCOM, CLML_CTX, tensor_desc->tensor, 
&mem_size);
 
-      JSONGraphNode node = it->second.second;
+      JSONGraphNode node = it->second.node;
       void* node_data = nullptr;
       size_t on_chip_mem_offset = -1;
       if (layer_.on_chip_alloc_plan.find(it->first) != 
layer_.on_chip_alloc_plan.end()) {
@@ -939,6 +984,7 @@ class CLMLRuntime : public JSONRuntimeBase {
     std::vector<std::string> strides = 
node.GetAttr<std::vector<std::string>>("strides");
     std::vector<std::string> dilation = 
node.GetAttr<std::vector<std::string>>("dilation");
     std::vector<cl_uint> clml_padding = GetVectorValues(padding);
+
     DLDataType tvm_dtype = node.GetOpDataType()[0];
     cl_channel_type cl_dtype = MakeCLDataType(tvm_dtype);
     cl_arithmetic_mode_qcom cl_arithmetic_mode = MakeCLArithMode(cl_dtype, 
cl_dtype);
@@ -946,6 +992,7 @@ class CLMLRuntime : public JSONRuntimeBase {
       clml_padding.resize(4);
       std::fill(clml_padding.begin(), clml_padding.end(), 0);
     }
+
     cl_uint clml_padding_b[CL_ML_TENSOR_MAX_SPATIAL_DIMS_QCOM] = 
{clml_padding[0], clml_padding[1]};
     cl_uint clml_padding_a[CL_ML_TENSOR_MAX_SPATIAL_DIMS_QCOM] = 
{clml_padding[2], clml_padding[3]};
     std::vector<cl_uint> v_strides = GetVectorValues(strides);
@@ -982,8 +1029,8 @@ class CLMLRuntime : public JSONRuntimeBase {
     size_t num_inputs = inputs.size();
     bool has_bias;
     bool has_bn;
-    ICHECK(num_inputs >= 2U && num_inputs <= 7U)
-        << "Batchnorm fused convolution requires bax 7 arguments";
+    ICHECK(num_inputs >= 2 && num_inputs <= 7)
+        << "Batchnorm fused convolution requires max 7 arguments";
     has_bias = (num_inputs == 3) || (num_inputs == 7);
     has_bn = (num_inputs == 6) || (num_inputs == 7);
     // Input
@@ -1032,6 +1079,12 @@ class CLMLRuntime : public JSONRuntimeBase {
       int bn_index = has_bias ? 3 : 2;
       int axis = 
std::stoi(node.GetAttr<std::vector<std::string>>("batchnorm")[0]);
       auto bn_dims = GetTensorDims(nodes_[inputs[bn_index].id_]);
+      float epsilon = 
std::stof(node.GetAttr<std::vector<std::string>>("batchnorm")[1]);
+
+      std::vector<cl_ml_op_properties_qcom> opProperties;
+      opProperties.push_back(CL_ML_BATCH_NORM_OP_EPSILON_QCOM);
+      
opProperties.push_back(*reinterpret_cast<cl_ml_op_properties_qcom*>(&epsilon));
+      opProperties.push_back(CL_ML_OP_PROPERTY_LIST_END_QCOM);
       std::vector<size_t> bn_shape = {1, 1, 1, 1};
       bn_shape[axis] = bn_dims.n;
       auto bn_mean = std::make_shared<cl_ml_tensor_memory_desc_qcom>();
@@ -1049,15 +1102,15 @@ class CLMLRuntime : public JSONRuntimeBase {
 
       cl_ml_op_batchnorm_desc_qcom bn_desc = {CL_BATCHNORM_MODE_SPATIAL_QCOM, 
cl_arithmetic_mode};
       if (!has_act) {
-        CLML_CALL(clCreateMLOpFusedConvolutionBatchNormForwardQCOM, CLML_CTX, 
nullptr, &conv_desc,
-                  &bn_desc, input->tensor, weight->tensor, bias->tensor, 
output->tensor,
+        CLML_CALL(clCreateMLOpFusedConvolutionBatchNormForwardQCOM, CLML_CTX, 
opProperties.data(),
+                  &conv_desc, &bn_desc, input->tensor, weight->tensor, 
bias->tensor, output->tensor,
                   bn_mean->tensor, bn_var->tensor, bn_scale->tensor, 
bn_bias->tensor, &op,
                   layer_.tuning_cache);
       } else {
-        CLML_CALL(clCreateMLOpFusedConvolutionBatchNormActivationForwardQCOM, 
CLML_CTX, nullptr,
-                  &conv_desc, &bn_desc, &act_desc, input->tensor, 
weight->tensor, bias->tensor,
-                  output->tensor, nullptr, bn_mean->tensor, bn_var->tensor, 
bn_scale->tensor,
-                  bn_bias->tensor, &op, layer_.tuning_cache);
+        CLML_CALL(clCreateMLOpFusedConvolutionBatchNormActivationForwardQCOM, 
CLML_CTX,
+                  opProperties.data(), &conv_desc, &bn_desc, &act_desc, 
input->tensor,
+                  weight->tensor, bias->tensor, output->tensor, nullptr, 
bn_mean->tensor,
+                  bn_var->tensor, bn_scale->tensor, bn_bias->tensor, &op, 
layer_.tuning_cache);
       }
       layer->function.push_back(op);
     }
@@ -1176,8 +1229,9 @@ class CLMLRuntime : public JSONRuntimeBase {
     std::vector<cl_uint> clml_padding = GetVectorValues(padding);
 
     cl_ml_op_pooling_desc_qcom pool_desc = {
-        node.GetOpName() == "nn.max_pool2d" ? CL_POOLING_MODE_MAX_QCOM
-                                            : 
CL_POOLING_MODE_AVERAGE_EXCLUDE_PADDING_QCOM,
+        ((node.GetOpName() == "nn.max_pool2d") || 
PatternMatch(node.GetOpName(), "nn.max_pool2d"))
+            ? CL_POOLING_MODE_MAX_QCOM
+            : CL_POOLING_MODE_AVERAGE_EXCLUDE_PADDING_QCOM,
         4,  // reserved
         {clml_padding[0], clml_padding[1]},
         {clml_padding[2], clml_padding[3]},
@@ -1221,8 +1275,10 @@ class CLMLRuntime : public JSONRuntimeBase {
     auto output = MakeCLMLTensorFromJSONEntry(nid, {}, 
CL_TENSOR_LAYOUT_OPTIMAL_QCOM, cl_dtype);
     auto in_dims = GetTensorDims(nodes_[node.GetInputs()[0].id_]);
     cl_ml_op_pooling_desc_qcom pool_desc = {
-        node.GetOpName() == "nn.global_max_pool2d" ? CL_POOLING_MODE_MAX_QCOM
-                                                   : 
CL_POOLING_MODE_AVERAGE_EXCLUDE_PADDING_QCOM,
+        ((node.GetOpName() == "nn.global_max_pool2d") ||
+         PatternMatch(node.GetOpName(), "nn.global_max_pool2d"))
+            ? CL_POOLING_MODE_MAX_QCOM
+            : CL_POOLING_MODE_AVERAGE_EXCLUDE_PADDING_QCOM,
         4,  // reserved
         {0, 0},
         {0, 0},
@@ -1252,7 +1308,6 @@ class CLMLRuntime : public JSONRuntimeBase {
    * \param node The JSON representation of the operator.
    * \param nid The node index of JSON graph node, which points to this 
operator.
    */
-
   void CreateSoftmaxLayerTensor(CachedLayer* layer, const JSONGraphNode& node, 
size_t nid) {
     cl_ml_tensor_layout_qcom layout;
     DLDataType tvm_dtype = node.GetOpDataType()[0];
@@ -1664,19 +1719,23 @@ class CLMLRuntime : public JSONRuntimeBase {
     auto output = MakeCLMLTensorFromJSONEntry(nid, {}, 
CL_TENSOR_LAYOUT_OPTIMAL_QCOM, cl_dtype);
     std::string op_name = node.GetOpName();
     cl_binary_op_qcom binary_op = CL_TENSOR_OP_ADD_QCOM;
-    if (op_name == "subtract")
+    if (op_name == "subtract" || PatternMatch(op_name, "relax.subtract"))
       binary_op = CL_TENSOR_OP_SUB_QCOM;
-    else if (op_name == "multiply")
+    else if (op_name == "multiply" || PatternMatch(op_name, "relax.multiply"))
       binary_op = CL_TENSOR_OP_MUL_QCOM;
-    else if (op_name == "divide")
+    else if (op_name == "divide" || PatternMatch(op_name, "relax.divide"))
       binary_op = CL_TENSOR_OP_DIV_QCOM;
-    else if (op_name == "minimum")
+    else if (op_name == "minimum" || PatternMatch(op_name, "relax.minimum"))
       binary_op = CL_TENSOR_OP_MIN_QCOM;
-    else if (op_name == "maximum")
+    else if (op_name == "maximum" || PatternMatch(op_name, "relax.maximum"))
       binary_op = CL_TENSOR_OP_MAX_QCOM;
+    else if (op_name == "add" || PatternMatch(op_name, "relax.add"))
+      binary_op = CL_TENSOR_OP_ADD_QCOM;
+    else
+      LOG(FATAL) << "Undefined binary op:" << op_name;
     cl_ml_op_binary_desc_qcom add_desc = {
         binary_op, {{1.0}, CL_FLOAT}, {{1.0}, CL_FLOAT}, {{0.0}, CL_FLOAT}, 
cl_arithmetic_mode};
-
+    LOG(INFO) << "Op name - " << op_name;
     CLML_CALL(clCreateMLOpBinaryQCOM, CLML_CTX, nullptr, &add_desc, 
input_a->tensor,
               input_b->tensor, output->tensor, &op, layer_.tuning_cache);
     ICHECK(op) << op_name << " Node Error";
diff --git a/src/runtime/contrib/clml/clml_runtime.h 
b/src/runtime/contrib/clml/clml_runtime.h
index 9dfde2f782..faada2ddee 100644
--- a/src/runtime/contrib/clml/clml_runtime.h
+++ b/src/runtime/contrib/clml/clml_runtime.h
@@ -230,6 +230,18 @@ class CLMLThreadEntry {
   static CLMLThreadEntry* ThreadLocal();
 };
 
+/*!
+ * \brief Node descriptor to hold various information related to a Node.
+ */
+struct NodeDescriptor {
+  std::shared_ptr<cl_ml_tensor_memory_desc_qcom> tensor_desc = nullptr;
+  JSONGraphNode node;
+  // Check the flag and them pick the layout.
+  bool custom_layout = false;
+  cl_ml_tensor_layout_qcom layout;
+  cl_ml_tensor_usage_qcom usage = CL_TENSOR_USAGE_INVALID_QCOM;
+};
+
 /*!
  * \brief CLML objects we cache in order to avoid needing to construct
  * a new layer each time.
@@ -249,9 +261,8 @@ struct CachedLayer {
   std::vector<std::shared_ptr<cl_ml_tensor_memory_desc_qcom>> out_placeholder;
   /* Tensor shape exception list while returning from CLML Subgraph */
   std::map<int, std::vector<size_t>> out_shapes;
-  /* Map of all tensors which need backing memory allocation */
-  std::map<int, std::pair<std::shared_ptr<cl_ml_tensor_memory_desc_qcom>, 
JSONGraphNode>>
-      storage_map;
+  /* Map of nodeid and descriptors */
+  std::map<int, struct NodeDescriptor> storage_map;
   /* Tensor memory descriptors list to set after backing memory allocation */
   std::vector<cl_ml_tensor_memory_desc_qcom> tensorMemDescs;
   cl_ml_tensor_mem_desc_set_qcom descriptorSet;
diff --git a/src/runtime/contrib/clml/clml_utils.cc 
b/src/runtime/contrib/clml/clml_utils.cc
index 354bd104b8..557815dfa1 100644
--- a/src/runtime/contrib/clml/clml_utils.cc
+++ b/src/runtime/contrib/clml/clml_utils.cc
@@ -240,6 +240,17 @@ std::vector<cl_uint> GetVectorValues(const 
std::vector<std::string>& val) {
   return array;
 }
 
+/*!
+ * \brief Utility function to find the string pattern in string str
+ * \param str the main string to check the pattern
+ * \param pattern the pattern to check in the main string
+ * \return return true if the main string ends with pattern, false otherwise
+ */
+bool PatternMatch(const std::string& str, const std::string& pattern) {
+  if (str.length() < pattern.length()) return false;
+  return str.compare(str.length() - pattern.length(), pattern.length(), 
pattern) == 0;
+}
+
 }  //  namespace contrib
 }  //  namespace runtime
 }  //  namespace tvm
diff --git a/src/runtime/contrib/clml/clml_utils.h 
b/src/runtime/contrib/clml/clml_utils.h
index 2051793cf1..0496878840 100644
--- a/src/runtime/contrib/clml/clml_utils.h
+++ b/src/runtime/contrib/clml/clml_utils.h
@@ -68,6 +68,8 @@ std::shared_ptr<cl_ml_tensor_memory_desc_qcom> 
MakeCLMLTensorFromJSONNode(
 
 std::vector<cl_uint> GetVectorValues(const std::vector<std::string>& val);
 
+bool PatternMatch(const std::string& str, const std::string& pattern);
+
 }  // namespace contrib
 }  // namespace runtime
 }  // namespace tvm
diff --git a/tests/scripts/unity/task_python_relax.sh 
b/tests/python/relax/backend/clml/conftest.py
old mode 100755
new mode 100644
similarity index 52%
copy from tests/scripts/unity/task_python_relax.sh
copy to tests/python/relax/backend/clml/conftest.py
index 688812b35d..00bad5da21
--- a/tests/scripts/unity/task_python_relax.sh
+++ b/tests/python/relax/backend/clml/conftest.py
@@ -1,4 +1,3 @@
-#!/usr/bin/env bash
 # Licensed to the Apache Software Foundation (ASF) under one
 # or more contributor license agreements.  See the NOTICE file
 # distributed with this work for additional information
@@ -16,26 +15,25 @@
 # specific language governing permissions and limitations
 # under the License.
 
-set -euxo pipefail
+import os
+import sys
+import tvm
+import pytest
+from tvm import rpc as _rpc
 
-source tests/scripts/setup-pytest-env.sh
-export LD_LIBRARY_PATH="build:${LD_LIBRARY_PATH:-}"
 
-# to avoid CI CPU thread throttling.
-export TVM_BIND_THREADS=0
-export TVM_NUM_THREADS=2
-
-# setup cython
-cd python; python3 setup.py build_ext --inplace; cd ..
-
-# Run Relax tests
-TVM_TEST_TARGETS="${TVM_RELAY_TEST_TARGETS:-llvm}" pytest tests/python/relax
-TVM_TEST_TARGETS="${TVM_RELAY_TEST_TARGETS:-llvm}" pytest tests/python/dlight
-
-# Run Relax examples
-# python3 ./apps/relax_examples/mlp.py
-# python3 ./apps/relax_examples/nn_module.py
-# python3 ./apps/relax_examples/resnet.py
-
-# Test for MSC
-# pytest tests/python/contrib/test_msc
[email protected](scope="session")
+def rpc():
+    rpc_target = os.getenv("RPC_TARGET", None)
+    if rpc_target:
+        connection_type = "tracker"
+        host = os.getenv("TVM_TRACKER_HOST", "localhost")
+        port = int(os.getenv("TVM_TRACKER_PORT", 9090))
+        target = "opencl"
+        target_host = "llvm -mtriple=aarch64-linux-gnu"
+        device_key = os.getenv("RPC_DEVICE_KEY", "android")
+        cross_compile = os.getenv("TVM_NDK_CC", "aarch64-linux-android-g++")
+        tracker = _rpc.connect_tracker(host, port)
+        return tracker.request(device_key, priority=1, session_timeout=1000)
+    else:
+        return None
diff --git a/tests/python/relax/backend/clml/mod_utils.py 
b/tests/python/relax/backend/clml/mod_utils.py
new file mode 100644
index 0000000000..1efbf40c5c
--- /dev/null
+++ b/tests/python/relax/backend/clml/mod_utils.py
@@ -0,0 +1,728 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""CLML integration operator tests."""
+import pytest
+import numpy as np
+import tvm
+import tvm.testing
+import json
+
+from tvm import relax, rpc
+from tvm.script import relax as R
+from tvm.script import ir as I
+from tvm.script import tir as T
+from tvm.script.ir_builder import IRBuilder
+from tvm.script.ir_builder import relax as relax_builder
+from tvm.relax.backend.adreno import clml
+
+
+def get_relax_conv2d_mod(
+    data_shape,
+    weight_shape,
+    stride,
+    dilation,
+    padding,
+    weight_layout="OIHW",
+    groups=1,
+    dtype="float32",
+    has_bias=False,
+    has_bn=False,
+    has_activation=False,
+    has_pad=False,
+    is_depthwise=False,
+):
+    with IRBuilder() as builder:
+        with relax_builder.function():
+            R.func_name("main")
+            if has_pad:
+                p = (0, 0, 0, 0, padding[0], padding[0], padding[1], 
padding[1])
+                orig_data = R.arg("data", R.Tensor(data_shape, dtype))
+                data = R.nn.pad(orig_data, pad_width=p, pad_value=0.0)
+                padding = (0, 0, 0, 0)
+            else:
+                data = R.arg("data", R.Tensor(data_shape, dtype))
+            weight = R.arg("weight", R.Tensor(weight_shape, dtype))
+            if has_bias:
+                bias = R.arg("bias", R.Tensor((1, weight_shape[0], 1, 1), 
dtype))
+
+            is_depthwise = data_shape[1] == weight_shape[0] == groups
+
+            with R.dataflow() as frame:
+                output = R.emit(
+                    R.nn.conv2d(
+                        data,
+                        weight,
+                        out_dtype=dtype,
+                        strides=stride,
+                        dilation=dilation,
+                        padding=padding,
+                        data_layout="NCHW",
+                        kernel_layout=weight_layout,
+                        groups=groups,
+                    )
+                )
+                if has_bias:
+                    output = R.emit(output + bias)
+                if has_bn:
+                    gamma = R.arg("gamma", R.Tensor((weight_shape[0],), dtype))
+                    beta = R.arg("beta", R.Tensor((weight_shape[0],), dtype))
+                    mean = R.arg("mean", R.Tensor((weight_shape[0],), dtype))
+                    variance = R.arg("variance", R.Tensor((weight_shape[0],), 
dtype))
+                    output = R.emit(
+                        R.nn.batch_norm(output, gamma, beta, mean, variance, 
axis=1, epsilon=1e-5)[
+                            0
+                        ]
+                    )
+                if has_activation:
+                    output = R.emit(R.nn.relu(output))
+                R.output(output)
+
+            R.func_ret_value(frame.output_vars[0])
+
+    func = builder.get()
+    return tvm.IRModule({"main": func})
+
+
+def get_clml_conv2d_codegen(
+    data_shape,
+    weight_shape,
+    stride,
+    dilation,
+    padding,
+    weight_layout="OIHW",
+    groups=1,
+    dtype="float32",
+    has_bias=False,
+    has_bn=False,
+    has_activation=False,
+    has_pad=False,
+    is_depthwise=False,
+):
+    kernel_h, kernel_w = weight_shape[2], weight_shape[3]
+    channels = weight_shape[0]
+    if len(padding) == 2:
+        padding = (padding[0], padding[1], padding[0], padding[1])
+    output_height = ((data_shape[2] - kernel_h + padding[0] + padding[2]) / 
stride[0]) + 1
+    output_width = ((data_shape[3] - kernel_w + padding[1] + padding[3]) / 
stride[1]) + 1
+    output_shape = (1, channels, int(output_height), int(output_width))
+    out_dtype = dtype
+    is_depthwise = data_shape[1] == channels == groups
+
+    weight_layout = "IOHW" if is_depthwise else "OIHW"
+    if weight_layout == "OIHW":
+        weight_shape = (channels, data_shape[1] // groups, kernel_h, kernel_w)
+    else:
+        weight_shape = (data_shape[1] // groups, channels, kernel_h, kernel_w)
+
+    if is_depthwise:
+        name = "openclml.nn.depthwise_conv2d"
+    else:
+        name = "openclml.nn.conv2d"
+
+    node = {
+        "op": "kernel",
+        "name": "",
+        "inputs": [],
+        "attrs": {
+            "groups": [[str(groups)]],
+            "num_outputs": "1",
+            "data_layout": [["NCHW"]],
+            "kernel_layout": [[weight_layout]],
+            "dilation": [[str(dilation[0]), str(dilation[1])]],
+            "out_layout": [["NCHW"]],
+            "out_dtype": [[out_dtype]],
+            "shape": [[list(output_shape)]],
+            "dtype": [[dtype]],
+            "padding": [[str(p) for p in padding]],
+            "strides": [[str(s) for s in stride]],
+        },
+    }
+
+    if has_activation:
+        node["attrs"]["activation_type"] = [["relu"]]
+
+    nodes = [
+        {
+            "op": "input",
+            "name": "",
+            "attrs": {"shape": [[list(data_shape)]], "dtype": [[str(dtype)]]},
+        },
+    ]
+
+    nodes.append(
+        {
+            "op": "const",
+            "name": "",
+            "attrs": {"shape": [[list(weight_shape)]], "dtype": 
[[str(dtype)]]},
+        }
+    )
+
+    if has_bias:
+        bias_dtype = dtype
+        nodes.append(
+            {
+                "op": "const",
+                "name": "",
+                "attrs": {
+                    "shape": [[[1, weight_shape[1] if is_depthwise else 
weight_shape[0], 1, 1]]],
+                    "dtype": [[bias_dtype]],
+                },
+            }
+        )
+
+    if has_bn:
+        bn_shape = [[1, weight_shape[0], 1, 1]]
+        # conv2d + bn --> conv2d + Add due to OptimizeBatchNorm transformation 
Pass
+        nodes.append(
+            {
+                "name": "",
+                "op": "const",
+                "attrs": {"dtype": [[dtype]], "shape": [[[1, weight_shape[0], 
1, 1]]]},
+            },
+        )
+
+    input_idx = 0
+    for _ in range(len(nodes)):
+        node["inputs"].append([input_idx, 0, 0])
+        input_idx += 1
+    node["attrs"]["num_inputs"] = str(len(nodes))
+    nodes.append(node)
+    return nodes
+
+
+def get_relax_conv2d_transpose_mod(
+    data_shape,
+    weight_shape,
+    channels,
+    stride,
+    padding,
+    dtype="float32",
+):
+    with IRBuilder() as builder:
+        with relax_builder.function():
+            R.func_name("main")
+            data = R.arg("data", R.Tensor(data_shape, dtype))
+            weight = R.arg("weight", R.Tensor(weight_shape, dtype))
+
+            with R.dataflow() as frame:
+                output = R.emit(
+                    R.nn.conv2d_transpose(
+                        data,
+                        weight,
+                        groups=1,
+                        strides=stride,
+                        padding=padding,
+                        kernel_layout="OIHW",
+                        data_layout="NCHW",
+                    )
+                )
+                R.output(output)
+
+            R.func_ret_value(frame.output_vars[0])
+
+    func = builder.get()
+    return tvm.IRModule({"main": func})
+
+
+def get_conv2d_transpose_expected_codegen(
+    dshape, kshape, channels, kernel_size, strides, padding, dilation, dtype, 
output_shape
+):
+    attrs = {
+        "data_layout": [["NCHW"]],
+        "kernel_layout": [["OIHW"]],
+        "groups": [["1"]],
+        "clml_version": [["3"]],
+        "dilation": [[str(p) for p in dilation]],
+        "num_inputs": "2",
+        "num_outputs": "1",
+        "padding": [[str(p) for p in padding]],
+        "shape": [[list(output_shape)]],
+        "dtype": [[dtype]],
+        "strides": [[str(s) for s in strides]],
+        "out_dtype": [[""]],
+        "out_layout": [["NCHW"]],
+        "output_padding": [["0", "0"]],
+    }
+
+    exp_codegen = [
+        {
+            "op": "input",
+            "name": "",
+            "attrs": {"shape": [[list(dshape)]], "dtype": [[str(dtype)]]},
+        },
+        {
+            "op": "const",
+            "name": "",
+            "attrs": {"shape": [[list(kshape)]], "dtype": [[str(dtype)]]},
+        },
+        {
+            "op": "kernel",
+            "name": "",
+            "inputs": [[0, 0, 0], [1, 0, 0]],
+            "attrs": attrs,
+        },
+    ]
+    return exp_codegen
+
+
+def get_batchnorm_mod(data_shape, channels, axis, epsilon, dtype):
+    with IRBuilder() as builder:
+        with relax_builder.function():
+            R.func_name("main")
+            data = R.arg("data", R.Tensor(data_shape, dtype))
+            gamma = R.arg("gamma", R.Tensor((channels,), dtype))
+            beta = R.arg("beta", R.Tensor((channels,), dtype))
+            mean = R.arg("moving_mean", R.Tensor((channels,), dtype))
+            variance = R.arg("moving_var", R.Tensor((channels,), dtype))
+            with R.dataflow() as frame:
+                output = R.emit(
+                    R.nn.batch_norm(data, gamma, beta, mean, variance, axis, 
epsilon)[0]
+                )
+                R.output(output)
+
+            R.func_ret_value(frame.output_vars[0])
+
+        func = builder.get()
+        return tvm.IRModule({"main": func})
+
+
+def get_binary_op_mod(a_shape, b_shape, op, dtype):
+    with IRBuilder() as builder:
+        with relax_builder.function():
+            R.func_name("main")
+            a = R.arg("a", R.Tensor(a_shape, dtype))
+            b = R.arg("b", R.Tensor(b_shape, dtype))
+
+            with R.dataflow() as frame:
+                output = R.emit(op(a, b))
+                R.output(output)
+
+            R.func_ret_value(frame.output_vars[0])
+
+    func = builder.get()
+
+    low, high = 0, 1
+    a_data = np.random.uniform(low, high, size=(a_shape)).astype(dtype)
+    b_data = np.random.uniform(low, high, size=(b_shape)).astype(dtype)
+
+    return (tvm.IRModule({"main": func}), (a_data, b_data))
+
+
+def get_unary_op_mod(a_shape, op, dtype):
+    with IRBuilder() as builder:
+        with relax_builder.function():
+            R.func_name("main")
+            a = R.arg("a", R.Tensor(a_shape, dtype))
+
+            with R.dataflow() as frame:
+                output = R.emit(op(a))
+                R.output(output)
+
+            R.func_ret_value(frame.output_vars[0])
+
+    func = builder.get()
+
+    low, high = 0, 1
+    a_data = np.random.uniform(low, high, size=(a_shape)).astype(dtype)
+
+    return (tvm.IRModule({"main": func}), (a_data,))
+
+
+def get_relax_maxpool_mod(
+    data_shape, dtype, pool_size, stride=None, dilation=(1, 1), padding=(0, 
0), has_pad=False
+):
+    """
+    Args:
+        data_shape (tuple): Input tensor shape
+        pool_size (tuple): Pooling window size (height, width)
+        stride (tuple, optional): Stride of pooling operation. Defaults to 
pool_size.
+        dilation (tuple, optional): Dilation rate. Defaults to (1, 1).
+        padding (tuple, optional): Padding for the input tensor. Defaults to 
(0, 0).
+        dtype (str, optional): Data type. Defaults to "float32".
+        has_pad (bool, optional): Whether to apply explicit padding. Defaults 
to False.
+
+    Returns:
+        tvm.IRModule: Relax MaxPool module
+    """
+    with IRBuilder() as builder:
+        with relax_builder.function():
+            R.func_name("main")
+
+            if has_pad:
+                p = (0, 0, 0, 0, padding[0], padding[1], padding[0], 
padding[1])
+                orig_data = R.arg("data", R.Tensor(data_shape, dtype))
+                data = R.nn.pad(orig_data, pad_width=p, 
pad_value=float("-inf"))
+                padding = (0, 0)
+            else:
+                data = R.arg("data", R.Tensor(data_shape, dtype))
+
+            with R.dataflow() as frame:
+                output = R.emit(
+                    R.nn.max_pool2d(
+                        data,
+                        pool_size=pool_size,
+                        strides=stride,
+                        dilation=dilation,
+                        padding=padding,
+                        layout="NCHW",
+                    )
+                )
+                R.output(output)
+            R.func_ret_value(frame.output_vars[0])
+
+    func = builder.get()
+    return tvm.IRModule({"main": func})
+
+
+def get_maxpool_expected_codegen(input_shape, pool_size, stride, padding, 
pool_type, dtype):
+    import math
+
+    adjusted_input_shape = [
+        input_shape[0],
+        input_shape[1],
+        input_shape[2] + padding[0] + padding[1],
+        input_shape[3] + padding[2] + padding[3],
+    ]
+
+    pool_height = math.floor(((adjusted_input_shape[2] - pool_size[0]) / 
stride[0]) + 1)
+    pool_width = math.floor(((adjusted_input_shape[3] - pool_size[1]) / 
stride[1]) + 1)
+    output_shape = [adjusted_input_shape[0], adjusted_input_shape[1], 
pool_height, pool_width]
+
+    attrs = {
+        "ceil_mode": [["0"]],
+        "clml_version": [["3"]],
+        "dilation": [["1", "1"]],
+        "layout": [["NCHW"]],
+        "num_inputs": "1",
+        "num_outputs": "1",
+        "out_layout": [["NCHW"]],
+        "padding": [[str(0) for p in padding]],
+        "pool_size": [[str(p) for p in pool_size]],
+        "shape": [[list(output_shape)]],
+        "dtype": [[dtype]],
+        "strides": [[str(s) for s in stride]],
+        "count_include_pad": [["0"]],
+    }
+    if sum(padding):
+        attrs["count_include_pad"] = [["0"]]
+
+    exp_codegen = [
+        {
+            "op": "input",
+            "name": "",
+            "attrs": {"shape": [[list(adjusted_input_shape)]], "dtype": 
[[str(dtype)]]},
+        },
+        {
+            "op": "kernel",
+            "name": "",
+            "inputs": [[0, 0, 0]],
+            "attrs": attrs,
+        },
+    ]
+    return exp_codegen
+
+
+def get_relax_avgpool_mod(data_shape, dtype, pool_size, stride, dilation, 
padding, has_pad):
+    """
+    Args:
+        data_shape (tuple): Input tensor shape
+        pool_size (tuple): Pooling window size (height, width)
+        stride (tuple, optional): Stride of pooling operation. Defaults to 
pool_size.
+        dilation (tuple, optional): Dilation rate. Defaults to (1, 1).
+        padding (tuple, optional): Padding for the input tensor. Defaults to 
(0, 0).
+        dtype (str, optional): Data type. Defaults to "float32".
+        has_pad (bool, optional): Whether to apply explicit padding. Defaults 
to False.
+        count_include_pad (bool, optional): Whether to include padding in 
averaging. Defaults to True.
+
+    Returns:
+        tvm.IRModule: Relax AvgPool module
+    """
+    with IRBuilder() as builder:
+        with relax_builder.function():
+            R.func_name("main")
+
+            if has_pad:
+                p = (0, 0, 0, 0, padding[0], padding[1], padding[0], 
padding[1])
+                orig_data = R.arg("data", R.Tensor(data_shape, dtype))
+                data = R.nn.pad(orig_data, pad_width=p, pad_value=0.0)
+                padding = (0, 0)
+            else:
+                data = R.arg("data", R.Tensor(data_shape, dtype))
+
+            with R.dataflow() as frame:
+                output = R.emit(
+                    R.nn.avg_pool2d(
+                        data,
+                        pool_size=pool_size,
+                        strides=stride,
+                        dilation=dilation,
+                        padding=padding,
+                        layout="NCHW",
+                    )
+                )
+                R.output(output)
+            R.func_ret_value(frame.output_vars[0])
+
+    func = builder.get()
+    return tvm.IRModule({"main": func})
+
+
+def get_avgpool_expected_codegen(input_shape, pool_size, stride, padding, 
pool_type, dtype):
+    import math
+
+    adjusted_input_shape = [
+        input_shape[0],
+        input_shape[1],
+        input_shape[2] + padding[0] + padding[1],
+        input_shape[3] + padding[2] + padding[3],
+    ]
+
+    pool_height = math.floor(((adjusted_input_shape[2] - pool_size[0]) / 
stride[0]) + 1)
+    pool_width = math.floor(((adjusted_input_shape[3] - pool_size[1]) / 
stride[1]) + 1)
+    output_shape = [adjusted_input_shape[0], adjusted_input_shape[1], 
pool_height, pool_width]
+
+    attrs = {
+        "ceil_mode": [["0"]],
+        "clml_version": [["3"]],
+        "dilation": [["1", "1"]],
+        "layout": [["NCHW"]],
+        "num_inputs": "1",
+        "num_outputs": "1",
+        "out_layout": [["NCHW"]],
+        "padding": [[str(0) for p in padding]],
+        "pool_size": [[str(p) for p in pool_size]],
+        "shape": [[list(output_shape)]],
+        "dtype": [[dtype]],
+        "strides": [[str(s) for s in stride]],
+        "count_include_pad": [["0"]],
+    }
+    if sum(padding):
+        attrs["count_include_pad"] = [["0"]]
+
+    exp_codegen = [
+        {
+            "op": "input",
+            "name": "",
+            "attrs": {"shape": [[list(adjusted_input_shape)]], "dtype": 
[[str(dtype)]]},
+        },
+        {
+            "op": "kernel",
+            "name": "",
+            "inputs": [[0, 0, 0]],
+            "attrs": attrs,
+        },
+    ]
+    return exp_codegen
+
+
+def get_relax_reshape_mod(input_shape, output_shape, dtype):
+    """
+    Args:
+        input_shape (tuple): Input tensor shape
+        output_shape (tuple): Desired output tensor shape
+        dtype (str, optional): Data type. Defaults to "float32".
+
+    Returns:
+        tvm.IRModule: Relax Reshape module
+    """
+    with IRBuilder() as builder:
+        with relax_builder.function():
+            R.func_name("main")
+            data = R.arg("data", R.Tensor(input_shape, dtype))
+
+            with R.dataflow() as frame:
+                output = R.emit(R.reshape(data, output_shape))
+                R.output(output)
+
+            R.func_ret_value(frame.output_vars[0])
+
+    func = builder.get()
+    return tvm.IRModule({"main": func})
+
+
+def get_relax_reshape_codegen(input_shape, output_shape, dtype):
+    def compute_output_shape(input_shape, output_shape):
+        input_elements = np.prod(input_shape)
+        specified_elements = np.prod([dim for dim in output_shape if dim != 
-1])
+        missing_dim = input_elements // specified_elements
+        return [int(dim) if dim != -1 else int(missing_dim) for dim in 
output_shape]
+
+    expected_output_shape = compute_output_shape(input_shape, output_shape)
+
+    expected_codegen_str = [
+        {
+            "attrs": {
+                "dtype": [[dtype]],
+                "shape": [[list(input_shape)]],
+            },
+            "name": "",
+            "op": "input",
+        },
+        {
+            "attrs": {
+                "clml_version": [["3"]],
+                "dtype": [[dtype]],
+                "num_inputs": "1",
+                "num_outputs": "1",
+                "shape": [[expected_output_shape]],
+            },
+            "inputs": [[0, 0, 0]],
+            "name": "",
+            "op": "kernel",
+        },
+    ]
+    return expected_codegen_str
+
+
+def get_relax_global_avgpool_mod(data_shape, keepdims, dtype):
+    """
+    Create a Relax module for Global Average Pooling (GAP).
+
+    Args:
+        data_shape (tuple): Input tensor shape (N, C, H, W)
+        dtype (str): Data type
+
+    Returns:
+        tvm.IRModule: Relax GAP module
+    """
+    with IRBuilder() as builder:
+        with relax_builder.function():
+            R.func_name("main")
+            data = R.arg("data", R.Tensor(data_shape, dtype))
+
+            with R.dataflow() as frame:
+                output = R.emit(R.mean(data, axis=[2, 3], keepdims=keepdims))
+                R.output(output)
+
+            R.func_ret_value(frame.output_vars[0])
+
+    func = builder.get()
+    return tvm.IRModule({"main": func})
+
+
+def get_global_avgpool_expected_codegen(input_shape, keep_dims, dtype):
+    """
+    Generate expected codegen for Global Average Pooling.
+
+    Args:
+        input_shape (tuple): Input shape (N, C, H, W)
+        dtype (str): Data type
+
+    Returns:
+        dict: Expected codegen output
+    """
+    output_shape = (
+        [input_shape[0], input_shape[1]]
+        if not keep_dims
+        else [input_shape[0], input_shape[1], 1, 1]
+    )
+    attrs = {
+        "num_inputs": "1",
+        "num_outputs": "1",
+        "clml_version": [["3"]],
+        "shape": [[list(output_shape)]],
+        "dtype": [[dtype]],
+        "axis": [["2", "3"]],
+        "keepdims": [["1" if keep_dims else "0"]],
+    }
+
+    exp_codegen = [
+        {
+            "op": "input",
+            "name": "",
+            "attrs": {"shape": [[list(input_shape)]], "dtype": [[str(dtype)]]},
+        },
+        {"op": "kernel", "name": "", "inputs": [[0, 0, 0]], "attrs": attrs},
+    ]
+    return exp_codegen
+
+
+def get_relax_global_maxpool_mod(data_shape, keepdims, dtype):
+    """
+    Create a Relax module for Global Average Pooling (GAP).
+
+    Args:
+        data_shape (tuple): Input tensor shape (N, C, H, W)
+        dtype (str): Data type
+
+    Returns:
+        tvm.IRModule: Relax GAP module
+    """
+    N, C, H, W = data_shape
+    with IRBuilder() as builder:
+        with relax_builder.function():
+            R.func_name("main")
+            data = R.arg("data", R.Tensor(data_shape, dtype))
+
+            with R.dataflow() as frame:
+                output = R.emit(
+                    R.nn.max_pool2d(
+                        data, pool_size=(H, W), strides=(1, 1), padding=(0, 
0), layout="NCHW"
+                    )
+                )
+                R.output(output)
+
+            R.func_ret_value(frame.output_vars[0])
+
+    func = builder.get()
+    return tvm.IRModule({"main": func})
+
+
+def get_global_maxpool_expected_codegen(input_shape, pool_size, stride, 
padding, pool_type, dtype):
+    import math
+
+    adjusted_input_shape = [
+        input_shape[0],
+        input_shape[1],
+        input_shape[2] + padding[0] + padding[1],
+        input_shape[3] + padding[2] + padding[3],
+    ]
+
+    output_shape = [adjusted_input_shape[0], adjusted_input_shape[1], 1, 1]
+
+    attrs = {
+        "ceil_mode": [["0"]],
+        "clml_version": [["3"]],
+        "dilation": [["1", "1"]],
+        "layout": [["NCHW"]],
+        "num_inputs": "1",
+        "num_outputs": "1",
+        "out_layout": [["NCHW"]],
+        "padding": [[str(0) for p in padding]],
+        "pool_size": [[str(p) for p in pool_size]],
+        "shape": [[list(output_shape)]],
+        "dtype": [[dtype]],
+        "strides": [[str(s) for s in stride]],
+        "count_include_pad": [["0"]],
+    }
+    if sum(padding):
+        attrs["count_include_pad"] = [["0"]]
+
+    exp_codegen = [
+        {
+            "op": "input",
+            "name": "",
+            "attrs": {"shape": [[list(adjusted_input_shape)]], "dtype": 
[[str(dtype)]]},
+        },
+        {
+            "op": "kernel",
+            "name": "",
+            "inputs": [[0, 0, 0]],
+            "attrs": attrs,
+        },
+    ]
+    return exp_codegen
diff --git a/tests/python/relax/backend/clml/test_clml_codegen.py 
b/tests/python/relax/backend/clml/test_clml_codegen.py
new file mode 100644
index 0000000000..b03d6afa1c
--- /dev/null
+++ b/tests/python/relax/backend/clml/test_clml_codegen.py
@@ -0,0 +1,505 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""CLML integration operator tests."""
+import pytest
+import numpy as np
+import tvm
+import tvm.testing
+import json
+
+from tvm import relax
+from tvm.script import relax as R
+from tvm.script import ir as I
+from tvm.script import tir as T
+from tvm.script.ir_builder import IRBuilder
+from tvm.script.ir_builder import relax as relax_builder
+from tvm.relax.backend.adreno import clml
+from tvm.relax.backend.adreno.clml import OpenCLMLOffLoad
+
+from mod_utils import (
+    get_relax_conv2d_mod,
+    get_clml_conv2d_codegen,
+    get_relax_conv2d_transpose_mod,
+    get_conv2d_transpose_expected_codegen,
+    get_batchnorm_mod,
+    get_binary_op_mod,
+    get_unary_op_mod,
+    get_relax_maxpool_mod,
+    get_maxpool_expected_codegen,
+    get_relax_avgpool_mod,
+    get_avgpool_expected_codegen,
+    get_relax_reshape_mod,
+    get_relax_reshape_codegen,
+    get_relax_global_avgpool_mod,
+    get_global_avgpool_expected_codegen,
+    get_relax_global_maxpool_mod,
+    get_global_maxpool_expected_codegen,
+)
+
+
+def compare_codegen(clml_mod, clml_codegen):
+    source = clml_mod.attrs["external_mods"][0].get_source()
+    codegen = json.loads(source)["nodes"]
+    for node in range(len(codegen)):
+        if codegen[node]["op"] == "input" or codegen[node]["op"] == "const":
+            codegen[node]["name"] = ""
+        if codegen[node]["op"] == "kernel":
+            codegen[node]["name"] = ""
+    codegen_str = json.dumps(codegen, sort_keys=True, indent=2)
+    known_good_codegen_str = json.dumps(clml_codegen, sort_keys=True, indent=2)
+    assert codegen_str == known_good_codegen_str, (
+        f"The JSON produced by codegen does not match the expected result. \n"
+        f"Actual={codegen_str} \n"
+        f"Expected={known_good_codegen_str}"
+    )
+
+
+def verify(mod, params_np, clml_codegen):
+    mod = tvm.relax.transform.BindParams("main", params_np)(mod)
+    clml_mod = OpenCLMLOffLoad()(mod)
+    compare_codegen(clml_mod, clml_codegen)
+
+
[email protected]_openclml
[email protected]("dtype", ["float32"])
[email protected](
+    "kernel_h, kernel_w, padding, stride, dilation, out_channels, shape, 
has_bias, has_bn, has_activation, has_pad, is_depthwise",
+    [
+        (3, 3, (1, 1), (1, 1), (1, 1), 64, (3, 224, 224), False, True, False, 
True, False),
+        (3, 3, (1, 1), (1, 1), (1, 1), 64, (3, 224, 224), False, True, False, 
False, False),
+        (5, 5, (2, 2), (1, 1), (1, 1), 16, (16, 64, 64), False, True, True, 
False, False),
+        (7, 7, (3, 3), (2, 2), (1, 1), 32, (3, 224, 224), True, False, True, 
True, False),
+        (3, 3, (0, 0), (1, 1), (1, 1), 512, (256, 14, 14), True, False, True, 
False, False),
+        (1, 1, (0, 0), (1, 1), (1, 1), 1024, (512, 7, 7), True, False, True, 
False, False),
+        (1, 3, (0, 0), (1, 1), (1, 1), 64, (64, 7, 7), True, False, True, 
False, False),
+        (3, 1, (0, 0), (1, 1), (1, 1), 64, (64, 7, 7), False, True, True, 
True, False),
+    ],
+)
+def test_conv2d_offload(
+    kernel_h,
+    kernel_w,
+    padding,
+    stride,
+    dilation,
+    out_channels,
+    shape,
+    has_bias,
+    has_bn,
+    has_activation,
+    has_pad,
+    is_depthwise,
+    dtype,
+):
+    low, high = 0, 1
+    data_shape = (1, *shape)
+    if is_depthwise:
+        groups = data_shape[1] // out_channels
+    else:
+        groups = 1
+    padding = (padding[0], padding[1], padding[0], padding[1])
+
+    weight_format = "IOHW" if is_depthwise else "OIHW"
+    weight_shape = (out_channels, data_shape[1] // groups, kernel_h, kernel_w)
+
+    weight = np.random.uniform(low, high, size=weight_shape).astype(dtype)
+    bias = np.random.uniform(low, high, size=(1, weight_shape[0], 1, 
1)).astype(dtype)
+
+    gamma = np.random.uniform(low, high, size=(weight_shape[0],)).astype(dtype)
+    beta = np.random.uniform(low, high, size=(weight_shape[0],)).astype(dtype)
+    mean = np.random.uniform(low, high, size=(weight_shape[0],)).astype(dtype)
+    variance = np.random.uniform(low, high, 
size=(weight_shape[0],)).astype(dtype)
+
+    params_np = {"weight": weight}
+    if has_bias:
+        params_np["bias"] = bias
+    if has_bn:
+        params_np.update({"gamma": gamma, "beta": beta, "mean": mean, 
"variance": variance})
+
+    mod = get_relax_conv2d_mod(
+        data_shape,
+        weight_shape,
+        stride=stride,
+        dilation=dilation,
+        padding=padding,
+        weight_layout=weight_format,
+        groups=groups,
+        dtype=dtype,
+        has_bias=has_bias,
+        has_bn=has_bn,
+        has_activation=has_activation,
+        has_pad=has_pad,
+        is_depthwise=is_depthwise,
+    )
+
+    clml_codegen = get_clml_conv2d_codegen(
+        data_shape,
+        weight_shape,
+        stride=stride,
+        dilation=dilation,
+        padding=padding,
+        weight_layout=weight_format,
+        groups=groups,
+        dtype=dtype,
+        has_bias=has_bias,
+        has_bn=has_bn,
+        has_activation=has_activation,
+        has_pad=has_pad,
+        is_depthwise=is_depthwise,
+    )
+
+    verify(mod, params_np, clml_codegen)
+
+
[email protected]_openclml
[email protected]("dtype", ["float32"])
[email protected](
+    "dshape, kshape, channels, kernel_size, strides, padding, out_shape",
+    [
+        ((1, 256, 100, 100), (64, 256, 4, 4), 64, (4, 4), (2, 2), (0, 0, 0, 
0), (1, 64, 202, 202)),
+        ((1, 64, 200, 200), (64, 64, 4, 4), 64, (4, 4), (2, 2), (1, 1, 1, 1), 
(1, 64, 400, 400)),
+        ((1, 64, 200, 200), (64, 64, 4, 4), 64, (4, 4), (2, 2), (1, 1, 1, 1), 
(1, 64, 400, 400)),
+        ((1, 64, 400, 400), (16, 64, 4, 4), 16, (4, 4), (2, 2), (1, 1, 1, 1), 
(1, 16, 800, 800)),
+    ],
+)
+def test_conv2d_transpose(
+    dshape, kshape, channels, kernel_size, strides, padding, dtype, out_shape
+):
+    low, high = -1, 1
+    weight = np.random.uniform(low, high, size=kshape).astype(dtype)
+
+    params_np = {"weight": weight}
+
+    mod = get_relax_conv2d_transpose_mod(
+        dshape,
+        kshape,
+        channels=channels,
+        stride=strides,
+        padding=padding,
+        dtype=dtype,
+    )
+
+    exp_codegen = get_conv2d_transpose_expected_codegen(
+        dshape=dshape,
+        kshape=kshape,
+        channels=channels,
+        kernel_size=kernel_size,
+        strides=strides,
+        padding=padding,
+        dilation=(1, 1),
+        dtype=dtype,
+        output_shape=out_shape,
+    )
+    verify(mod, params_np, exp_codegen)
+
+
[email protected]_openclml
[email protected]("dtype", ["float32"])
[email protected](
+    "trials",
+    [
+        [(1, 64, 14, 14), 1, 3e-4],
+        [(1, 14, 256, 256), 1, 3e-4],
+        [(1, 14, 256, 256), 1, 3e-4],
+        [(1, 256, 1, 1), 1, 3e-4],
+    ],
+)
+def test_batchnorm(dtype, trials):
+    low, high = 0, 1
+    if clml.clml_sdk_version() < 3:
+        print("Skip due to unsupported CLML version:", clml.clml_sdk_version())
+        return
+
+    (input_shape, axis, epsilon) = trials
+    channels = input_shape[axis]
+
+    def _get_axis_tuple(axis):
+        if axis == 0:
+            return (1, 2, 3)
+        elif axis == 1:
+            return (0, 2, 3)
+        elif axis == 2:
+            return (0, 1, 3)
+        else:
+            return (0, 1, 2)
+
+    data = np.random.uniform(low, high, size=(input_shape)).astype(dtype)
+    gamma = np.random.uniform(low, high, size=(channels)).astype(dtype)
+    beta = np.random.uniform(low, high, size=(channels)).astype(dtype)
+    mean = np.mean(data, _get_axis_tuple(axis), keepdims=False)
+    variance = np.var(data, _get_axis_tuple(axis), keepdims=False)
+
+    params_np = {"gamma": gamma, "beta": beta, "moving_mean": mean, 
"moving_var": variance}
+    mod = get_batchnorm_mod(input_shape, channels, axis, epsilon, dtype)
+    exp_codegen = [
+        {
+            "attrs": {"dtype": [[dtype]], "shape": [[input_shape]]},
+            "name": "",
+            "op": "input",
+        },
+        {"attrs": {"dtype": [[dtype]], "shape": [[[channels]]]}, "name": "", 
"op": "const"},
+        {"attrs": {"dtype": [[dtype]], "shape": [[[channels]]]}, "name": "", 
"op": "const"},
+        {"attrs": {"dtype": [[dtype]], "shape": [[[channels]]]}, "name": "", 
"op": "const"},
+        {"attrs": {"dtype": [[dtype]], "shape": [[[channels]]]}, "name": "", 
"op": "const"},
+        {
+            "attrs": {
+                "axis": [[str(axis)]],
+                "center": [["1"]],
+                "dtype": [[dtype]],
+                "clml_version": [["3"]],
+                "momentum": [["0.10000000000000001"]],
+                "epsilon": [["0.00029999999999999997"]],
+                "num_inputs": "5",
+                "num_outputs": "1",
+                "scale": [["1"]],
+                "shape": [[input_shape]],
+            },
+            "inputs": [[0, 0, 0], [1, 0, 0], [2, 0, 0], [3, 0, 0], [4, 0, 0]],
+            "name": "",
+            "op": "kernel",
+        },
+    ]
+    verify(mod, params_np, exp_codegen)
+
+
[email protected]_openclml
[email protected]("dtype", ["float32"])
[email protected](
+    "a_shape, b_shape, op",
+    [
+        ((1, 64, 14, 14), (1, 64, 14, 14), R.add),
+        ((1, 256), (1, 256), R.add),
+        ((1, 64, 14, 14), (1, 64, 14, 14), R.subtract),
+        ((1, 256), (1, 256), R.subtract),
+        ((1, 64, 14, 14), (1, 64, 14, 14), R.multiply),
+        ((1, 256), (1, 256), R.multiply),
+        ((1, 64, 14, 14), (1, 64, 14, 14), R.divide),
+        ((1, 256), (1, 256), R.divide),
+        ((1, 64, 14, 14), (1, 64, 14, 14), R.minimum),
+        ((1, 256), (1, 256), R.minimum),
+        ((1, 64, 14, 14), (1, 64, 14, 14), R.maximum),
+        ((1, 256), (1, 256), R.maximum),
+    ],
+)
[email protected]_openclml
+def test_binary_ops(a_shape, b_shape, op, dtype):
+    def _verify(mod):
+        expected_codegen_str = [
+            {
+                "attrs": {
+                    "dtype": [[dtype]],
+                    "shape": [[a_shape]],
+                },
+                "name": "",
+                "op": "input",
+            },
+            {
+                "attrs": {
+                    "dtype": [[dtype]],
+                    "shape": [[b_shape]],
+                },
+                "name": "",
+                "op": "input",
+            },
+            {
+                "attrs": {
+                    "clml_version": [["3"]],
+                    "dtype": [[dtype]],
+                    "num_inputs": "2",
+                    "num_outputs": "1",
+                    "shape": [[a_shape]],
+                },
+                "inputs": [[0, 0, 0], [1, 0, 0]],
+                "name": "",
+                "op": "kernel",
+            },
+        ]
+        verify(mod, {}, expected_codegen_str)
+
+    (mod, _) = get_binary_op_mod(a_shape, b_shape, op, dtype)
+
+    _verify(mod)
+
+
[email protected]_openclml
[email protected](
+    "dtype",
+    [
+        "float32",
+    ],
+)
[email protected](
+    "a_shape, op",
+    [
+        ((1, 64, 14, 14), R.nn.relu),
+        ((1, 256, 1, 1), R.nn.relu),
+        ((1, 14, 256, 256), R.nn.relu),
+        ((1, 14, 14, 256), R.nn.relu),
+    ],
+)
[email protected]_openclml
+def test_unary_ops(a_shape, op, dtype):
+    def _verify(mod):
+        expected_codegen_str = [
+            {
+                "attrs": {
+                    "dtype": [[dtype]],
+                    "shape": [[a_shape]],
+                },
+                "name": "",
+                "op": "input",
+            },
+            {
+                "attrs": {
+                    "activation_type": [["relu"]],
+                    "clml_version": [["3"]],
+                    "dtype": [[dtype]],
+                    "num_inputs": "1",
+                    "num_outputs": "1",
+                    "shape": [[a_shape]],
+                },
+                "inputs": [[0, 0, 0]],
+                "name": "",
+                "op": "kernel",
+            },
+        ]
+        verify(mod, {}, expected_codegen_str)
+
+    (mod, _) = get_unary_op_mod(a_shape, op, dtype)
+
+    _verify(mod)
+
+
[email protected]_openclml
[email protected]("dtype", ["float32"])
[email protected](
+    "trials",
+    [
+        [(1, 64, 147, 147), (3, 3), (2, 2), (1, 1), (0, 0, 0, 0), False],
+        [(1, 256, 17, 17), (3, 3), (1, 1), (1, 1), (0, 0, 0, 0), False],
+        [(1, 1024, 14, 14), (3, 3), (1, 1), (1, 1), (0, 0, 0, 0), False],
+        [(1, 32, 256, 256), (3, 3), (2, 2), (1, 1), (1, 1, 1, 1), True],
+        [(1, 32, 256, 256), (3, 3), (2, 2), (1, 1), (0, 1, 0, 1), True],
+        [(1, 32, 256, 256), (2, 2), (2, 2), (1, 1), (1, 1, 1, 1), True],
+        [(1, 32, 256, 256), (2, 2), (2, 2), (1, 1), (1, 0, 1, 0), True],
+    ],
+)
+def test_max_pool(dtype, trials):
+    low, high = -1, 1
+    (input_shape, pool_size, stride, dilation, padding, has_pad) = trials
+    mod = get_relax_maxpool_mod(input_shape, dtype, pool_size, stride, 
dilation, padding, has_pad)
+    params_np = {}
+
+    expected_codegen_str = get_maxpool_expected_codegen(
+        input_shape, pool_size, stride, padding, "maxpool2d", dtype
+    )
+    verify(mod, params_np, expected_codegen_str)
+
+
[email protected]_openclml
[email protected]("dtype", ["float32"])
[email protected](
+    "trials",
+    [
+        [(1, 64, 147, 147), (3, 3), (2, 2), (1, 1), (0, 0, 0, 0), False],
+        [(1, 256, 17, 17), (3, 3), (1, 1), (1, 1), (0, 0, 0, 0), False],
+        [(1, 1024, 14, 14), (3, 3), (1, 1), (1, 1), (0, 0, 0, 0), False],
+        [(1, 32, 256, 256), (3, 3), (2, 2), (1, 1), (1, 1, 1, 1), True],
+        [(1, 32, 256, 256), (3, 3), (2, 2), (1, 1), (0, 1, 0, 1), True],
+        [(1, 32, 256, 256), (2, 2), (2, 2), (1, 1), (1, 1, 1, 1), True],
+        [(1, 32, 256, 256), (2, 2), (2, 2), (1, 1), (1, 0, 1, 0), True],
+    ],
+)
+def test_avg_pool(dtype, trials):
+    low, high = -1, 1
+    (input_shape, pool_size, stride, dilation, padding, has_pad) = trials
+    mod = get_relax_avgpool_mod(input_shape, dtype, pool_size, stride, 
dilation, padding, has_pad)
+    params_np = {}
+    exp_codegen_str = get_avgpool_expected_codegen(
+        input_shape, pool_size, stride, padding, "avg_pool2d", dtype
+    )
+    verify(mod, params_np, exp_codegen_str)
+
+
[email protected]_openclml
[email protected]("dtype", ["float32"])
[email protected](
+    "trials",
+    [
+        [(1, 3, 32, 32), (1, 4, -1, 32)],
+        [(1, 4, 8, 32), (1, 4, -1, 16)],
+        [(1, 64, 3, 3), (1, 32, 3, -1)],
+    ],
+)
+def test_reshape(dtype, trials):
+    low, high = -1, 1
+    (input_shape, output_shape) = trials
+    mod = get_relax_reshape_mod(input_shape, output_shape, dtype)
+    params_np = {}
+    expected_codegen = get_relax_reshape_codegen(input_shape, output_shape, 
dtype)
+    verify(mod, params_np, expected_codegen)
+
+
[email protected]_openclml
[email protected]("dtype", ["float32"])
[email protected](
+    "trials",
+    [
+        [(1, 64, 147, 147), True],
+        [(1, 256, 17, 17), False],
+        [(1, 1024, 14, 14), True],
+        [(1, 32, 256, 256), False],
+    ],
+)
+def test_global_avg_pool(dtype, trials):
+    """Test function for global average pooling."""
+    low, high = -1, 1
+    (input_shape, keep_dims) = trials
+    mod = get_relax_global_avgpool_mod(input_shape, keep_dims, dtype)
+    params_np = {}
+    exp_codegen_str = get_global_avgpool_expected_codegen(input_shape, 
keep_dims, dtype)
+    verify(mod, params_np, exp_codegen_str)
+
+
[email protected]_openclml
[email protected]("dtype", ["float32"])
[email protected](
+    "trials",
+    [
+        [(1, 64, 147, 147), True],
+        [(1, 256, 17, 17), False],
+        [(1, 1024, 14, 14), True],
+        [(1, 32, 256, 256), False],
+    ],
+)
+def test_global_max_pool(dtype, trials):
+    """Test function for global average pooling."""
+    low, high = -1, 1
+    (input_shape, keep_dims) = trials
+    N, C, H, W = input_shape
+    pool_size = (H, W)
+    stride = (1, 1)
+    padding = (0, 0, 0, 0)
+    mod = get_relax_global_maxpool_mod(input_shape, keep_dims, dtype)
+    params_np = {}
+    exp_codegen_str = get_global_maxpool_expected_codegen(
+        input_shape, pool_size, stride, padding, "global_max", dtype
+    )
+    verify(mod, params_np, exp_codegen_str)
+
+
+if __name__ == "__main__":
+    tvm.testing.main()
diff --git a/tests/python/relax/backend/clml/test_op_exec_clml_codegen.py 
b/tests/python/relax/backend/clml/test_op_exec_clml_codegen.py
new file mode 100644
index 0000000000..4e5b4b652b
--- /dev/null
+++ b/tests/python/relax/backend/clml/test_op_exec_clml_codegen.py
@@ -0,0 +1,329 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""CLML integration operator tests."""
+import pytest
+import numpy as np
+import tvm
+import tvm.testing
+import json
+
+from tvm import relax, rpc
+from tvm.script import relax as R
+from tvm.script import ir as I
+from tvm.script import tir as T
+from tvm.script.ir_builder import IRBuilder
+from tvm.script.ir_builder import relax as relax_builder
+from tvm.relax.backend.adreno import clml
+from utils import run_compare
+
+from mod_utils import (
+    get_relax_conv2d_mod,
+    get_batchnorm_mod,
+    get_binary_op_mod,
+    get_unary_op_mod,
+    get_relax_maxpool_mod,
+    get_relax_avgpool_mod,
+    get_relax_reshape_mod,
+    get_relax_reshape_codegen,
+    get_relax_global_avgpool_mod,
+    get_relax_global_maxpool_mod,
+)
+
+
[email protected]_openclml
[email protected]("dtype", ["float32"])
[email protected](
+    "kernel_h, kernel_w, padding, stride, dilation, out_channels, shape, 
has_bias, has_bn, has_activation, has_pad, is_depthwise",
+    [
+        (3, 3, (1, 1), (1, 1), (1, 1), 64, (3, 224, 224), False, True, False, 
True, False),
+        (3, 3, (1, 1), (1, 1), (1, 1), 64, (3, 224, 224), False, True, False, 
False, False),
+        (5, 5, (2, 2), (1, 1), (1, 1), 16, (16, 64, 64), False, True, True, 
False, False),
+        (7, 7, (3, 3), (2, 2), (1, 1), 32, (3, 224, 224), True, False, True, 
True, False),
+        (3, 3, (0, 0), (1, 1), (1, 1), 512, (256, 14, 14), True, False, True, 
False, False),
+        (1, 1, (0, 0), (1, 1), (1, 1), 1024, (512, 7, 7), True, False, True, 
False, False),
+        (1, 3, (0, 0), (1, 1), (1, 1), 64, (64, 7, 7), True, False, True, 
False, False),
+        (3, 1, (0, 0), (1, 1), (1, 1), 64, (64, 7, 7), False, True, True, 
True, False),
+    ],
+)
+def test_conv2d_offload(
+    kernel_h,
+    kernel_w,
+    padding,
+    stride,
+    dilation,
+    out_channels,
+    shape,
+    has_bias,
+    has_bn,
+    has_activation,
+    has_pad,
+    is_depthwise,
+    dtype,
+    rpc,
+):
+    low, high = 0, 1
+    data_shape = (1, *shape)
+    if is_depthwise:
+        groups = data_shape[1] // out_channels
+    else:
+        groups = 1
+    padding = (padding[0], padding[1], padding[0], padding[1])
+
+    weight_format = "IOHW" if is_depthwise else "OIHW"
+    weight_shape = (out_channels, data_shape[1] // groups, kernel_h, kernel_w)
+
+    data = np.random.uniform(low, high, size=data_shape).astype(dtype)
+    weight = np.random.uniform(low, high, size=weight_shape).astype(dtype)
+    bias = np.random.uniform(low, high, size=(1, weight_shape[0], 1, 
1)).astype(dtype)
+
+    gamma = np.random.uniform(low, high, size=(weight_shape[0],)).astype(dtype)
+    beta = np.random.uniform(low, high, size=(weight_shape[0],)).astype(dtype)
+    mean = np.random.uniform(low, high, size=(weight_shape[0],)).astype(dtype)
+    variance = np.random.uniform(low, high, 
size=(weight_shape[0],)).astype(dtype)
+
+    inputs = [data]
+    params_np = {"weight": weight}
+    if has_bias:
+        params_np["bias"] = bias
+    if has_bn:
+        params_np.update({"gamma": gamma, "beta": beta, "mean": mean, 
"variance": variance})
+
+    mod = get_relax_conv2d_mod(
+        data_shape,
+        weight_shape,
+        stride=stride,
+        dilation=dilation,
+        padding=padding,
+        weight_layout=weight_format,
+        groups=groups,
+        dtype=dtype,
+        has_bias=has_bias,
+        has_bn=has_bn,
+        has_activation=has_activation,
+        has_pad=has_pad,
+        is_depthwise=is_depthwise,
+    )
+    run_compare(mod, inputs, params_np, rpc)
+
+
[email protected]_openclml
[email protected]("dtype", ["float32"])
[email protected](
+    "trials",
+    [
+        [(1, 64, 14, 14), 1, 3e-4],
+        [(1, 14, 256, 256), 1, 3e-4],
+        [(1, 14, 256, 256), 1, 3e-4],
+        [(1, 256, 1, 1), 1, 3e-4],
+    ],
+)
+def test_batchnorm(dtype, trials, rpc):
+    low, high = 0, 1
+    if clml.clml_sdk_version() < 3:
+        print("Skip due to unsupported CLML version:", clml.clml_sdk_version())
+        return
+
+    (input_shape, axis, epsilon) = trials
+    channels = input_shape[axis]
+
+    def _get_axis_tuple(axis):
+        if axis == 0:
+            return (1, 2, 3)
+        elif axis == 1:
+            return (0, 2, 3)
+        elif axis == 2:
+            return (0, 1, 3)
+        else:
+            return (0, 1, 2)
+
+    data = np.random.uniform(low, high, size=(input_shape)).astype(dtype)
+    gamma = np.random.uniform(low, high, size=(channels)).astype(dtype)
+    beta = np.random.uniform(low, high, size=(channels)).astype(dtype)
+    mean = np.mean(data, _get_axis_tuple(axis), keepdims=False)
+    variance = np.var(data, _get_axis_tuple(axis), keepdims=False)
+
+    inputs = [data]
+    params_np = {"gamma": gamma, "beta": beta, "moving_mean": mean, 
"moving_var": variance}
+    mod = get_batchnorm_mod(input_shape, channels, axis, epsilon, dtype)
+    run_compare(mod, inputs, params_np, rpc)
+
+
[email protected]_openclml
[email protected]("dtype", ["float32"])
[email protected](
+    "a_shape, b_shape, op",
+    [
+        ((1, 64, 14, 14), (1, 64, 14, 14), R.add),
+        ((1, 256), (1, 256), R.add),
+        ((1, 64, 14, 14), (1, 64, 14, 14), R.subtract),
+        ((1, 256), (1, 256), R.subtract),
+        ((1, 64, 14, 14), (1, 64, 14, 14), R.multiply),
+        ((1, 256), (1, 256), R.multiply),
+        ((1, 64, 14, 14), (1, 64, 14, 14), R.divide),
+        ((1, 256), (1, 256), R.divide),
+        ((1, 64, 14, 14), (1, 64, 14, 14), R.minimum),
+        ((1, 256), (1, 256), R.minimum),
+        ((1, 64, 14, 14), (1, 64, 14, 14), R.maximum),
+        ((1, 256), (1, 256), R.maximum),
+    ],
+)
[email protected]_openclml
+def test_binary_ops(a_shape, b_shape, op, rpc, dtype):
+    (mod, inputs) = get_binary_op_mod(a_shape, b_shape, op, dtype)
+    run_compare(mod, inputs, {}, rpc)
+
+
[email protected]_openclml
[email protected](
+    "dtype",
+    [
+        "float32",
+    ],
+)
[email protected](
+    "a_shape, op",
+    [
+        ((1, 64, 14, 14), R.nn.relu),
+        ((1, 256, 1, 1), R.nn.relu),
+        ((1, 14, 256, 256), R.nn.relu),
+        ((1, 14, 14, 256), R.nn.relu),
+    ],
+)
[email protected]_openclml
+def test_unary_ops(a_shape, op, rpc, dtype):
+    (mod, inputs) = get_unary_op_mod(a_shape, op, dtype)
+    run_compare(mod, inputs, {}, rpc)
+
+
[email protected]_openclml
[email protected]("dtype", ["float32"])
[email protected](
+    "trials",
+    [
+        [(1, 64, 147, 147), (3, 3), (2, 2), (1, 1), (0, 0, 0, 0), False],
+        [(1, 256, 17, 17), (3, 3), (1, 1), (1, 1), (0, 0, 0, 0), False],
+        [(1, 1024, 14, 14), (3, 3), (1, 1), (1, 1), (0, 0, 0, 0), False],
+        [(1, 32, 256, 256), (3, 3), (2, 2), (1, 1), (1, 1, 1, 1), True],
+        [(1, 32, 256, 256), (3, 3), (2, 2), (1, 1), (0, 1, 0, 1), True],
+        [(1, 32, 256, 256), (2, 2), (2, 2), (1, 1), (1, 1, 1, 1), True],
+        [(1, 32, 256, 256), (2, 2), (2, 2), (1, 1), (1, 0, 1, 0), True],
+    ],
+)
+def test_max_pool(dtype, trials, rpc):
+    low, high = -1, 1
+    (input_shape, pool_size, stride, dilation, padding, has_pad) = trials
+    data = np.random.uniform(low, high, size=input_shape).astype(dtype)
+    inputs = [data]
+    mod = get_relax_maxpool_mod(input_shape, dtype, pool_size, stride, 
dilation, padding, has_pad)
+    params_np = {}
+    run_compare(mod, inputs, params_np, rpc)
+
+
[email protected]_openclml
[email protected]("dtype", ["float32"])
[email protected](
+    "trials",
+    [
+        [(1, 64, 147, 147), (3, 3), (2, 2), (1, 1), (0, 0, 0, 0), False],
+        [(1, 256, 17, 17), (3, 3), (1, 1), (1, 1), (0, 0, 0, 0), False],
+        [(1, 1024, 14, 14), (3, 3), (1, 1), (1, 1), (0, 0, 0, 0), False],
+        [(1, 32, 256, 256), (3, 3), (2, 2), (1, 1), (1, 1, 1, 1), True],
+        [(1, 32, 256, 256), (3, 3), (2, 2), (1, 1), (0, 1, 0, 1), True],
+        [(1, 32, 256, 256), (2, 2), (2, 2), (1, 1), (1, 1, 1, 1), True],
+        [(1, 32, 256, 256), (2, 2), (2, 2), (1, 1), (1, 0, 1, 0), True],
+    ],
+)
+def test_avg_pool(dtype, trials, rpc):
+    low, high = -1, 1
+    (input_shape, pool_size, stride, dilation, padding, has_pad) = trials
+    data = np.random.uniform(low, high, size=input_shape).astype(dtype)
+    inputs = [data]
+    mod = get_relax_avgpool_mod(input_shape, dtype, pool_size, stride, 
dilation, padding, has_pad)
+    params_np = {}
+    run_compare(mod, inputs, params_np, rpc)
+
+
[email protected]_openclml
[email protected]("dtype", ["float32"])
[email protected](
+    "trials",
+    [
+        [(1, 3, 32, 32), (1, 4, -1, 32)],
+        [(1, 4, 8, 32), (1, 4, -1, 16)],
+        [(1, 64, 3, 3), (1, 32, 3, -1)],
+    ],
+)
+def test_reshape(dtype, trials, rpc):
+    low, high = -1, 1
+    (input_shape, output_shape) = trials
+    data = np.random.uniform(low, high, size=input_shape).astype(dtype)
+    inputs = [data]
+    mod = get_relax_reshape_mod(input_shape, output_shape, dtype)
+    params_np = {}
+    run_compare(mod, inputs, params_np, rpc)
+
+
[email protected]_openclml
[email protected]("dtype", ["float32"])
[email protected](
+    "trials",
+    [
+        [(1, 64, 147, 147), True],
+        [(1, 256, 17, 17), False],
+        [(1, 1024, 14, 14), True],
+        [(1, 32, 256, 256), False],
+    ],
+)
+def test_global_avg_pool(dtype, trials, rpc):
+    """Test function for global average pooling."""
+    low, high = -1, 1
+    (input_shape, keep_dims) = trials
+    data = np.random.uniform(low, high, size=input_shape).astype(dtype)
+    inputs = [data]
+    mod = get_relax_global_avgpool_mod(input_shape, keep_dims, dtype)
+    params_np = {}
+    run_compare(mod, inputs, params_np, rpc)
+
+
[email protected]_openclml
[email protected]("dtype", ["float32"])
[email protected](
+    "trials",
+    [
+        [(1, 64, 147, 147), True],
+        [(1, 256, 17, 17), False],
+        [(1, 1024, 14, 14), True],
+        [(1, 32, 256, 256), False],
+    ],
+)
+def test_global_max_pool(dtype, trials, rpc):
+    """Test function for global average pooling."""
+    low, high = -1, 1
+    (input_shape, keep_dims) = trials
+    N, C, H, W = input_shape
+    pool_size = (H, W)
+    stride = (1, 1)
+    padding = (0, 0, 0, 0)
+    data = np.random.uniform(low, high, size=input_shape).astype(dtype)
+    inputs = [data]
+    mod = get_relax_global_maxpool_mod(input_shape, keep_dims, dtype)
+    params_np = {}
+    run_compare(mod, inputs, params_np, rpc)
+
+
+if __name__ == "__main__":
+    tvm.testing.main()
diff --git a/tests/python/relax/backend/clml/utils.py 
b/tests/python/relax/backend/clml/utils.py
new file mode 100644
index 0000000000..22b587c964
--- /dev/null
+++ b/tests/python/relax/backend/clml/utils.py
@@ -0,0 +1,90 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""Run utils for CLML integration operator tests"""
+import pytest
+import numpy as np
+import json
+import tvm
+import tvm.testing
+import copy
+
+from tvm import relax, rpc
+from tvm.relax import transform
+from tvm import dlight as dl
+from tvm.contrib import utils, ndk
+from tvm.relax.backend.adreno.clml import OpenCLMLOffLoad
+
+
+def build_and_run(
+    mod,
+    inputs_np,
+    target,
+    rpc=None,
+    load_path="vm_library.so",
+    clml_enable=False,
+):
+
+    tgt = tvm.target.Target(target, host="llvm -mtriple=aarch64-linux-gnu")
+    pipeline = relax.pipeline.get_default_pipeline(tgt)
+    mod = pipeline(mod)
+    if rpc:
+        ex = relax.build(mod, tgt)
+        temp = utils.tempdir()
+        path = temp.relpath(load_path)
+        path = "./" + load_path
+        ex.export_library(path, fcompile=ndk.create_shared, 
options=["-shared", "-fPIC", "-lm"])
+        rpc.upload(path)
+        rexec = rpc.load_module(load_path)
+        dev = rpc.cl(0)
+        vm = relax.VirtualMachine(rexec, dev)
+    else:
+        ex = relax.build(mod, target)
+        dev = tvm.device(target, 0)
+        vm = relax.VirtualMachine(ex, dev)
+
+    f = vm["main"]
+    inputs = [tvm.nd.array(inp, dev) for inp in inputs_np]
+    vm.set_input("main", *inputs)
+    vm.invoke_stateful("main")
+    tvm_output = vm.get_outputs("main")
+    return tvm_output.numpy()
+
+
+def run_compare(mod, inputs, params_np, rpc=None):
+    clml_mod = copy.deepcopy(mod)
+    mod = tvm.relax.transform.BindParams("main", params_np)(mod)
+    clml_mod = tvm.relax.transform.BindParams("main", params_np)(clml_mod)
+
+    if not rpc:
+        return
+
+    ref = build_and_run(
+        mod,
+        inputs,
+        tvm.target.adreno(),
+        rpc=rpc,
+        load_path="vm_library_opencl.so",
+    )
+    out = build_and_run(
+        clml_mod,
+        inputs,
+        tvm.target.adreno(clml=True),
+        rpc=rpc,
+        load_path="vm_library_clml.so",
+        clml_enable=True,
+    )
+    np.testing.assert_allclose(out, ref, rtol=1e-5, atol=1e-5)
diff --git a/tests/python/relax/test_transform_fold_batch_norm_to_conv2d.py 
b/tests/python/relax/test_transform_fold_batch_norm_to_conv2d.py
new file mode 100644
index 0000000000..fc68f51b9f
--- /dev/null
+++ b/tests/python/relax/test_transform_fold_batch_norm_to_conv2d.py
@@ -0,0 +1,144 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import numpy as np
+import pytest
+
+import tvm
+import tvm.testing
+from tvm import relax
+from tvm.script import relax as R
+from tvm.script import ir as I
+from tvm.script.ir_builder import IRBuilder
+from tvm.ir.module import IRModule
+from tvm.script.ir_builder import relax as relax_builder
+from tvm.relax.expr_functor import PyExprVisitor, visitor
+
+
+def get_conv2d_batchnorm_sample():
+    with IRBuilder() as builder:
+        with relax_builder.function():
+            R.func_name("main")
+            data = R.arg("data", R.Tensor((1, 3, 224, 224), "float32"))
+            weight = R.arg("weight", R.Tensor((32, 3, 3, 3), "float32"))
+            with R.dataflow() as frame:
+                output = R.emit(
+                    R.nn.conv2d(
+                        data,
+                        weight,
+                        out_dtype="float32",
+                        strides=(1, 1),
+                        dilation=(1, 1),
+                        padding=(1, 1),
+                        data_layout="NCHW",
+                        kernel_layout="OIHW",
+                        groups=1,
+                    )
+                )
+                gamma = R.arg("gamma", R.Tensor((32,), "float32"))
+                beta = R.arg("beta", R.Tensor((32,), "float32"))
+                mean = R.arg("mean", R.Tensor((32,), "float32"))
+                variance = R.arg("variance", R.Tensor((32,), "float32"))
+                output = R.emit(
+                    R.nn.batch_norm(output, gamma, beta, mean, variance, 
axis=1, epsilon=1e-5)[0]
+                )
+                R.output(output)
+
+            R.func_ret_value(frame.output_vars[0])
+
+    func = builder.get()
+
+    return tvm.IRModule({"main": func})
+
+
+def test_fold_batchnorm_info_conv2d():
+    mod = get_conv2d_batchnorm_sample()
+    mod_fold = get_conv2d_batchnorm_sample()
+
+    target = tvm.target.Target("llvm", host="llvm")
+    data_in = tvm.nd.array(np.random.rand(1, 3, 224, 224).astype(np.float32))
+
+    weight_data = tvm.nd.array(np.random.rand(32, 3, 3, 3).astype(np.float32))
+    gamma_data = tvm.nd.array(np.random.rand(32).astype(np.float32))
+    beta_data = tvm.nd.array(np.random.rand(32).astype(np.float32))
+    mean_data = tvm.nd.array(np.random.rand(32).astype(np.float32))
+    variance_data = tvm.nd.array(np.random.rand(32).astype(np.float32))
+    params_np = {
+        "weight": weight_data,
+        "gamma": gamma_data,
+        "beta": beta_data,
+        "mean": mean_data,
+        "variance": variance_data,
+    }
+
+    mod = tvm.relax.transform.BindParams("main", params_np)(mod)
+    mod_fold = tvm.relax.transform.BindParams("main", params_np)(mod_fold)
+
+    # Normal build
+    mod = tvm.relax.transform.DecomposeOpsForInference()(mod)
+    ex = relax.build(mod, target)
+    vm = relax.VirtualMachine(ex, tvm.cpu())
+    out = vm["main"](data_in)
+
+    # Fold BN to Conv2D
+    mod_fold = relax.transform.FoldBatchnormToConv2D()(mod_fold)
+    mod_fold = relax.transform.FoldConstant()(mod_fold)
+    ex_fold = relax.build(mod_fold, target)
+    vm_fold = relax.VirtualMachine(ex_fold, tvm.cpu())
+    out_fold = vm_fold["main"](data_in)
+
+    tvm.testing.assert_allclose(out.numpy(), out_fold.numpy(), rtol=1e-5, 
atol=1e-5)
+
+
+@visitor
+class VerifyFolding(PyExprVisitor):  # pylint: disable=abstract-method
+    def visit(self, mod: IRModule) -> None:
+        """Entry point"""
+        for _, func in mod.functions_items():
+            if isinstance(func, relax.Function):
+                self.visit_expr(func)
+
+    def visit_call_(self, call: relax.Call) -> None:  # pylint: 
disable=arguments-renamed
+        assert (
+            call.op.name != "relax.nn.batch_norm"
+        ), f"Batchnorm op shouldn't be present after folding to previous 
conv2d"
+
+
+def test_fold_batchnorm_info_conv2d_transform():
+    mod = get_conv2d_batchnorm_sample()
+    mod = relax.transform.FoldBatchnormToConv2D()(mod)
+    weight_data = tvm.nd.array(np.random.rand(32, 3, 3, 3).astype(np.float32))
+    gamma_data = tvm.nd.array(np.random.rand(32).astype(np.float32))
+    beta_data = tvm.nd.array(np.random.rand(32).astype(np.float32))
+    mean_data = tvm.nd.array(np.random.rand(32).astype(np.float32))
+    variance_data = tvm.nd.array(np.random.rand(32).astype(np.float32))
+    params_np = {
+        "weight": weight_data,
+        "gamma": gamma_data,
+        "beta": beta_data,
+        "mean": mean_data,
+        "variance": variance_data,
+    }
+    mod = tvm.relax.transform.BindParams("main", params_np)(mod)
+    mod = relax.transform.FoldBatchnormToConv2D()(mod)
+    mod = relax.transform.FoldConstant()(mod)
+
+    VerifyFolding().visit(mod)
+
+
+if __name__ == "__main__":
+    tvm.testing.main()
diff --git a/tests/scripts/task_config_build_gpu.sh 
b/tests/scripts/task_config_build_gpu.sh
index 244ce4b8a5..4768867826 100755
--- a/tests/scripts/task_config_build_gpu.sh
+++ b/tests/scripts/task_config_build_gpu.sh
@@ -46,3 +46,4 @@ echo set\(USE_CUTLASS ON\) >> config.cmake
 # Temporary disable MSC
 # echo set\(USE_MSC ON\) >> config.cmake
 echo set\(CMAKE_CUDA_ARCHITECTURES 75\) >> config.cmake
+echo set\(USE_CLML ON\) >> config.cmake
diff --git a/tests/scripts/task_python_adreno.sh 
b/tests/scripts/task_python_adreno.sh
index 684a63e77f..f019cd1ecc 100755
--- a/tests/scripts/task_python_adreno.sh
+++ b/tests/scripts/task_python_adreno.sh
@@ -75,5 +75,14 @@ for node_id in $CLML_TESTS; do
     i=$((i+1))
 done
 
+# Relax test
+RELAX_TESTS=$(./ci/scripts/jenkins/pytest_ids.py --folder 
tests/python/relax/backend/clml 2> /dev/null  | grep -v dlerror)
+i=0
+for node_id in $RELAX_TESTS; do
+    echo "$node_id"
+    CXX=${TVM_NDK_CC} run_pytest ctypes 
"$TVM_INTEGRATION_TESTSUITE_NAME-openclml-relax-$i" "$node_id" --reruns=0
+    i=$((i+1))
+done
+
 kill ${TRACKER_PID}
 kill ${DEVICE_PID}
diff --git a/tests/scripts/unity/task_python_relax.sh 
b/tests/scripts/unity/task_python_relax.sh
index 688812b35d..5eb2a9e420 100755
--- a/tests/scripts/unity/task_python_relax.sh
+++ b/tests/scripts/unity/task_python_relax.sh
@@ -39,3 +39,6 @@ TVM_TEST_TARGETS="${TVM_RELAY_TEST_TARGETS:-llvm}" pytest 
tests/python/dlight
 
 # Test for MSC
 # pytest tests/python/contrib/test_msc
+
+# Test for OpenCLML
+pytest tests/python/relax/backend/clml/

(tvm) branch main updated: [RELAX][BYOC] OpenCLML offload support for Relax (#17654)

Reply via email to