This is an automated email from the ASF dual-hosted git repository.
masahi pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tvm.git
The following commit(s) were added to refs/heads/main by this push:
new 27b0aad5a5 [BYOC-OpenCLML] OpenCLML integration with TVM. (#10243)
27b0aad5a5 is described below
commit 27b0aad5a55254815a076dbcacb53e9725019f9d
Author: Siva <[email protected]>
AuthorDate: Tue Jun 14 16:00:28 2022 +0530
[BYOC-OpenCLML] OpenCLML integration with TVM. (#10243)
* [BYOC-OpenCLML] OpenCLML integration with TVM.
* [BYOC-OpenCLML] Cleanup and review.
---
CMakeLists.txt | 3 +
cmake/config.cmake | 5 +
cmake/modules/LibInfo.cmake | 2 +
cmake/modules/contrib/CLML.cmake | 58 ++
python/tvm/relay/op/contrib/__init__.py | 1 +
python/tvm/relay/op/contrib/clml.py | 247 +++++
src/relay/backend/contrib/clml/codegen.cc | 412 ++++++++
src/runtime/contrib/clml/clml_runtime.cc | 1091 ++++++++++++++++++++
src/support/libinfo.cc | 2 +
.../python/contrib/test_clml}/__init__.py | 13 +-
tests/python/contrib/test_clml/infrastructure.py | 256 +++++
tests/python/contrib/test_clml/test_network.py | 139 +++
tests/python/contrib/test_clml/test_ops.py | 216 ++++
13 files changed, 2433 insertions(+), 12 deletions(-)
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 2238665644..6931b40c66 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -110,6 +110,8 @@ tvm_option(USE_TENSORRT_RUNTIME "Build with TensorRT
runtime" OFF)
tvm_option(USE_RUST_EXT "Build with Rust based compiler extensions, STATIC,
DYNAMIC, or OFF" OFF)
tvm_option(USE_VITIS_AI "Build with VITIS-AI Codegen support" OFF)
tvm_option(SUMMARIZE "Print CMake option summary after configuring" OFF)
+tvm_option(USE_CLML "Build with CLML Codegen support" OFF)
+tvm_option(USE_CLML_GRAPH_EXECUTOR "Build with CLML graph runtime" OFF)
# include directories
include_directories(${CMAKE_INCLUDE_PATH})
@@ -492,6 +494,7 @@ include(cmake/modules/contrib/ArmComputeLib.cmake)
include(cmake/modules/contrib/TensorRT.cmake)
include(cmake/modules/contrib/VitisAI.cmake)
include(cmake/modules/contrib/Verilator.cmake)
+include(cmake/modules/contrib/CLML.cmake)
include(cmake/modules/Git.cmake)
include(cmake/modules/LibInfo.cmake)
include(cmake/modules/RustExt.cmake)
diff --git a/cmake/config.cmake b/cmake/config.cmake
index 2c22d2b498..212b565f25 100644
--- a/cmake/config.cmake
+++ b/cmake/config.cmake
@@ -269,6 +269,11 @@ set(USE_VITIS_AI OFF)
# Build Verilator codegen and runtime
set(USE_VERILATOR OFF)
+#Whether to use CLML codegen
+set(USE_CLML OFF)
+# USE_CLML_GRAPH_EXECUTOR - CLML SDK PATH or ON or OFF
+set(USE_CLML_GRAPH_EXECUTOR OFF)
+
# Build ANTLR parser for Relay text format
# Possible values:
# - ON: enable ANTLR by searching default locations (cmake find_program for
antlr4 and /usr/local for jar)
diff --git a/cmake/modules/LibInfo.cmake b/cmake/modules/LibInfo.cmake
index 2c07a94ed5..06c42494a3 100644
--- a/cmake/modules/LibInfo.cmake
+++ b/cmake/modules/LibInfo.cmake
@@ -113,6 +113,8 @@ function(add_lib_info src_file)
TVM_INFO_USE_THRUST="${USE_THRUST}"
TVM_INFO_USE_VITIS_AI="${USE_VITIS_AI}"
TVM_INFO_USE_VULKAN="${USE_VULKAN}"
+ TVM_INFO_USE_CLML="${USE_CLML}"
+ TVM_INFO_USE_CLML_GRAPH_EXECUTOR="${USE_CLML_GRAPH_EXECUTOR}"
)
endfunction()
diff --git a/cmake/modules/contrib/CLML.cmake b/cmake/modules/contrib/CLML.cmake
new file mode 100644
index 0000000000..30e60423b0
--- /dev/null
+++ b/cmake/modules/contrib/CLML.cmake
@@ -0,0 +1,58 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+if(USE_CLML)
+ file(GLOB CLML_RELAY_CONTRIB_SRC src/relay/backend/contrib/clml/*.cc)
+ file(GLOB CLML_RUNTIME_MODULE src/runtime/contrib/clml/clml_runtime.cc)
+ list(APPEND COMPILER_SRCS ${CLML_RELAY_CONTRIB_SRC})
+ if(NOT USE_CLML_GRAPH_EXECUTOR)
+ list(APPEND COMPILER_SRCS ${CLML_RUNTIME_MODULE})
+ endif()
+ message(STATUS "Build with CLML support...")
+endif()
+
+if(USE_CLML_GRAPH_EXECUTOR)
+ set(CLML_PATH ${CMAKE_CURRENT_SOURCE_DIR}/clml)
+ # Detect custom CLML path.
+ if (NOT USE_CLML_GRAPH_EXECUTOR STREQUAL "ON")
+ set(CLML_PATH ${USE_CLML_GRAPH_EXECUTOR})
+ endif()
+
+ file(GLOB CLML_CONTRIB_SRC src/runtime/contrib/clml/*)
+
+ # Cmake needs to find clml library, include and support directories
+ # in the path specified by CLML_PATH.
+ set(CLML_INCLUDE_DIRS ${CLML_PATH}/include ${CLML_PATH})
+ include_directories(${CLML_INCLUDE_DIRS})
+ find_library(EXTERN_CLML_COMPUTE_LIB
+ NAMES OpenCL libOpenCL
+ HINTS "${CLML_PATH}" "${CLML_PATH}/lib64"
+ )
+ list(APPEND TVM_RUNTIME_LINKER_LIBS ${EXTERN_CLML_COMPUTE_LIB})
+ list(APPEND RUNTIME_SRCS ${CLML_CONTRIB_SRC})
+ message(STATUS "Build with CLML graph runtime support: "
+ ${EXTERN_CLML_COMPUTE_LIB})
+
+ # Set flag to detect CLML graph runtime support.
+ add_definitions(-DTVM_GRAPH_EXECUTOR_CLML)
+
+ message(STATUS "Enable OpenCL as fallback to CLML")
+ file(GLOB RUNTIME_OPENCL_SRCS src/runtime/opencl/*.cc)
+ list(APPEND RUNTIME_SRCS ${RUNTIME_OPENCL_SRCS})
+ set(USE_OPENCL ON)
+
+endif()
diff --git a/python/tvm/relay/op/contrib/__init__.py
b/python/tvm/relay/op/contrib/__init__.py
index a03d0f6d4f..01708e8452 100644
--- a/python/tvm/relay/op/contrib/__init__.py
+++ b/python/tvm/relay/op/contrib/__init__.py
@@ -26,3 +26,4 @@ from .ethosn import *
from .libtorch import *
from .tensorrt import *
from .cutlass import *
+from .clml import *
diff --git a/python/tvm/relay/op/contrib/clml.py
b/python/tvm/relay/op/contrib/clml.py
new file mode 100644
index 0000000000..cacd10de28
--- /dev/null
+++ b/python/tvm/relay/op/contrib/clml.py
@@ -0,0 +1,247 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+# pylint: disable=invalid-name, unused-argument
+"""CLML Library supported operators."""
+import tvm
+
+from tvm import relay
+from tvm._ffi import register_func
+from tvm.relay import transform
+from tvm.relay.build_module import bind_params_by_name
+
+from ...dataflow_pattern import wildcard, is_op, is_constant, is_tuple_get_item
+from .register import register_pattern_table
+from ..strategy.generic import is_depthwise_conv2d
+
+
+def is_clml_runtime_enabled():
+ """Check if the CLML graph runtime is present.
+
+ Returns
+ -------
+ ret: bool
+ True if present, False if not.
+ """
+ check_enabled = tvm.get_global_func("relay.op.is_clml_runtime_enabled",
True)
+ if check_enabled:
+ return check_enabled()
+ return False
+
+
+def partition_for_clml(mod, params=None):
+ """Partition the graph greedily offloading supported
+ operators to CLML Library.
+
+ Parameters
+ ----------
+ mod : Module
+ The module to run passes on.
+ params : Optional[Dict[str, NDArray]]
+ Constant input parameters.
+
+ Returns
+ -------
+ ret : annotated and partitioned module.
+ """
+
+ if params:
+ mod["main"] = bind_params_by_name(mod["main"], params)
+
+ seq = tvm.transform.Sequential(
+ [
+ transform.InferType(),
+ transform.FoldConstant(),
+ transform.MergeComposite(clml_pattern_table()),
+ transform.AnnotateTarget("clml", False),
+ transform.MergeCompilerRegions(),
+ transform.PartitionGraph(),
+ ]
+ )
+
+ result_mod = seq(mod)
+ return result_mod
+
+
+@register_func("relay.ext.clml.optimize")
+def preprocess_module(mod):
+ """
+ Pre-process a module containing functions ready for CLML codegen. For now
we enforce OIHW
+ kernel layout and fold the transforms away.
+
+ Parameters
+ ----------
+ mod : Module
+ The module to run passes on.
+
+ Returns
+ -------
+ preprocessed_mod : The processed module.
+ """
+
+ def convert_layout_conv2d(conv2d_function):
+ def convert_conv(attrs, inputs, tinfos, desired_layouts):
+ new_attrs = dict(attrs)
+ data_info = tinfos[0]
+ weight_info = tinfos[1]
+ desired_data_layout, desired_kernel_layout = map(str,
desired_layouts)
+ new_attrs["data_layout"] = desired_data_layout
+ new_attrs["kernel_layout"] = desired_kernel_layout
+
+ if is_depthwise_conv2d(
+ data_info.shape,
+ attrs["data_layout"],
+ weight_info.shape,
+ attrs["kernel_layout"],
+ attrs["groups"],
+ ):
+ dkl = desired_kernel_layout
+ new_attrs["kernel_layout"] = dkl[1] + dkl[0] + dkl[2] + dkl[3]
+ return conv2d_function(*inputs, **new_attrs)
+
+ return convert_conv
+
+ with OpAttrContext(
+ "nn.conv2d", "FTVMConvertOpLayout",
convert_layout_conv2d(tvm.relay.nn.conv2d)
+ ):
+ seq = tvm.transform.Sequential(
+ [
+ transform.ConvertLayout({"nn.conv2d": ["NCHW", "OIHW"]}),
+ transform.FoldConstant(),
+ ]
+ )
+ preprocessed_mod = seq(mod)
+ return preprocessed_mod
+
+
+@register_pattern_table("clml")
+def clml_pattern_table():
+ """Get the CLML pattern table."""
+
+ def conv_pattern():
+ """Create a convolution pattern."""
+ pattern = is_op("nn.conv2d")(wildcard(), is_constant())
+ pattern = pattern.optional(lambda x: is_op("nn.bias_add")(x,
is_constant()))
+ pattern = pattern.optional(
+ lambda x: is_op("nn.batch_norm")(
+ x, is_constant(), is_constant(), is_constant(), is_constant()
+ )
+ )
+ pattern = pattern.optional(is_tuple_get_item)
+ pattern = pattern.optional(is_op("nn.relu"))
+ return pattern
+
+ def batch_norm_pattern():
+ """Create a batch norm pattern."""
+ pattern = is_op("nn.batch_norm")(
+ wildcard(), is_constant(), is_constant(), is_constant(),
is_constant()
+ )
+ pattern = is_tuple_get_item(pattern)
+ return pattern
+
+ def dense_pattern():
+ """Create a dense pattern."""
+ pattern = is_op("nn.dense")(wildcard(), is_constant())
+ pattern = pattern.optional(lambda x: is_op("add")(x, is_constant()))
+ return pattern
+
+ def pad_pattern():
+ """Create a pad pattern."""
+ pattern = is_op("nn.pad")(wildcard(), wildcard())
+ return pattern
+
+ def check_conv(extract):
+ """Check conv pattern is supported by CLML."""
+ call = extract
+ if isinstance(call, tvm.relay.expr.TupleGetItem):
+ call = call.tuple_value
+ elif call.op.name == "nn.relu":
+ call = call.args[0]
+ if isinstance(call, tvm.relay.expr.TupleGetItem):
+ call = call.tuple_value
+ while call.op.name != "nn.conv2d":
+ call = call.args[0]
+ attrs, args = call.attrs, call.args
+ if attrs.data_layout != "NCHW":
+ return False
+ data_typ = args[0].checked_type
+ kernel_typ = args[1].checked_type
+ is_depthwise = is_depthwise_conv2d(
+ data_typ.shape,
+ attrs["data_layout"],
+ kernel_typ.shape,
+ attrs["kernel_layout"],
+ attrs["groups"],
+ )
+ if attrs.groups != 1 and not is_depthwise:
+ return False
+ return True
+
+ return [
+ ("clml.conv2d", conv_pattern(), check_conv),
+ ("clml.dense", dense_pattern()),
+ ("clml.pad", pad_pattern()),
+ ("clml.batch_norm", batch_norm_pattern()),
+ ]
+
+
+def _register_external_op_helper(op_name, supported=True):
+ @tvm.ir.register_op_attr(op_name, "target.clml")
+ def _func_wrapper(expr):
+ return supported
+
+ return _func_wrapper
+
+
+_register_external_op_helper("clip")
+_register_external_op_helper("relu")
+_register_external_op_helper("nn.global_avg_pool2d")
+_register_external_op_helper("nn.global_max_pool2d")
+_register_external_op_helper("nn.softmax")
+_register_external_op_helper("reshape")
+
+
+class OpAttrContext(object):
+ """Temporarily changes the attr of an op."""
+
+ def __init__(self, op_name, attr_key, attr_value):
+ """Saves the required info for RAII pattern usage.
+
+ Parameters
+ ----------
+ op_name : str
+ The op name.
+
+ attr_key : str
+ The attribute name.
+
+ attr_value : object
+ The attribute value.
+ """
+ self.op = relay.op.get(op_name)
+ self.attr_key = attr_key
+ self.attr_value = attr_value
+
+ def __enter__(self):
+ self.older_attr = self.op.get_attr(self.attr_key)
+ self.op.reset_attr(self.attr_key)
+ self.op.set_attr(self.attr_key, self.attr_value)
+ return self
+
+ def __exit__(self, ptype, value, trace):
+ self.op.reset_attr(self.attr_key)
+ if self.older_attr:
+ self.op.set_attr(self.attr_key, self.older_attr)
diff --git a/src/relay/backend/contrib/clml/codegen.cc
b/src/relay/backend/contrib/clml/codegen.cc
new file mode 100644
index 0000000000..fa082a423d
--- /dev/null
+++ b/src/relay/backend/contrib/clml/codegen.cc
@@ -0,0 +1,412 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file src/relay/backend/contrib/clml/codegen.cc
+ * \brief Implementation of the Relay -> CLML JSON serializer.
+ */
+#include <tvm/ir/module.h>
+#include <tvm/relay/attrs/nn.h>
+#include <tvm/relay/type.h>
+#include <tvm/tir/analysis.h>
+
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+#include "../../utils.h"
+#include "../codegen_json/codegen_json.h"
+
+namespace tvm {
+namespace relay {
+namespace contrib {
+
+/*!
+ * \brief Generates an CLMLModule from a relay expression. This "compilation"
+ * does not require CLML since the actual conversion using CLML APIs is
+ * deferred until creation of the runtime. This step simply serializes the
+ * relay program into a JSON string.
+ */
+class CLMLJSONSerializer : public backend::contrib::JSONSerializer {
+ using JSONGraphNode = tvm::runtime::json::JSONGraphNode;
+ using JSONGraphNodeEntry = tvm::runtime::json::JSONGraphNodeEntry;
+
+ public:
+ CLMLJSONSerializer(const std::string& symbol, const Expr& expr)
+ : JSONSerializer(symbol, expr), clml_symbol_(symbol) {}
+
+ /*!
+ * \brief A series of operators that form a composite
+ * convolution. Supports nn.conv2d
+ */
+ struct CompositeConvNode {
+ const CallNode* pad = nullptr;
+ const CallNode* conv = nullptr;
+ const CallNode* bn = nullptr;
+ const CallNode* bias = nullptr;
+ const CallNode* activation = nullptr;
+ std::string act_type;
+ };
+
+ /*!
+ * \brief Visit call nodes and generate appropriate JSON node.
+ *
+ * \param cn The current call node.
+ * \return A list of graph entry nodes.
+ */
+ std::vector<JSONGraphNodeEntry> VisitExpr_(const CallNode* cn) override {
+ if (cn->op.as<OpNode>()) {
+ return JSONSerializer::VisitExpr_(cn);
+ }
+ if (!cn->op.as<FunctionNode>()) {
+ LOG(FATAL) << "CLML JSON runtime does not support calls to " <<
cn->op->GetTypeKey();
+ }
+ auto fn = cn->op.as<FunctionNode>();
+ auto comp = fn->GetAttr<String>(attr::kComposite);
+ ICHECK(comp.defined()) << "CLML JSON runtime only supports composite
functions.";
+ const std::string name = comp.value();
+ std::shared_ptr<JSONGraphNode> json_node;
+ if (name == "clml.conv2d") {
+ json_node = CreateCompositeConvJSONNode(cn);
+ } else if (name == "clml.batch_norm") {
+ json_node = CreateBatchNormJSONNode(cn);
+ } else if (name == "clml.dense") {
+ json_node = CreateDenseJSONNode(cn);
+ } else if (name == "clml.pad") {
+ json_node = CreatePadJSONNode(cn);
+ } else {
+ LOG(FATAL) << "Unrecognized CLML pattern: " << name;
+ }
+ return AddNode(json_node, GetRef<Expr>(cn));
+ }
+
+ /*!
+ * \brief Visit call nodes and generate ordered params.
+ *
+ * \param cn The current constant node.
+ * \return A list of graph entry nodes.
+ */
+ std::vector<JSONGraphNodeEntry> VisitExpr_(const ConstantNode* cn) override {
+ std::string name = "clml_" + clml_symbol_ + "_const_" +
std::to_string(clml_params_.size());
+ clml_params_.push_back(name);
+ clml_params_map_[name] = cn->data;
+ auto node = std::make_shared<JSONGraphNode>(name, "const" /* op_type_ */);
+ return AddNode(node, GetRef<Expr>(cn));
+ }
+
+ Array<String> GetParams() const { return clml_params_; }
+ Map<String, runtime::NDArray> GetParamsMap() const {
+ return Map<String, runtime::NDArray>(clml_params_map_);
+ }
+
+ private:
+ std::string clml_symbol_;
+ Array<String> clml_params_;
+ std::unordered_map<String, runtime::NDArray> clml_params_map_;
+ /*!
+ * \brief Extract convolution nodes from a composite function.
+ *
+ * \param cn The call node of the composite function.
+ * \return Extracted composite convolution nodes.
+ */
+ static CompositeConvNode UnpackCompositeConvolution(const CallNode* cn) {
+ CompositeConvNode nodes{};
+
+ const auto* fn = cn->op.as<FunctionNode>();
+ ICHECK(fn);
+ // Traverse composite convolution function from child to parent
+ const auto* current_call = fn->body.as<CallNode>();
+ if (fn->body.as<TupleGetItemNode>()) {
+ auto tuple_item = fn->body.as<TupleGetItemNode>();
+ current_call = tuple_item->tuple.as<CallNode>();
+ } else {
+ current_call = fn->body.as<CallNode>();
+ }
+ if (backend::IsOp(current_call, "nn.relu")) {
+ nodes.activation = current_call;
+ nodes.act_type = "relu";
+ if (current_call->args[0].as<TupleGetItemNode>()) {
+ auto tuple_item = current_call->args[0].as<TupleGetItemNode>();
+ current_call = tuple_item->tuple.as<CallNode>();
+ } else {
+ current_call = current_call->args[0].as<CallNode>();
+ }
+ }
+ if (backend::IsOp(current_call, "nn.batch_norm")) {
+ nodes.bn = current_call;
+ current_call = current_call->args[0].as<CallNode>();
+ }
+ if (backend::IsOp(current_call, "add")) {
+ nodes.bias = current_call;
+ current_call = current_call->args[0].as<CallNode>();
+ }
+ // Enforce a convolution node exists at this point during traversal
+ ICHECK(backend::IsOp(current_call, "nn.conv2d"));
+ nodes.conv = current_call;
+ if (!current_call->args.empty() &&
current_call->args[0]->IsInstance<CallNode>()) {
+ current_call = current_call->args[0].as<CallNode>();
+ if (backend::IsOp(current_call, "nn.pad")) {
+ nodes.pad = current_call;
+ }
+ }
+ return nodes;
+ }
+
+ /*!
+ * \brief Create a JSON representation of a composite convolution.
+ *
+ * \param cn The call to be represented.
+ * \return A JSON representation of a specific operator.
+ */
+ std::shared_ptr<JSONGraphNode> CreateCompositeConvJSONNode(const CallNode*
cn) {
+ CompositeConvNode nodes = UnpackCompositeConvolution(cn);
+
+ const auto* conv_attr = nodes.conv->attrs.as<Conv2DAttrs>();
+ ICHECK(conv_attr);
+
+ std::string name;
+ std::string name_prefix = "nn";
+
+ // Distinguish between normal and depth-wise convolution
+ if (conv_attr->channels.defined() &&
+ tvm::tir::ExprDeepEqual()(conv_attr->channels, conv_attr->groups) &&
+ conv_attr->groups != 1) {
+ name = "depthwise_conv2d";
+ ICHECK(conv_attr->kernel_layout == "IOHW")
+ << "Kernel layout must be IHWO, has the module been pre-processed
correctly?";
+ } else {
+ name = "conv2d";
+ ICHECK(conv_attr->kernel_layout == "OIHW")
+ << "Kernel layout must be OHWI, has the module been pre-processed
correctly?";
+ }
+
+ // Inputs must be added in the same order they appear in the relay graph.
+ std::vector<JSONGraphNodeEntry> inputs;
+
+ inputs.push_back(VisitExpr(cn->args[0])[0]);
+ inputs.push_back(VisitExpr(nodes.conv->args[1])[0]);
+ if (nodes.bias) {
+ inputs.push_back(VisitExpr(nodes.bias->args[1])[0]);
+ }
+ // Deal with Batchnorm Fusing here
+ if (nodes.bn) {
+ inputs.push_back(VisitExpr(nodes.bn->args[1])[0]);
+ inputs.push_back(VisitExpr(nodes.bn->args[2])[0]);
+ inputs.push_back(VisitExpr(nodes.bn->args[3])[0]);
+ inputs.push_back(VisitExpr(nodes.bn->args[4])[0]);
+ }
+
+ auto json_node = std::make_shared<JSONGraphNode>(name_prefix + "." + name,
"kernel", inputs, 1);
+ SetCallNodeAttribute(json_node, nodes.conv);
+
+ if (nodes.bn) {
+ const auto* bn_attr = nodes.bn->attrs.as<BatchNormAttrs>();
+ std::vector<dmlc::any> bn_any_attr;
+ std::vector<std::string> bn_args = {
+ std::to_string(bn_attr->axis), std::to_string(bn_attr->epsilon),
+ std::to_string(bn_attr->center), std::to_string(bn_attr->scale)};
+ bn_any_attr.emplace_back(bn_args);
+ json_node->SetAttr("batchnorm", bn_any_attr);
+ }
+
+ // Override attributes
+ if (nodes.pad) {
+ const auto* pad_attr = nodes.pad->attrs.as<PadAttrs>();
+ ICHECK(pad_attr);
+ auto p = pad_attr->pad_width;
+ // Standard convolution pad layout for TVM: dimension wise pair of pre
and post padding.
+ // CLML takes dimension wise pre-padding followed by dimension wise
post-padding.
+ std::vector<std::string> padding =
{std::to_string(p[2][0].as<IntImmNode>()->value),
+
std::to_string(p[3][0].as<IntImmNode>()->value),
+
std::to_string(p[2][1].as<IntImmNode>()->value),
+
std::to_string(p[3][1].as<IntImmNode>()->value)};
+ std::vector<dmlc::any> padding_attr;
+ padding_attr.emplace_back(padding);
+ json_node->SetAttr("padding", padding_attr);
+ }
+
+ if (nodes.activation) {
+ std::vector<std::string> activation_type = {nodes.act_type};
+ std::vector<dmlc::any> act_attr;
+ act_attr.emplace_back(activation_type);
+ json_node->SetAttr("activation_type", act_attr);
+ }
+ return json_node;
+ }
+
+ /*!
+ * \brief Create a JSON representation of a Batchnorm operator.
+ *
+ * \param cn The call to be represented.
+ * \return A JSON representation of a specific operator.
+ */
+ std::shared_ptr<JSONGraphNode> CreateBatchNormJSONNode(const CallNode* cn) {
+ const auto* fn = cn->op.as<FunctionNode>();
+ ICHECK(fn);
+ const auto* tuple_item = fn->body.as<TupleGetItemNode>();
+ ICHECK(tuple_item);
+ const auto* bn = tuple_item->tuple.as<CallNode>();
+ ICHECK(bn);
+ const auto* bn_op = bn->op.as<OpNode>();
+ ICHECK(bn_op);
+ const std::string name = bn_op->name;
+
+ std::vector<JSONGraphNodeEntry> inputs;
+ inputs.push_back(VisitExpr(cn->args[0])[0]);
+ inputs.push_back(VisitExpr(bn->args[1])[0]);
+ inputs.push_back(VisitExpr(bn->args[2])[0]);
+ inputs.push_back(VisitExpr(bn->args[3])[0]);
+ inputs.push_back(VisitExpr(bn->args[4])[0]);
+ auto json_node = std::make_shared<JSONGraphNode>(name, "kernel", inputs,
1);
+ SetCallNodeAttribute(json_node, bn);
+ return json_node;
+ }
+
+ /*!
+ * \brief Create a JSON representation of a Dense operator.
+ *
+ * \param cn The call to be represented.
+ * \return A JSON representation of a specific operator.
+ */
+ std::shared_ptr<JSONGraphNode> CreateDenseJSONNode(const CallNode* cn) {
+ const auto* fn = cn->op.as<FunctionNode>();
+ ICHECK(fn);
+ const auto* dense = fn->body.as<CallNode>();
+ const CallNode* bias = nullptr;
+
+ if (backend::IsOp(dense, "add")) {
+ bias = dense;
+ dense = dense->args[0].as<CallNode>();
+ }
+ ICHECK(backend::IsOp(dense, "nn.dense"));
+ const auto* dense_op = dense->op.as<OpNode>();
+ ICHECK(dense_op);
+ const std::string name = dense_op->name;
+
+ std::vector<JSONGraphNodeEntry> inputs;
+ inputs.push_back(VisitExpr(cn->args[0])[0]);
+ inputs.push_back(VisitExpr(dense->args[1])[0]);
+ if (bias) {
+ inputs.push_back(VisitExpr(bias->args[1])[0]);
+ }
+ auto json_node = std::make_shared<JSONGraphNode>(name, "kernel", inputs,
1);
+ SetCallNodeAttribute(json_node, dense);
+ return json_node;
+ }
+
+ /*!
+ * \brief Create a JSON representation of a Pad operator.
+ *
+ * \param cn The call to be represented.
+ * \return A JSON representation of a specific operator.
+ */
+ std::shared_ptr<JSONGraphNode> CreatePadJSONNode(const CallNode* cn) {
+ const auto* fn = cn->op.as<FunctionNode>();
+ ICHECK(fn);
+ const auto* pad = fn->body.as<CallNode>();
+ const auto* pad_op = pad->op.as<OpNode>();
+ ICHECK(pad_op);
+ const std::string name = pad_op->name;
+
+ std::vector<JSONGraphNodeEntry> inputs;
+ inputs.push_back(VisitExpr(cn->args[0])[0]);
+
+ auto json_node = std::make_shared<JSONGraphNode>(name, "kernel", inputs,
1);
+
+ const auto* pad_attr = pad->attrs.as<PadAttrs>();
+ ICHECK(pad_attr);
+ auto p = pad_attr->pad_width;
+ // TVM padding format: Dimension wise pair of pre and post padding.
+ // CLML padding format: Dimension wise pre padding followed by dimension
wise post padding.
+ std::vector<std::string> padding =
{std::to_string(p[2][0].as<IntImmNode>()->value),
+
std::to_string(p[2][1].as<IntImmNode>()->value),
+
std::to_string(p[3][0].as<IntImmNode>()->value),
+
std::to_string(p[3][1].as<IntImmNode>()->value)};
+ std::vector<dmlc::any> padding_attr;
+ padding_attr.emplace_back(padding);
+ json_node->SetAttr("pad_width", padding_attr);
+
+ std::vector<std::string> pad_mode = {pad_attr->pad_mode};
+ std::vector<dmlc::any> pad_mode_attr;
+ pad_mode_attr.emplace_back(pad_mode);
+ json_node->SetAttr("pad_mode", pad_mode_attr);
+
+ return json_node;
+ }
+};
+
+/*!
+ * \brief Create a runtime module for CLML.
+ *
+ * This consists of a series of "serialized functions" which each represent a
+ * sub-graph to be computed by CLML and will each be executed independently
from
+ * one another. Each function consists of serialized JSON describing the
sub-graph
+ * and serialized constant tensors.
+ *
+ * \note The CLML runtime module only supports a single operator per
+ * sub-graph currently.
+ *
+ * \param ref The ext_func Relay expression/module to be executed using extern
ops.
+ * \return A runtime module.
+ */
+runtime::Module CLMLCompiler(const ObjectRef& ref) {
+ ICHECK(ref->IsInstance<FunctionNode>()) << "The input ref is expected to be
a Relay function.";
+ Function func = Downcast<Function>(ref);
+ std::string func_name = backend::GetExtSymbol(func);
+
+ CLMLJSONSerializer serializer(func_name, func);
+ serializer.serialize();
+ std::string graph_json = serializer.GetJSON();
+ auto param_names = serializer.GetParams();
+ const auto* pf = runtime::Registry::Get("runtime.clml_runtime_create");
+ ICHECK(pf != nullptr) << "Cannot find CLML runtime module to create";
+ runtime::Module lib = (*pf)(func_name, graph_json, param_names);
+ return lib;
+}
+
+TVM_REGISTER_GLOBAL("relay.ext.clml").set_body_typed(CLMLCompiler);
+
+/*!
+ * \brief Check whether CLML graph runtime is used.
+ *
+ * \return True if CLML graph runtime is enabled, False if not.
+ */
+inline constexpr bool IsCLMLRuntimeEnabled() {
+#if TVM_GRAPH_EXECUTOR_CLML
+ return true;
+#else
+ return false;
+#endif
+}
+
+TVM_REGISTER_GLOBAL("relay.op.is_clml_runtime_enabled").set_body_typed(IsCLMLRuntimeEnabled);
+
+Map<String, runtime::NDArray> CLMLConstantUpdater(Expr func, std::string
symbol) {
+ CLMLJSONSerializer serializer(symbol, func);
+ serializer.serialize();
+ auto pmap = serializer.GetParamsMap();
+ return pmap;
+}
+
+TVM_REGISTER_GLOBAL("relay.ext.clml.constant_updater").set_body_typed(CLMLConstantUpdater);
+
+} // namespace contrib
+} // namespace relay
+} // namespace tvm
diff --git a/src/runtime/contrib/clml/clml_runtime.cc
b/src/runtime/contrib/clml/clml_runtime.cc
new file mode 100644
index 0000000000..7966c0e78b
--- /dev/null
+++ b/src/runtime/contrib/clml/clml_runtime.cc
@@ -0,0 +1,1091 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file src/runtime/contrib/clml/clml_runtime.cc
+ * \brief A simple JSON runtime for CLML.
+ */
+
+#include <CL/cl.h>
+#include <CL/opencl.h>
+#ifdef TVM_GRAPH_EXECUTOR_CLML
+#include <CL/cl_qcom_ml_ops.h>
+#endif
+#include <stdlib.h>
+#include <tvm/runtime/ndarray.h>
+#include <tvm/runtime/registry.h>
+
+#include <fstream>
+#include <map>
+#include <utility>
+
+#include "../../opencl/opencl_common.h"
+#include "../json/json_node.h"
+#include "../json/json_runtime.h"
+
+namespace tvm {
+namespace runtime {
+namespace contrib {
+
+using namespace tvm::runtime::json;
+using JSONGraphNode = tvm::runtime::json::JSONGraphNode;
+
+class CLMLRuntime : public JSONRuntimeBase {
+ public:
+ /*!
+ * \brief The CLML runtime module. Deserialize the provided functions
+ * on creation and store in the layer cache.
+ *
+ * \param symbol_name The name of the function.
+ * \param graph_json serialized JSON representation of a sub-graph.
+ * \param const_names The names of each constant in the sub-graph.
+ */
+ explicit CLMLRuntime(const std::string& symbol_name, const std::string&
graph_json,
+ const Array<String>& const_names)
+ : JSONRuntimeBase(symbol_name, graph_json, const_names) {}
+
+ ~CLMLRuntime() {
+#ifdef TVM_GRAPH_EXECUTOR_CLML
+ cl_int result = 0;
+ if (this->is_tuning_run) {
+ result = h_ClmlIntf->clReleaseMLTuningCacheQCOM(this->tuning_cache);
+ ICHECK(result == CL_SUCCESS) << "clReleaseMLTuningCacheQCOM:" << result;
+ }
+ for (auto it = this->layer_.storage_map.begin(); it !=
this->layer_.storage_map.end(); it++) {
+ auto tensor_desc = it->second.first;
+ result = h_ClmlIntf->clReleaseMLTensorQCOM(tensor_desc->tensor);
+ ICHECK(result == CL_SUCCESS) << "clReleaseMLTensorQCOM:" << result;
+ result = clReleaseMemObject(tensor_desc->memory);
+ ICHECK(result == CL_SUCCESS) << "clReleaseMemObject:" << result;
+ }
+ for (size_t i = 0; i < this->layer_.function.size(); ++i) {
+ result = h_ClmlIntf->clReleaseMLOpQCOM(this->layer_.function[i]);
+ ICHECK(result == CL_SUCCESS) << "clReleaseMLOpQCOM:" << result;
+ }
+ for (auto it = this->layer_.in_placeholder.begin(); it !=
this->layer_.in_placeholder.end();
+ it++) {
+ result = h_ClmlIntf->clReleaseMLTensorQCOM((*it)->tensor);
+ ICHECK(result == CL_SUCCESS) << "clReleaseMLTensorQCOM:" << result;
+ }
+ for (auto it = this->layer_.out_placeholder.begin(); it !=
this->layer_.out_placeholder.end();
+ it++) {
+ result = h_ClmlIntf->clReleaseMLTensorQCOM((*it)->tensor);
+ ICHECK(result == CL_SUCCESS) << "clReleaseMLTensorQCOM:" << result;
+ }
+ result =
h_ClmlIntf->clReleaseMLTensorMemoryDescriptorSetQCOM(layer_.descriptorSet);
+ ICHECK(result == CL_SUCCESS) <<
"clReleaseMLTensorMemoryDescriptorSetQCOM:" << result;
+#endif
+ }
+
+ /*!
+ * \brief The type key of the module.
+ *
+ * \return module type key.
+ */
+ const char* type_key() const override { return "clml"; }
+
+ /*!
+ * \brief Initialize runtime. Create CLML layer from JSON
+ * representation.
+ *
+ * \param consts The constant params from compiled model.
+ */
+ void Init(const Array<NDArray>& consts) override {
+ ICHECK_EQ(consts.size(), const_idx_.size())
+ << "The number of input constants must match the number of required.";
+ SetupConstants(consts);
+
+#ifdef TVM_GRAPH_EXECUTOR_CLML
+ InitCLML();
+#endif
+
+ BuildEngine();
+ }
+
+#ifdef TVM_GRAPH_EXECUTOR_CLML
+ std::vector<cl_uint> GetVectorValues(const std::vector<std::string>& val) {
+ std::vector<cl_uint> array;
+ for (auto i : val) {
+ array.push_back((cl_uint)stoi(i));
+ }
+ return array;
+ }
+
+ void InitCLML() {
+ // Setup CLML Context
+ cl_int result = 0;
+
+ // Initialize Context and Command Queue
+ result = clGetPlatformIDs(1, &platform, NULL);
+ ICHECK(result == CL_SUCCESS) << "clGetPlatformIDs:" << result;
+
+ uint32_t num_devices = 0;
+ result = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 0, NULL,
&num_devices);
+ ICHECK(result == CL_SUCCESS && num_devices == 1) << "clGetDeviceIDs:" <<
result;
+
+ result = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &device_id, NULL);
+ ICHECK(device_id && result == CL_SUCCESS) << "clGetDeviceIDs:" << result;
+
+ if (!ExtensionStringPresent(device_id)) {
+ LOG(WARNING) << "CLML Runtime Init: Qualcomm extn not present.\n";
+ return;
+ }
+
+ // Reuse the OpenCl work space from TVM Device API.
+ auto func = tvm::runtime::Registry::Get("device_api.opencl");
+ ICHECK(func != nullptr) << "Cannot find OpenCL device_api in registry";
+ auto device_api = static_cast<cl::OpenCLWorkspace*>(((*func)()).operator
void*());
+ this->context = device_api->context;
+ bool queue_found = false;
+ for (size_t i = 0; i < device_api->devices.size(); ++i) {
+ if (device_api->devices[i] == device_id) {
+ this->queue = device_api->queues[i];
+ this->evts = &(device_api->events[i]);
+ queue_found = true;
+ }
+ }
+ ICHECK(queue_found != false) << "Device queue not found in OpenCL
Workspace";
+
+ // Query and Get CLML Interface
+ static const cl_uint MAX_VERSIONS = 256;
+ cl_int majorVersions[MAX_VERSIONS];
+ cl_int minorVersions[MAX_VERSIONS];
+ cl_uint numVersions = 0;
+ result = clQueryMLInterfaceVersionsQCOM(NULL, NULL, 0, &numVersions);
+ ICHECK(result == CL_SUCCESS) << "clQueryMLInterfaceVersionsQCOM:" <<
result;
+ ICHECK(numVersions > 0u);
+ ICHECK(numVersions <= MAX_VERSIONS);
+
+ result = clQueryMLInterfaceVersionsQCOM(majorVersions, minorVersions,
numVersions, NULL);
+ ICHECK(result == CL_SUCCESS) << "clQueryMLInterfaceVersionsQCOM:" <<
result;
+
+ for (cl_uint i = 0; i < numVersions; ++i) {
+ if (majorVersions[i] == 2) {
+ LOG(WARNING) << "CLML Version Selected:" << majorVersions[i] << " : "
<< majorVersions[i];
+ h_ClmlIntf = clGetMLInterfaceV2QCOM(0);
+ ICHECK(h_ClmlIntf != NULL) << "clGetMLInterfaceV2QCOM:" << result;
+ break;
+ }
+ }
+ char* tune_flag;
+ if ((tune_flag = getenv("CLML_IS_TUNNING_RUN")))
+ this->is_tuning_run = std::stoi(tune_flag);
+ else
+ this->is_tuning_run = 0;
+
+ if (!(tuning_file = getenv("CLML_TUNNING_CACHE"))) this->is_tuning_run = 0;
+ // A Tuning run, so create the cache from scratch
+ result = h_ClmlIntf->clCreateMLTuningCacheQCOM(&tuning_cache);
+ ICHECK(result == CL_SUCCESS) << "clCreateMLTuningCacheQCOM:" << result;
+ if (!this->is_tuning_run && this->tuning_file) {
+ std::vector<unsigned char> buffer;
+ buffer = readBinFile(this->tuning_file);
+ result = h_ClmlIntf->clLoadMLTuningCacheQCOM(tuning_cache,
buffer.size(), buffer.data());
+ ICHECK(result == CL_SUCCESS) << "clLoadMLTuningCacheQCOM:" << result;
+ }
+ }
+
+ std::vector<unsigned char> readBinFile(const std::string& filename) {
+ std::ifstream fin(filename, std::ios::binary | std::ios::ate);
+ if (!fin.good()) {
+ LOG(FATAL) << "ERROR: Could not load tuning cache file: " + filename;
+ }
+ ICHECK(fin.good());
+ int64_t size = fin.tellg();
+ fin.seekg(0, std::ios::beg);
+ std::vector<unsigned char> buffer(static_cast<size_t>(size));
+ char* ptr = reinterpret_cast<char*>(buffer.data());
+ fin.read(ptr, size);
+ ICHECK(fin.good());
+ return buffer;
+ }
+
+ void CopyDataToCLMLTensor(std::shared_ptr<cl_ml_tensor_memory_desc_qcom>
tensor, void* data,
+ cl_ml_tensor_layout_qcom layout =
CL_TENSOR_LAYOUT_NCHW_QCOM) {
+ cl_int result = 0;
+ cl_event evt = NULL;
+ result = h_ClmlIntf->clEnqueueWriteMLTensorDataQCOM(queue, data, layout,
tensor->tensor,
+ tensor->memory,
+ 0, // n waitlist
+ NULL, // waitlist
+ &evt); // event
+ ICHECK((evt != NULL) && result == CL_SUCCESS) <<
"clEnqueueWriteMLTensorDataQCOM:" << result;
+ }
+
+ void CopyDataFromCLMLTensor(std::shared_ptr<cl_ml_tensor_memory_desc_qcom>
tensor, void* data,
+ cl_ml_tensor_layout_qcom layout =
CL_TENSOR_LAYOUT_NCHW_QCOM) {
+ cl_int result = 0;
+ cl_event readEvent = NULL;
+ // Read the output tensor
+ result = h_ClmlIntf->clEnqueueReadMLTensorDataQCOM(queue, tensor->tensor,
tensor->memory, data,
+ layout,
+ 0, // n
waitlist
+ NULL, //
waitlist
+ &readEvent); // event
+ ICHECK(result == CL_SUCCESS) << "clEnqueueReadMLTensorDataQCOM:" << result;
+
+ result = clWaitForEvents(1, &readEvent);
+ ICHECK(result == CL_SUCCESS) << "clWaitForEvents:" << result;
+ }
+
+ /*!
+ * \brief Unpack inputs and outputs and run inference on a given layer.
+ *
+ * \param args Access inputs and outputs.
+ * \param function The layer to execute inference on.
+ * \return Status of inference.
+ */
+ void Run() override {
+ cl_int result = 0;
+ for (size_t i = 0; i < input_nodes_.size(); ++i) {
+ auto nid = input_nodes_[i];
+ uint32_t eid = EntryID(nid, 0);
+ if (nodes_[nid].GetOpType() == "input") {
+ void* data = data_entry_[eid]->data;
+ size_t isize = 1;
+ for (size_t j = 0; j < data_entry_[eid]->ndim; ++j) {
+ isize *= data_entry_[eid]->shape[j];
+ }
+ if (kDLCPU == data_entry_[eid]->device.device_type) {
+ CopyDataToCLMLTensor(layer_.inputs[i], data);
+ } else if (kDLOpenCL == data_entry_[eid]->device.device_type) {
+ layer_.in_placeholder[i]->memory = static_cast<cl_mem>(
+
((cl::BufferDescriptor*)const_cast<DLTensor*>(data_entry_[eid])->data)->buffer);
+ cl_event cpy_evt = NULL;
+ result = h_ClmlIntf->clEnqueueCopyMLTensorDataQCOM(
+ queue, layer_.in_placeholder[i]->tensor,
layer_.in_placeholder[i]->memory,
+ layer_.inputs[i]->tensor, layer_.inputs[i]->memory, 0, NULL,
&cpy_evt);
+ ICHECK(result == CL_SUCCESS) << "clEnqueueCopyMLTensorDataQCOM:" <<
result;
+ } else {
+ DLDataType tvm_dtype =
const_cast<DLTensor*>(data_entry_[eid])->dtype;
+ cl_channel_type cl_dtype = MakeCLDataType(tvm_dtype);
+ int dtype_size = cl_dtype == CL_FLOAT ? 4 : 2;
+ void* tmpptr = reinterpret_cast<void*>(malloc(isize * dtype_size));
+ TVMArrayCopyToBytes(const_cast<DLTensor*>(data_entry_[eid]),
const_cast<void*>(tmpptr),
+ isize * dtype_size);
+ CopyDataToCLMLTensor(layer_.inputs[i], tmpptr);
+ free(tmpptr);
+ }
+ }
+ }
+
+ for (size_t i = 0; i < this->layer_.function.size(); ++i) {
+ this->evts->resize(this->evts->size() + 1);
+ cl_event* evt = &(this->evts->back());
+ result = h_ClmlIntf->clEnqueueMLOpQCOM(queue, this->layer_.function[i],
+ this->layer_.descriptorSet, 0,
NULL, evt);
+ ICHECK(result == CL_SUCCESS) << "clEnqueueMLOpQCOM:" << result;
+ }
+
+ if (getenv("CLML_PROFILING")) {
+ cl_ulong start, end;
+ cl_ulong duration = 0;
+ clWaitForEvents(1, &(this->evts->back()));
+ for (size_t i = 0; i < this->layer_.layer_names.size(); ++i) {
+ clGetEventProfilingInfo((*this->evts)[i], CL_PROFILING_COMMAND_START,
sizeof(cl_ulong),
+ &start, nullptr);
+ clGetEventProfilingInfo((*this->evts)[i], CL_PROFILING_COMMAND_END,
sizeof(cl_ulong), &end,
+ nullptr);
+ duration += (end - start);
+ LOG(WARNING) << "Layer:" << this->layer_.layer_names[i] << "
Duration:" << (end - start);
+ }
+ LOG(WARNING) << "Total Duration:" << duration;
+ }
+
+ for (size_t i = 0; i < outputs_.size(); ++i) {
+ uint32_t eid = EntryID(outputs_[i]);
+ void* data = data_entry_[eid]->data;
+
+ size_t osize = 1;
+ for (size_t j = 0; j < data_entry_[eid]->ndim; ++j) {
+ osize *= data_entry_[eid]->shape[j];
+ }
+ if (kDLCPU == data_entry_[eid]->device.device_type) {
+ CopyDataFromCLMLTensor(layer_.outputs[0], data);
+ } else if (kDLOpenCL == data_entry_[eid]->device.device_type) {
+ layer_.out_placeholder[i]->memory = static_cast<cl_mem>(
+
((cl::BufferDescriptor*)const_cast<DLTensor*>(data_entry_[eid])->data)->buffer);
+ cl_event cpy_evt = NULL;
+ result = h_ClmlIntf->clEnqueueCopyMLTensorDataQCOM(
+ queue, layer_.outputs[i]->tensor, layer_.outputs[i]->memory,
+ layer_.out_placeholder[i]->tensor,
layer_.out_placeholder[i]->memory, 0, NULL,
+ &cpy_evt);
+ ICHECK(result == CL_SUCCESS) << "clEnqueueCopyMLTensorDataQCOM:" <<
result;
+ } else {
+ DLDataType tvm_dtype = const_cast<DLTensor*>(data_entry_[eid])->dtype;
+ cl_channel_type cl_dtype = MakeCLDataType(tvm_dtype);
+ int dtype_size = cl_dtype == CL_FLOAT ? 4 : 2;
+
+ void* tmpptr = reinterpret_cast<void*>(malloc(osize * dtype_size));
+ CopyDataFromCLMLTensor(layer_.outputs[0], tmpptr);
+ TVMArrayCopyFromBytes(const_cast<DLTensor*>(data_entry_[eid]),
const_cast<void*>(tmpptr),
+ osize * dtype_size);
+ free(tmpptr);
+ }
+ }
+ }
+
+ private:
+ /*!
+ * \brief Build CLML layer from JSON representation and cache.
+ *
+ * \note For the time being only one layer or operator is supported
+ * per engine.
+ */
+ void BuildEngine() {
+ size_t nid;
+ for (nid = 0; nid < nodes_.size(); ++nid) {
+ const auto& node = nodes_[nid];
+ if (node.GetOpType() == "input") {
+ auto clml_input = MakeCLMLTensorFromJSONNode(node);
+ this->layer_.storage_map.insert({nid, std::make_pair(clml_input,
node)});
+ this->layer_.inputs.push_back(clml_input);
+ // Input copy placeholder Tensor
+ this->layer_.in_placeholder.push_back(
+ MakeCLMLTensorFromJSONNode(node, CL_TENSOR_LAYOUT_NCHW_QCOM));
+ } else if (node.GetOpType() == "kernel") {
+ auto op_name = node.GetOpName();
+ if ("nn.conv2d" == op_name) {
+ auto out = CreateConvolution2DLayer(&layer_, node,
CL_CONVOLUTION_MODE_CONVOLUTION_QCOM);
+ this->layer_.storage_map.insert({nid, std::make_pair(out, node)});
+ this->layer_.func_outs.push_back(out);
+ } else if ("nn.depthwise_conv2d" == op_name) {
+ auto out = CreateConvolution2DLayer(&layer_, node,
CL_CONVOLUTION_MODE_DEPTHWISE_QCOM);
+ this->layer_.storage_map.insert({nid, std::make_pair(out, node)});
+ this->layer_.func_outs.push_back(out);
+ } else if ("nn.relu6" == op_name) {
+ auto out = CreateReLULayer(&layer_, node, CL_ACTIVATION_RELU6);
+ this->layer_.storage_map.insert({nid, std::make_pair(out, node)});
+ this->layer_.func_outs.push_back(out);
+ } else if ("nn.relu" == op_name) {
+ auto out = CreateReLULayer(&layer_, node, CL_ACTIVATION_RELU);
+ this->layer_.storage_map.insert({nid, std::make_pair(out, node)});
+ this->layer_.func_outs.push_back(out);
+ } else if ("nn.batch_norm" == op_name) {
+ auto out = CreateBatchNormLayer(&layer_, node);
+ this->layer_.storage_map.insert({nid, std::make_pair(out, node)});
+ this->layer_.func_outs.push_back(out);
+ } else if ("nn.global_max_pool2d" == op_name || "nn.global_avg_pool2d"
== op_name) {
+ auto out = CreateGlobalPoolingLayer(&layer_, node);
+ this->layer_.storage_map.insert({nid, std::make_pair(out, node)});
+ this->layer_.func_outs.push_back(out);
+ } else if ("reshape" == op_name) {
+ auto out = CreateReshapeLayer(&layer_, node);
+ this->layer_.storage_map.insert({nid, std::make_pair(out, node)});
+ this->layer_.func_outs.push_back(out);
+ } else if ("nn.dense" == op_name) {
+ auto out = CreateDenseLayer(&layer_, node);
+ this->layer_.storage_map.insert({nid, std::make_pair(out, node)});
+ this->layer_.func_outs.push_back(out);
+ } else if ("nn.softmax" == op_name) {
+ auto out = CreateSoftMaxLayer(&layer_, node);
+ this->layer_.storage_map.insert({nid, std::make_pair(out, node)});
+ this->layer_.func_outs.push_back(out);
+ } else if ("nn.pad" == op_name) {
+ auto out = CreatePadLayer(&layer_, node);
+ this->layer_.storage_map.insert({nid, std::make_pair(out, node)});
+ this->layer_.func_outs.push_back(out);
+ } else if ("clip" == op_name) {
+ auto out = CreateClipLayer(&layer_, node);
+ this->layer_.storage_map.insert({nid, std::make_pair(out, node)});
+ this->layer_.func_outs.push_back(out);
+ } else {
+ LOG(FATAL) << "Unsupported op: " << op_name;
+ }
+ this->layer_.layer_names.push_back(op_name);
+ } else if (node.GetOpType() != "const") {
+ LOG(WARNING) << "Build Engine: Unknown Node:" << node.GetOpType();
+ }
+ }
+ if (nid > 0) {
+ this->layer_.outputs.push_back(this->layer_.storage_map[nid - 1].first);
+ this->layer_.out_placeholder.push_back(
+ MakeCLMLTensorFromJSONNode(nodes_[nid - 1],
CL_TENSOR_LAYOUT_NCHW_QCOM));
+ }
+ // ALlocate device memories and initialize the params if any
+ cl_int result = 0;
+ for (auto it = this->layer_.storage_map.begin(); it !=
this->layer_.storage_map.end(); it++) {
+ auto tensor_desc = it->second.first;
+ JSONGraphNode node = it->second.second;
+ void* node_data = nullptr;
+
+ allocateTensorMemory(h_ClmlIntf, context, tensor_desc);
+
+ if (node.GetOpType() == "const") {
+ node_data = data_entry_[EntryID(it->first, 0)]->data;
+ if (node_data != nullptr) {
+ CopyDataToCLMLTensor(tensor_desc, node_data);
+ }
+ }
+ this->layer_.tensorMemDescs.push_back(*tensor_desc);
+ }
+
+ // Setup descriptor set
+ result =
h_ClmlIntf->clCreateMLTensorMemoryDescriptorSetQCOM(&this->layer_.descriptorSet);
+ ICHECK(result == CL_SUCCESS) << "clCreateMLTensorMemoryDescriptorSetQCOM:"
<< result;
+
+ result = h_ClmlIntf->clUpdateMLTensorMemoryDescriptorSetQCOM(
+ this->layer_.descriptorSet,
static_cast<uint32_t>(this->layer_.tensorMemDescs.size()),
+ this->layer_.tensorMemDescs.data());
+ ICHECK(result == CL_SUCCESS) << "clUpdateMLTensorMemoryDescriptorSetQCOM:"
<< result;
+
+ if (this->is_tuning_run) {
+ LOG(WARNING) << "CLML Tunning In Progress:";
+ for (size_t i = 0; i < this->layer_.function.size(); ++i) {
+ LOG(WARNING) << "CLML Tunning:" << i;
+ result = h_ClmlIntf->clTuneMLOpQCOM(queue, this->layer_.function[i],
+ this->layer_.descriptorSet,
this->tuning_cache, NULL);
+ ICHECK(result == CL_SUCCESS) << "clTuneMLOpQCOM:" << result;
+ }
+
+ size_t cacheLenBytes = 0;
+ size_t lenRet = 0;
+ result = h_ClmlIntf->clSaveMLTuningCacheQCOM(tuning_cache, 0, NULL,
&cacheLenBytes);
+ ICHECK(result == CL_SUCCESS) << "clSaveMLTuningCacheQCOM:" << result;
+
+ std::vector<unsigned char> savedCache(cacheLenBytes, 0);
+ result = h_ClmlIntf->clSaveMLTuningCacheQCOM(tuning_cache,
savedCache.size(),
+ savedCache.data(), &lenRet);
+ assert(result == CL_SUCCESS);
+
+ std::ofstream cache_out(tuning_file, std::ios_base::binary);
+ if (cache_out) {
+ cache_out.write(reinterpret_cast<char*>(savedCache.data()),
savedCache.size());
+ cache_out.close();
+ }
+ LOG(WARNING) << "CLML: Tuning cache dumped to:" << tuning_file;
+ }
+ }
+
+ /*!
+ * \brief CLML objects we cache in order to avoid needing to construct
+ * a new layer each time.
+ */
+ struct CachedLayer {
+ std::vector<cl_ml_op_qcom> function;
+ std::vector<std::shared_ptr<cl_ml_tensor_memory_desc_qcom>> inputs;
+ std::vector<std::shared_ptr<cl_ml_tensor_memory_desc_qcom>> in_placeholder;
+ std::vector<std::shared_ptr<cl_ml_tensor_memory_desc_qcom>> outputs;
+ std::vector<std::shared_ptr<cl_ml_tensor_memory_desc_qcom>>
out_placeholder;
+ std::vector<std::shared_ptr<cl_ml_tensor_memory_desc_qcom>> func_outs;
+ std::vector<std::shared_ptr<cl_ml_tensor_memory_desc_qcom>> func_ins;
+ std::map<int, std::pair<std::shared_ptr<cl_ml_tensor_memory_desc_qcom>,
JSONGraphNode>>
+ storage_map;
+ std::vector<cl_ml_tensor_memory_desc_qcom> tensorMemDescs;
+ std::vector<cl_ml_tensor_memory_desc_qcom> in_tensorMemDescs;
+ std::vector<cl_ml_tensor_memory_desc_qcom> out_tensorMemDescs;
+ cl_ml_tensor_mem_desc_set_qcom descriptorSet;
+ std::vector<std::string> layer_names;
+ cl_ml_tensor_qcom unusedTensor = NULL;
+ };
+
+ struct tensor_dims_t {
+ uint32_t n, c, h, w;
+ };
+
+ bool ExtensionStringPresent(cl_device_id device_id) {
+ cl_int result = 0;
+
+ size_t reqd_size = 0;
+ result = clGetDeviceInfo(device_id, CL_DEVICE_EXTENSIONS, 0, NULL,
&reqd_size);
+ ICHECK(reqd_size > 0u && result == CL_SUCCESS) << "clGetDeviceInfo:" <<
result;
+
+ std::vector<char> buf(reqd_size);
+ result = clGetDeviceInfo(device_id, CL_DEVICE_EXTENSIONS, reqd_size,
buf.data(), NULL);
+ ICHECK(result == CL_SUCCESS) << "clGetDeviceInfo:" << result;
+
+ std::string extensions(buf.data());
+ LOG(WARNING) << "OpenCL Extensions:" << extensions;
+ return (extensions.find("cl_qcom_ml_ops") != std::string::npos);
+ }
+
+ cl_ml_tensor_qcom DeviceMakeCLMLTensor(
+ void* pClmlIntf, cl_context context, tensor_dims_t dims,
+ cl_ml_tensor_layout_qcom layout = CL_TENSOR_LAYOUT_OPTIMAL_QCOM,
+ cl_channel_type dtype = CL_FLOAT) {
+ cl_ml_tensor_qcom tensor;
+ cl_int result = CL_OUT_OF_RESOURCES;
+
+ cl_ml_tensor_desc_qcom desc = {
+ dtype, layout, dims.n, dims.c, dims.h, dims.w, 0,
CL_TENSOR_DIMENSIONS_4D_QCOM, { 0 }};
+ CLMLInterfaceV2QCOM* clmlIntf =
reinterpret_cast<CLMLInterfaceV2QCOM*>(pClmlIntf);
+ result = clmlIntf->clCreateMLTensorQCOM(context, NULL, &desc, &tensor);
+ ICHECK(tensor && result == CL_SUCCESS) << "clCreateMLTensorQCOM:" <<
result;
+ (void)result;
+ return tensor;
+ }
+
+ cl_int allocateTensorMemory(void* pClmlIntf, cl_context context,
+ std::shared_ptr<cl_ml_tensor_memory_desc_qcom>
pTensorMemDesc) {
+ uint32_t size = 0;
+ cl_int result = CL_OUT_OF_HOST_MEMORY;
+ cl_mem buffer = NULL;
+
+ CLMLInterfaceV2QCOM* clmlIntf =
reinterpret_cast<CLMLInterfaceV2QCOM*>(pClmlIntf);
+ result = clmlIntf->clGetMLTensorMemorySizeQCOM(context,
pTensorMemDesc->tensor, &size);
+ ICHECK(result == CL_SUCCESS) << "clGetMLTensorMemorySizeQCOM:" << result;
+
+ buffer = clCreateBuffer(context, CL_MEM_READ_WRITE, size, NULL, &result);
+ ICHECK(result == CL_SUCCESS) << "clCreateBuffer:" << result;
+
+ pTensorMemDesc->memory = buffer;
+
+ return result;
+ }
+
+ tensor_dims_t get_tensor_dims(const JSONGraphNode& node) {
+ std::vector<int64_t> shape = node.GetOpShape()[0];
+ tensor_dims_t dims;
+ dims.n = shape[0];
+ dims.c = shape[1];
+ dims.h = shape[2];
+ dims.w = shape[3];
+ return dims;
+ }
+
+ cl_channel_type MakeCLDataType(const DLDataType& data_type) {
+ if (data_type.code == DLDataTypeCode::kDLFloat && data_type.bits == 32) {
+ return CL_FLOAT;
+ } else if (data_type.code == DLDataTypeCode::kDLFloat && data_type.bits ==
16) {
+ return CL_HALF_FLOAT;
+ } else {
+ LOG(FATAL) << "Datatype " << data_type << " unsupported by CLML runtime";
+ return -1;
+ }
+ }
+
+ std::shared_ptr<cl_ml_tensor_memory_desc_qcom> MakeCLMLTensor(
+ const JSONGraphNode& tensor_rep, void* data, std::vector<size_t> c_shape,
+ cl_ml_tensor_layout_qcom layout = CL_TENSOR_LAYOUT_OPTIMAL_QCOM, cl_uint
dtype = CL_FLOAT) {
+ std::vector<int64_t> shape = tensor_rep.GetOpShape()[0];
+ std::vector<size_t> clml_shape(shape.begin(), shape.end());
+ if (c_shape.size() > 0) {
+ clml_shape = c_shape;
+ }
+ // Make sure the tensors with dimensions less than 4 are padded with 1.
+ clml_shape.push_back(1);
+ clml_shape.push_back(1);
+ clml_shape.push_back(1);
+
+ tensor_dims_t dims;
+ dims.n = clml_shape[0];
+ dims.c = clml_shape[1];
+ dims.h = clml_shape[2];
+ dims.w = clml_shape[3];
+ DLDataType tvm_dtype = tensor_rep.GetOpDataType()[0];
+ cl_channel_type cl_dtype = MakeCLDataType(tvm_dtype);
+
+ auto tensor_dsc = std::make_shared<cl_ml_tensor_memory_desc_qcom>();
+ tensor_dsc->tensor = DeviceMakeCLMLTensor(h_ClmlIntf, context, dims,
layout, cl_dtype);
+ return tensor_dsc;
+ }
+
+ /*!
+ * \brief Create an CLML tensor given the JSON representation. If scale
+ * and offset are given, then create a quantized CLML tensor.
+ *
+ * \param tensor The tensor to represent.
+ * \return CLML Tensor.
+ */
+
+ std::shared_ptr<cl_ml_tensor_memory_desc_qcom> MakeCLMLTensorFromJSONEntry(
+ const JSONGraphNodeEntry& tensor, std::vector<size_t> shape = {},
+ cl_ml_tensor_layout_qcom layout = CL_TENSOR_LAYOUT_OPTIMAL_QCOM, cl_uint
dtype = CL_FLOAT) {
+ JSONGraphNode node = nodes_[tensor.id_];
+ if (this->layer_.storage_map.find(tensor.id_) ==
this->layer_.storage_map.end()) {
+ void* node_data = nullptr;
+ if (node.GetOpType() == "const") {
+ node_data = data_entry_[EntryID(tensor)]->data;
+ }
+ auto clml_tensor = MakeCLMLTensorFromJSONNode(node, layout, dtype,
node_data, shape);
+ this->layer_.storage_map.insert({tensor.id_, std::make_pair(clml_tensor,
node)});
+ return clml_tensor;
+ } else {
+ return this->layer_.storage_map[tensor.id_].first;
+ }
+ }
+ /*!
+ * \brief Create an CLML tensor given the JSON representation. If scale
+ * and offset are given, then create a quantized CLML tensor.
+ *
+ * \param node The tensor to represent.
+ * \param data (optional) Constant data of input node.
+ * \return CLML Tensor.
+ */
+ std::shared_ptr<cl_ml_tensor_memory_desc_qcom> MakeCLMLTensorFromJSONNode(
+ const JSONGraphNode& node, cl_ml_tensor_layout_qcom layout =
CL_TENSOR_LAYOUT_OPTIMAL_QCOM,
+ cl_uint dtype = CL_FLOAT, void* data = nullptr, std::vector<size_t>
shape = {}) {
+ return MakeCLMLTensor(node, data, shape, layout, dtype);
+ }
+ /*!
+ * \brief Create a 2D convolution layer.
+ *
+ * \param layer The CLML layer to build. Containing inputs, outputs and the
CLML function.
+ * \param node The JSON representation of the operator.
+ */
+ std::shared_ptr<cl_ml_tensor_memory_desc_qcom> CreateConvolution2DLayer(
+ CachedLayer* layer, const JSONGraphNode& node, cl_convolution_mode_qcom
mode) {
+ std::vector<std::string> padding =
node.GetAttr<std::vector<std::string>>("padding");
+ std::vector<std::string> strides =
node.GetAttr<std::vector<std::string>>("strides");
+ std::vector<std::string> dilation =
node.GetAttr<std::vector<std::string>>("dilation");
+ std::vector<cl_uint> clml_padding = GetVectorValues(padding);
+ if (!node.HasAttr("padding")) {
+ clml_padding.resize(4);
+ std::fill(clml_padding.begin(), clml_padding.end(), 0);
+ }
+ cl_uint clml_padding_b[CL_ML_TENSOR_MAX_SPATIAL_DIMS_QCOM] =
{clml_padding[0], clml_padding[1]};
+ cl_uint clml_padding_a[CL_ML_TENSOR_MAX_SPATIAL_DIMS_QCOM] =
{clml_padding[2], clml_padding[3]};
+ std::vector<cl_uint> v_strides = GetVectorValues(strides);
+ std::vector<cl_uint> v_dilation = GetVectorValues(dilation);
+ cl_uint clml_strides[CL_ML_TENSOR_MAX_SPATIAL_DIMS_QCOM] = {v_strides[0],
v_strides[1]};
+ cl_uint clml_dilation[CL_ML_TENSOR_MAX_SPATIAL_DIMS_QCOM] =
{v_dilation[0], v_dilation[1]};
+ cl_int result = 0;
+
+ cl_uint groups =
std::stoi(node.GetAttr<std::vector<std::string>>("groups")[0]);
+ if (CL_CONVOLUTION_MODE_CONVOLUTION_QCOM == mode) {
+ ICHECK(groups == 1) << "CLML convolution only supports group size of 1.";
+ } else {
+ groups = 1; // Don't need to pass groups to depthwise
+ }
+
+ bool has_act = false;
+ std::string activation_type;
+ cl_activation_function_qcom clml_act_type = CL_ACTIVATION_RELU;
+ if (node.HasAttr("activation_type")) {
+ activation_type =
node.GetAttr<std::vector<std::string>>("activation_type")[0];
+ ICHECK(activation_type == "relu" || activation_type == "relu6")
+ << "Unknown activation type:" << activation_type;
+ if (activation_type == "relu") {
+ clml_act_type = CL_ACTIVATION_RELU;
+ } else {
+ clml_act_type = CL_ACTIVATION_RELU6;
+ }
+ has_act = true;
+ }
+ cl_ml_op_activation_desc_qcom act_desc = {clml_act_type,
CL_PROPAGATE_NAN_QCOM,
+ CL_ARITHMETIC_MODE_FP32_QCOM};
+
+ // Collect inputs and outputs, handling nn.conv2d.
+ std::vector<JSONGraphNodeEntry> inputs = node.GetInputs();
+ size_t num_inputs = inputs.size();
+ bool has_bias;
+ bool has_bn;
+ ICHECK(num_inputs >= 2U && num_inputs <= 7U)
+ << "Batchnorm fused convolution requires bax 7 arguments";
+ has_bias = (num_inputs == 3) || (num_inputs == 7);
+ has_bn = (num_inputs == 6) || (num_inputs == 7);
+ // Input
+ auto input = MakeCLMLTensorFromJSONEntry(inputs[0]);
+
+ // Weight
+ auto weight = MakeCLMLTensorFromJSONEntry(inputs[1]);
+
+ // Bias
+ auto bias = std::make_shared<cl_ml_tensor_memory_desc_qcom>();
+ if (has_bias) {
+ bias = MakeCLMLTensorFromJSONEntry(inputs[2]);
+ } else {
+ cl_ml_tensor_desc_qcom desc = {};
+ desc.num_dimensions = CL_TENSOR_UNUSED_QCOM;
+ result = h_ClmlIntf->clCreateMLTensorQCOM(context, NULL, &desc,
&layer_.unusedTensor);
+ ICHECK(layer_.unusedTensor && result == CL_SUCCESS) <<
"clCreateMLTensorQCOM:" << result;
+ bias->tensor = layer_.unusedTensor;
+ }
+ // Output
+ auto output = MakeCLMLTensorFromJSONNode(node);
+ cl_ml_op_convolution_desc_qcom conv_desc{mode,
+ groups,
+ 4,
+ {clml_padding_b[0],
clml_padding_b[1]},
+ {clml_padding_a[0],
clml_padding_a[1]},
+ {clml_strides[0],
clml_strides[1]},
+ {clml_dilation[0],
clml_dilation[1]},
+ 0,
+ CL_ARITHMETIC_MODE_FP32_QCOM};
+
+ cl_ml_op_qcom op = NULL;
+ if (!has_bn) {
+ if (!has_act) {
+ result = h_ClmlIntf->clCreateMLOpConvolutionForwardQCOM(
+ context, 0, &conv_desc, input->tensor, weight->tensor,
bias->tensor, output->tensor,
+ &op, NULL);
+ ICHECK(op && result == CL_SUCCESS) << "Convolution Error:" << result;
+ } else {
+ result = h_ClmlIntf->clCreateMLOpFusedConvolutionActivationForwardQCOM(
+ context, 0, &conv_desc, &act_desc, input->tensor, weight->tensor,
bias->tensor, NULL,
+ output->tensor, &op, tuning_cache);
+ ICHECK(op && result == CL_SUCCESS) << "Convolution Error:" << result;
+ }
+ layer_.func_ins.push_back(input);
+ layer->function.push_back(op);
+ } else {
+ int bn_index = has_bias ? 3 : 2;
+ int axis =
std::stoi(node.GetAttr<std::vector<std::string>>("batchnorm")[0]);
+ auto bn_dims = get_tensor_dims(nodes_[inputs[bn_index].id_]);
+ std::vector<size_t> bn_shape = {1, 1, 1, 1};
+ bn_shape[axis] = bn_dims.n;
+ auto bn_mean = std::make_shared<cl_ml_tensor_memory_desc_qcom>();
+ auto bn_var = std::make_shared<cl_ml_tensor_memory_desc_qcom>();
+ auto bn_scale = std::make_shared<cl_ml_tensor_memory_desc_qcom>();
+ auto bn_bias = std::make_shared<cl_ml_tensor_memory_desc_qcom>();
+ bn_scale = MakeCLMLTensorFromJSONEntry(inputs[bn_index], bn_shape);
+ bn_bias = MakeCLMLTensorFromJSONEntry(inputs[bn_index + 1], bn_shape);
+ bn_mean = MakeCLMLTensorFromJSONEntry(inputs[bn_index + 2], bn_shape);
+ bn_var = MakeCLMLTensorFromJSONEntry(inputs[bn_index + 3], bn_shape);
+
+ cl_ml_op_batchnorm_desc_qcom bn_desc = {CL_BATCHNORM_MODE_SPATIAL_QCOM,
+ CL_ARITHMETIC_MODE_FP32_QCOM};
+ if (!has_act) {
+ result = h_ClmlIntf->clCreateMLOpFusedConvolutionBatchNormForwardQCOM(
+ context, 0, &conv_desc, &bn_desc, input->tensor, weight->tensor,
bias->tensor,
+ output->tensor, bn_mean->tensor, bn_var->tensor, bn_scale->tensor,
bn_bias->tensor, &op,
+ tuning_cache);
+ ICHECK(op && result == CL_SUCCESS) << "Convolution Error:" << result;
+ } else {
+ result =
h_ClmlIntf->clCreateMLOpFusedConvolutionBatchNormActivationForwardQCOM(
+ context, 0, &conv_desc, &bn_desc, &act_desc, input->tensor,
weight->tensor,
+ bias->tensor, output->tensor, NULL, bn_mean->tensor,
bn_var->tensor, bn_scale->tensor,
+ bn_bias->tensor, &op, tuning_cache);
+
+ ICHECK(op && result == CL_SUCCESS) << "Convolution Error:" << result;
+ }
+ layer_.func_ins.push_back(input);
+ layer->function.push_back(op);
+ }
+ return output;
+ }
+
+ /*!
+ * \brief Create a ReLU(X) layer.
+ *
+ * \param layer The CLML layer to build. Containing inputs, outputs and the
CLML output.
+ * \param node The JSON representation of the operator.
+ */
+ std::shared_ptr<cl_ml_tensor_memory_desc_qcom> CreateReLULayer(
+ CachedLayer* layer, const JSONGraphNode& node,
+ cl_activation_function_qcom clml_act_type = CL_ACTIVATION_RELU) {
+ cl_int result = 0;
+ cl_ml_op_qcom op = NULL;
+ auto input = MakeCLMLTensorFromJSONEntry(node.GetInputs()[0]);
+ auto output = MakeCLMLTensorFromJSONNode(node);
+
+ cl_ml_op_activation_desc_qcom act_desc = {clml_act_type,
CL_PROPAGATE_NAN_QCOM,
+ CL_ARITHMETIC_MODE_FP32_QCOM};
+
+ cl_ml_tensor_desc_qcom desc = {};
+ desc.num_dimensions = CL_TENSOR_UNUSED_QCOM;
+ result = h_ClmlIntf->clCreateMLTensorQCOM(context, NULL, &desc,
&layer_.unusedTensor);
+ ICHECK(layer_.unusedTensor && result == CL_SUCCESS) << ":" << result;
+
+ result = h_ClmlIntf->clCreateMLOpActivationForwardQCOM(context, 0,
&act_desc, input->tensor,
+
layer_.unusedTensor, output->tensor, &op,
+ tuning_cache);
+ ICHECK(op && result == CL_SUCCESS) << "Activation Error:" << result;
+
+ layer_.func_ins.push_back(input);
+ layer->function.push_back(op);
+ return output;
+ }
+
+ /*!
+ * \brief Create a batch norm layer.
+ *
+ *
+ * \param layer The CLML layer to build. Containing inputs, outputs and the
CLML function.
+ * \param node The JSON representation of the operator.
+ */
+ std::shared_ptr<cl_ml_tensor_memory_desc_qcom>
CreateBatchNormLayer(CachedLayer* layer,
+ const
JSONGraphNode& node) {
+ cl_int result = 0;
+ cl_ml_op_qcom op = NULL;
+ auto input = MakeCLMLTensorFromJSONEntry(node.GetInputs()[0]);
+ int axis = std::stoi(node.GetAttr<std::vector<std::string>>("axis")[0]);
+ auto bn_dims = get_tensor_dims(nodes_[node.GetInputs()[1].id_]);
+ std::vector<size_t> bn_shape = {1, 1, 1, 1};
+ bn_shape[axis] = bn_dims.n;
+ auto bn_mean = std::make_shared<cl_ml_tensor_memory_desc_qcom>();
+ auto bn_var = std::make_shared<cl_ml_tensor_memory_desc_qcom>();
+ auto bn_scale = std::make_shared<cl_ml_tensor_memory_desc_qcom>();
+ auto bn_bias = std::make_shared<cl_ml_tensor_memory_desc_qcom>();
+ bn_scale = MakeCLMLTensorFromJSONEntry(node.GetInputs()[1], bn_shape);
+ bn_bias = MakeCLMLTensorFromJSONEntry(node.GetInputs()[2], bn_shape);
+ bn_mean = MakeCLMLTensorFromJSONEntry(node.GetInputs()[3], bn_shape);
+ bn_var = MakeCLMLTensorFromJSONEntry(node.GetInputs()[4], bn_shape);
+
+ auto output = MakeCLMLTensorFromJSONNode(node);
+
+ cl_ml_op_batchnorm_desc_qcom bn_desc = {CL_BATCHNORM_MODE_SPATIAL_QCOM,
+ CL_ARITHMETIC_MODE_FP32_QCOM};
+
+ result = h_ClmlIntf->clCreateMLOpBatchNormForwardQCOM(
+ context, 0, &bn_desc, input->tensor, bn_mean->tensor, bn_var->tensor,
bn_scale->tensor,
+ bn_bias->tensor, output->tensor, &op, tuning_cache);
+ ICHECK(op && result == CL_SUCCESS) << "Batchnorm Error:" << result;
+
+ layer->function.push_back(op);
+ layer_.func_ins.push_back(input);
+ return output;
+ }
+
+ /*!
+ * \brief Create a global pooling layer.
+ *
+ * \note Currently global_max_pool2d and global_avg_pool2d are supported.
+ *
+ * \param layer The CLML layer to build. Containing inputs, outputs and the
CLML function.
+ * \param node The JSON representation of the operator.
+ */
+ std::shared_ptr<cl_ml_tensor_memory_desc_qcom> CreateGlobalPoolingLayer(
+ CachedLayer* layer, const JSONGraphNode& node) {
+ cl_int result = 0;
+ cl_ml_op_qcom op = NULL;
+ auto input = MakeCLMLTensorFromJSONEntry(node.GetInputs()[0]);
+ auto output = MakeCLMLTensorFromJSONNode(node);
+ auto in_dims = get_tensor_dims(nodes_[node.GetInputs()[0].id_]);
+ cl_ml_op_pooling_desc_qcom pool_desc = {
+ node.GetOpName() == "nn.global_max_pool2d" ? CL_POOLING_MODE_MAX_QCOM
+ :
CL_POOLING_MODE_AVERAGE_EXCLUDE_PADDING_QCOM,
+ 4, // reserved
+ {0, 0},
+ {0, 0},
+ {1, 1},
+ {in_dims.w, in_dims.h},
+ CL_PROPAGATE_NAN_QCOM,
+ CL_ARITHMETIC_MODE_FP32_QCOM,
+ };
+
+ cl_ml_tensor_desc_qcom desc = {};
+ desc.num_dimensions = CL_TENSOR_UNUSED_QCOM;
+ result = h_ClmlIntf->clCreateMLTensorQCOM(context, NULL, &desc,
&layer_.unusedTensor);
+ ICHECK(layer_.unusedTensor && result == CL_SUCCESS) << ":" << result;
+
+ result = h_ClmlIntf->clCreateMLOpPoolingForwardQCOM(context, 0,
&pool_desc, input->tensor,
+ layer_.unusedTensor,
output->tensor, &op,
+ tuning_cache);
+ ICHECK(op && result == CL_SUCCESS) << "Pooling Error:" << result;
+
+ layer_.func_ins.push_back(input);
+ layer->function.push_back(op);
+ return output;
+ }
+
+ /*!
+ * \brief Create a SoftMax layer.
+ *
+ * \param layer The CLML layer to build. Containing inputs, outputs and the
CLML output.
+ * \param node The JSON representation of the operator.
+ */
+ std::shared_ptr<cl_ml_tensor_memory_desc_qcom>
CreateSoftMaxLayer(CachedLayer* layer,
+ const
JSONGraphNode& node) {
+ cl_int result = 0;
+ cl_ml_op_qcom op = NULL;
+ auto input = MakeCLMLTensorFromJSONEntry(node.GetInputs()[0]);
+ auto out_dims = get_tensor_dims(nodes_[node.GetInputs()[0].id_]);
+ auto output = MakeCLMLTensorFromJSONNode(node,
CL_TENSOR_LAYOUT_OPTIMAL_QCOM, CL_FLOAT, nullptr,
+ {out_dims.n, out_dims.c, 1, 1});
+
+ cl_ml_op_softmax_desc_qcom softmax_desc =
{CL_SOFTMAX_ALGORITHM_ACCURATE_QCOM,
+ CL_SOFTMAX_MODE_INSTANCE_QCOM,
+ CL_ARITHMETIC_MODE_FP32_QCOM};
+
+ result = h_ClmlIntf->clCreateMLOpSoftmaxQCOM(context, 0, &softmax_desc,
input->tensor,
+ output->tensor, &op,
tuning_cache);
+ ICHECK(op && result == CL_SUCCESS) << "SoftMax Error:" << result;
+
+ layer_.func_ins.push_back(input);
+ layer->function.push_back(op);
+ return output;
+ }
+
+ /*!
+ * \brief Create a Pad layer.
+ *
+ * \param layer The CLML layer to build. Containing inputs, outputs and the
CLML output.
+ * \param node The JSON representation of the operator.
+ */
+ std::shared_ptr<cl_ml_tensor_memory_desc_qcom> CreatePadLayer(CachedLayer*
layer,
+ const
JSONGraphNode& node) {
+ cl_int result = 0;
+ cl_ml_op_qcom op = NULL;
+ auto input = MakeCLMLTensorFromJSONEntry(node.GetInputs()[0]);
+ auto output = MakeCLMLTensorFromJSONNode(node);
+
+ std::string pad_mode =
node.GetAttr<std::vector<std::string>>("pad_mode")[0];
+ std::vector<std::string> padding =
node.GetAttr<std::vector<std::string>>("pad_width");
+ std::vector<cl_uint> clml_padding = GetVectorValues(padding);
+
+ cl_pad_mode_qcom clml_pad_mode = CL_PAD_MODE_CONSTANT_QCOM;
+ if (pad_mode == "constant")
+ clml_pad_mode = CL_PAD_MODE_CONSTANT_QCOM;
+ else if (pad_mode == "edge")
+ clml_pad_mode = CL_PAD_MODE_SYMMETRIC_QCOM;
+ else if (pad_mode == "reflect")
+ clml_pad_mode = CL_PAD_MODE_REFLECT_QCOM;
+ else
+ LOG(FATAL) << "Padding mode not supported by CLML:" << pad_mode;
+
+ cl_ml_op_pad_desc_qcom pad_desc{
+ clml_pad_mode,
+ {0, 0},
+ {clml_padding[0], clml_padding[1], clml_padding[2], clml_padding[3],
0, 0, 0, 0},
+ CL_ARITHMETIC_MODE_FP32_QCOM};
+
+ result = h_ClmlIntf->clCreateMLOpPadQCOM(context, 0, &pad_desc,
input->tensor, output->tensor,
+ &op, tuning_cache);
+ ICHECK(op && result == CL_SUCCESS) << "Pad Error:" << result;
+
+ layer_.func_ins.push_back(input);
+ layer->function.push_back(op);
+ return output;
+ }
+
+ /*!
+ * \brief Create a Reshape layer.
+ *
+ * \param layer The CLML layer to build. Containing inputs, outputs and the
CLML output.
+ * \param node The JSON representation of the operator.
+ */
+ std::shared_ptr<cl_ml_tensor_memory_desc_qcom>
CreateReshapeLayer(CachedLayer* layer,
+ const
JSONGraphNode& node) {
+ cl_int result = 0;
+ cl_ml_op_qcom op = NULL;
+ auto input = MakeCLMLTensorFromJSONEntry(node.GetInputs()[0]);
+ auto output = MakeCLMLTensorFromJSONNode(node);
+
+ result = h_ClmlIntf->clCreateMLOpReshapeQCOM(context, 0, input->tensor,
output->tensor, &op,
+ tuning_cache);
+ ICHECK(op && result == CL_SUCCESS) << "Reshape Error:" << result;
+
+ layer_.func_ins.push_back(input);
+ layer->function.push_back(op);
+ return output;
+ }
+
+ /*!
+ * \brief Create a dense layer.
+ *
+ *
+ * \param layer The CLML layer to build. Containing inputs, outputs and the
CLML function.
+ * \param node The JSON representation of the operator.
+ */
+ std::shared_ptr<cl_ml_tensor_memory_desc_qcom> CreateDenseLayer(CachedLayer*
layer,
+ const
JSONGraphNode& node) {
+ cl_int result = 0;
+ cl_ml_op_qcom op = NULL;
+ auto input = MakeCLMLTensorFromJSONEntry(node.GetInputs()[0]);
+ auto wt_dims = get_tensor_dims(nodes_[node.GetInputs()[1].id_]);
+ bool has_bias = node.GetInputs().size() == 3 ? true : false;
+
+ auto weight = MakeCLMLTensorFromJSONEntry(node.GetInputs()[1], {1, 1,
wt_dims.n, wt_dims.c});
+ auto bias = std::make_shared<cl_ml_tensor_memory_desc_qcom>();
+ if (has_bias) {
+ auto bias_dims = get_tensor_dims(nodes_[node.GetInputs()[2].id_]);
+ bias = MakeCLMLTensorFromJSONEntry(node.GetInputs()[2], {1, bias_dims.c,
1, 1});
+ }
+
+ cl_ml_op_fully_connected_desc_qcom fc_desc = {1,
CL_FC_WEIGHT_TRANSFORM_TRANSPOSE_QCOM,
+
CL_ARITHMETIC_MODE_FP32_QCOM};
+
+ auto output = MakeCLMLTensorFromJSONNode(node);
+ if (has_bias) {
+ result = h_ClmlIntf->clCreateMLOpFullyConnectedQCOM(context, 0,
&fc_desc, input->tensor,
+ weight->tensor,
bias->tensor,
+ output->tensor, &op,
tuning_cache);
+ } else {
+ result = h_ClmlIntf->clCreateMLOpFullyConnectedQCOM(context, 0,
&fc_desc, input->tensor,
+ weight->tensor,
NULL, output->tensor, &op,
+ tuning_cache);
+ }
+ ICHECK(op && result == CL_SUCCESS) << "Fully Connected Error:" << result;
+
+ layer->function.push_back(op);
+ layer_.func_ins.push_back(input);
+ return output;
+ }
+
+ /*!
+ * \brief Create a Clip(X) layer.
+ *
+ * \param layer The CLML layer to build. Containing inputs, outputs and the
CLML output.
+ * \param node The JSON representation of the operator.
+ */
+ std::shared_ptr<cl_ml_tensor_memory_desc_qcom> CreateClipLayer(CachedLayer*
layer,
+ const
JSONGraphNode& node) {
+ cl_int result = 0;
+ cl_ml_op_qcom op = NULL;
+ auto input = MakeCLMLTensorFromJSONEntry(node.GetInputs()[0]);
+ auto output = MakeCLMLTensorFromJSONNode(node);
+ cl_float a_max =
std::stof(node.GetAttr<std::vector<std::string>>("a_max")[0]);
+ cl_float a_min =
std::stof(node.GetAttr<std::vector<std::string>>("a_min")[0]);
+
+ cl_ml_op_clip_desc_qcom clip_desc = {CL_CLIP_BY_VALUE_QCOM,
+ {{a_max}, CL_FLOAT},
+ {{a_min}, CL_FLOAT},
+ CL_ARITHMETIC_MODE_FP32_QCOM};
+
+ result = h_ClmlIntf->clCreateMLOpClipQCOM(context, 0, &clip_desc,
input->tensor, output->tensor,
+ &op, tuning_cache);
+ ICHECK(op && result == CL_SUCCESS) << "Clip Error:" << result;
+
+ layer_.func_ins.push_back(input);
+ layer->function.push_back(op);
+ return output;
+ }
+
+ /*!
+ * \brief The network layers represented by acl functions.
+ * \note Currently only supports a single layer.
+ */
+
+ CachedLayer layer_;
+ // CLML Context
+ CLMLInterfaceV2QCOM* h_ClmlIntf = NULL;
+ cl_platform_id platform = NULL;
+ cl_context context = NULL;
+ cl_device_id device_id = NULL;
+ cl_command_queue queue = NULL;
+ std::vector<cl_event>* evts;
+ cl_ml_tuningcache_qcom tuning_cache = NULL;
+ bool is_tuning_run;
+ char* tuning_file;
+#else
+ void Run() override {
+ LOG(FATAL) << "Cannot call run on CLML module without runtime enabled. "
+ << "Please build with USE_CLML_GRAPH_EXECUTOR.";
+ }
+
+ void BuildEngine() {
+ LOG(WARNING) << "CLML engine is not initialized. "
+ << "Please build with USE_CLML_GRAPH_EXECUTOR.";
+ }
+#endif
+};
+
+runtime::Module CLMLRuntimeCreate(const String& symbol_name, const String&
graph_json,
+ const Array<String>& const_names) {
+ auto n = make_object<CLMLRuntime>(symbol_name, graph_json, const_names);
+ return runtime::Module(n);
+}
+
+TVM_REGISTER_GLOBAL("runtime.clml_runtime_create").set_body_typed(CLMLRuntimeCreate);
+TVM_REGISTER_GLOBAL("runtime.module.loadbinary_clml")
+ .set_body_typed(JSONRuntimeBase::LoadFromBinary<CLMLRuntime>);
+} // namespace contrib
+} // namespace runtime
+} // namespace tvm
diff --git a/src/support/libinfo.cc b/src/support/libinfo.cc
index f98e08ce94..be0cd9eb8f 100644
--- a/src/support/libinfo.cc
+++ b/src/support/libinfo.cc
@@ -310,6 +310,8 @@ TVM_DLL Map<String, String> GetLibInfo() {
{"USE_THRUST", TVM_INFO_USE_THRUST},
{"USE_VITIS_AI", TVM_INFO_USE_VITIS_AI},
{"USE_VULKAN", TVM_INFO_USE_VULKAN},
+ {"USE_CLML", TVM_INFO_USE_CLML},
+ {"USE_CLML_GRAPH_EXECUTOR", TVM_INFO_USE_CLML_GRAPH_EXECUTOR},
};
return result;
}
diff --git a/python/tvm/relay/op/contrib/__init__.py
b/tests/python/contrib/test_clml/__init__.py
similarity index 71%
copy from python/tvm/relay/op/contrib/__init__.py
copy to tests/python/contrib/test_clml/__init__.py
index a03d0f6d4f..dfeb9ae5c8 100644
--- a/python/tvm/relay/op/contrib/__init__.py
+++ b/tests/python/contrib/test_clml/__init__.py
@@ -14,15 +14,4 @@
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
-# pylint: disable=wildcard-import
-"""Contrib modules."""
-from .register import get_pattern_table, register_pattern_table
-
-from .arm_compute_lib import *
-from .dnnl import *
-from .bnns import *
-from .coreml import *
-from .ethosn import *
-from .libtorch import *
-from .tensorrt import *
-from .cutlass import *
+"""Infrastructure and tests for CLML"""
diff --git a/tests/python/contrib/test_clml/infrastructure.py
b/tests/python/contrib/test_clml/infrastructure.py
new file mode 100644
index 0000000000..19901d733e
--- /dev/null
+++ b/tests/python/contrib/test_clml/infrastructure.py
@@ -0,0 +1,256 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+from itertools import zip_longest, combinations
+import json
+import os
+import warnings
+
+import numpy as np
+
+import tvm
+from tvm import relay
+from tvm import rpc
+
+# from tvm.contrib.debugger import debug_runtime as graph_executor
+from tvm.contrib import graph_executor
+from tvm.relay.op.contrib import clml
+from tvm.contrib import utils
+from tvm.autotvm.measure import request_remote
+from tvm.relay.expr_functor import ExprMutator, Call
+
+
+class Device:
+ """
+ Configuration for CLML tests.
+
+ Check tests/python/contrib/clml/ for the presence of an test_config.json
file.
+ This file can be used to override the default configuration here which
will attempt to run the Arm
+ Compute Library runtime tests locally if the runtime is available.
Changing the configuration
+ will allow these runtime tests to be offloaded to a remote Arm device via
a tracker for example.
+
+ Notes
+ -----
+ The test configuration will be loaded once when the the class is
created. If the configuration
+ changes between tests, any changes will not be picked up.
+
+ Parameters
+ ----------
+ device : RPCSession
+ Allows tests to connect to and use remote device.
+
+ Attributes
+ ----------
+ connection_type : str
+ Details the type of RPC connection to use. Options:
+ local - Use the local device,
+ tracker - Connect to a tracker to request a remote device,
+ remote - Connect to a remote device directly.
+ host : str
+ Specify IP address or hostname of remote target.
+ port : int
+ Specify port number of remote target.
+ target : str
+ The compilation target.
+ device_key : str
+ The device key of the remote target. Use when connecting to a remote
device via a tracker.
+ cross_compile : str
+ Specify path to cross compiler to use when connecting a remote device
from a non-arm platform.
+ """
+
+ connection_type = "tracker"
+ host = "localhost"
+ port = 9090
+ target = "opencl"
+ target_host = "llvm -mtriple=aarch64-linux-gnu"
+ device_key = ""
+ cross_compile = ""
+
+ def __init__(self):
+ """Keep remote device for lifetime of object."""
+ self.device = self._get_remote()
+
+ @classmethod
+ def _get_remote(cls):
+ """Get a remote (or local) device to use for testing."""
+ if cls.connection_type == "tracker":
+ device = request_remote(cls.device_key, cls.host, cls.port,
timeout=1000)
+ elif cls.connection_type == "remote":
+ device = rpc.connect(cls.host, cls.port)
+ elif cls.connection_type == "local":
+ device = rpc.LocalSession()
+ else:
+ raise ValueError(
+ "connection_type in test_config.json should be one of: "
"local, tracker, remote."
+ )
+
+ return device
+
+ @classmethod
+ def load(cls, file_name):
+ """Load test config
+
+ Load the test configuration by looking for file_name relative
+ to the test_clml directory.
+ """
+ location = os.path.realpath(os.path.join(os.getcwd(),
os.path.dirname(__file__)))
+ config_file = os.path.join(location, file_name)
+ if not os.path.exists(config_file):
+ warnings.warn("Config file doesn't exist, resuming CLML tests with
default config.")
+ return
+ with open(config_file, mode="r") as config:
+ test_config = json.load(config)
+
+ cls.connection_type = test_config["connection_type"]
+ cls.host = test_config["host"]
+ cls.port = test_config["port"]
+ cls.target = test_config["target"]
+ cls.target_host = test_config["target_host"]
+ cls.device_key = test_config.get("device_key") or ""
+ cls.cross_compile = test_config.get("cross_compile") or ""
+
+
+def skip_runtime_test():
+ """Skip test if it requires the runtime and it's not present."""
+ # CLML codegen not present.
+ if not tvm.get_global_func("relay.ext.clml", True):
+ print("Skip because CLML codegen is not available.")
+ return True
+
+ # Remote device is in use or CLML runtime not present
+ # Note: Ensure that the device config has been loaded before this check
+ if not Device.connection_type != "local" and not
clml.is_clml_runtime_enabled():
+ print("Skip because runtime isn't present or a remote device isn't
being used.")
+ return True
+
+
+def skip_codegen_test():
+ """Skip test if it requires the CLML codegen and it's not present."""
+ if not tvm.get_global_func("relay.ext.clml", True):
+ print("Skip because CLML codegen is not available.")
+ return True
+
+
+def build_module(mod, target, target_host, params=None, enable_clml=True):
+ """Build module with option to build for CLML."""
+ if isinstance(mod, tvm.relay.expr.Call):
+ mod = tvm.IRModule.from_expr(mod)
+
+ with tvm.transform.PassContext(opt_level=3,
disabled_pass=["AlterOpLayout"]):
+ if enable_clml:
+ mod = clml.partition_for_clml(mod, params)
+ relay.backend.te_compiler.get().clear()
+ # print("Build Mod:", mod)
+ return relay.build(mod, target=target, target_host=target_host,
params=params)
+
+
+def build_and_run(
+ mod,
+ inputs,
+ outputs,
+ params,
+ device,
+ enable_clml=True,
+ no_runs=1,
+ config=None,
+):
+ """Build and run the relay module."""
+ if config is None:
+ config = {}
+
+ try:
+ libm = build_module(mod, device.target, device.target_host, params,
enable_clml)
+
+ clml_modules = extract_clml_modules(libm)
+ for mod in clml_modules:
+ source = mod.get_source("json")
+ codegen = json.loads(source)["nodes"]
+ # remove input and const names as these cannot be predetermined
+ for node in range(len(codegen)):
+ if codegen[node]["op"] == "input" or codegen[node]["op"] ==
"const":
+ codegen[node]["name"] = ""
+ codegen_str = json.dumps(codegen, sort_keys=True, indent=2)
+
+ except Exception as e:
+ err_msg = "The module could not be built.\n"
+ if config:
+ err_msg += f"The test failed with the following parameters:
{config}\n"
+ err_msg += str(e)
+ raise Exception(err_msg)
+
+ lib = update_lib(libm, device.device, device.cross_compile)
+ gen_module =
graph_executor.GraphModule(lib["default"](device.device.cl(0)))
+ gen_module.set_input(**inputs)
+ out = []
+ for _ in range(no_runs):
+ gen_module.run()
+ out.append([gen_module.get_output(i) for i in range(outputs)])
+ time_f = gen_module.module.time_evaluator("run", device.device.cl(0),
number=50)
+ cost = time_f().mean
+ print("%g secs/iteration\n" % cost)
+ return out
+
+
+def update_lib(lib, device, cross_compile):
+ """Export the library to the remote/local device."""
+ lib_name = "mod.so"
+ temp = utils.tempdir()
+ lib_path = temp.relpath(lib_name)
+ if cross_compile:
+ lib.export_library(lib_path, cc=cross_compile)
+ else:
+ lib.export_library(lib_path)
+ device.upload(lib_path)
+ lib = device.load_module(lib_name)
+ return lib
+
+
+def extract_clml_modules(module):
+ """Get the CLML module(s) from llvm module."""
+ return list(filter(lambda mod: mod.type_key == "clml",
module.get_lib().imported_modules))
+
+
+def verify_codegen(
+ module,
+ known_good_codegen,
+ num_clml_modules=1,
+ tvm_ops=0,
+ target="llvm -mtriple=aarch64-linux-gnu",
+):
+ """Check clml codegen against a known good output."""
+ module = build_module(module, target, tvm_ops=tvm_ops,
clml_partitions=num_clml_modules)
+ clml_modules = extract_clml_modules(module)
+
+ assert len(clml_modules) == num_clml_modules, (
+ f"The number of CLML modules produced ({len(clml_modules)}) does not "
+ f"match the expected value ({num_clml_modules})."
+ )
+
+ for mod in clml_modules:
+ source = mod.get_source("json")
+ codegen = json.loads(source)["nodes"]
+ # remove input and const names as these cannot be predetermined
+ for node in range(len(codegen)):
+ if codegen[node]["op"] == "input" or codegen[node]["op"] ==
"const":
+ codegen[node]["name"] = ""
+ codegen_str = json.dumps(codegen, sort_keys=True, indent=2)
+ known_good_codegen_str = json.dumps(known_good_codegen,
sort_keys=True, indent=2)
+
+ assert codegen_str == known_good_codegen_str, (
+ f"The JSON produced by codegen does not match the expected result.
\n"
+ f"Actual={codegen_str} \n"
+ f"Expected={known_good_codegen_str}"
+ )
diff --git a/tests/python/contrib/test_clml/test_network.py
b/tests/python/contrib/test_clml/test_network.py
new file mode 100644
index 0000000000..d89676f10e
--- /dev/null
+++ b/tests/python/contrib/test_clml/test_network.py
@@ -0,0 +1,139 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""OpenCL ML network tests."""
+
+import numpy as np
+import pytest
+from tvm import testing
+from tvm import relay
+
+import tvm
+from test_clml.infrastructure import skip_runtime_test, build_and_run
+from test_clml.infrastructure import Device
+
+
+def _build_and_run_network(mod, params, inputs, data, device, atol, rtol):
+ """Helper function to build and run a network."""
+
+ outputs = []
+ for clml in [True, False]:
+ outputs.append(
+ build_and_run(
+ mod,
+ data,
+ 1,
+ params,
+ device,
+ enable_clml=clml,
+ )[0]
+ )
+ return outputs
+
+
+def _get_keras_model(keras_model, inputs_dict, data):
+ """Convert Keras graph to relay."""
+ inputs = {}
+ for name, (shape, _) in inputs_dict.items():
+ inputs[keras_model.input_names[0]] = shape
+
+ from tensorflow.keras.layers import Input
+ from tensorflow.keras.models import Model
+
+ def get_bottom_top_model(model, layer_name):
+ layer = model.get_layer(layer_name)
+ bottom_input = model.layers[0].input
+ bottom_output = bottom_input
+ for layer in model.layers:
+ bottom_output = layer(bottom_output)
+ if layer.name == layer_name:
+ break
+ bottom_model = Model(bottom_input, bottom_output)
+ return bottom_model
+
+ keras_model = get_bottom_top_model(keras_model, "predictions")
+ ref_output = keras_model.predict(data["input_1"].transpose(0, 2, 3, 1))
+
+ mod, params = relay.frontend.from_keras(keras_model, inputs, layout="NCHW")
+ return mod, params, ref_output
+
+
+def test_mobilenet():
+ Device.load("test_config.json")
+
+ if skip_runtime_test():
+ return
+
+ device = Device()
+ dtype = "float16"
+
+ def get_model():
+ from tensorflow.keras.applications import MobileNet
+
+ mobilenet = MobileNet(
+ include_top=True, weights=None, input_shape=(224, 224, 3),
classes=1000
+ )
+ mobilenet.load_weights("mobilenet_1_0_224_tf.h5")
+ inputs = {mobilenet.input_names[0]: ((1, 3, 224, 224), "float32")}
+
+ data = {}
+ np.random.seed(0)
+
+ for name, (shape, dtype) in inputs.items():
+ if dtype == "uint8":
+ low, high = 0, 1
+ else:
+ low, high = -1, 1
+ data[name] = np.random.uniform(low, high, shape).astype(dtype)
+
+ mod, params, ref_outputs = _get_keras_model(mobilenet, inputs, data)
+ return mod, params, inputs, data, ref_outputs
+
+ mod, params, inputs, input_data, ref_outputs = get_model()
+ outputs = _build_and_run_network(
+ mod, params, inputs, input_data, device=device, atol=1e-5, rtol=1e-5
+ )
+
+ # test
+ print("OpenCL:", outputs[0][0].asnumpy().shape)
+ print("CLML:", outputs[1][0].asnumpy().shape)
+
+ opencl_sort = np.argsort(outputs[1][0].asnumpy()).flatten()
+ clml_sort = np.argsort(outputs[0][0].asnumpy()).flatten()
+
+ tvm.testing.assert_allclose(opencl_sort[:10], clml_sort[:10], rtol=1e-5,
atol=1e-5)
+
+
+"""
+ tvm.testing.assert_allclose(
+ ref_outputs, outputs[1][0].asnumpy(), rtol=1e-5, atol=1e-5)
+ print("OpenCL to Keras looks good")
+ tvm.testing.assert_allclose(
+ outputs[0][0].asnumpy(), outputs[1][0].asnumpy(), rtol=1e-5,
atol=1e-5)
+ print("OpenCL to CLML looks good")
+ exit(0)
+
+ tvm.testing.assert_allclose(
+ ref_outputs.transpose(0, 3, 1, 2), outputs[1][0].asnumpy(),
rtol=1e-5, atol=1e-5)
+ print("OpenCL to Keras looks good")
+ tvm.testing.assert_allclose(
+ outputs[0][0].asnumpy(), outputs[1][0].asnumpy(), rtol=1e-5,
atol=1e-5)
+ print("OpenCL to CLML looks good")
+"""
+
+
+if __name__ == "__main__":
+ test_mobilenet()
diff --git a/tests/python/contrib/test_clml/test_ops.py
b/tests/python/contrib/test_clml/test_ops.py
new file mode 100644
index 0000000000..63f5bc168f
--- /dev/null
+++ b/tests/python/contrib/test_clml/test_ops.py
@@ -0,0 +1,216 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""CLML integration conv2d tests."""
+
+import numpy as np
+
+np.random.seed(0)
+
+import tvm
+from tvm import testing
+from tvm import relay
+from tvm.ir import IRModule
+
+from test_clml.infrastructure import (
+ skip_runtime_test,
+ skip_codegen_test,
+ build_and_run,
+ Device,
+)
+
+
+def _get_conv_model(
+ shape,
+ kernel_h,
+ kernel_w,
+ padding,
+ strides,
+ dilation,
+ groups,
+ dtype,
+ channels,
+ var,
+ has_bias=False,
+ has_activation=False,
+ has_pad=False,
+):
+ """Return a model and any parameters it may have"""
+ a = relay.var(next(iter(var)), shape=shape, dtype=dtype)
+ input_arr = var[next(iter(var))]
+ if has_pad:
+ p = ((0, 0), (padding[0], padding[0]), (padding[1], padding[1]), (0,
0))
+ a = relay.nn.pad(a, pad_width=p)
+ padding = (0, 0, 0, 0)
+ else:
+ if len(padding) == 2:
+ padding = (padding[0], padding[1], padding[0], padding[1])
+ shape = (shape[0], shape[1], shape[2] + padding[0] * 2, shape[3] +
padding[1] * 2)
+ is_depthwise = shape[1] == channels == groups
+
+ weight_format = "OIHW" if is_depthwise else "OIHW"
+ if weight_format == "IOHW":
+ weight_shape = (shape[1] // groups, channels, kernel_h, kernel_w)
+ else:
+ weight_shape = (channels, shape[1] // groups, kernel_h, kernel_w)
+
+ w = tvm.nd.array(np.random.uniform(-1, 1, weight_shape).astype(dtype))
+ weights = relay.const(w, dtype)
+ out = relay.nn.conv2d(
+ a,
+ weights,
+ kernel_size=(kernel_h, kernel_w),
+ data_layout="NCHW",
+ kernel_layout=weight_format,
+ dilation=dilation,
+ strides=strides,
+ padding=padding,
+ groups=groups,
+ channels=channels,
+ out_dtype=dtype,
+ )
+ params = {"w": w}
+ if has_bias:
+ bias_shape = weight_shape[2] if is_depthwise else weight_shape[0]
+ b = tvm.nd.array(np.random.uniform(-1, 1, bias_shape).astype(dtype))
+ biasc = relay.const(b, dtype)
+ out = relay.nn.bias_add(out, biasc, axis=1)
+ params["b"] = b
+
+ if has_activation:
+ out = relay.nn.relu(out)
+
+ print("Out:", out)
+
+ return out, params
+
+
+def test_conv2d():
+ Device.load("test_config.json")
+
+ if skip_runtime_test():
+ return
+
+ device = Device()
+ np.random.seed(0)
+
+ dtype = "float32"
+
+ trials = [
+ # Normal convolution
+ [3, 3, (1, 1), (1, 1), (1, 1), 4, (14, 10, 10), (False, False, False)],
+ [2, 1, (2, 2), (1, 1), (1, 1), 7, (15, 16, 12), (False, False, True)],
+ [3, 3, (2, 1), (1, 1), (1, 1), 4, (14, 10, 10), (False, True, False)],
+ [3, 3, (2, 1), (1, 1), (1, 1), 4, (14, 10, 10), (False, True, True)],
+ # Normal convolution
+ [2, 2, (1, 1), (1, 1), (1, 1), 4, (14, 10, 10), (False, False, False)],
+ [2, 1, (2, 2), (1, 1), (1, 1), 7, (16, 12, 15), (False, False, True)],
+ [3, 3, (2, 1), (1, 1), (1, 1), 4, (14, 10, 10), (False, True, False)],
+ [3, 3, (1, 1), (1, 1), (1, 1), 16, (16, 12, 15), (False, False,
False)],
+ [5, 5, (1, 1), (2, 2), (1, 1), 4, (14, 10, 10), (False, False, False)],
+ [1, 3, (1, 1), (1, 1), (1, 1), 7, (20, 20, 20), (False, False, True)],
+ [2, 2, (2, 2), (1, 1), (1, 1), 4, (20, 20, 20), (False, True, False)],
+ [5, 5, (1, 1), (2, 2), (1, 1), 4, (14, 10, 10), (False, False, False)],
+ [3, 3, (2, 1), (1, 1), (1, 1), 7, (20, 20, 20), (False, False, False)],
+ [3, 3, (1, 1), (2, 2), (1, 1), 16, (14, 10, 10), (False, True, True)],
+ ]
+
+ for (
+ kernel_h,
+ kernel_w,
+ pad,
+ stride,
+ dilation,
+ out_channels,
+ shape,
+ composite,
+ ) in trials:
+ shape = (1, *shape)
+ groups = 1
+ outputs = []
+ inputs = {
+ "a": tvm.nd.array(np.random.uniform(-1, 1, shape).astype(dtype)),
+ }
+
+ func, params = _get_conv_model(
+ shape,
+ kernel_h,
+ kernel_w,
+ pad,
+ stride,
+ dilation,
+ groups,
+ dtype,
+ out_channels,
+ inputs,
+ has_pad=composite[0],
+ has_bias=composite[1],
+ has_activation=composite[2],
+ )
+ opencl_out = build_and_run(func, inputs, 1, params, device,
enable_clml=False)[0]
+ clml_out = build_and_run(func, inputs, 1, params, device,
enable_clml=True)[0]
+
+ tvm.testing.assert_allclose(
+ clml_out[0].asnumpy(), opencl_out[0].asnumpy(), rtol=1e-5,
atol=1e-5
+ )
+
+
+def test_batchnorm():
+ Device.load("test_config.json")
+
+ if skip_runtime_test():
+ return
+
+ device = Device()
+ np.random.seed(0)
+
+ dtype = "float32"
+
+ in_shape = (1, 8, 64, 64)
+ channels = 8
+
+ input_arr = tvm.nd.array(np.random.uniform(-1, 1, in_shape).astype(dtype))
+ inp = relay.var("a", shape=in_shape, dtype=dtype)
+ gamma_arr = tvm.nd.array(np.random.uniform(-1, 1,
(channels)).astype(dtype))
+ beta_arr = tvm.nd.array(np.random.uniform(-1, 1, (channels)).astype(dtype))
+ gamma = relay.const(gamma_arr, dtype)
+ beta = relay.const(beta_arr, dtype)
+
+ mean_arr = tvm.nd.array(np.mean(input_arr.asnumpy(), axis=(0, 2, 3),
keepdims=False))
+ mean = relay.const(mean_arr)
+ variance_arr = tvm.nd.array(np.var(input_arr.asnumpy(), axis=(0, 2, 3),
keepdims=False))
+ variance = relay.const(variance_arr)
+
+ params = {}
+
+ func = relay.nn.batch_norm(inp, gamma, beta, mean, variance, axis=1,
epsilon=0.0001)[0]
+ mod = IRModule.from_expr(func)
+
+ inputs = {
+ "a": input_arr,
+ }
+
+ opencl_out = build_and_run(mod, inputs, 1, params, device,
enable_clml=False)[0]
+ clml_out = build_and_run(mod, inputs, 1, params, device,
enable_clml=True)[0]
+
+ tvm.testing.assert_allclose(
+ clml_out[0].asnumpy(), opencl_out[0].asnumpy(), rtol=1e-5, atol=1e-5
+ )
+
+
+if __name__ == "__main__":
+ # test_conv2d()
+ test_batchnorm()