[GitHub] [tvm] jroesch commented on a change in pull request #8795: [2/6] Arm(R) Ethos(TM)-U NPU Relay passes and Conv2D op

GitBox Mon, 23 Aug 2021 20:53:02 -0700


jroesch commented on a change in pull request #8795:
URL: https://github.com/apache/tvm/pull/8795#discussion_r694460440




##########
File path: python/tvm/relay/backend/contrib/ethosu/__init__.py
##########
@@ -0,0 +1,23 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""Arm(R) Ethos(TM)-U NPU codegen modules for relay."""

Review comment:
       Nit: captilization

##########
File path: python/tvm/relay/backend/contrib/ethosu/legalize.py
##########
@@ -0,0 +1,200 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+# pylint: disable=invalid-name, unused-argument, import-outside-toplevel
+""" A set of passes to legalize some of operations for the NPU"""
+import numpy as np
+
+import tvm
+from tvm import relay
+from tvm.relay.dataflow_pattern import DFPatternCallback
+from tvm.relay.dataflow_pattern import wildcard
+from tvm.relay.dataflow_pattern import is_op
+from tvm.relay.dataflow_pattern import rewrite
+from tvm.relay.backend.contrib.ethosu import op as ethosu_ops
+from tvm.relay.backend.contrib.ethosu.errors import UnsupportedLayout
+from tvm.relay.backend.contrib.ethosu import vela_api
+from tvm.relay.op.contrib import ethosu as ethosu_patterns
+
+
+class SplitRewriter(DFPatternCallback):
+    """Convert split operations to bunch of strided_slice operations,

Review comment:
       ```suggestion
       """This rewriting converts split operations into a sequence of 
strided_slice operations,
   ```

##########
File path: python/tvm/relay/backend/contrib/ethosu/preprocess.py
##########
@@ -0,0 +1,27 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+# pylint: disable=invalid-name, unused-argument, import-outside-toplevel
+"""Set of passes to pre-process the IRModule prior to codegen"""
+from . import _ffi_api
+
+
+def preprocess_ext_io():

Review comment:
       API document string, what does this return? 

##########
File path: python/tvm/relay/backend/contrib/ethosu/legalize.py
##########
@@ -0,0 +1,200 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+# pylint: disable=invalid-name, unused-argument, import-outside-toplevel
+""" A set of passes to legalize some of operations for the NPU"""
+import numpy as np
+
+import tvm
+from tvm import relay
+from tvm.relay.dataflow_pattern import DFPatternCallback
+from tvm.relay.dataflow_pattern import wildcard
+from tvm.relay.dataflow_pattern import is_op
+from tvm.relay.dataflow_pattern import rewrite
+from tvm.relay.backend.contrib.ethosu import op as ethosu_ops
+from tvm.relay.backend.contrib.ethosu.errors import UnsupportedLayout
+from tvm.relay.backend.contrib.ethosu import vela_api
+from tvm.relay.op.contrib import ethosu as ethosu_patterns
+
+
+class SplitRewriter(DFPatternCallback):
+    """Convert split operations to bunch of strided_slice operations,
+    because codegen is going to be based on strided_slices that are
+    close to in/out boxes of Vela High-Level Command Stream (HLCS).
+    Moreover, Vela HLCS is a high-level description of the supported
+    hardware operator.
+    """
+
+    def __init__(self):
+        super().__init__(require_type=True)
+        self.split_in = wildcard()
+        self.pattern = is_op("split")(self.split_in)
+
+    @staticmethod
+    def get_section_begin_coords(split):
+        """Currently, the split can take an array of indices or an integer
+        indicating the number of splits. This helper functions unifies
+        this by making it a array of section begins.
+
+        Parameters
+        ----------
+        split : relay.Expr
+            The relay expression for split operator
+
+        Returns
+        -------
+        section_begins : list
+            A list containing integers corresponding to section
+            begins
+        """
+        indices_or_sections = split.attrs.indices_or_sections
+        input_shape = split.args[0].checked_type.shape
+        split_axis = split.attrs.axis
+
+        if isinstance(indices_or_sections, tvm.ir.container.Array):
+            # 0 is the beginning of the first section.
+            return [0] + list(indices_or_sections)
+        split_axis_len = input_shape[split_axis].value
+        section_length = split_axis_len // indices_or_sections.value
+        section_begins = list(range(0, split_axis_len, section_length))
+        return section_begins
+
+    def callback(self, pre, post, node_map):
+        splits_types = dict()
+        split_input = post.args[0]
+        for idx, field_type in enumerate(post.checked_type.fields):
+            split = relay.TupleGetItem(post, idx)
+            splits_types[split] = field_type
+
+        split_begins = list()
+        split_ends = list()
+        section_begins_in_split_axis = self.get_section_begin_coords(post)
+        for split_cord in section_begins_in_split_axis:
+            # first begin is [0, 0, ... , 0]
+            begin_shape = [0 for i in 
range(len(split_input.checked_type.shape))]
+            begin_shape[post.attrs.axis] = split_cord
+            split_begins.append(begin_shape)
+
+            end_shape = list(split_input.checked_type.shape)
+            # Only the split axis coordinate changes
+            end_shape[post.attrs.axis] = split_cord
+            split_ends.append(end_shape)
+
+        # Coordinates needs to be shifted left because beginning
+        # of the next section is the end of the previous
+        split_ends = split_ends[1:]
+        # Last section end is the shape of the tensor itself.
+        split_ends.append(list(split_input.checked_type.shape))
+
+        strided_slices = list()
+        for sb, se in zip(split_begins, split_ends):
+            strided_slices.append(relay.strided_slice(split_input, sb, se))
+
+        return relay.Tuple(strided_slices)
+
+
+class EthosUConv2DRewriter(DFPatternCallback):
+    """Convert conv2d related composite functions to ethosu_conv2d operators"""
+
+    def __init__(self):
+        super().__init__(require_type=True)
+        self.pattern = (wildcard().has_attr({"Composite": 
"ethosu.qnn_conv2d"}))(wildcard())
+
+    def callback(self, pre, post, node_map):
+        params = ethosu_patterns.QnnConv2DParams(post.op.body)
+        params.ifm.tensor = post.args[0]
+        channels_map = {
+            "NHWC": 3,
+        }
+        if str(params.ofm.layout) not in channels_map.keys():
+            raise UnsupportedLayout(str(params.ofm.layout))
+        kernel_size_map = {
+            "HWIO": params.weights.shape[0:2],
+            "OHWI": params.weights.shape[1:3],
+            "HWOI": params.weights.shape[0:2],
+        }
+        if str(params.weights.layout) not in kernel_size_map.keys():
+            raise UnsupportedLayout(str(params.weights.layout))
+        activation_map = {"clip": "CLIP"}
+        weight_to_ohwi_transform_map = {"HWIO": [3, 0, 1, 2]}
+        weights_values = params.weights.values
+        weights_values_ohwi = np.transpose(
+            weights_values, 
weight_to_ohwi_transform_map[str(params.weights.layout)]
+        )
+        if params.activation:
+            activation = activation_map[params.activation.op.name]
+            clip_min = int(params.activation.attrs.a_min)
+            clip_max = int(params.activation.attrs.a_max)
+        else:
+            activation = "NONE"
+            clip_min = 0
+            clip_max = 0
+        scale_bias = vela_api.pack_biases(
+            biases=params.biases.tensor.data.asnumpy(),
+            ifm_scale=params.ifm.q_params.scale_f32,
+            ifm_dtype=np.dtype(params.ifm.dtype),
+            weight_scales=params.weights.q_params.scale_f32,
+            ofm_scale=params.ofm.q_params.scale_f32,
+            is_activation_tanh_or_sigmoid=activation in ["TANH", "SIGMOID"],
+        )
+        ethosu_conv2d = ethosu_ops.ethosu_conv2d(
+            ifm=post.args[0],
+            weight=relay.const(weights_values_ohwi, 
params.weights.values.dtype),
+            scale_bias=relay.const(scale_bias, "uint8"),
+            lut=relay.const([], dtype="int8"),
+            ifm_scale=float(params.ifm.q_params.scale_f32),
+            ifm_zero_point=int(params.ifm.q_params.zero_point),
+            weight_zero_point=int(params.weights.q_params.zero_point),
+            ofm_scale=float(params.ofm.q_params.scale_f32),
+            ofm_zero_point=int(params.ofm.q_params.zero_point),
+            kernel_shape=kernel_size_map[str(params.weights.layout)],
+            
ofm_channels=params.ofm.shape[channels_map[str(params.ofm.layout)]],
+            strides=params.strides,
+            padding=params.padding,
+            dilation=params.dilation,
+            activation=activation,
+            clip_min=clip_min,
+            clip_max=clip_max,
+            upscale="NONE",
+            ifm_layout=str(params.ifm.layout),
+            ofm_layout=str(params.ofm.layout),
+        )
+        return ethosu_conv2d
+
+
+class LegalizeEthosU:
+    """This is the wrapper class to call graph-rewrites to perform graph 
transformation

Review comment:
       Ideally this should be "this is a pass which converts a Relay program 
into a form that can be accepted by the Ethos-U code generator.

##########
File path: python/tvm/relay/backend/contrib/ethosu/vela_api.py
##########
@@ -0,0 +1,314 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""
+conversions between TVM and Vela. Therefore, all interactions with the

Review comment:
       Nit: The first line is not a sentence worth explaining further. 

##########
File path: python/tvm/relay/op/contrib/ethosu.py
##########
@@ -0,0 +1,251 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""Arm(R) Ethos(TM)-U NPU supported operators."""
+import numpy as np
+
+from tvm.relay.expr import Constant
+from tvm.relay.op.contrib.register import register_pattern_table
+from tvm.relay.dataflow_pattern import wildcard, is_op, is_constant
+from tvm.relay.backend.contrib.ethosu.util import QConv2DArgs
+from tvm.relay.backend.contrib.ethosu.util import BiasAddArgs
+from tvm.relay.backend.contrib.ethosu.util import RequantArgs
+from tvm.relay.backend.contrib.ethosu.util import get_dim_value
+from ethosu.vela import api as vapi
+
+
+def check_strides(strides):
+    """Checks whether strides are within the limits supported by the 
hardware"""
+    stride_range = (1, 3)
+    smin, smax = stride_range
+    if not smax >= strides[0] >= smin:
+        return False
+    if not smax >= strides[1] >= smin:
+        return False
+    return True
+
+
+def check_valid_dtypes(tensor_params):
+    """Check whether dtypes are supported by the hardware"""
+    supported_dtypes = (np.uint8, np.int8)
+    for tep in tensor_params:
+        # Check for dtypes
+        if np.dtype(tep.dtype) not in supported_dtypes:
+            return False
+        # Check for shape sizes
+        if any(dimlen > 65536 for dimlen in tep.shape):
+            return False
+    return True
+
+
+def check_weights(weights, dilation):
+    """Checks whether weight tensor is compatible with HW"""
+    dilated_height_range = (1, 64)
+    dilated_hxw_range = (1, 64 * 64)
+    weights_limit = 127 * 65536
+    dilated_width = (weights.shape[get_dim_value(weights.layout, "W")] - 1) * 
dilation[0] + 1
+    dilated_height = (weights.shape[get_dim_value(weights.layout, "H")] - 1) * 
dilation[1] + 1
+    dh_min, dh_max = dilated_height_range
+    if not dh_min <= dilated_height <= dh_max:
+        return False
+    dilated_hxw = dilated_height * dilated_width
+    dhxw_min, dhxw_max = dilated_hxw_range
+    if not dhxw_min <= dilated_hxw <= dhxw_max:
+        return False
+    # A saturation upper bound check for accumulators
+    weights.values = weights.values - weights.q_params.zero_point
+    axis = (
+        get_dim_value(weights.layout, "H"),
+        get_dim_value(weights.layout, "W"),
+        get_dim_value(weights.layout, "I"),
+    )
+    sum_weights = np.amax(np.sum(np.absolute(weights.values), axis=axis))
+    if not sum_weights <= weights_limit:
+        return False
+    return True
+
+
+def check_bias(bias):
+    """Check whether the bias values fit in 40 bits"""
+    if bias and bias.dtype == np.dtype("int64"):
+        valid = all(len(bin(bias_value)[2:]) <= 40 for bias_value in 
bias.values)
+        return valid
+    return True
+
+
+def check_batch_size(ifm):
+    """Checks for the number of batches vela currently supports"""
+    if ifm.shape[0] != 1:
+        return False
+    return True
+
+
+def check_dilation(dilation):
+    """Checks whether dilation is within the limits supported by the 
hardware"""
+    dilation_range = (1, 2)
+    dmin, dmax = dilation_range
+    if not dmin <= dilation[0] <= dmax:
+        return False
+    if not dmin <= dilation[1] <= dmax:
+        return False
+    return True
+
+
+def check_padding(padding, bounds):
+    """Checks whether padding is within the limits supported by the hardware"""
+    if len(padding) != 4 or len(bounds) != 4:
+        return False
+    top, left, bottom, right = padding
+    topb, leftb, bottomb, rightb = bounds
+    if top > topb or left > leftb or bottom > bottomb or right > rightb:
+        return False
+    return True
+
+
+class TensorParams:
+    """
+    This class will parse a tvm Expr along with quantization scale
+    and zero point to populate parameters that are required
+    for the creation of tensors in Vela.
+    """
+
+    def __init__(self, tensor, layout=None, scale=None, zero_point=None):
+        self.tensor = tensor
+        if isinstance(tensor, Constant):
+            self.values = tensor.data.asnumpy()
+        else:
+            self.values = None
+        self.dtype = tensor.checked_type.dtype
+        self.shape = [int(i) for i in tensor.checked_type.shape]
+        self.layout = layout
+
+        if scale is not None and zero_point is not None:
+            self.q_params = vapi.NpuQuantization(
+                scale.data.asnumpy().astype("float32"), 
zero_point.data.asnumpy().astype(self.dtype)
+            )
+        else:
+            # put default values
+            self.q_params = vapi.NpuQuantization(1.0, 0)
+
+
+class QnnConv2DParams:
+    """
+    This class will parse a Call to a ethosu.qnn_conv2d_clip composite function
+    and extract quantization information of all the associated tensors.
+    """
+
+    composite_name = "ethosu.qnn_conv2d"
+    # The hardware only supports padding upto the numbers as follows
+    padding_bounds = [31, 31, 32, 32]
+    activation_map = {"clip": "CLIP"}
+
+    def __init__(self, func_body):
+        activation = None
+        if str(func_body.op) in self.activation_map.keys():
+            activation = func_body
+            requantize_op = activation.args[0]
+        else:
+            requantize_op = func_body
+        bias_add = requantize_op.args[0]
+        qnn_conv2d = bias_add.args[0]
+        data_layout = qnn_conv2d.attrs.data_layout
+        kernel_layout = qnn_conv2d.attrs.kernel_layout
+        # We consider the weights & biases as params as it should be a Constant
+        self.weights = TensorParams(
+            qnn_conv2d.args[QConv2DArgs.weights.value],
+            kernel_layout,
+            qnn_conv2d.args[QConv2DArgs.weights_scale.value],
+            qnn_conv2d.args[QConv2DArgs.weights_zero_point.value],
+        )
+
+        self.biases = TensorParams(
+            bias_add.args[BiasAddArgs.biases.value],
+            data_layout,
+            requantize_op.args[RequantArgs.ifm_scale.value],
+            requantize_op.args[RequantArgs.ifm_zero_point.value],
+        )
+        self.ifm = TensorParams(
+            qnn_conv2d.args[QConv2DArgs.ifm.value],
+            data_layout,
+            qnn_conv2d.args[QConv2DArgs.ifm_scale.value],
+            qnn_conv2d.args[QConv2DArgs.ifm_zero_point.value],
+        )
+        self.ofm = TensorParams(
+            func_body,
+            data_layout,
+            requantize_op.args[RequantArgs.ofm_scale.value],
+            requantize_op.args[RequantArgs.ofm_zero_point.value],
+        )
+        self.padding = qnn_conv2d.attrs.padding
+        self.strides = qnn_conv2d.attrs.strides
+        self.dilation = qnn_conv2d.attrs.dilation
+        self.activation = activation
+
+        # If groups are equal to channel, its a depthwise_conv2d
+        self.groups = qnn_conv2d.attrs.groups
+        self.is_depthwise = False
+        channels_axis = {"HWIO": 3, "HWOI": 2}
+        if qnn_conv2d.attrs.groups == 
self.weights.shape[channels_axis[kernel_layout]]:
+            self.is_depthwise = True
+
+    def is_valid(self):
+        """
+        Checks whether QnnConv2D with Clip has compatible attributes with HW
+        """
+        tensor_params = [self.weights, self.ifm, self.ofm]
+        if not check_valid_dtypes(tensor_params):
+            return False
+        if not check_weights(self.weights, self.dilation):
+            return False
+        if not check_bias(self.biases):
+            return False
+        if not check_strides(self.strides):
+            return False
+        if not check_batch_size(self.ifm):
+            return False
+        if not check_dilation(self.dilation):
+            return False
+        if not check_padding(self.padding, self.padding_bounds):
+            return False
+        legal_groups = [1, self.ofm.shape[3]]
+        if self.groups not in legal_groups:
+            return False
+        # This should be a valid QnnDepthwise2DParams, not QnnConv2DParams
+        if self.is_depthwise:
+            return False
+        return True
+
+
+def qnn_conv2d_pattern():
+    """
+    Create pattern for qnn.conv2D with optional fused relu

Review comment:
       Improve the grammar in these if possible. 

##########
File path: src/relay/backend/contrib/ethosu/preprocess.cc
##########
@@ -0,0 +1,268 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+#include <tvm/ir/error.h>
+#include <tvm/relay/analysis.h>
+#include <tvm/relay/attrs/annotation.h>
+#include <tvm/relay/expr.h>
+#include <tvm/relay/expr_functor.h>
+#include <tvm/relay/transform.h>
+
+#include <unordered_map>
+#include <unordered_set>
+#include <utility>
+#include <vector>
+
+#include "../../../op/make_op.h"
+
+namespace tvm {
+namespace relay {
+namespace contrib {
+namespace ethosu {
+
+/*!
+ * \brief This expression rewriter will traverse the graph to find calls
+ * to all external functions. If they have multiple inputs and/or
+ * multiple outputs, the following has to be done :
+ * 1) If multiple inputs are present, they needed to be concat before the call.
+ * 2) Inside the external function they need to be split again to their 
original inputs.
+ * 3) If there are multiple outputs, they need to be concat at the end of 
external function.
+ * 4) Then, the concat output again need to be split and made the original 
tuple output in the
+ * main.
+ */
+class ExternalFuncIOHandler : public ExprRewriter {
+ public:
+  explicit ExternalFuncIOHandler(IRModule& module) : module_(module) {}
+  int count = 0;
+
+  Function InferType(const Function& expr, const IRModule& m) {
+    IRModule mod(m);
+    mod->Update(mod->GetGlobalVar("main"), expr);
+    mod = transform::InferType()(mod);
+    return Downcast<Function>(mod->Lookup("main"));
+  }
+
+  /*!
+   * \brief This function will take shape and compute
+   * the scalar size value for it to be use to create
+   * flat single dimensional tensors.
+   */
+  int64_t CalcSize(const Array<Integer>& shape) {
+    int size = 1;
+    for (auto dim_sz : shape) {
+      size = size * Downcast<Integer>(dim_sz)->value;
+    }
+    return size;
+  }
+
+  /*!
+   * \brief This will take a tensor and create a flattened
+   * tensor to be used by the concat.
+   */
+  Expr CreateFlattenTensor(const Expr& input) {
+    auto ishape = 
Downcast<Array<Integer>>(Downcast<TensorType>(input->checked_type())->shape);
+    int flatten_size = CalcSize(ishape);
+    Array<Integer> oshape = {Integer(flatten_size)};
+    return MakeReshape(input, oshape);
+  }
+
+  /*!
+   * \brief This will take flattened tensors and create
+   * a single concat'd tensor.
+   */
+  Expr CreateConcatTensor(const Array<Expr>& inputs) {
+    auto tuple = Tuple(inputs);
+    return MakeConcatenate(tuple, 0);
+  }
+
+  /*!
+   * \brief This will take a flattened concat'd tensor and use the original 
inputs shapes
+   * to recreate a Tuple of the original set of tensors.
+   */
+  Expr CreateSplitReshapedTensors(const Expr& input, const Array<Expr>& 
original_args) {
+    Array<Array<Integer>> shapes;
+    Array<Integer> flatten_tensor_sizes;
+    Array<IndexExpr> split_indices;
+    Array<Expr> rets;
+
+    int total_size = 0;
+    for (auto orig_arg : original_args) {
+      auto shape = 
Downcast<Array<Integer>>(Downcast<TensorType>(orig_arg->checked_type())->shape);
+      shapes.push_back(shape);
+      flatten_tensor_sizes.push_back(CalcSize(shape));
+      if (total_size != 0) {
+        split_indices.push_back(total_size);
+      }
+      total_size += CalcSize(shape);
+    }
+    auto split_outs = MakeSplit(input, split_indices, 0);
+    for (unsigned int i = 0; i < shapes.size(); i++) {
+      auto split_out = TupleGetItem(split_outs, i);
+      split_out->checked_type_ = original_args[i]->checked_type_;
+      rets.push_back(MakeReshape(split_out, shapes[i]));
+    }
+    return Tuple(rets);
+  }
+
+  /*!
+   * \brief Modify the external function to split the input as the original 
compute
+   * as required originally. Moreover, the outputs will be flattened and 
concat'd
+   * to make a single output. Finaly, the external function should only have a 
single input
+   * and a single output.
+   */
+  Function ModifyExternalFunction(const Function& func, GlobalVar gv, const 
DataType& dtype) {
+    Array<Expr> inputs;
+    Var ifms;
+    if (func->params.size() > 1) {
+      Array<Array<Integer>> shapes;
+      Array<Integer> flatten_tensor_sizes;
+      Array<IndexExpr> split_indices;
+
+      auto func_name = gv->name_hint;
+      int total_size = 0;
+      for (auto input : func->params) {
+        auto shape = 
Downcast<Array<Integer>>(Downcast<TensorType>(input->checked_type())->shape);
+        shapes.push_back(shape);
+        auto flat_size = CalcSize(shape);
+        flatten_tensor_sizes.push_back(flat_size);
+        if (total_size != 0) {
+          split_indices.push_back(total_size);
+        }
+        total_size += flat_size;
+      }
+      Array<PrimExpr> ifms_shape = {total_size};
+      ifms = Var(func_name + "_ifms", TensorType(ifms_shape, dtype));
+      auto split_outs = MakeSplit(ifms, split_indices, 0);
+      for (unsigned int i = 0; i < shapes.size(); i++) {
+        auto split_out = TupleGetItem(split_outs, i);
+        split_out->checked_type_ = func->params[i]->checked_type();
+        inputs.push_back(MakeReshape(split_out, shapes[i]));
+      }
+    } else {
+      CHECK_EQ(func->params.size(), 1);
+      inputs.push_back(func->params[0]);
+      ifms = func->params[0];
+    }
+    Map<Var, Expr> bind_map;
+    CHECK_EQ(func->params.size(), inputs.size());
+    for (size_t i = 0; i < inputs.size(); i++) {
+      bind_map.Set(func->params[i], inputs[i]);
+    }
+    auto core_compute_expr = Bind(func->body, bind_map);
+
+    // Creation of wrapper inside the external function
+    Array<Var> params = {ifms};
+    if (func->body->IsInstance<TupleNode>()) {
+      auto tuple_out = func->body.as<TupleNode>();
+      Array<Expr> reshaped_outputs;
+      for (unsigned int i = 0; i < tuple_out->fields.size(); i++) {
+        auto out = Downcast<Tuple>(core_compute_expr)->fields[i];
+        out->checked_type_ = tuple_out->fields[i]->checked_type_;
+        reshaped_outputs.push_back(CreateFlattenTensor(out));
+      }
+      auto concat_out = CreateConcatTensor(reshaped_outputs);
+      auto f = Function(params, concat_out, concat_out->checked_type_, {}, 
func->attrs);
+      return InferType(f, this->module_);
+    } else {
+      auto f =
+          Function(params, core_compute_expr, 
core_compute_expr->checked_type_, {}, func->attrs);
+      return InferType(f, this->module_);
+    }
+  }
+
+  Expr Rewrite_(const CallNode* call, const Expr& post) final {
+    auto post_call = Downcast<Call>(post);
+
+    if (auto glb_var_node = post_call->op.as<GlobalVarNode>()) {
+      auto glb_var = GetRef<GlobalVar>(glb_var_node);
+      auto func = Downcast<Function>(module_->functions[glb_var]);
+
+      // If the number of inputs and output are 1 --> no need to do anything
+      if (post_call->args.size() == 1 && !func->body->IsInstance<TupleNode>()) 
{
+        return post;
+      }
+      if (auto compiler = func->GetAttr<String>(attr::kCompiler)) {
+        if (compiler == "ethosu") {
+          auto ext_input = std::move(post_call->args[0]);
+          auto arg_dtype = 
Downcast<TensorType>(post_call->args[0]->checked_type())->dtype;
+          if (post_call->args.size() > 1) {
+            Array<Expr> reshaped_inputs;
+            for (const auto& arg : post_call->args) {
+              // All arguments should be of same data type
+              CHECK_EQ(arg_dtype, 
Downcast<TensorType>(arg->checked_type())->dtype)
+                  << "Currently NPU external functions require all inputs to 
be of same data "
+                     "type";
+              reshaped_inputs.push_back(CreateFlattenTensor(arg));
+            }
+            ext_input = CreateConcatTensor(reshaped_inputs);
+          }
+          auto ext_func = ModifyExternalFunction(func, glb_var, arg_dtype);
+          Array<Expr> new_args = {ext_input};
+          module_->Add(glb_var, ext_func);
+          Expr new_call = Call(glb_var, new_args);
+          if (func->body->IsInstance<TupleNode>()) {
+            auto orginal_tuple_out = Downcast<Tuple>(func->body);
+            new_call = CreateSplitReshapedTensors(new_call, 
orginal_tuple_out->fields);
+          }
+          return std::move(new_call);
+        }
+      }
+    }
+    return post;
+  }
+
+ private:
+  IRModule module_;
+};
+
+IRModule PreprocessExternalFuncIO_(IRModule module) {

Review comment:
       Passes shouldn't directly mutate the module contents, we assume that we 
copy if we want to mutate. 

##########
File path: python/tvm/relay/backend/contrib/__init__.py
##########
@@ -0,0 +1,18 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""external backend codegen modules for relay."""

Review comment:
       Nit: can you capitalize this correctly? 

##########
File path: python/tvm/relay/backend/contrib/ethosu/legalize.py
##########
@@ -0,0 +1,200 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+# pylint: disable=invalid-name, unused-argument, import-outside-toplevel
+""" A set of passes to legalize some of operations for the NPU"""
+import numpy as np
+
+import tvm
+from tvm import relay
+from tvm.relay.dataflow_pattern import DFPatternCallback
+from tvm.relay.dataflow_pattern import wildcard
+from tvm.relay.dataflow_pattern import is_op
+from tvm.relay.dataflow_pattern import rewrite
+from tvm.relay.backend.contrib.ethosu import op as ethosu_ops
+from tvm.relay.backend.contrib.ethosu.errors import UnsupportedLayout
+from tvm.relay.backend.contrib.ethosu import vela_api
+from tvm.relay.op.contrib import ethosu as ethosu_patterns
+
+
+class SplitRewriter(DFPatternCallback):
+    """Convert split operations to bunch of strided_slice operations,
+    because codegen is going to be based on strided_slices that are
+    close to in/out boxes of Vela High-Level Command Stream (HLCS).
+    Moreover, Vela HLCS is a high-level description of the supported
+    hardware operator.
+    """
+
+    def __init__(self):
+        super().__init__(require_type=True)
+        self.split_in = wildcard()
+        self.pattern = is_op("split")(self.split_in)
+
+    @staticmethod
+    def get_section_begin_coords(split):
+        """Currently, the split can take an array of indices or an integer
+        indicating the number of splits. This helper functions unifies
+        this by making it a array of section begins.
+
+        Parameters
+        ----------
+        split : relay.Expr
+            The relay expression for split operator
+
+        Returns
+        -------
+        section_begins : list
+            A list containing integers corresponding to section
+            begins
+        """
+        indices_or_sections = split.attrs.indices_or_sections
+        input_shape = split.args[0].checked_type.shape
+        split_axis = split.attrs.axis
+
+        if isinstance(indices_or_sections, tvm.ir.container.Array):
+            # 0 is the beginning of the first section.
+            return [0] + list(indices_or_sections)
+        split_axis_len = input_shape[split_axis].value
+        section_length = split_axis_len // indices_or_sections.value
+        section_begins = list(range(0, split_axis_len, section_length))
+        return section_begins
+
+    def callback(self, pre, post, node_map):
+        splits_types = dict()
+        split_input = post.args[0]
+        for idx, field_type in enumerate(post.checked_type.fields):
+            split = relay.TupleGetItem(post, idx)
+            splits_types[split] = field_type
+
+        split_begins = list()
+        split_ends = list()
+        section_begins_in_split_axis = self.get_section_begin_coords(post)
+        for split_cord in section_begins_in_split_axis:
+            # first begin is [0, 0, ... , 0]
+            begin_shape = [0 for i in 
range(len(split_input.checked_type.shape))]
+            begin_shape[post.attrs.axis] = split_cord
+            split_begins.append(begin_shape)
+
+            end_shape = list(split_input.checked_type.shape)
+            # Only the split axis coordinate changes
+            end_shape[post.attrs.axis] = split_cord
+            split_ends.append(end_shape)
+
+        # Coordinates needs to be shifted left because beginning
+        # of the next section is the end of the previous
+        split_ends = split_ends[1:]
+        # Last section end is the shape of the tensor itself.
+        split_ends.append(list(split_input.checked_type.shape))
+
+        strided_slices = list()
+        for sb, se in zip(split_begins, split_ends):
+            strided_slices.append(relay.strided_slice(split_input, sb, se))
+
+        return relay.Tuple(strided_slices)
+
+
+class EthosUConv2DRewriter(DFPatternCallback):
+    """Convert conv2d related composite functions to ethosu_conv2d operators"""
+
+    def __init__(self):
+        super().__init__(require_type=True)
+        self.pattern = (wildcard().has_attr({"Composite": 
"ethosu.qnn_conv2d"}))(wildcard())
+
+    def callback(self, pre, post, node_map):
+        params = ethosu_patterns.QnnConv2DParams(post.op.body)
+        params.ifm.tensor = post.args[0]
+        channels_map = {
+            "NHWC": 3,
+        }
+        if str(params.ofm.layout) not in channels_map.keys():
+            raise UnsupportedLayout(str(params.ofm.layout))
+        kernel_size_map = {
+            "HWIO": params.weights.shape[0:2],
+            "OHWI": params.weights.shape[1:3],
+            "HWOI": params.weights.shape[0:2],
+        }
+        if str(params.weights.layout) not in kernel_size_map.keys():
+            raise UnsupportedLayout(str(params.weights.layout))
+        activation_map = {"clip": "CLIP"}
+        weight_to_ohwi_transform_map = {"HWIO": [3, 0, 1, 2]}
+        weights_values = params.weights.values
+        weights_values_ohwi = np.transpose(
+            weights_values, 
weight_to_ohwi_transform_map[str(params.weights.layout)]
+        )
+        if params.activation:
+            activation = activation_map[params.activation.op.name]
+            clip_min = int(params.activation.attrs.a_min)
+            clip_max = int(params.activation.attrs.a_max)
+        else:
+            activation = "NONE"
+            clip_min = 0
+            clip_max = 0
+        scale_bias = vela_api.pack_biases(
+            biases=params.biases.tensor.data.asnumpy(),
+            ifm_scale=params.ifm.q_params.scale_f32,
+            ifm_dtype=np.dtype(params.ifm.dtype),
+            weight_scales=params.weights.q_params.scale_f32,
+            ofm_scale=params.ofm.q_params.scale_f32,
+            is_activation_tanh_or_sigmoid=activation in ["TANH", "SIGMOID"],
+        )
+        ethosu_conv2d = ethosu_ops.ethosu_conv2d(
+            ifm=post.args[0],
+            weight=relay.const(weights_values_ohwi, 
params.weights.values.dtype),
+            scale_bias=relay.const(scale_bias, "uint8"),
+            lut=relay.const([], dtype="int8"),
+            ifm_scale=float(params.ifm.q_params.scale_f32),
+            ifm_zero_point=int(params.ifm.q_params.zero_point),
+            weight_zero_point=int(params.weights.q_params.zero_point),
+            ofm_scale=float(params.ofm.q_params.scale_f32),
+            ofm_zero_point=int(params.ofm.q_params.zero_point),
+            kernel_shape=kernel_size_map[str(params.weights.layout)],
+            
ofm_channels=params.ofm.shape[channels_map[str(params.ofm.layout)]],
+            strides=params.strides,
+            padding=params.padding,
+            dilation=params.dilation,
+            activation=activation,
+            clip_min=clip_min,
+            clip_max=clip_max,
+            upscale="NONE",
+            ifm_layout=str(params.ifm.layout),
+            ofm_layout=str(params.ofm.layout),
+        )
+        return ethosu_conv2d
+
+
+class LegalizeEthosU:

Review comment:
       Is there a reason that these are just functions over functions? it would 
be better to convert these into passes if the intention is to convert an entire 
program. 

##########
File path: python/tvm/relay/backend/contrib/ethosu/legalize.py
##########
@@ -0,0 +1,200 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+# pylint: disable=invalid-name, unused-argument, import-outside-toplevel
+""" A set of passes to legalize some of operations for the NPU"""
+import numpy as np
+
+import tvm
+from tvm import relay
+from tvm.relay.dataflow_pattern import DFPatternCallback
+from tvm.relay.dataflow_pattern import wildcard
+from tvm.relay.dataflow_pattern import is_op
+from tvm.relay.dataflow_pattern import rewrite
+from tvm.relay.backend.contrib.ethosu import op as ethosu_ops
+from tvm.relay.backend.contrib.ethosu.errors import UnsupportedLayout
+from tvm.relay.backend.contrib.ethosu import vela_api
+from tvm.relay.op.contrib import ethosu as ethosu_patterns
+
+
+class SplitRewriter(DFPatternCallback):
+    """Convert split operations to bunch of strided_slice operations,
+    because codegen is going to be based on strided_slices that are
+    close to in/out boxes of Vela High-Level Command Stream (HLCS).
+    Moreover, Vela HLCS is a high-level description of the supported
+    hardware operator.
+    """
+
+    def __init__(self):
+        super().__init__(require_type=True)
+        self.split_in = wildcard()
+        self.pattern = is_op("split")(self.split_in)
+
+    @staticmethod
+    def get_section_begin_coords(split):
+        """Currently, the split can take an array of indices or an integer
+        indicating the number of splits. This helper functions unifies
+        this by making it a array of section begins.
+
+        Parameters
+        ----------
+        split : relay.Expr

Review comment:
       Is this the split operation (i.e. relay.Op or relay.Expr)?

##########
File path: python/tvm/relay/backend/contrib/ethosu/legalize.py
##########
@@ -0,0 +1,200 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+# pylint: disable=invalid-name, unused-argument, import-outside-toplevel
+""" A set of passes to legalize some of operations for the NPU"""
+import numpy as np
+
+import tvm
+from tvm import relay
+from tvm.relay.dataflow_pattern import DFPatternCallback
+from tvm.relay.dataflow_pattern import wildcard
+from tvm.relay.dataflow_pattern import is_op
+from tvm.relay.dataflow_pattern import rewrite
+from tvm.relay.backend.contrib.ethosu import op as ethosu_ops
+from tvm.relay.backend.contrib.ethosu.errors import UnsupportedLayout
+from tvm.relay.backend.contrib.ethosu import vela_api
+from tvm.relay.op.contrib import ethosu as ethosu_patterns
+
+
+class SplitRewriter(DFPatternCallback):
+    """Convert split operations to bunch of strided_slice operations,
+    because codegen is going to be based on strided_slices that are
+    close to in/out boxes of Vela High-Level Command Stream (HLCS).
+    Moreover, Vela HLCS is a high-level description of the supported
+    hardware operator.
+    """
+
+    def __init__(self):
+        super().__init__(require_type=True)
+        self.split_in = wildcard()
+        self.pattern = is_op("split")(self.split_in)
+
+    @staticmethod
+    def get_section_begin_coords(split):
+        """Currently, the split can take an array of indices or an integer
+        indicating the number of splits. This helper functions unifies
+        this by making it a array of section begins.
+
+        Parameters
+        ----------
+        split : relay.Expr
+            The relay expression for split operator
+
+        Returns
+        -------
+        section_begins : list

Review comment:
       What is a section begin?

##########
File path: python/tvm/relay/backend/contrib/ethosu/legalize.py
##########
@@ -0,0 +1,200 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+# pylint: disable=invalid-name, unused-argument, import-outside-toplevel
+""" A set of passes to legalize some of operations for the NPU"""
+import numpy as np
+
+import tvm
+from tvm import relay
+from tvm.relay.dataflow_pattern import DFPatternCallback
+from tvm.relay.dataflow_pattern import wildcard
+from tvm.relay.dataflow_pattern import is_op
+from tvm.relay.dataflow_pattern import rewrite
+from tvm.relay.backend.contrib.ethosu import op as ethosu_ops
+from tvm.relay.backend.contrib.ethosu.errors import UnsupportedLayout
+from tvm.relay.backend.contrib.ethosu import vela_api
+from tvm.relay.op.contrib import ethosu as ethosu_patterns
+
+
+class SplitRewriter(DFPatternCallback):
+    """Convert split operations to bunch of strided_slice operations,
+    because codegen is going to be based on strided_slices that are
+    close to in/out boxes of Vela High-Level Command Stream (HLCS).
+    Moreover, Vela HLCS is a high-level description of the supported
+    hardware operator.
+    """
+
+    def __init__(self):
+        super().__init__(require_type=True)
+        self.split_in = wildcard()
+        self.pattern = is_op("split")(self.split_in)
+
+    @staticmethod
+    def get_section_begin_coords(split):
+        """Currently, the split can take an array of indices or an integer

Review comment:
       ```suggestion
           """Currently, the split operator takes an array of indices or an 
integer
   ```

##########
File path: python/tvm/relay/backend/contrib/ethosu/legalize.py
##########
@@ -0,0 +1,200 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+# pylint: disable=invalid-name, unused-argument, import-outside-toplevel
+""" A set of passes to legalize some of operations for the NPU"""
+import numpy as np
+
+import tvm
+from tvm import relay
+from tvm.relay.dataflow_pattern import DFPatternCallback
+from tvm.relay.dataflow_pattern import wildcard
+from tvm.relay.dataflow_pattern import is_op
+from tvm.relay.dataflow_pattern import rewrite
+from tvm.relay.backend.contrib.ethosu import op as ethosu_ops
+from tvm.relay.backend.contrib.ethosu.errors import UnsupportedLayout
+from tvm.relay.backend.contrib.ethosu import vela_api
+from tvm.relay.op.contrib import ethosu as ethosu_patterns
+
+
+class SplitRewriter(DFPatternCallback):
+    """Convert split operations to bunch of strided_slice operations,
+    because codegen is going to be based on strided_slices that are

Review comment:
       ```suggestion
       for the code generator. The code generator matches strided_slices which 
are
   ```

##########
File path: python/tvm/relay/backend/contrib/ethosu/legalize.py
##########
@@ -0,0 +1,200 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+# pylint: disable=invalid-name, unused-argument, import-outside-toplevel
+""" A set of passes to legalize some of operations for the NPU"""
+import numpy as np
+
+import tvm
+from tvm import relay
+from tvm.relay.dataflow_pattern import DFPatternCallback
+from tvm.relay.dataflow_pattern import wildcard
+from tvm.relay.dataflow_pattern import is_op
+from tvm.relay.dataflow_pattern import rewrite
+from tvm.relay.backend.contrib.ethosu import op as ethosu_ops
+from tvm.relay.backend.contrib.ethosu.errors import UnsupportedLayout
+from tvm.relay.backend.contrib.ethosu import vela_api
+from tvm.relay.op.contrib import ethosu as ethosu_patterns
+
+
+class SplitRewriter(DFPatternCallback):
+    """Convert split operations to bunch of strided_slice operations,
+    because codegen is going to be based on strided_slices that are
+    close to in/out boxes of Vela High-Level Command Stream (HLCS).
+    Moreover, Vela HLCS is a high-level description of the supported

Review comment:
       Can you expand this?

##########
File path: python/tvm/relay/backend/contrib/ethosu/legalize.py
##########
@@ -0,0 +1,200 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+# pylint: disable=invalid-name, unused-argument, import-outside-toplevel
+""" A set of passes to legalize some of operations for the NPU"""
+import numpy as np
+
+import tvm
+from tvm import relay
+from tvm.relay.dataflow_pattern import DFPatternCallback
+from tvm.relay.dataflow_pattern import wildcard
+from tvm.relay.dataflow_pattern import is_op
+from tvm.relay.dataflow_pattern import rewrite
+from tvm.relay.backend.contrib.ethosu import op as ethosu_ops
+from tvm.relay.backend.contrib.ethosu.errors import UnsupportedLayout
+from tvm.relay.backend.contrib.ethosu import vela_api
+from tvm.relay.op.contrib import ethosu as ethosu_patterns
+
+
+class SplitRewriter(DFPatternCallback):
+    """Convert split operations to bunch of strided_slice operations,
+    because codegen is going to be based on strided_slices that are
+    close to in/out boxes of Vela High-Level Command Stream (HLCS).
+    Moreover, Vela HLCS is a high-level description of the supported
+    hardware operator.
+    """
+
+    def __init__(self):
+        super().__init__(require_type=True)
+        self.split_in = wildcard()
+        self.pattern = is_op("split")(self.split_in)
+
+    @staticmethod
+    def get_section_begin_coords(split):
+        """Currently, the split can take an array of indices or an integer
+        indicating the number of splits. This helper functions unifies
+        this by making it a array of section begins.
+
+        Parameters
+        ----------
+        split : relay.Expr
+            The relay expression for split operator
+
+        Returns
+        -------
+        section_begins : list
+            A list containing integers corresponding to section
+            begins
+        """
+        indices_or_sections = split.attrs.indices_or_sections
+        input_shape = split.args[0].checked_type.shape
+        split_axis = split.attrs.axis
+
+        if isinstance(indices_or_sections, tvm.ir.container.Array):
+            # 0 is the beginning of the first section.
+            return [0] + list(indices_or_sections)
+        split_axis_len = input_shape[split_axis].value
+        section_length = split_axis_len // indices_or_sections.value
+        section_begins = list(range(0, split_axis_len, section_length))
+        return section_begins
+
+    def callback(self, pre, post, node_map):
+        splits_types = dict()
+        split_input = post.args[0]
+        for idx, field_type in enumerate(post.checked_type.fields):
+            split = relay.TupleGetItem(post, idx)
+            splits_types[split] = field_type
+
+        split_begins = list()
+        split_ends = list()
+        section_begins_in_split_axis = self.get_section_begin_coords(post)
+        for split_cord in section_begins_in_split_axis:
+            # first begin is [0, 0, ... , 0]
+            begin_shape = [0 for i in 
range(len(split_input.checked_type.shape))]
+            begin_shape[post.attrs.axis] = split_cord
+            split_begins.append(begin_shape)
+
+            end_shape = list(split_input.checked_type.shape)
+            # Only the split axis coordinate changes
+            end_shape[post.attrs.axis] = split_cord
+            split_ends.append(end_shape)
+
+        # Coordinates needs to be shifted left because beginning
+        # of the next section is the end of the previous
+        split_ends = split_ends[1:]
+        # Last section end is the shape of the tensor itself.
+        split_ends.append(list(split_input.checked_type.shape))
+
+        strided_slices = list()
+        for sb, se in zip(split_begins, split_ends):
+            strided_slices.append(relay.strided_slice(split_input, sb, se))
+
+        return relay.Tuple(strided_slices)
+
+
+class EthosUConv2DRewriter(DFPatternCallback):
+    """Convert conv2d related composite functions to ethosu_conv2d operators"""

Review comment:
       ```suggestion
       """Convert conv2d related composite functions into ethosu_conv2d 
operators."""
   ```

##########
File path: python/tvm/relay/backend/contrib/ethosu/errors.py
##########
@@ -0,0 +1,38 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+# pylint: disable=super-init-not-called
+"""This module is to hold all type of errors associated Arm(R) Ethos(TM)-U NPU 
Codegen"""

Review comment:
       ```suggestion
   """This module defines all error types associated with the Arm(R) 
Ethos(TM)-U NPU code generator."""
   ```

##########
File path: python/tvm/relay/backend/contrib/ethosu/legalize.py
##########
@@ -0,0 +1,200 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+# pylint: disable=invalid-name, unused-argument, import-outside-toplevel
+""" A set of passes to legalize some of operations for the NPU"""
+import numpy as np
+
+import tvm
+from tvm import relay
+from tvm.relay.dataflow_pattern import DFPatternCallback
+from tvm.relay.dataflow_pattern import wildcard
+from tvm.relay.dataflow_pattern import is_op
+from tvm.relay.dataflow_pattern import rewrite
+from tvm.relay.backend.contrib.ethosu import op as ethosu_ops
+from tvm.relay.backend.contrib.ethosu.errors import UnsupportedLayout
+from tvm.relay.backend.contrib.ethosu import vela_api
+from tvm.relay.op.contrib import ethosu as ethosu_patterns
+
+
+class SplitRewriter(DFPatternCallback):
+    """Convert split operations to bunch of strided_slice operations,
+    because codegen is going to be based on strided_slices that are
+    close to in/out boxes of Vela High-Level Command Stream (HLCS).
+    Moreover, Vela HLCS is a high-level description of the supported
+    hardware operator.
+    """
+
+    def __init__(self):
+        super().__init__(require_type=True)
+        self.split_in = wildcard()
+        self.pattern = is_op("split")(self.split_in)
+
+    @staticmethod
+    def get_section_begin_coords(split):
+        """Currently, the split can take an array of indices or an integer
+        indicating the number of splits. This helper functions unifies
+        this by making it a array of section begins.

Review comment:
       Clarify this line.

##########
File path: python/tvm/relay/backend/contrib/ethosu/errors.py
##########
@@ -0,0 +1,38 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+# pylint: disable=super-init-not-called
+"""This module is to hold all type of errors associated Arm(R) Ethos(TM)-U NPU 
Codegen"""
+
+
+class EthosUCodegenError(Exception):
+    """Base class for all exceptions related to Codegen"""
+
+    def __init__(self, data):
+        self.message = "EthosUCodegenError:" + data
+
+    def __str__(self):
+        return self.message
+
+
+class UnsupportedLayout(EthosUCodegenError):
+    """Raised when unsupported layout is encountered in the codegen"""

Review comment:
       ```suggestion
       """Raised when unsupported layout is encountered during code 
generation."""
   ```

##########
File path: python/tvm/relay/backend/contrib/ethosu/te/dma.py
##########
@@ -0,0 +1,299 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+# pylint: disable=invalid-name,unnecessary-lambda
+"""Tensor Expressions for operations supported by the DMA engine"""
+import tvm
+from tvm import te
+from tvm.topi.utils import equal_const_int
+
+
+def _pad_tensor(tensor, pad_before, pad_after=None):
+    """Generate a padded tensor.
+
+    Parameters
+    ----------
+    tensor : te.Tensor
+        The tensor to pad.
+    pad_before : tuple of int
+        The 'before' padding on each axis.
+    pad_after : tuple of int
+        The 'after' padding on each axis.
+    Returns
+    -------
+    _pad : callable
+        The padded tensor.
+
+    """
+    pad_after = pad_after or pad_before
+    dims = len(tensor.shape)
+    assert len(pad_before) == dims
+    assert len(pad_after) == dims
+
+    def _pad(*indices):
+        not_zero = []
+        index_tuple = []
+        for i in range(dims):
+            if equal_const_int(pad_before[i], 0) and 
equal_const_int(pad_after[i], 0):
+                index_tuple.append(indices[i])
+            else:
+                index_tuple.append(indices[i] - pad_before[i])
+                not_zero.append(indices[i] >= pad_before[i])
+                not_zero.append(indices[i] < tensor.shape[i] + pad_before[i])
+        if not_zero:
+            not_zero = tvm.tir.all(*not_zero)
+            return tvm.tir.if_then_else(not_zero, tensor(*index_tuple), 
tvm.tir.const(0, "uint8"))
+        return tensor(*index_tuple)
+
+    return _pad
+
+
+def read_compute(tensor, layout, zero_point, scale):
+    """A TE compute operator to represent a read.

Review comment:
       You can probably just say: "A tensor expression which represents a 
read." 

##########
File path: python/tvm/relay/backend/contrib/ethosu/preprocess.py
##########
@@ -0,0 +1,27 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+# pylint: disable=invalid-name, unused-argument, import-outside-toplevel
+"""Set of passes to pre-process the IRModule prior to codegen"""

Review comment:
       Can you clarify the scope of what this does?

##########
File path: python/tvm/relay/backend/contrib/ethosu/te/dma.py
##########
@@ -0,0 +1,299 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+# pylint: disable=invalid-name,unnecessary-lambda
+"""Tensor Expressions for operations supported by the DMA engine"""

Review comment:
       ```suggestion
   """Tensor Expressions for operations supported by the Ethos-U DMA engine."""
   ```

##########
File path: python/tvm/relay/backend/contrib/ethosu/util.py
##########
@@ -0,0 +1,198 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""
+Helper utility Enums and Functions used through out codegen
+
+The enums are there to indicate which argument of each relay operator

Review comment:
       Suggestion: move this commentary directly to the type or name them here 
so we can understand "which one" you are talking about. 

##########
File path: python/tvm/relay/backend/contrib/ethosu/util.py
##########
@@ -0,0 +1,198 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""
+Helper utility Enums and Functions used through out codegen

Review comment:
       ```suggestion
   Helper utility Enums and Functions used through out code generation. 
   ```

##########
File path: python/tvm/relay/backend/contrib/ethosu/util.py
##########
@@ -0,0 +1,198 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""
+Helper utility Enums and Functions used through out codegen
+
+The enums are there to indicate which argument of each relay operator
+corresponds with which input.
+e.g., input zero point of qnn.conv2d is 4th argument(3rd index)
+
+The rest of the utility functions are misc.
+Refer to the description inside such functions
+"""
+
+from enum import Enum
+import numpy as np
+
+from tvm import relay
+from tvm.relay.build_module import bind_params_by_name
+from tvm.relay.backend.contrib.ethosu import preprocess
+
+
+class QConv2DArgs(Enum):
+    """
+    This is a helper enums to access the correct index
+    qnn conv2d arguments
+    """
+
+    ifm = 0
+    weights = 1
+    ifm_zero_point = 2
+    weights_zero_point = 3
+    ifm_scale = 4
+    weights_scale = 5
+
+
+class RequantArgs(Enum):
+    """
+    This is a helper enums to access the correct index
+    qnn requantize arguments
+    """
+
+    ifm_scale = 1
+    ifm_zero_point = 2
+    ofm_scale = 3
+    ofm_zero_point = 4
+
+
+class BiasAddArgs(Enum):
+    """
+    This is a helper enums to access the correct index
+    qnn bias_add arguments
+    """
+
+    biases = 1
+
+
+class ClipArgs(Enum):
+    """
+    This is a helper enums to access the correct index
+    qnn bias_add arguments
+    """
+
+    a_min = 1
+    a_max = 2
+
+
+class MaxPoolArgs(Enum):
+    """
+    This is a helper enums to access the correct index
+    max pool arguments
+    """
+
+    ifm = 0
+
+
+class AddArgs(Enum):
+    """This is a helper enums to access the correct index
+    max pool arguments
+    """
+
+    ifm0 = 0
+    ifm1 = 1
+    ifm0_scale = 2
+    ifm0_zero_point = 3
+    ifm1_scale = 4
+    ifm1_zero_point = 5
+    ofm_scale = 6
+    ofm_zero_point = 7
+
+
+def is_composite_func(func, name):
+    """
+    This a method to check whether the call is to

Review comment:
       ```suggestion
       This method checks whether the call is to
   ```

##########
File path: python/tvm/relay/backend/contrib/ethosu/util.py
##########
@@ -0,0 +1,198 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""
+Helper utility Enums and Functions used through out codegen
+
+The enums are there to indicate which argument of each relay operator
+corresponds with which input.
+e.g., input zero point of qnn.conv2d is 4th argument(3rd index)
+
+The rest of the utility functions are misc.
+Refer to the description inside such functions
+"""
+
+from enum import Enum
+import numpy as np
+
+from tvm import relay
+from tvm.relay.build_module import bind_params_by_name
+from tvm.relay.backend.contrib.ethosu import preprocess
+
+
+class QConv2DArgs(Enum):
+    """
+    This is a helper enums to access the correct index
+    qnn conv2d arguments
+    """
+
+    ifm = 0
+    weights = 1
+    ifm_zero_point = 2
+    weights_zero_point = 3
+    ifm_scale = 4
+    weights_scale = 5
+
+
+class RequantArgs(Enum):
+    """
+    This is a helper enums to access the correct index
+    qnn requantize arguments
+    """
+
+    ifm_scale = 1
+    ifm_zero_point = 2
+    ofm_scale = 3
+    ofm_zero_point = 4
+
+
+class BiasAddArgs(Enum):
+    """
+    This is a helper enums to access the correct index
+    qnn bias_add arguments
+    """
+
+    biases = 1
+
+
+class ClipArgs(Enum):
+    """
+    This is a helper enums to access the correct index
+    qnn bias_add arguments
+    """
+
+    a_min = 1
+    a_max = 2
+
+
+class MaxPoolArgs(Enum):
+    """
+    This is a helper enums to access the correct index
+    max pool arguments
+    """
+
+    ifm = 0
+
+
+class AddArgs(Enum):
+    """This is a helper enums to access the correct index
+    max pool arguments
+    """
+
+    ifm0 = 0
+    ifm1 = 1
+    ifm0_scale = 2
+    ifm0_zero_point = 3
+    ifm1_scale = 4
+    ifm1_zero_point = 5
+    ofm_scale = 6
+    ofm_zero_point = 7
+
+
+def is_composite_func(func, name):
+    """
+    This a method to check whether the call is to
+    a composite function of the "name".

Review comment:
       You should be able to use Pydoc to reference the argument in these cases.

##########
File path: python/tvm/relay/op/contrib/ethosu.py
##########
@@ -0,0 +1,251 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""Arm(R) Ethos(TM)-U NPU supported operators."""
+import numpy as np
+
+from tvm.relay.expr import Constant
+from tvm.relay.op.contrib.register import register_pattern_table
+from tvm.relay.dataflow_pattern import wildcard, is_op, is_constant
+from tvm.relay.backend.contrib.ethosu.util import QConv2DArgs
+from tvm.relay.backend.contrib.ethosu.util import BiasAddArgs
+from tvm.relay.backend.contrib.ethosu.util import RequantArgs
+from tvm.relay.backend.contrib.ethosu.util import get_dim_value
+from ethosu.vela import api as vapi
+
+
+def check_strides(strides):
+    """Checks whether strides are within the limits supported by the 
hardware"""

Review comment:
       Nit: might be worth expanding HW -> Ethos-U for better understanding 
when reading this without knowing exactly what file we are in. 

##########
File path: src/relay/backend/contrib/ethosu/preprocess.cc
##########
@@ -0,0 +1,268 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+#include <tvm/ir/error.h>
+#include <tvm/relay/analysis.h>
+#include <tvm/relay/attrs/annotation.h>
+#include <tvm/relay/expr.h>
+#include <tvm/relay/expr_functor.h>
+#include <tvm/relay/transform.h>
+
+#include <unordered_map>
+#include <unordered_set>
+#include <utility>
+#include <vector>
+
+#include "../../../op/make_op.h"
+
+namespace tvm {
+namespace relay {
+namespace contrib {
+namespace ethosu {
+
+/*!
+ * \brief This expression rewriter will traverse the graph to find calls
+ * to all external functions. If they have multiple inputs and/or
+ * multiple outputs, the following has to be done :
+ * 1) If multiple inputs are present, they needed to be concat before the call.
+ * 2) Inside the external function they need to be split again to their 
original inputs.
+ * 3) If there are multiple outputs, they need to be concat at the end of 
external function.
+ * 4) Then, the concat output again need to be split and made the original 
tuple output in the
+ * main.
+ */
+class ExternalFuncIOHandler : public ExprRewriter {
+ public:
+  explicit ExternalFuncIOHandler(IRModule& module) : module_(module) {}
+  int count = 0;
+
+  Function InferType(const Function& expr, const IRModule& m) {
+    IRModule mod(m);
+    mod->Update(mod->GetGlobalVar("main"), expr);
+    mod = transform::InferType()(mod);
+    return Downcast<Function>(mod->Lookup("main"));
+  }
+
+  /*!
+   * \brief This function will take shape and compute
+   * the scalar size value for it to be use to create
+   * flat single dimensional tensors.
+   */
+  int64_t CalcSize(const Array<Integer>& shape) {
+    int size = 1;
+    for (auto dim_sz : shape) {
+      size = size * Downcast<Integer>(dim_sz)->value;
+    }
+    return size;
+  }
+
+  /*!
+   * \brief This will take a tensor and create a flattened
+   * tensor to be used by the concat.
+   */
+  Expr CreateFlattenTensor(const Expr& input) {
+    auto ishape = 
Downcast<Array<Integer>>(Downcast<TensorType>(input->checked_type())->shape);
+    int flatten_size = CalcSize(ishape);
+    Array<Integer> oshape = {Integer(flatten_size)};
+    return MakeReshape(input, oshape);
+  }
+
+  /*!
+   * \brief This will take flattened tensors and create
+   * a single concat'd tensor.
+   */
+  Expr CreateConcatTensor(const Array<Expr>& inputs) {
+    auto tuple = Tuple(inputs);
+    return MakeConcatenate(tuple, 0);
+  }
+
+  /*!
+   * \brief This will take a flattened concat'd tensor and use the original 
inputs shapes
+   * to recreate a Tuple of the original set of tensors.
+   */
+  Expr CreateSplitReshapedTensors(const Expr& input, const Array<Expr>& 
original_args) {
+    Array<Array<Integer>> shapes;
+    Array<Integer> flatten_tensor_sizes;
+    Array<IndexExpr> split_indices;
+    Array<Expr> rets;
+
+    int total_size = 0;
+    for (auto orig_arg : original_args) {
+      auto shape = 
Downcast<Array<Integer>>(Downcast<TensorType>(orig_arg->checked_type())->shape);
+      shapes.push_back(shape);
+      flatten_tensor_sizes.push_back(CalcSize(shape));
+      if (total_size != 0) {
+        split_indices.push_back(total_size);
+      }
+      total_size += CalcSize(shape);
+    }
+    auto split_outs = MakeSplit(input, split_indices, 0);
+    for (unsigned int i = 0; i < shapes.size(); i++) {
+      auto split_out = TupleGetItem(split_outs, i);
+      split_out->checked_type_ = original_args[i]->checked_type_;
+      rets.push_back(MakeReshape(split_out, shapes[i]));
+    }
+    return Tuple(rets);
+  }
+
+  /*!
+   * \brief Modify the external function to split the input as the original 
compute
+   * as required originally. Moreover, the outputs will be flattened and 
concat'd
+   * to make a single output. Finaly, the external function should only have a 
single input
+   * and a single output.
+   */
+  Function ModifyExternalFunction(const Function& func, GlobalVar gv, const 
DataType& dtype) {

Review comment:
       ```suggestion
     Function ModifyExternalFunction(const Function& func, const GlobalVar& gv, 
const DataType& dtype) {
   ```
   
   Can all of these be const?

##########
File path: python/tvm/relay/backend/contrib/ethosu/util.py
##########
@@ -0,0 +1,198 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""
+Helper utility Enums and Functions used through out codegen
+
+The enums are there to indicate which argument of each relay operator
+corresponds with which input.
+e.g., input zero point of qnn.conv2d is 4th argument(3rd index)
+
+The rest of the utility functions are misc.
+Refer to the description inside such functions
+"""
+
+from enum import Enum
+import numpy as np
+
+from tvm import relay
+from tvm.relay.build_module import bind_params_by_name
+from tvm.relay.backend.contrib.ethosu import preprocess
+
+
+class QConv2DArgs(Enum):
+    """
+    This is a helper enums to access the correct index

Review comment:
       I know I have left a lot of English nits, but would be good to make all 
of these full sentences with punctuation. 

##########
File path: python/tvm/relay/backend/contrib/ethosu/util.py
##########
@@ -0,0 +1,198 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""
+Helper utility Enums and Functions used through out codegen
+
+The enums are there to indicate which argument of each relay operator
+corresponds with which input.
+e.g., input zero point of qnn.conv2d is 4th argument(3rd index)
+
+The rest of the utility functions are misc.
+Refer to the description inside such functions
+"""
+
+from enum import Enum
+import numpy as np
+
+from tvm import relay
+from tvm.relay.build_module import bind_params_by_name
+from tvm.relay.backend.contrib.ethosu import preprocess
+
+
+class QConv2DArgs(Enum):
+    """
+    This is a helper enums to access the correct index
+    qnn conv2d arguments
+    """
+
+    ifm = 0

Review comment:
       Style: these are normally capitalized in Python I believe.

##########
File path: src/relay/op/contrib/ethosu/convolution.cc
##########
@@ -0,0 +1,212 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file src/relay/op/contrib/ethosu/convolution.cc
+ * \brief Property def of the Arm(R) Ethos(TM)-U NPU convolution ops.

Review comment:
       Clarify this?
   ```suggestion
    * \brief Operator definitions for the Arm(R) Ethos(TM)-U NPU convolution 
ops.
   ```

##########
File path: python/tvm/relay/backend/contrib/ethosu/vela_api.py
##########
@@ -0,0 +1,314 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""
+conversions between TVM and Vela. Therefore, all interactions with the
+Vela API are supposed to go through this adapter, with the hope that
+any changes to Vela API, TVM only needs to change this file.
+The following conversion APIs are added :
+    *Obtaining the best block config
+    *Compressing weights
+    *Packing biases
+"""
+import logging
+import math
+import numpy as np
+from ethosu.vela import api as vapi
+
+from tvm.relay.backend.contrib.ethosu import util
+
+# pylint: disable=invalid-name
+logger = logging.getLogger("Ethos-U")
+
+VELA_TO_NP_DTYPES = {
+    vapi.NpuDataType.UINT8: np.uint8,
+    vapi.NpuDataType.UINT16: np.uint16,
+    vapi.NpuDataType.INT8: np.int8,
+    vapi.NpuDataType.INT16: np.int16,
+    vapi.NpuDataType.INT32: np.int32,
+}
+
+SCALE_BIAS_LENGTH = 10
+
+
+def get_optimal_block_config(npu_op, accel_type):
+    """
+    "The NPU's unit of work is known as a block. It will fetch block(s) from 
Input
+    Feature Map (IFM) and a compute block for Output Feature Map (OFM).
+    Therefore, we need to pick an optimal block configuration considering 
bandwidth
+    to bring IFM blocks and the number of OFM block computes need to happen
+    to cover the OFM as indicated by the npu op.
+
+    Parameters
+    ----------
+    npu_op : ethosu.vela.api.NpuOperation
+        The NPU operation and its params
+    accel_type : ethosu.vela.api.NpuAccelerator
+        The NPU accelerator variant
+    Returns
+    -------
+    ethosu.vela.api.NpuShape3d :
+        The optimal block config for the operator
+    """
+    all_valid_block_configs = vapi.npu_find_block_configs(npu_op, accel_type)
+    return _get_optimal_block_config(all_valid_block_configs)
+
+
+def _get_optimal_block_config(all_valid_block_configs):
+    """An internal function to get block config with largest depth
+    and then highest volume/area"""
+    assert isinstance(all_valid_block_configs, list)
+    for block_cfg in all_valid_block_configs:
+        assert isinstance(block_cfg, vapi.NpuShape3D)
+
+    # Getting the largest volume block for benchmarksing
+    all_valid_block_configs.sort(
+        key=lambda _cfg: _cfg.depth * _cfg.height * _cfg.width, reverse=True
+    )
+    largest_volume_block_config = all_valid_block_configs[0]
+    largest_volume = (
+        largest_volume_block_config.depth
+        * largest_volume_block_config.height
+        * largest_volume_block_config.width
+    )
+
+    all_valid_block_configs.sort(key=lambda _cfg: _cfg.depth, reverse=True)
+    max_d = all_valid_block_configs[0].depth
+    max_depth_block_configs = [_cfg for _cfg in all_valid_block_configs if 
_cfg.depth == max_d]
+    max_depth_block_configs.sort(key=lambda _cfg: _cfg.height * _cfg.width, 
reverse=True)
+    max_area = max_depth_block_configs[0].height * 
max_depth_block_configs[0].width
+    max_area_depth_block_configs = [
+        _cfg for _cfg in max_depth_block_configs if _cfg.height * _cfg.width 
== max_area
+    ]
+    # This to get a deterministic anwser everytime
+    max_area_depth_block_configs.sort(key=lambda _cfg: _cfg.height, 
reverse=True)
+    assert len(max_area_depth_block_configs) > 0
+    current_volume = (
+        max_area_depth_block_configs[0].depth
+        * max_area_depth_block_configs[0].height
+        * max_area_depth_block_configs[0].width
+    )
+    logger.info("Using block config=%s", max_area_depth_block_configs[0])
+    logger.info(
+        "Quality of the block config w.r.t. max volume block config=%s",
+        100.0 * (current_volume / largest_volume),
+    )
+    return max_area_depth_block_configs[0]
+
+
+def compress_weights(
+    weights,
+    weights_zp,
+    weights_layout,
+    ifm_bitdepth,
+    block_depth,
+    dilation,
+    accel_type,
+    is_depthwise=False,
+):
+    """Obtain compressed weights from vela

Review comment:
       Would be worth explaining more above, what does this mean to "obtain the 
compressed weights from vela"?

##########
File path: python/tvm/relay/backend/contrib/ethosu/te/dma.py
##########
@@ -0,0 +1,299 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+# pylint: disable=invalid-name,unnecessary-lambda
+"""Tensor Expressions for operations supported by the DMA engine"""

Review comment:
       Can you maybe explain why all the operations need to be redefined here? 
I think it is due to the fact that Ethos-U code generator matches its own TE 
for lowering, but its hard to tell from the code.

##########
File path: python/tvm/relay/backend/contrib/ethosu/vela_api.py
##########
@@ -0,0 +1,314 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""
+conversions between TVM and Vela. Therefore, all interactions with the
+Vela API are supposed to go through this adapter, with the hope that
+any changes to Vela API, TVM only needs to change this file.
+The following conversion APIs are added :
+    *Obtaining the best block config
+    *Compressing weights
+    *Packing biases
+"""
+import logging
+import math
+import numpy as np
+from ethosu.vela import api as vapi
+
+from tvm.relay.backend.contrib.ethosu import util
+
+# pylint: disable=invalid-name
+logger = logging.getLogger("Ethos-U")
+
+VELA_TO_NP_DTYPES = {
+    vapi.NpuDataType.UINT8: np.uint8,
+    vapi.NpuDataType.UINT16: np.uint16,
+    vapi.NpuDataType.INT8: np.int8,
+    vapi.NpuDataType.INT16: np.int16,
+    vapi.NpuDataType.INT32: np.int32,
+}
+
+SCALE_BIAS_LENGTH = 10
+
+
+def get_optimal_block_config(npu_op, accel_type):
+    """
+    "The NPU's unit of work is known as a block. It will fetch block(s) from 
Input
+    Feature Map (IFM) and a compute block for Output Feature Map (OFM).
+    Therefore, we need to pick an optimal block configuration considering 
bandwidth
+    to bring IFM blocks and the number of OFM block computes need to happen
+    to cover the OFM as indicated by the npu op.
+
+    Parameters
+    ----------
+    npu_op : ethosu.vela.api.NpuOperation
+        The NPU operation and its params
+    accel_type : ethosu.vela.api.NpuAccelerator
+        The NPU accelerator variant
+    Returns
+    -------
+    ethosu.vela.api.NpuShape3d :
+        The optimal block config for the operator
+    """
+    all_valid_block_configs = vapi.npu_find_block_configs(npu_op, accel_type)
+    return _get_optimal_block_config(all_valid_block_configs)
+
+
+def _get_optimal_block_config(all_valid_block_configs):
+    """An internal function to get block config with largest depth
+    and then highest volume/area"""
+    assert isinstance(all_valid_block_configs, list)
+    for block_cfg in all_valid_block_configs:
+        assert isinstance(block_cfg, vapi.NpuShape3D)
+
+    # Getting the largest volume block for benchmarksing
+    all_valid_block_configs.sort(
+        key=lambda _cfg: _cfg.depth * _cfg.height * _cfg.width, reverse=True
+    )
+    largest_volume_block_config = all_valid_block_configs[0]
+    largest_volume = (
+        largest_volume_block_config.depth
+        * largest_volume_block_config.height
+        * largest_volume_block_config.width
+    )
+
+    all_valid_block_configs.sort(key=lambda _cfg: _cfg.depth, reverse=True)
+    max_d = all_valid_block_configs[0].depth
+    max_depth_block_configs = [_cfg for _cfg in all_valid_block_configs if 
_cfg.depth == max_d]
+    max_depth_block_configs.sort(key=lambda _cfg: _cfg.height * _cfg.width, 
reverse=True)
+    max_area = max_depth_block_configs[0].height * 
max_depth_block_configs[0].width
+    max_area_depth_block_configs = [
+        _cfg for _cfg in max_depth_block_configs if _cfg.height * _cfg.width 
== max_area
+    ]
+    # This to get a deterministic anwser everytime
+    max_area_depth_block_configs.sort(key=lambda _cfg: _cfg.height, 
reverse=True)
+    assert len(max_area_depth_block_configs) > 0
+    current_volume = (
+        max_area_depth_block_configs[0].depth
+        * max_area_depth_block_configs[0].height
+        * max_area_depth_block_configs[0].width
+    )
+    logger.info("Using block config=%s", max_area_depth_block_configs[0])
+    logger.info(
+        "Quality of the block config w.r.t. max volume block config=%s",
+        100.0 * (current_volume / largest_volume),
+    )
+    return max_area_depth_block_configs[0]
+
+
+def compress_weights(
+    weights,
+    weights_zp,
+    weights_layout,
+    ifm_bitdepth,
+    block_depth,
+    dilation,
+    accel_type,
+    is_depthwise=False,
+):
+    """Obtain compressed weights from vela
+
+    Parameters
+    ----------
+    weights : numpy.ndarray
+        The raw weights
+    weights_zp : int
+        The zero point of the weights
+    weights_layout : str
+        A string literal indicating the layout
+        Supported values : HWIO, HWOI, OHWI
+    ifm_bitdepth : int
+        The bit depth of the ifm the weights are used with
+    block_depth : int
+        The depth of the optimal block config for the operator
+    dilation : tuple
+        A tuple of 2 elements indicating dilation in h and w
+    accel_type : ethosu.vela.api.NpuAccelerator
+        The NPU accelerator variant
+    is_depthwise : bool, Optional
+        This indicates whether the weights are compressed for depthwise 
convolution
+
+    Returns
+    -------
+    compressed_weights : bytearray
+        Compressed weights
+    """
+    layout_transform_indices = {"HWIO": (3, 0, 1, 2), "HWOI": (2, 0, 1, 3), 
"OHWI": (0, 1, 2, 3)}
+    assert weights_layout in layout_transform_indices.keys()
+    assert isinstance(weights_zp, np.int64)
+    weights = weights.astype(np.int64) - weights_zp
+    # Vela needs the weights in OHWI layout
+    weights_ohwi = np.transpose(weights, 
layout_transform_indices[weights_layout])
+    shape_ohwi = [
+        weights.shape[layout_transform_indices[weights_layout][0]],
+        weights.shape[layout_transform_indices[weights_layout][1]],
+        weights.shape[layout_transform_indices[weights_layout][2]],
+        weights.shape[layout_transform_indices[weights_layout][3]],
+    ]
+    block_traversal = calculate_block_traversal_mode(is_depthwise, shape_ohwi, 
ifm_bitdepth)
+    compressed_weights = vapi.npu_encode_weights(
+        accelerator=accel_type,
+        weights_volume=weights_ohwi,
+        dilation_xy=dilation,
+        ifm_bitdepth=ifm_bitdepth,
+        ofm_block_depth=block_depth,
+        is_depthwise=is_depthwise,
+        block_traversal=block_traversal,
+    )
+    return compressed_weights
+
+
+def calculate_block_traversal_mode(is_depthwise, weights_shape_ohwi, 
ifm_bitdepth):
+    """Calculate a block traversal mode given whether the op is depthwise 
convolution,
+    shape of weights and bit-depth of the ifm.
+    """
+
+    if is_depthwise:
+        return vapi.NpuBlockTraversal.DEPTH_FIRST
+    # Determine which block traversal strategy has better DPU utilization
+    kernel_size = weights_shape_ohwi[1] * weights_shape_ohwi[2]
+    depth_utilization = weights_shape_ohwi[3] / util.round_up(
+        weights_shape_ohwi[3], 32 if ifm_bitdepth == 8 else 16
+    )
+    part_kernel_utilization = (weights_shape_ohwi[3] / 
util.round_up(weights_shape_ohwi[3], 8)) * (
+        kernel_size / util.round_up(kernel_size, 4 if ifm_bitdepth == 8 else 2)
+    )
+    if part_kernel_utilization >= depth_utilization or weights_shape_ohwi[3] 
<= 8:
+        # Part-kernel first is always better for ifm depths <= 8
+        return vapi.NpuBlockTraversal.PART_KERNEL_FIRST
+    return vapi.NpuBlockTraversal.DEPTH_FIRST
+
+
+def pack_biases(
+    biases,
+    ifm_scale,
+    ifm_dtype,
+    weight_scales,
+    ofm_scale,
+    is_activation_tanh_or_sigmoid=False,
+):
+    """
+    Obtain packed bias bytearray as the hardware requires from

Review comment:
       Could use more explanation here as well. I know you guys are experts at 
this hardware but I have a feeling many of us will have to refactor/maintain 
this in the future. 

##########
File path: src/relay/backend/contrib/ethosu/preprocess.cc
##########
@@ -0,0 +1,268 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+#include <tvm/ir/error.h>
+#include <tvm/relay/analysis.h>
+#include <tvm/relay/attrs/annotation.h>
+#include <tvm/relay/expr.h>
+#include <tvm/relay/expr_functor.h>
+#include <tvm/relay/transform.h>
+
+#include <unordered_map>
+#include <unordered_set>
+#include <utility>
+#include <vector>
+
+#include "../../../op/make_op.h"
+
+namespace tvm {
+namespace relay {
+namespace contrib {
+namespace ethosu {
+
+/*!
+ * \brief This expression rewriter will traverse the graph to find calls
+ * to all external functions. If they have multiple inputs and/or
+ * multiple outputs, the following has to be done :
+ * 1) If multiple inputs are present, they needed to be concat before the call.
+ * 2) Inside the external function they need to be split again to their 
original inputs.
+ * 3) If there are multiple outputs, they need to be concat at the end of 
external function.
+ * 4) Then, the concat output again need to be split and made the original 
tuple output in the
+ * main.
+ */
+class ExternalFuncIOHandler : public ExprRewriter {
+ public:
+  explicit ExternalFuncIOHandler(IRModule& module) : module_(module) {}
+  int count = 0;
+
+  Function InferType(const Function& expr, const IRModule& m) {
+    IRModule mod(m);
+    mod->Update(mod->GetGlobalVar("main"), expr);
+    mod = transform::InferType()(mod);
+    return Downcast<Function>(mod->Lookup("main"));
+  }
+
+  /*!
+   * \brief This function will take shape and compute
+   * the scalar size value for it to be use to create
+   * flat single dimensional tensors.
+   */
+  int64_t CalcSize(const Array<Integer>& shape) {
+    int size = 1;
+    for (auto dim_sz : shape) {
+      size = size * Downcast<Integer>(dim_sz)->value;
+    }
+    return size;
+  }
+
+  /*!
+   * \brief This will take a tensor and create a flattened
+   * tensor to be used by the concat.
+   */
+  Expr CreateFlattenTensor(const Expr& input) {
+    auto ishape = 
Downcast<Array<Integer>>(Downcast<TensorType>(input->checked_type())->shape);
+    int flatten_size = CalcSize(ishape);
+    Array<Integer> oshape = {Integer(flatten_size)};
+    return MakeReshape(input, oshape);
+  }
+
+  /*!
+   * \brief This will take flattened tensors and create
+   * a single concat'd tensor.
+   */
+  Expr CreateConcatTensor(const Array<Expr>& inputs) {
+    auto tuple = Tuple(inputs);
+    return MakeConcatenate(tuple, 0);
+  }
+
+  /*!
+   * \brief This will take a flattened concat'd tensor and use the original 
inputs shapes
+   * to recreate a Tuple of the original set of tensors.
+   */
+  Expr CreateSplitReshapedTensors(const Expr& input, const Array<Expr>& 
original_args) {
+    Array<Array<Integer>> shapes;
+    Array<Integer> flatten_tensor_sizes;
+    Array<IndexExpr> split_indices;
+    Array<Expr> rets;
+
+    int total_size = 0;
+    for (auto orig_arg : original_args) {
+      auto shape = 
Downcast<Array<Integer>>(Downcast<TensorType>(orig_arg->checked_type())->shape);
+      shapes.push_back(shape);
+      flatten_tensor_sizes.push_back(CalcSize(shape));
+      if (total_size != 0) {
+        split_indices.push_back(total_size);
+      }
+      total_size += CalcSize(shape);
+    }
+    auto split_outs = MakeSplit(input, split_indices, 0);
+    for (unsigned int i = 0; i < shapes.size(); i++) {
+      auto split_out = TupleGetItem(split_outs, i);
+      split_out->checked_type_ = original_args[i]->checked_type_;
+      rets.push_back(MakeReshape(split_out, shapes[i]));
+    }
+    return Tuple(rets);
+  }
+
+  /*!
+   * \brief Modify the external function to split the input as the original 
compute
+   * as required originally. Moreover, the outputs will be flattened and 
concat'd
+   * to make a single output. Finaly, the external function should only have a 
single input
+   * and a single output.
+   */
+  Function ModifyExternalFunction(const Function& func, GlobalVar gv, const 
DataType& dtype) {
+    Array<Expr> inputs;
+    Var ifms;
+    if (func->params.size() > 1) {
+      Array<Array<Integer>> shapes;
+      Array<Integer> flatten_tensor_sizes;
+      Array<IndexExpr> split_indices;
+
+      auto func_name = gv->name_hint;
+      int total_size = 0;
+      for (auto input : func->params) {
+        auto shape = 
Downcast<Array<Integer>>(Downcast<TensorType>(input->checked_type())->shape);
+        shapes.push_back(shape);
+        auto flat_size = CalcSize(shape);
+        flatten_tensor_sizes.push_back(flat_size);
+        if (total_size != 0) {
+          split_indices.push_back(total_size);
+        }
+        total_size += flat_size;
+      }
+      Array<PrimExpr> ifms_shape = {total_size};
+      ifms = Var(func_name + "_ifms", TensorType(ifms_shape, dtype));
+      auto split_outs = MakeSplit(ifms, split_indices, 0);
+      for (unsigned int i = 0; i < shapes.size(); i++) {
+        auto split_out = TupleGetItem(split_outs, i);
+        split_out->checked_type_ = func->params[i]->checked_type();
+        inputs.push_back(MakeReshape(split_out, shapes[i]));
+      }
+    } else {
+      CHECK_EQ(func->params.size(), 1);
+      inputs.push_back(func->params[0]);
+      ifms = func->params[0];
+    }
+    Map<Var, Expr> bind_map;
+    CHECK_EQ(func->params.size(), inputs.size());
+    for (size_t i = 0; i < inputs.size(); i++) {
+      bind_map.Set(func->params[i], inputs[i]);
+    }
+    auto core_compute_expr = Bind(func->body, bind_map);
+
+    // Creation of wrapper inside the external function
+    Array<Var> params = {ifms};

Review comment:
       @mbs-octoml can we ensure that these are duplicating the same kind of 
flattening code that we already have in the compiler? 

##########
File path: tests/python/contrib/test_ethosu/test_legalize.py
##########
@@ -0,0 +1,345 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+# pylint: disable=invalid-name, unused-argument
+
+import pytest
+
+pytest.importorskip("ethosu.vela")
+import numpy as np
+
+import tvm
+from tvm import relay
+from tvm.relay.backend.contrib import ethosu
+from tvm.relay.backend.contrib.ethosu import legalize, preprocess
+from tvm.relay.dataflow_pattern import *
+from tvm.relay.op.contrib.ethosu import *
+import relay_ir_builder
+
+
+def test_split_indices_legalize():
+    def create_graph(axis):
+        x = relay.var("x", shape=(1, 50, 50, 3))
+        x_relu = relay.nn.relu(x)
+        split_o = relay.split(x_relu, [5, 20, 45], axis).tuple_value
+        return relay.Function([x], split_o)
+
+    def expected_mod_axis1():
+        expected_ir_string = """
+        #[version = "0.0.5"]

Review comment:
       You can use the text format to both do expected and input IR if you 
want, there are some examples of this elsewhere in the code base, just FYI. 

##########
File path: python/tvm/relay/backend/contrib/ethosu/vela_api.py
##########
@@ -0,0 +1,314 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""
+conversions between TVM and Vela. Therefore, all interactions with the
+Vela API are supposed to go through this adapter, with the hope that
+any changes to Vela API, TVM only needs to change this file.
+The following conversion APIs are added :
+    *Obtaining the best block config
+    *Compressing weights
+    *Packing biases
+"""
+import logging
+import math
+import numpy as np
+from ethosu.vela import api as vapi
+
+from tvm.relay.backend.contrib.ethosu import util
+
+# pylint: disable=invalid-name
+logger = logging.getLogger("Ethos-U")
+
+VELA_TO_NP_DTYPES = {
+    vapi.NpuDataType.UINT8: np.uint8,
+    vapi.NpuDataType.UINT16: np.uint16,
+    vapi.NpuDataType.INT8: np.int8,
+    vapi.NpuDataType.INT16: np.int16,
+    vapi.NpuDataType.INT32: np.int32,
+}
+
+SCALE_BIAS_LENGTH = 10
+
+
+def get_optimal_block_config(npu_op, accel_type):
+    """
+    "The NPU's unit of work is known as a block. It will fetch block(s) from 
Input
+    Feature Map (IFM) and a compute block for Output Feature Map (OFM).
+    Therefore, we need to pick an optimal block configuration considering 
bandwidth
+    to bring IFM blocks and the number of OFM block computes need to happen
+    to cover the OFM as indicated by the npu op.
+
+    Parameters
+    ----------
+    npu_op : ethosu.vela.api.NpuOperation
+        The NPU operation and its params
+    accel_type : ethosu.vela.api.NpuAccelerator
+        The NPU accelerator variant
+    Returns
+    -------
+    ethosu.vela.api.NpuShape3d :
+        The optimal block config for the operator
+    """
+    all_valid_block_configs = vapi.npu_find_block_configs(npu_op, accel_type)
+    return _get_optimal_block_config(all_valid_block_configs)
+
+
+def _get_optimal_block_config(all_valid_block_configs):
+    """An internal function to get block config with largest depth
+    and then highest volume/area"""
+    assert isinstance(all_valid_block_configs, list)
+    for block_cfg in all_valid_block_configs:
+        assert isinstance(block_cfg, vapi.NpuShape3D)
+
+    # Getting the largest volume block for benchmarksing
+    all_valid_block_configs.sort(
+        key=lambda _cfg: _cfg.depth * _cfg.height * _cfg.width, reverse=True
+    )
+    largest_volume_block_config = all_valid_block_configs[0]
+    largest_volume = (
+        largest_volume_block_config.depth
+        * largest_volume_block_config.height
+        * largest_volume_block_config.width
+    )
+
+    all_valid_block_configs.sort(key=lambda _cfg: _cfg.depth, reverse=True)
+    max_d = all_valid_block_configs[0].depth
+    max_depth_block_configs = [_cfg for _cfg in all_valid_block_configs if 
_cfg.depth == max_d]
+    max_depth_block_configs.sort(key=lambda _cfg: _cfg.height * _cfg.width, 
reverse=True)
+    max_area = max_depth_block_configs[0].height * 
max_depth_block_configs[0].width
+    max_area_depth_block_configs = [
+        _cfg for _cfg in max_depth_block_configs if _cfg.height * _cfg.width 
== max_area
+    ]
+    # This to get a deterministic anwser everytime
+    max_area_depth_block_configs.sort(key=lambda _cfg: _cfg.height, 
reverse=True)
+    assert len(max_area_depth_block_configs) > 0
+    current_volume = (
+        max_area_depth_block_configs[0].depth
+        * max_area_depth_block_configs[0].height
+        * max_area_depth_block_configs[0].width
+    )
+    logger.info("Using block config=%s", max_area_depth_block_configs[0])
+    logger.info(
+        "Quality of the block config w.r.t. max volume block config=%s",
+        100.0 * (current_volume / largest_volume),
+    )
+    return max_area_depth_block_configs[0]
+
+
+def compress_weights(
+    weights,
+    weights_zp,
+    weights_layout,
+    ifm_bitdepth,
+    block_depth,
+    dilation,
+    accel_type,
+    is_depthwise=False,
+):
+    """Obtain compressed weights from vela
+
+    Parameters
+    ----------
+    weights : numpy.ndarray
+        The raw weights
+    weights_zp : int
+        The zero point of the weights
+    weights_layout : str
+        A string literal indicating the layout
+        Supported values : HWIO, HWOI, OHWI
+    ifm_bitdepth : int
+        The bit depth of the ifm the weights are used with
+    block_depth : int
+        The depth of the optimal block config for the operator
+    dilation : tuple
+        A tuple of 2 elements indicating dilation in h and w
+    accel_type : ethosu.vela.api.NpuAccelerator
+        The NPU accelerator variant
+    is_depthwise : bool, Optional
+        This indicates whether the weights are compressed for depthwise 
convolution
+
+    Returns
+    -------
+    compressed_weights : bytearray
+        Compressed weights
+    """
+    layout_transform_indices = {"HWIO": (3, 0, 1, 2), "HWOI": (2, 0, 1, 3), 
"OHWI": (0, 1, 2, 3)}
+    assert weights_layout in layout_transform_indices.keys()
+    assert isinstance(weights_zp, np.int64)
+    weights = weights.astype(np.int64) - weights_zp
+    # Vela needs the weights in OHWI layout
+    weights_ohwi = np.transpose(weights, 
layout_transform_indices[weights_layout])
+    shape_ohwi = [
+        weights.shape[layout_transform_indices[weights_layout][0]],
+        weights.shape[layout_transform_indices[weights_layout][1]],
+        weights.shape[layout_transform_indices[weights_layout][2]],
+        weights.shape[layout_transform_indices[weights_layout][3]],
+    ]
+    block_traversal = calculate_block_traversal_mode(is_depthwise, shape_ohwi, 
ifm_bitdepth)
+    compressed_weights = vapi.npu_encode_weights(
+        accelerator=accel_type,
+        weights_volume=weights_ohwi,
+        dilation_xy=dilation,
+        ifm_bitdepth=ifm_bitdepth,
+        ofm_block_depth=block_depth,
+        is_depthwise=is_depthwise,
+        block_traversal=block_traversal,
+    )
+    return compressed_weights
+
+
+def calculate_block_traversal_mode(is_depthwise, weights_shape_ohwi, 
ifm_bitdepth):
+    """Calculate a block traversal mode given whether the op is depthwise 
convolution,
+    shape of weights and bit-depth of the ifm.
+    """
+
+    if is_depthwise:
+        return vapi.NpuBlockTraversal.DEPTH_FIRST
+    # Determine which block traversal strategy has better DPU utilization
+    kernel_size = weights_shape_ohwi[1] * weights_shape_ohwi[2]
+    depth_utilization = weights_shape_ohwi[3] / util.round_up(
+        weights_shape_ohwi[3], 32 if ifm_bitdepth == 8 else 16
+    )
+    part_kernel_utilization = (weights_shape_ohwi[3] / 
util.round_up(weights_shape_ohwi[3], 8)) * (
+        kernel_size / util.round_up(kernel_size, 4 if ifm_bitdepth == 8 else 2)
+    )
+    if part_kernel_utilization >= depth_utilization or weights_shape_ohwi[3] 
<= 8:
+        # Part-kernel first is always better for ifm depths <= 8
+        return vapi.NpuBlockTraversal.PART_KERNEL_FIRST
+    return vapi.NpuBlockTraversal.DEPTH_FIRST
+
+
+def pack_biases(
+    biases,
+    ifm_scale,
+    ifm_dtype,
+    weight_scales,
+    ofm_scale,
+    is_activation_tanh_or_sigmoid=False,
+):
+    """
+    Obtain packed bias bytearray as the hardware requires from
+    Vela.
+    Parameters
+    ----------
+    biases : numpy.ndarray
+        The values of biases
+    ifm_scale : float
+        The quantization scale parameter of input feature map
+    ifm_dtype : numpy.dtype
+        The data type of input feature map data.
+    weight_scales : numpy.ndarray
+        The quantization scale parameter of weight feature map
+        This could be a tuple if per-channel quantization is present.
+    ofm_scale : float
+        The quantization scale parameter of output feature map.
+    is_activation_tanh_or_sigmoid : bool
+        Indicates whether the fused activation function is tanh or sigmoid.
+
+    Returns
+    -------
+    scale_bias : numpy.ndarray
+        Packed scales/biases as the hardware requires them.
+    """
+    # The BYOC infra should not partition anything else.
+    supported_ifm_dtypes = (np.uint8, np.int8, np.int16)
+    assert ifm_dtype in supported_ifm_dtypes
+
+    if weight_scales.size == 1:
+        weight_scales = [weight_scales] * biases.size
+
+    hw_bias_scales = _calculate_hw_bias_scales(
+        ifm_scale, weight_scales, ofm_scale, ifm_dtype, 
is_activation_tanh_or_sigmoid
+    )
+    assert len(hw_bias_scales) == biases.size
+    biases = biases.astype("int64")
+    packed_biases = bytearray()
+    for idx, scale in enumerate(hw_bias_scales):
+        packed_biases.extend(vapi.npu_encode_bias(biases[idx], *scale))
+    # Align to 16
+    # remainder = (len(packed_biases)) % 16
+    # if remainder > 0:
+    #     packed_biases.extend(bytearray(16 - remainder))
+    scale_bias = np.frombuffer(packed_biases, dtype=np.uint8)
+    scale_bias = np.reshape(scale_bias, (-1, 10))
+    return scale_bias
+
+
+def _quantize_scale(scale):
+    """Quantize floating point scale into 32-bit int scale with a 6-bit shift.
+    This is to be used with 8-bit data.
+    """
+    mantissa, exponent = math.frexp(scale)
+    mantissa_scaled = mantissa * (1 << 31)
+    mantissa_scaled = int(util.round_away_zero(mantissa_scaled))
+    required_shift = 31 - exponent
+    assert 0 <= required_shift < (1 << 6)
+    return mantissa_scaled, required_shift
+
+
+def _reduced_quantize_scale(scale):
+    """A reduction of precision is required for 16 bit data."""
+    mantissa_scaled, required_shift = _quantize_scale(scale)
+    # This is max a signed 16-bit number could represent
+    max_reduced_mantissa_scaled = (1 << 15) - 1
+    # if the current value is larger than pre-scaled 
max_reduced_mantissa_scaled
+    # we need to saturate the anwser to max_reduced_mantissa_scaled
+    if mantissa_scaled >= max_reduced_mantissa_scaled << 16:
+        reduced_mantissa_scaled = max_reduced_mantissa_scaled
+    else:
+        reduced_mantissa_scaled = (mantissa_scaled + (1 << 15)) >> 16
+    reduced_shift = required_shift - 16
+    return reduced_mantissa_scaled, reduced_shift
+
+
+def _calculate_hw_bias_scales(
+    ifm_scale, weight_scales, ofm_scale, ifm_dtype, is_faf_tanh_sigmoid=False
+):
+    """This function will produce a scale that is calculated using scales of 
ifm,
+    weights and ofm. It is also important to note that if per-channel / 
per-value
+    quantization required they should go into hw bias scales"""
+    if is_faf_tanh_sigmoid:
+        ifm_scale = ifm_scale * 0x3000
+    if ifm_dtype == np.uint8:
+        bias_scales = [np.double(ifm_scale * ws) / np.double(ofm_scale) for ws 
in weight_scales]
+    else:
+        assert ifm_dtype in (np.int8, np.int16)
+        ifm_scale_dbl = np.double(ifm_scale)
+        ofm_scale_dbl = np.double(ofm_scale)
+        bias_scales = [ifm_scale_dbl * np.double(ws) / ofm_scale_dbl for ws in 
weight_scales]
+
+    if ifm_dtype == np.int16:
+        hw_bias_scales = [_reduced_quantize_scale(bs) for bs in bias_scales]
+    else:
+        assert ifm_dtype in (np.uint8, np.int8)
+        hw_bias_scales = [_quantize_scale(bs) for bs in bias_scales]
+
+    return hw_bias_scales
+
+
+def get_target_accel_type():
+    """This is a helper function to convert cli accelerator type str argument

Review comment:
       I didn't quite understand this docstring either. 

##########
File path: tests/python/contrib/test_ethosu/relay_ir_builder.py
##########
@@ -0,0 +1,295 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""Helper module to build relay operations for testing"""

Review comment:
       Is there a reason to not use normal IRBuilder for this? 

##########
File path: src/relay/op/contrib/ethosu/convolution.cc
##########
@@ -0,0 +1,212 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file src/relay/op/contrib/ethosu/convolution.cc
+ * \brief Property def of the Arm(R) Ethos(TM)-U NPU convolution ops.
+ */
+#include "../../nn/convolution.h"
+
+#include <tvm/relay/base.h>
+#include <tvm/relay/op.h>
+#include <tvm/relay/qnn/attrs.h>
+#include <tvm/tir/analysis.h>
+#include <tvm/tir/data_layout.h>
+
+#include "../../../qnn/utils.h"
+#include "common.h"
+
+namespace tvm {
+namespace relay {
+namespace op {
+namespace contrib {
+namespace ethosu {
+
+/*! \brief Attributes used by the Ethos(TM)-U NPU convolution operator */
+struct EthosuConv2DAttrs : public tvm::AttrsNode<EthosuConv2DAttrs> {
+  double ifm_scale;
+  int ifm_zero_point;
+  int weight_zero_point;
+  double ofm_scale;
+  int ofm_zero_point;
+  Array<IndexExpr> kernel_shape;
+  IndexExpr ofm_channels;
+  Array<IndexExpr> strides;
+  Array<IndexExpr> padding;
+  Array<IndexExpr> dilation;
+  String activation;
+  int clip_min;
+  int clip_max;
+  String upscale;
+  tvm::String ifm_layout;
+  tvm::String ofm_layout;
+
+  TVM_DECLARE_ATTRS(EthosuConv2DAttrs, "relay.attrs.EthosuConv2DAttrs") {
+    TVM_ATTR_FIELD(ifm_scale).describe("The quantization scale for the Input 
Feature Map tensor.");
+    TVM_ATTR_FIELD(ifm_zero_point)
+        .describe("The quantization zero point for the Input Feature Map 
tensor.");
+    TVM_ATTR_FIELD(weight_zero_point)
+        .describe("The quantization zero point for the weight tensor.");
+    TVM_ATTR_FIELD(ofm_scale).describe("The quantization scale for the Output 
Feature Map tensor.");
+    TVM_ATTR_FIELD(ofm_zero_point)
+        .describe("The quantization zero point for the Output Feature Map 
tensor.");
+    TVM_ATTR_FIELD(kernel_shape)
+        .describe("The 2 dimensional kernel shape as (kernel_height, 
kernel_width).")
+        .set_default(NullValue<Array<IndexExpr> >());
+    TVM_ATTR_FIELD(ofm_channels)
+        .describe("The number of OFM channels.")
+        .set_default(NullValue<IndexExpr>());
+    TVM_ATTR_FIELD(strides)
+        .set_default(Array<IndexExpr>({1, 1}))
+        .describe("The 2 dimensional strides as (stride_height, 
stride_width).");
+    TVM_ATTR_FIELD(padding)
+        .set_default(Array<IndexExpr>({0, 0, 0, 0}))
+        .describe("The 4 dimensional padding as (pad_top, pad_left, 
pad_bottom, pad_right).");
+    TVM_ATTR_FIELD(dilation)
+        .set_default(Array<IndexExpr>({1, 1}))
+        .describe("The 2 dimensional dilation as (dilation_height, 
dilation_width).");
+    TVM_ATTR_FIELD(activation)
+        .describe(
+            "The activation function to use. "
+            "'NONE' - no activation function. "
+            "'CLIP' - clip the output between clip_min and clip_max. "
+            "'TANH' - tanh activation function. "
+            "'SIGMOID' - sigmoid activation function. "
+            "'LUT' - use a look-up table to perform the activation function.")
+        .set_default("NONE");
+    TVM_ATTR_FIELD(clip_min)
+        .describe("The minimum clipping value if activation = 'CLIP'.")
+        .set_default(0);
+    TVM_ATTR_FIELD(clip_max)
+        .describe("The maximum clipping value if activation = 'CLIP'.")
+        .set_default(0);
+    TVM_ATTR_FIELD(upscale)
+        .describe(
+            "The 2x2 upscaling mode to apply to the Input Feature Map tensor. "
+            "'NONE' - no upscaling. "
+            "'NEAREST' - upscale using nearest neighbour. "
+            "'ZEROS' - upscale using zeros.")
+        .set_default("NONE");
+    TVM_ATTR_FIELD(ifm_layout)
+        .set_default("NHWC")
+        .describe("The layout of the Input Feature Map tensor. Can be 'NHWC' 
or 'NHCWB16'.");
+    TVM_ATTR_FIELD(ofm_layout)
+        .set_default("NHWC")
+        .describe("The layout of the Output Feature Map tensor. Can be 'NHWC' 
or 'NHCWB16'.");
+  }
+};
+
+TVM_REGISTER_NODE_TYPE(EthosuConv2DAttrs);
+
+bool EthosuConv2DRel(const Array<Type>& types, int num_inputs, const Attrs& 
attrs,
+                     const TypeReporter& reporter) {
+  CHECK_EQ(types.size(), 5);
+  const auto* ifm = types[0].as<TensorTypeNode>();
+  const auto* weight = types[1].as<TensorTypeNode>();
+  const auto* scale_bias = types[2].as<TensorTypeNode>();
+  if (ifm == nullptr || weight == nullptr) return false;
+  const auto* param = attrs.as<EthosuConv2DAttrs>();
+  CHECK(param != nullptr) << "EthosuConv2DAttrs cannot be nullptr.";
+  CHECK(ifm->dtype == DataType::UInt(8) || ifm->dtype == DataType::Int(8))
+      << "Expected ethosu_conv2d type(uint8) or type(int8) for ifm but was " 
<< ifm->dtype;
+  CHECK(weight->dtype == DataType::UInt(8) || weight->dtype == 
DataType::Int(8))
+      << "Expected ethosu_conv2d type(uint8) or type(int8) for weight but was 
" << weight->dtype;
+  CHECK(scale_bias->dtype == DataType::UInt(8))
+      << "Expected ethosu_conv2d type(uint8) for scale_bias but was " << 
scale_bias->dtype;
+
+  // The scale_bias should be provided as a tensor of size {ofm_channels, 10}
+  reporter->Assign(types[2], TensorType({weight->shape[0], 10}, 
DataType::UInt(8)));
+
+  // Assign weight type {ofm_channels, kernel_height, kernel_width, 
ifm_channels}
+  reporter->Assign(types[1], TensorType({param->ofm_channels, 
param->kernel_shape[0],
+                                         param->kernel_shape[1], 
weight->shape[3]},
+                                        weight->dtype));
+
+  // Assign ofm type
+  auto ofm_shape =
+      EthosuInferKernelOutput(ifm->shape, param->ifm_layout, 
param->ofm_layout, param->kernel_shape,
+                              param->ofm_channels, param->dilation, 
param->strides, param->padding);
+  reporter->Assign(types[4], TensorType(ofm_shape, ifm->dtype));
+  return true;
+}
+
+Expr MakeEthosuConv2D(Expr ifm, Expr weight, Expr scale_bias, Expr lut, double 
ifm_scale,
+                      int ifm_zero_point, int weight_zero_point, double 
ofm_scale,
+                      int ofm_zero_point, Array<IndexExpr> kernel_shape, 
IndexExpr ofm_channels,
+                      Array<IndexExpr> strides, Array<IndexExpr> padding, 
Array<IndexExpr> dilation,
+                      String activation, int clip_min, int clip_max, String 
upscale,
+                      String ifm_layout, String ofm_layout) {
+  auto attrs = make_object<EthosuConv2DAttrs>();
+  attrs->ifm_scale = ifm_scale;
+  attrs->ifm_zero_point = ifm_zero_point;
+  attrs->weight_zero_point = weight_zero_point;
+  attrs->ofm_scale = ofm_scale;
+  attrs->ofm_zero_point = ofm_zero_point;
+  attrs->kernel_shape = std::move(kernel_shape);
+  attrs->ofm_channels = std::move(ofm_channels);
+  attrs->strides = std::move(strides);
+  attrs->padding = std::move(padding);
+  attrs->dilation = std::move(dilation);
+  attrs->activation = std::move(activation);
+  attrs->clip_min = clip_min;
+  attrs->clip_max = clip_max;
+  attrs->upscale = std::move(upscale);
+  attrs->ifm_layout = std::move(ifm_layout);
+  attrs->ofm_layout = std::move(ofm_layout);
+  static const Op& op = Op::Get("contrib.ethosu.conv2d");
+  return Call(op, {ifm, weight, scale_bias, lut}, Attrs(attrs), {});
+}
+
+TVM_REGISTER_GLOBAL("relay.op._make.ethosu_conv2d").set_body_typed(MakeEthosuConv2D);
+
+RELAY_REGISTER_OP("contrib.ethosu.conv2d")
+    .describe(R"code(Arm(R) Ethos(TM)-U NPU 2D quantized convolution operator.
+
+This Relay operator corresponds to the hardware-implemented quantized

Review comment:
       This is a great example of a useful comment. 

##########
File path: tests/python/contrib/test_ethosu/relay_ir_builder.py
##########
@@ -0,0 +1,295 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""Helper module to build relay operations for testing"""

Review comment:
       cc @mbs-octoml can you take another pass on testing code, want to try 
and consolidate 




-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

[GitHub] [tvm] jroesch commented on a change in pull request #8795: [2/6] Arm(R) Ethos(TM)-U NPU Relay passes and Conv2D op

Reply via email to