manupa-arm commented on a change in pull request #8795: URL: https://github.com/apache/tvm/pull/8795#discussion_r695439000
########## File path: python/tvm/relay/op/contrib/ethosu.py ########## @@ -0,0 +1,251 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +"""Arm(R) Ethos(TM)-U NPU supported operators.""" +import numpy as np + +from tvm.relay.expr import Constant +from tvm.relay.op.contrib.register import register_pattern_table +from tvm.relay.dataflow_pattern import wildcard, is_op, is_constant +from tvm.relay.backend.contrib.ethosu.util import QConv2DArgs +from tvm.relay.backend.contrib.ethosu.util import BiasAddArgs +from tvm.relay.backend.contrib.ethosu.util import RequantArgs +from tvm.relay.backend.contrib.ethosu.util import get_dim_value +from ethosu.vela import api as vapi + + +def check_strides(strides): + """Checks whether strides are within the limits supported by the hardware""" + stride_range = (1, 3) + smin, smax = stride_range + if not smax >= strides[0] >= smin: + return False + if not smax >= strides[1] >= smin: + return False + return True + + +def check_valid_dtypes(tensor_params): + """Check whether dtypes are supported by the hardware""" + supported_dtypes = (np.uint8, np.int8) + for tep in tensor_params: + # Check for dtypes + if np.dtype(tep.dtype) not in supported_dtypes: + return False + # Check for shape sizes + if any(dimlen > 65536 for dimlen in tep.shape): + return False + return True + + +def check_weights(weights, dilation): + """Checks whether weight tensor is compatible with HW""" + dilated_height_range = (1, 64) + dilated_hxw_range = (1, 64 * 64) + weights_limit = 127 * 65536 + dilated_width = (weights.shape[get_dim_value(weights.layout, "W")] - 1) * dilation[0] + 1 + dilated_height = (weights.shape[get_dim_value(weights.layout, "H")] - 1) * dilation[1] + 1 + dh_min, dh_max = dilated_height_range + if not dh_min <= dilated_height <= dh_max: + return False + dilated_hxw = dilated_height * dilated_width + dhxw_min, dhxw_max = dilated_hxw_range + if not dhxw_min <= dilated_hxw <= dhxw_max: + return False + # A saturation upper bound check for accumulators + weights.values = weights.values - weights.q_params.zero_point + axis = ( + get_dim_value(weights.layout, "H"), + get_dim_value(weights.layout, "W"), + get_dim_value(weights.layout, "I"), + ) + sum_weights = np.amax(np.sum(np.absolute(weights.values), axis=axis)) + if not sum_weights <= weights_limit: + return False + return True + + +def check_bias(bias): + """Check whether the bias values fit in 40 bits""" + if bias and bias.dtype == np.dtype("int64"): + valid = all(len(bin(bias_value)[2:]) <= 40 for bias_value in bias.values) + return valid + return True + + +def check_batch_size(ifm): + """Checks for the number of batches vela currently supports""" + if ifm.shape[0] != 1: + return False + return True + + +def check_dilation(dilation): + """Checks whether dilation is within the limits supported by the hardware""" + dilation_range = (1, 2) + dmin, dmax = dilation_range + if not dmin <= dilation[0] <= dmax: + return False + if not dmin <= dilation[1] <= dmax: + return False + return True + + +def check_padding(padding, bounds): + """Checks whether padding is within the limits supported by the hardware""" + if len(padding) != 4 or len(bounds) != 4: + return False + top, left, bottom, right = padding + topb, leftb, bottomb, rightb = bounds + if top > topb or left > leftb or bottom > bottomb or right > rightb: + return False + return True + + +class TensorParams: + """ + This class will parse a tvm Expr along with quantization scale + and zero point to populate parameters that are required + for the creation of tensors in Vela. + """ + + def __init__(self, tensor, layout=None, scale=None, zero_point=None): + self.tensor = tensor + if isinstance(tensor, Constant): + self.values = tensor.data.asnumpy() + else: + self.values = None + self.dtype = tensor.checked_type.dtype + self.shape = [int(i) for i in tensor.checked_type.shape] + self.layout = layout + + if scale is not None and zero_point is not None: + self.q_params = vapi.NpuQuantization( + scale.data.asnumpy().astype("float32"), zero_point.data.asnumpy().astype(self.dtype) + ) + else: + # put default values + self.q_params = vapi.NpuQuantization(1.0, 0) + + +class QnnConv2DParams: + """ + This class will parse a Call to a ethosu.qnn_conv2d_clip composite function + and extract quantization information of all the associated tensors. + """ + + composite_name = "ethosu.qnn_conv2d" + # The hardware only supports padding upto the numbers as follows + padding_bounds = [31, 31, 32, 32] + activation_map = {"clip": "CLIP"} + + def __init__(self, func_body): + activation = None + if str(func_body.op) in self.activation_map.keys(): + activation = func_body + requantize_op = activation.args[0] + else: + requantize_op = func_body + bias_add = requantize_op.args[0] + qnn_conv2d = bias_add.args[0] + data_layout = qnn_conv2d.attrs.data_layout + kernel_layout = qnn_conv2d.attrs.kernel_layout + # We consider the weights & biases as params as it should be a Constant + self.weights = TensorParams( + qnn_conv2d.args[QConv2DArgs.weights.value], + kernel_layout, + qnn_conv2d.args[QConv2DArgs.weights_scale.value], + qnn_conv2d.args[QConv2DArgs.weights_zero_point.value], + ) + + self.biases = TensorParams( + bias_add.args[BiasAddArgs.biases.value], + data_layout, + requantize_op.args[RequantArgs.ifm_scale.value], + requantize_op.args[RequantArgs.ifm_zero_point.value], + ) + self.ifm = TensorParams( + qnn_conv2d.args[QConv2DArgs.ifm.value], + data_layout, + qnn_conv2d.args[QConv2DArgs.ifm_scale.value], + qnn_conv2d.args[QConv2DArgs.ifm_zero_point.value], + ) + self.ofm = TensorParams( + func_body, + data_layout, + requantize_op.args[RequantArgs.ofm_scale.value], + requantize_op.args[RequantArgs.ofm_zero_point.value], + ) + self.padding = qnn_conv2d.attrs.padding + self.strides = qnn_conv2d.attrs.strides + self.dilation = qnn_conv2d.attrs.dilation + self.activation = activation + + # If groups are equal to channel, its a depthwise_conv2d + self.groups = qnn_conv2d.attrs.groups + self.is_depthwise = False + channels_axis = {"HWIO": 3, "HWOI": 2} + if qnn_conv2d.attrs.groups == self.weights.shape[channels_axis[kernel_layout]]: + self.is_depthwise = True + + def is_valid(self): + """ + Checks whether QnnConv2D with Clip has compatible attributes with HW + """ + tensor_params = [self.weights, self.ifm, self.ofm] + if not check_valid_dtypes(tensor_params): + return False + if not check_weights(self.weights, self.dilation): + return False + if not check_bias(self.biases): + return False + if not check_strides(self.strides): + return False + if not check_batch_size(self.ifm): + return False + if not check_dilation(self.dilation): + return False + if not check_padding(self.padding, self.padding_bounds): + return False + legal_groups = [1, self.ofm.shape[3]] + if self.groups not in legal_groups: + return False + # This should be a valid QnnDepthwise2DParams, not QnnConv2DParams + if self.is_depthwise: + return False + return True + + +def qnn_conv2d_pattern(): + """ + Create pattern for qnn.conv2D with optional fused relu Review comment: Done ########## File path: src/relay/backend/contrib/ethosu/preprocess.cc ########## @@ -0,0 +1,268 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +#include <tvm/ir/error.h> +#include <tvm/relay/analysis.h> +#include <tvm/relay/attrs/annotation.h> +#include <tvm/relay/expr.h> +#include <tvm/relay/expr_functor.h> +#include <tvm/relay/transform.h> + +#include <unordered_map> +#include <unordered_set> +#include <utility> +#include <vector> + +#include "../../../op/make_op.h" + +namespace tvm { +namespace relay { +namespace contrib { +namespace ethosu { + +/*! + * \brief This expression rewriter will traverse the graph to find calls + * to all external functions. If they have multiple inputs and/or + * multiple outputs, the following has to be done : + * 1) If multiple inputs are present, they needed to be concat before the call. + * 2) Inside the external function they need to be split again to their original inputs. + * 3) If there are multiple outputs, they need to be concat at the end of external function. + * 4) Then, the concat output again need to be split and made the original tuple output in the + * main. + */ +class ExternalFuncIOHandler : public ExprRewriter { + public: + explicit ExternalFuncIOHandler(IRModule& module) : module_(module) {} + int count = 0; + + Function InferType(const Function& expr, const IRModule& m) { + IRModule mod(m); + mod->Update(mod->GetGlobalVar("main"), expr); + mod = transform::InferType()(mod); + return Downcast<Function>(mod->Lookup("main")); + } + + /*! + * \brief This function will take shape and compute + * the scalar size value for it to be use to create + * flat single dimensional tensors. + */ + int64_t CalcSize(const Array<Integer>& shape) { + int size = 1; + for (auto dim_sz : shape) { + size = size * Downcast<Integer>(dim_sz)->value; + } + return size; + } + + /*! + * \brief This will take a tensor and create a flattened + * tensor to be used by the concat. + */ + Expr CreateFlattenTensor(const Expr& input) { + auto ishape = Downcast<Array<Integer>>(Downcast<TensorType>(input->checked_type())->shape); + int flatten_size = CalcSize(ishape); + Array<Integer> oshape = {Integer(flatten_size)}; + return MakeReshape(input, oshape); + } + + /*! + * \brief This will take flattened tensors and create + * a single concat'd tensor. + */ + Expr CreateConcatTensor(const Array<Expr>& inputs) { + auto tuple = Tuple(inputs); + return MakeConcatenate(tuple, 0); + } + + /*! + * \brief This will take a flattened concat'd tensor and use the original inputs shapes + * to recreate a Tuple of the original set of tensors. + */ + Expr CreateSplitReshapedTensors(const Expr& input, const Array<Expr>& original_args) { + Array<Array<Integer>> shapes; + Array<Integer> flatten_tensor_sizes; + Array<IndexExpr> split_indices; + Array<Expr> rets; + + int total_size = 0; + for (auto orig_arg : original_args) { + auto shape = Downcast<Array<Integer>>(Downcast<TensorType>(orig_arg->checked_type())->shape); + shapes.push_back(shape); + flatten_tensor_sizes.push_back(CalcSize(shape)); + if (total_size != 0) { + split_indices.push_back(total_size); + } + total_size += CalcSize(shape); + } + auto split_outs = MakeSplit(input, split_indices, 0); + for (unsigned int i = 0; i < shapes.size(); i++) { + auto split_out = TupleGetItem(split_outs, i); + split_out->checked_type_ = original_args[i]->checked_type_; + rets.push_back(MakeReshape(split_out, shapes[i])); + } + return Tuple(rets); + } + + /*! + * \brief Modify the external function to split the input as the original compute + * as required originally. Moreover, the outputs will be flattened and concat'd + * to make a single output. Finaly, the external function should only have a single input + * and a single output. + */ + Function ModifyExternalFunction(const Function& func, GlobalVar gv, const DataType& dtype) { Review comment: Done ########## File path: src/relay/backend/contrib/ethosu/preprocess.cc ########## @@ -0,0 +1,268 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +#include <tvm/ir/error.h> +#include <tvm/relay/analysis.h> +#include <tvm/relay/attrs/annotation.h> +#include <tvm/relay/expr.h> +#include <tvm/relay/expr_functor.h> +#include <tvm/relay/transform.h> + +#include <unordered_map> +#include <unordered_set> +#include <utility> +#include <vector> + +#include "../../../op/make_op.h" + +namespace tvm { +namespace relay { +namespace contrib { +namespace ethosu { + +/*! + * \brief This expression rewriter will traverse the graph to find calls + * to all external functions. If they have multiple inputs and/or + * multiple outputs, the following has to be done : + * 1) If multiple inputs are present, they needed to be concat before the call. + * 2) Inside the external function they need to be split again to their original inputs. + * 3) If there are multiple outputs, they need to be concat at the end of external function. + * 4) Then, the concat output again need to be split and made the original tuple output in the + * main. + */ +class ExternalFuncIOHandler : public ExprRewriter { + public: + explicit ExternalFuncIOHandler(IRModule& module) : module_(module) {} + int count = 0; + + Function InferType(const Function& expr, const IRModule& m) { + IRModule mod(m); + mod->Update(mod->GetGlobalVar("main"), expr); + mod = transform::InferType()(mod); + return Downcast<Function>(mod->Lookup("main")); + } + + /*! + * \brief This function will take shape and compute + * the scalar size value for it to be use to create + * flat single dimensional tensors. + */ + int64_t CalcSize(const Array<Integer>& shape) { + int size = 1; + for (auto dim_sz : shape) { + size = size * Downcast<Integer>(dim_sz)->value; + } + return size; + } + + /*! + * \brief This will take a tensor and create a flattened + * tensor to be used by the concat. + */ + Expr CreateFlattenTensor(const Expr& input) { + auto ishape = Downcast<Array<Integer>>(Downcast<TensorType>(input->checked_type())->shape); + int flatten_size = CalcSize(ishape); + Array<Integer> oshape = {Integer(flatten_size)}; + return MakeReshape(input, oshape); + } + + /*! + * \brief This will take flattened tensors and create + * a single concat'd tensor. + */ + Expr CreateConcatTensor(const Array<Expr>& inputs) { + auto tuple = Tuple(inputs); + return MakeConcatenate(tuple, 0); + } + + /*! + * \brief This will take a flattened concat'd tensor and use the original inputs shapes + * to recreate a Tuple of the original set of tensors. + */ + Expr CreateSplitReshapedTensors(const Expr& input, const Array<Expr>& original_args) { + Array<Array<Integer>> shapes; + Array<Integer> flatten_tensor_sizes; + Array<IndexExpr> split_indices; + Array<Expr> rets; + + int total_size = 0; + for (auto orig_arg : original_args) { + auto shape = Downcast<Array<Integer>>(Downcast<TensorType>(orig_arg->checked_type())->shape); + shapes.push_back(shape); + flatten_tensor_sizes.push_back(CalcSize(shape)); + if (total_size != 0) { + split_indices.push_back(total_size); + } + total_size += CalcSize(shape); + } + auto split_outs = MakeSplit(input, split_indices, 0); + for (unsigned int i = 0; i < shapes.size(); i++) { + auto split_out = TupleGetItem(split_outs, i); + split_out->checked_type_ = original_args[i]->checked_type_; + rets.push_back(MakeReshape(split_out, shapes[i])); + } + return Tuple(rets); + } + + /*! + * \brief Modify the external function to split the input as the original compute + * as required originally. Moreover, the outputs will be flattened and concat'd + * to make a single output. Finaly, the external function should only have a single input + * and a single output. + */ + Function ModifyExternalFunction(const Function& func, GlobalVar gv, const DataType& dtype) { + Array<Expr> inputs; + Var ifms; + if (func->params.size() > 1) { + Array<Array<Integer>> shapes; + Array<Integer> flatten_tensor_sizes; + Array<IndexExpr> split_indices; + + auto func_name = gv->name_hint; + int total_size = 0; + for (auto input : func->params) { + auto shape = Downcast<Array<Integer>>(Downcast<TensorType>(input->checked_type())->shape); + shapes.push_back(shape); + auto flat_size = CalcSize(shape); + flatten_tensor_sizes.push_back(flat_size); + if (total_size != 0) { + split_indices.push_back(total_size); + } + total_size += flat_size; + } + Array<PrimExpr> ifms_shape = {total_size}; + ifms = Var(func_name + "_ifms", TensorType(ifms_shape, dtype)); + auto split_outs = MakeSplit(ifms, split_indices, 0); + for (unsigned int i = 0; i < shapes.size(); i++) { + auto split_out = TupleGetItem(split_outs, i); + split_out->checked_type_ = func->params[i]->checked_type(); + inputs.push_back(MakeReshape(split_out, shapes[i])); + } + } else { + CHECK_EQ(func->params.size(), 1); + inputs.push_back(func->params[0]); + ifms = func->params[0]; + } + Map<Var, Expr> bind_map; + CHECK_EQ(func->params.size(), inputs.size()); + for (size_t i = 0; i < inputs.size(); i++) { + bind_map.Set(func->params[i], inputs[i]); + } + auto core_compute_expr = Bind(func->body, bind_map); + + // Creation of wrapper inside the external function + Array<Var> params = {ifms}; + if (func->body->IsInstance<TupleNode>()) { + auto tuple_out = func->body.as<TupleNode>(); + Array<Expr> reshaped_outputs; + for (unsigned int i = 0; i < tuple_out->fields.size(); i++) { + auto out = Downcast<Tuple>(core_compute_expr)->fields[i]; + out->checked_type_ = tuple_out->fields[i]->checked_type_; + reshaped_outputs.push_back(CreateFlattenTensor(out)); + } + auto concat_out = CreateConcatTensor(reshaped_outputs); + auto f = Function(params, concat_out, concat_out->checked_type_, {}, func->attrs); + return InferType(f, this->module_); + } else { + auto f = + Function(params, core_compute_expr, core_compute_expr->checked_type_, {}, func->attrs); + return InferType(f, this->module_); + } + } + + Expr Rewrite_(const CallNode* call, const Expr& post) final { + auto post_call = Downcast<Call>(post); + + if (auto glb_var_node = post_call->op.as<GlobalVarNode>()) { + auto glb_var = GetRef<GlobalVar>(glb_var_node); + auto func = Downcast<Function>(module_->functions[glb_var]); + + // If the number of inputs and output are 1 --> no need to do anything + if (post_call->args.size() == 1 && !func->body->IsInstance<TupleNode>()) { + return post; + } + if (auto compiler = func->GetAttr<String>(attr::kCompiler)) { + if (compiler == "ethosu") { + auto ext_input = std::move(post_call->args[0]); + auto arg_dtype = Downcast<TensorType>(post_call->args[0]->checked_type())->dtype; + if (post_call->args.size() > 1) { + Array<Expr> reshaped_inputs; + for (const auto& arg : post_call->args) { + // All arguments should be of same data type + CHECK_EQ(arg_dtype, Downcast<TensorType>(arg->checked_type())->dtype) + << "Currently NPU external functions require all inputs to be of same data " + "type"; + reshaped_inputs.push_back(CreateFlattenTensor(arg)); + } + ext_input = CreateConcatTensor(reshaped_inputs); + } + auto ext_func = ModifyExternalFunction(func, glb_var, arg_dtype); + Array<Expr> new_args = {ext_input}; + module_->Add(glb_var, ext_func); + Expr new_call = Call(glb_var, new_args); + if (func->body->IsInstance<TupleNode>()) { + auto orginal_tuple_out = Downcast<Tuple>(func->body); + new_call = CreateSplitReshapedTensors(new_call, orginal_tuple_out->fields); + } + return std::move(new_call); + } + } + } + return post; + } + + private: + IRModule module_; +}; + +IRModule PreprocessExternalFuncIO_(IRModule module) { Review comment: Done -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: [email protected] For queries about this service, please contact Infrastructure at: [email protected]
