mkatanbaf commented on code in PR #13242: URL: https://github.com/apache/tvm/pull/13242#discussion_r1028398803
########## python/tvm/relay/qnn/strategy/arm_cpu.py: ########## @@ -0,0 +1,63 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +"""Quantized operator strategy for Arm CPU. + +As quantized op schedules, these are only used if the qnn.Legalize pass is disabled. The current +schedules only work for fused operators with bias, as this is the most common use case. Only +regular/depthwise conv2d is supported, but qnn_dense will be added eventually.""" + +from tvm import topi, TVMError +from .generic import qnn_conv2d_strategy +from ... import op as _op +from ...op.strategy.generic import is_depthwise_conv2d + + +@qnn_conv2d_strategy.register("arm_cpu") +def qnn_conv2d_strategy_arm_cpu(attrs, inputs, _out_type, target): + """qnn.conv2d strategy for Arm Cortex-M CPUs with DSP.""" + + if not (target.features.has_dsp and "cortex-m" in target.mcpu): + raise TVMError( + "Quantized Arm schedules only exist for Cortex-M with DSP! " + "The qnn.Legalize pass should be run for other Arm processors." + ) + + data = inputs[0] + kernel = inputs[1] + data_layout = attrs.data_layout + kernel_layout = attrs.kernel_layout + groups = attrs.groups + strategy = _op.OpStrategy() + + if groups == 1: + if data_layout == "NHWC" and kernel_layout == "OHWI": Review Comment: I remember you wrote a nice explanation why the NHWC and NCHW layouts result in better performance for conv2d and depthwise_conv2d respectively. It would be nice to add a summary of the discussion or a link to it here. ########## python/tvm/topi/arm_cpu/mprofile/dsp/micro_kernel/tensordot.py: ########## @@ -14,142 +14,333 @@ # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. -"""Computes a "jumpy tensordot" operator, which can be used to tensorize many common operators -including regular conv2d, depthwise conv2d, and grouped conv2d provided the data and kernel layouts -are the optimal ones. When groups=1, the optimal data layout is NHWC and kernel layout is OHWI. When -this is a depthwise convolution, the optimal data layout is NCHW and kernel layout is OIHW.""" +"""Generates optimized code to compute a tensor dot product on ARMv7E-M. +This function can be used to tensorize many common operators including regular conv2d, depthwise +conv2d, and grouped conv2d for some data and kernel layouts. When for regular convolution, use data +layout HHWC and kernel layout OHWI. For depthwise convolution, use data layout data layout is NCHW +and kernel layout OIHW. +""" + +from itertools import chain import textwrap +from typing import Iterator, Tuple -from tvm import te, tir -from .common import num_simd_lanes_per_word +def _get_c_function_name(split_size, dimensions, offsets, x_strides): + """Generates a C function name for tensordot. + We do not need a suffix, as the generated function will have an #include guard. Unlike other + microTVM operators, _get_c_function_name is never called externally. + """ + tensor_w, kernel_h, kernel_w = dimensions + return ( + f"tensordot_opt_x{split_size}_int16_w{tensor_w}_" + + f"{kernel_h}x{kernel_w}_" + + "".join(map(str, offsets)) + + (f"_{x_strides[0]}_{x_strides[1]}" if split_size > 1 else "") + ) -def _get_func_name(in_dtype, tensor_h, jump, tensor_w, suffix): - """Gets the C function name of the tensordot function.""" - return f"tensordot_{in_dtype}_h{tensor_h}_j{jump}_w{tensor_w}_{suffix}" +def _init_biased_accumulators(split_size): + """Generates code to load the bias into the accumulators. -def make_intrin_tensordot(slices, strides, tensordot_params): - """Helper function for constructing tensordot intrinsic. We can't construct the whole thing here - (as multiple schedules use tensordot and each must build the intrinstic differently) but we can - build part here to simplify the code.""" + Addition is commutative, so we could add the bias before, during, or after performing our + multiply-accumulate operations. Where we add the bias does not change the overflow behavior. - # in_dtype, tensor_h, jump, tensor_w, suffix = tensordot_params - data, kernel, output = slices - data_strides, kernel_strides = strides + Doing the bias add takes one cycle either way (if done at the beginning we can't use a SMULXY + trick to set sum_i to zero for "free"). However, doing it at the beginning frees up a register, + so we'll do it first. + """ + assignments = map(lambda x: f"sum_{x:x} = *bias", range(split_size)) + joined_assignments = ", ".join(assignments) + return f"int {joined_assignments};" - data_buf = tir.decl_buffer( - data.shape, data.dtype, name="data", offset_factor=1, strides=data_strides - ) - kernel_buf = tir.decl_buffer( - kernel.shape, - kernel.dtype, - name="kernel", - offset_factor=1, - strides=kernel_strides, - ) - output_buf = tir.decl_buffer( - output.shape, output.dtype, name="output", offset_factor=1, strides=[1] - ) - def intrin_func(ins, outs): - builder = tir.ir_builder.create() - builder.emit( - tir.call_extern( - "int32", - _get_func_name(*tensordot_params), - outs[0].access_ptr("w"), - ins[0].access_ptr("r"), - ins[1].access_ptr("r"), - ) - ) - return builder.get() +def _get_tensor_halfwords(dimensions, offset, split_size, in_stride) -> Iterator: + tensor_w, kernel_h, kernel_w = dimensions - return te.decl_tensor_intrin( - output.op, - intrin_func, - binds={data: data_buf, kernel: kernel_buf, output: output_buf}, - ) + split_max = (split_size - 1) * in_stride + for y in range(kernel_h): + if y * tensor_w % 2 + offset == 1: + yield None + for x in range(kernel_w + split_max): + yield (y, x) + if (y * tensor_w + kernel_w + split_max + offset) % 2 == 1: + yield None + + +def _get_kernel_halfwords(dimensions, offset) -> Iterator: + _, kernel_h, kernel_w = dimensions + if offset == 1: + yield None + for y in range(kernel_h): + for x in range(kernel_w): + yield (y, x) + if (kernel_h * kernel_w + offset) % 2 == 1: + yield None + + +def _get_int16_alias(position) -> str: + if not position: + return "unknown" + y, x = position + return f"y{y:0>2x}_x{x:0>2x}" + + +def _load_tensor_vars(halfwords, tensor_w) -> Iterator[str]: + assert len(halfwords) % 2 == 0 + offset = int(not bool(halfwords[0])) + + for i in range(0, len(halfwords), 2): + var_name = "__".join(map(_get_int16_alias, halfwords[i : i + 2])) + y, x = halfwords[i + 1] or halfwords[i] + tensor_index = (y * tensor_w + x + offset) // 2 + yield f"int tensor__{var_name} = tensor[{tensor_index}];" + + +def _load_kernel_vars(halfwords) -> Iterator[str]: + assert len(halfwords) % 2 == 0 + for i in range(0, len(halfwords), 2): + var_name = "__".join(map(_get_int16_alias, halfwords[i : i + 2])) + yield f"int kernel__{var_name} = kernel[{i // 2}];" -def tensordot_impl(in_dtype: str, tensor_h: int, jump: int, tensor_w: int, suffix: str) -> str: - """Generates C code for taking the dot products of two `tensor_h` * `tensor_w` tensors. Also has - a `jump` argument that advances the pointer of one tensor by that many words after each row. The - `jump` and `tensor_w` values must be word-aligned for the input data type, as non-word-aligned - memory access is slow on the Cortex-M series. Depending on the input datatype, the code may - contain DSP instructions for Arm v7e-m. C code contains DSP instructions for Arm v7e-m. See - the below pseudocode for reference: - - tensordot(out_ptr, dat_ptr, ker_ptr) { - sum = 0; - for (i = 0; i < tensor_h; i++) { - for (j = 0; j < tensor_w; j++) { - sum += (*dat_ptr++) * (*ker_ptr++); - } - dat_ptr += jump; - } - *out_ptr = sum; - } +def _get_draft_macs(kernel_dims, tensor_halfwords, kernel_halfwords, offset) -> Iterator[Tuple]: + """Generates an un-optimized list of multiply-accumulate instructions. + + We will optimize these into SIMD instructions later. The tuples in the returned iterator are + organized as: + + (instruction, (arg1_y, arg1_x), (arg2_y, arg2_x)) + Review Comment: could you add an example here please? maybe you can use a named_tuple for make it more readable? ########## tests/python/relay/strategy/arm_cpu/test_quantized_convolution.py: ########## @@ -0,0 +1,355 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +"""microTVM cares a lot about the convolution + bias + requantize + fused ReLU use case. There have +been some accuracy issues in the past, so this test steps through a model (MobileNetV1) layer by +layer and ensures there is 1-1 correspondance at each step. This test would run way faster if we ran +the model all at once, but then we wouldn't know which layers had issues. + +Furthermore, this test uses some in-development optimizations for microTVM that aren't part of the +main pipeline. +""" + +import numpy as np +from PIL import Image +import pytest + +import tvm +import tvm.testing +from tvm import meta_schedule, relay +from tvm.testing.aot import AOTTestModel, run_and_check, AOTCompiledTestModel +from tvm.relay.backend import Executor, Runtime +from tvm.micro.testing.aot_test_utils import AOT_CORSTONE300_RUNNER +from tvm.contrib.download import download_testdata +from test_generalized_conv2d import change_ndarray_layout + + +# The model is the v0.7 version of the TinyML person detection (aka visual wake words) model. This +# is an RGB 96x96 MobileNet V1 model. +MODEL_URL = "https://github.com/mlcommons/tiny/raw/v0.7/benchmark/training/visual_wake_words/trained_models/vww_96_int8.tflite" +SAMPLE_URL = ( + "https://github.com/dmlc/web-data/raw/main/tensorflow/models/InceptionV1/elephant-299.jpg" +) + + [email protected](scope="module") +def interpreter(): + """Returns a TFLite interpreter with the MLPerf Tiny visual wakewords model loaded, with an + elephant image run through it, and with all intermediate layer outputs saved.""" + + # Make sure the Tensorflow import is skipped if the test is being skipped. This is needed to + # prevent the "python: i386" tests from failing, as they don't have Tensorflow installed. + import tensorflow as tf # pylint: disable=import-outside-toplevel + + # Download the reference model + rel_model_path = "model_microtvm_mobilenetv1.tflite" + file = download_testdata(MODEL_URL, rel_model_path, overwrite=False) + + # Load it into TensorFlow and allocate memory + interpreter = tf.lite.Interpreter(file, experimental_preserve_all_tensors=True) + interpreter.allocate_tensors() + + # Download an image. The neuron activations are strange if we use random data or ones, + # so downloading an image is useful. + rel_image_path = "image_microtvm_mobilenetv1.jpg" + img_path = download_testdata(SAMPLE_URL, rel_image_path, overwrite=False) + image = Image.open(img_path).resize((96, 96)) + image_data_hwc_uint8 = np.asarray(image) + assert image_data_hwc_uint8.shape == (96, 96, 3) + assert image_data_hwc_uint8.dtype == "uint8" + image_data_nhwc_int8 = (image_data_hwc_uint8 + 128).view("int8").reshape((1, 96, 96, 3)) + + # Load the image into the TFLite interpreter and compute all intermediate tensor values + input_details = interpreter.get_input_details() + interpreter.set_tensor(input_details[0]["index"], image_data_nhwc_int8) + interpreter.invoke() + return interpreter + + +def _get_mobilenet_v1_layer_attributes(layer_num): + """Returns the relevant padding and stride for a given layer in a MobileNetV1 model. It's a huge + headache to read this data from TensorFlow, as it is not user accessible via the interpreter. If + we really wanted to, we would have to parse the .tflite file ourselves. This function is a bit + of a hack, but lets us skip that.""" + + if layer_num == 0: # Regular conv2d Review Comment: There are small avg_pool and softmax layer at the end of vww model, would be nice to clarify that those layers are not considered here. ########## python/tvm/topi/arm_cpu/mprofile/dsp/micro_kernel/tensordot.py: ########## @@ -14,142 +14,333 @@ # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. -"""Computes a "jumpy tensordot" operator, which can be used to tensorize many common operators -including regular conv2d, depthwise conv2d, and grouped conv2d provided the data and kernel layouts -are the optimal ones. When groups=1, the optimal data layout is NHWC and kernel layout is OHWI. When -this is a depthwise convolution, the optimal data layout is NCHW and kernel layout is OIHW.""" +"""Generates optimized code to compute a tensor dot product on ARMv7E-M. +This function can be used to tensorize many common operators including regular conv2d, depthwise +conv2d, and grouped conv2d for some data and kernel layouts. When for regular convolution, use data +layout HHWC and kernel layout OHWI. For depthwise convolution, use data layout data layout is NCHW +and kernel layout OIHW. +""" + +from itertools import chain import textwrap +from typing import Iterator, Tuple -from tvm import te, tir -from .common import num_simd_lanes_per_word +def _get_c_function_name(split_size, dimensions, offsets, x_strides): + """Generates a C function name for tensordot. + We do not need a suffix, as the generated function will have an #include guard. Unlike other + microTVM operators, _get_c_function_name is never called externally. + """ + tensor_w, kernel_h, kernel_w = dimensions + return ( + f"tensordot_opt_x{split_size}_int16_w{tensor_w}_" + + f"{kernel_h}x{kernel_w}_" + + "".join(map(str, offsets)) + + (f"_{x_strides[0]}_{x_strides[1]}" if split_size > 1 else "") + ) -def _get_func_name(in_dtype, tensor_h, jump, tensor_w, suffix): - """Gets the C function name of the tensordot function.""" - return f"tensordot_{in_dtype}_h{tensor_h}_j{jump}_w{tensor_w}_{suffix}" +def _init_biased_accumulators(split_size): + """Generates code to load the bias into the accumulators. -def make_intrin_tensordot(slices, strides, tensordot_params): - """Helper function for constructing tensordot intrinsic. We can't construct the whole thing here - (as multiple schedules use tensordot and each must build the intrinstic differently) but we can - build part here to simplify the code.""" + Addition is commutative, so we could add the bias before, during, or after performing our + multiply-accumulate operations. Where we add the bias does not change the overflow behavior. - # in_dtype, tensor_h, jump, tensor_w, suffix = tensordot_params - data, kernel, output = slices - data_strides, kernel_strides = strides + Doing the bias add takes one cycle either way (if done at the beginning we can't use a SMULXY + trick to set sum_i to zero for "free"). However, doing it at the beginning frees up a register, + so we'll do it first. + """ + assignments = map(lambda x: f"sum_{x:x} = *bias", range(split_size)) + joined_assignments = ", ".join(assignments) + return f"int {joined_assignments};" - data_buf = tir.decl_buffer( - data.shape, data.dtype, name="data", offset_factor=1, strides=data_strides - ) - kernel_buf = tir.decl_buffer( - kernel.shape, - kernel.dtype, - name="kernel", - offset_factor=1, - strides=kernel_strides, - ) - output_buf = tir.decl_buffer( - output.shape, output.dtype, name="output", offset_factor=1, strides=[1] - ) - def intrin_func(ins, outs): - builder = tir.ir_builder.create() - builder.emit( - tir.call_extern( - "int32", - _get_func_name(*tensordot_params), - outs[0].access_ptr("w"), - ins[0].access_ptr("r"), - ins[1].access_ptr("r"), - ) - ) - return builder.get() +def _get_tensor_halfwords(dimensions, offset, split_size, in_stride) -> Iterator: + tensor_w, kernel_h, kernel_w = dimensions - return te.decl_tensor_intrin( - output.op, - intrin_func, - binds={data: data_buf, kernel: kernel_buf, output: output_buf}, - ) + split_max = (split_size - 1) * in_stride + for y in range(kernel_h): + if y * tensor_w % 2 + offset == 1: + yield None + for x in range(kernel_w + split_max): + yield (y, x) + if (y * tensor_w + kernel_w + split_max + offset) % 2 == 1: + yield None + + +def _get_kernel_halfwords(dimensions, offset) -> Iterator: + _, kernel_h, kernel_w = dimensions + if offset == 1: + yield None + for y in range(kernel_h): + for x in range(kernel_w): + yield (y, x) + if (kernel_h * kernel_w + offset) % 2 == 1: + yield None + + +def _get_int16_alias(position) -> str: + if not position: + return "unknown" + y, x = position + return f"y{y:0>2x}_x{x:0>2x}" + + +def _load_tensor_vars(halfwords, tensor_w) -> Iterator[str]: + assert len(halfwords) % 2 == 0 + offset = int(not bool(halfwords[0])) + + for i in range(0, len(halfwords), 2): + var_name = "__".join(map(_get_int16_alias, halfwords[i : i + 2])) + y, x = halfwords[i + 1] or halfwords[i] + tensor_index = (y * tensor_w + x + offset) // 2 + yield f"int tensor__{var_name} = tensor[{tensor_index}];" + + +def _load_kernel_vars(halfwords) -> Iterator[str]: + assert len(halfwords) % 2 == 0 + for i in range(0, len(halfwords), 2): + var_name = "__".join(map(_get_int16_alias, halfwords[i : i + 2])) + yield f"int kernel__{var_name} = kernel[{i // 2}];" -def tensordot_impl(in_dtype: str, tensor_h: int, jump: int, tensor_w: int, suffix: str) -> str: - """Generates C code for taking the dot products of two `tensor_h` * `tensor_w` tensors. Also has - a `jump` argument that advances the pointer of one tensor by that many words after each row. The - `jump` and `tensor_w` values must be word-aligned for the input data type, as non-word-aligned - memory access is slow on the Cortex-M series. Depending on the input datatype, the code may - contain DSP instructions for Arm v7e-m. C code contains DSP instructions for Arm v7e-m. See - the below pseudocode for reference: - - tensordot(out_ptr, dat_ptr, ker_ptr) { - sum = 0; - for (i = 0; i < tensor_h; i++) { - for (j = 0; j < tensor_w; j++) { - sum += (*dat_ptr++) * (*ker_ptr++); - } - dat_ptr += jump; - } - *out_ptr = sum; - } +def _get_draft_macs(kernel_dims, tensor_halfwords, kernel_halfwords, offset) -> Iterator[Tuple]: + """Generates an un-optimized list of multiply-accumulate instructions. + + We will optimize these into SIMD instructions later. The tuples in the returned iterator are + organized as: + + (instruction, (arg1_y, arg1_x), (arg2_y, arg2_x)) + + We return an iterator so that optimizations may be done by iterator chaining. + """ + + def get_var(y, x, halfwords): + i = halfwords.index((y, x)) + if i % 2 == 0: + return f"{_get_int16_alias((y, x))}__{_get_int16_alias(halfwords[i + 1])}", "b" + return f"{_get_int16_alias(halfwords[i - 1])}__{_get_int16_alias((y, x))}", "t" + + kernel_h, kernel_w = kernel_dims + for y in range(kernel_h): + for x in range(kernel_w): + tensor_var, tensor_half = get_var(y, x + offset, tensor_halfwords) + kernel_var, kernel_half = get_var(y, x, kernel_halfwords) + instruction = f"smla{tensor_half}{kernel_half}" + yield instruction, f"tensor__{tensor_var}", f"kernel__{kernel_var}" + + +def _apply_simd_optimizations(instruction_tuples) -> Iterator[Tuple]: + """When possible, fuses single MACs into SIMD MAC instructions. + + The compiler cannot do this automatically, as calling __builtin_arm_smlaxy forces the SMLAxy Review Comment: I'm not sure if I understand this correctly, but does this mean that we will unroll the loop and get a long list of instructions instead? would this significantly increase the code size? -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: [email protected] For queries about this service, please contact Infrastructure at: [email protected]
