[GitHub] [tvm] areusch commented on a diff in pull request #13242: [microTVM] Modernize Arm Cortex-M convolution schedules

GitBox Mon, 05 Dec 2022 13:28:50 -0800


areusch commented on code in PR #13242:
URL: https://github.com/apache/tvm/pull/13242#discussion_r1040102872



##########
python/tvm/topi/arm_cpu/mprofile/dsp/micro_kernel/tensordot.py:
##########
@@ -14,142 +14,416 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
-"""Computes a "jumpy tensordot" operator, which can be used to tensorize many 
common operators
-including regular conv2d, depthwise conv2d, and grouped conv2d provided the 
data and kernel layouts
-are the optimal ones. When groups=1, the optimal data layout is NHWC and 
kernel layout is OHWI. When
-this is a depthwise convolution, the optimal data layout is NCHW and kernel 
layout is OIHW."""
+"""Generates optimized code to compute a tensor dot product on ARMv7E-M.

Review Comment:
   does this apply to v8-M also?



##########
python/tvm/topi/arm_cpu/mprofile/dsp/micro_kernel/tensordot.py:
##########
@@ -14,142 +14,416 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
-"""Computes a "jumpy tensordot" operator, which can be used to tensorize many 
common operators
-including regular conv2d, depthwise conv2d, and grouped conv2d provided the 
data and kernel layouts
-are the optimal ones. When groups=1, the optimal data layout is NHWC and 
kernel layout is OHWI. When
-this is a depthwise convolution, the optimal data layout is NCHW and kernel 
layout is OIHW."""
+"""Generates optimized code to compute a tensor dot product on ARMv7E-M.
 
+This function can be used to tensorize many common operators including regular 
conv2d, depthwise
+conv2d, and grouped conv2d for some data and kernel layouts. When for regular 
convolution, use data
+layout HHWC and kernel layout OHWI. For depthwise convolution, use data layout 
data layout is NCHW
+and kernel layout OIHW.
+"""
+
+from dataclasses import dataclass
+from itertools import chain
 import textwrap
+from typing import Iterator, Optional, Tuple
 
-from tvm import te, tir
 
-from .common import num_simd_lanes_per_word
+@dataclass
+class SMLAInstruction:
+    """Class for keeping track of an item in inventory."""
 
+    instruction: str
+    tensor_var: str
+    kernel_var: str
 
-def _get_func_name(in_dtype, tensor_h, jump, tensor_w, suffix):
-    """Gets the C function name of the tensordot function."""
-    return f"tensordot_{in_dtype}_h{tensor_h}_j{jump}_w{tensor_w}_{suffix}"
+    def call_with_acle(self, accumulator_var: str) -> str:
+        return (
+            f"{accumulator_var} = __{self.instruction}"
+            f"({self.tensor_var}, {self.kernel_var}, {accumulator_var});"
+        )
 
+    def has_same_operands(self, other: "SMLAInstruction") -> bool:
+        return self.tensor_var == other.tensor_var and self.kernel_var == 
other.kernel_var
 
-def make_intrin_tensordot(slices, strides, tensordot_params):
-    """Helper function for constructing tensordot intrinsic. We can't 
construct the whole thing here
-    (as multiple schedules use tensordot and each must build the intrinstic 
differently) but we can
-    build part here to simplify the code."""
 
-    # in_dtype, tensor_h, jump, tensor_w, suffix = tensordot_params
-    data, kernel, output = slices
-    data_strides, kernel_strides = strides
+def _get_c_function_name(num_outputs, dimensions, offsets, x_strides):
+    """Generates a C function name for tensordot.
 
-    data_buf = tir.decl_buffer(
-        data.shape, data.dtype, name="data", offset_factor=1, 
strides=data_strides
-    )
-    kernel_buf = tir.decl_buffer(
-        kernel.shape,
-        kernel.dtype,
-        name="kernel",
-        offset_factor=1,
-        strides=kernel_strides,
-    )
-    output_buf = tir.decl_buffer(
-        output.shape, output.dtype, name="output", offset_factor=1, strides=[1]
+    We do not need a suffix, as the generated function will have an #include 
guard. Unlike other
+    microTVM operators, _get_c_function_name is never called externally.
+    """
+    tensor_w, kernel_h, kernel_w = dimensions
+    return (
+        f"tensordot_opt_x{num_outputs}_int16_w{tensor_w}_"
+        + f"{kernel_h}x{kernel_w}_"
+        + "".join(map(str, offsets))
+        + (f"_{x_strides[0]}_{x_strides[1]}" if num_outputs > 1 else "")
     )
 
-    def intrin_func(ins, outs):
-        builder = tir.ir_builder.create()
-        builder.emit(
-            tir.call_extern(
-                "int32",
-                _get_func_name(*tensordot_params),
-                outs[0].access_ptr("w"),
-                ins[0].access_ptr("r"),
-                ins[1].access_ptr("r"),
-            )
-        )
-        return builder.get()
 
-    return te.decl_tensor_intrin(
-        output.op,
-        intrin_func,
-        binds={data: data_buf, kernel: kernel_buf, output: output_buf},
-    )
+def _init_biased_accumulators(num_outputs):
+    """Generates code to load the bias into the accumulators.
+
+    Addition is commutative, so we could add the bias before, during, or after 
performing our
+    multiply-accumulate operations. Where we add the bias does not change the 
overflow behavior.
+
+    Doing the bias add takes one cycle either way (if done at the beginning we 
can't use a SMULXY
+    trick to set sum_i to zero for "free"). However, doing it at the beginning 
frees up a register,
+    so we'll do it first.
+    """
+    assignments = map(lambda x: f"sum_{x:x} = *bias", range(num_outputs))

Review Comment:
   more pythonic:
   ```suggestion
       assignments = ]f"sum_{x:x} = *bias" for x in range(num_outputs)]
   ```



##########
python/tvm/topi/arm_cpu/mprofile/dsp/micro_kernel/tensordot.py:
##########
@@ -14,142 +14,416 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
-"""Computes a "jumpy tensordot" operator, which can be used to tensorize many 
common operators
-including regular conv2d, depthwise conv2d, and grouped conv2d provided the 
data and kernel layouts
-are the optimal ones. When groups=1, the optimal data layout is NHWC and 
kernel layout is OHWI. When
-this is a depthwise convolution, the optimal data layout is NCHW and kernel 
layout is OIHW."""
+"""Generates optimized code to compute a tensor dot product on ARMv7E-M.
 
+This function can be used to tensorize many common operators including regular 
conv2d, depthwise
+conv2d, and grouped conv2d for some data and kernel layouts. When for regular 
convolution, use data
+layout HHWC and kernel layout OHWI. For depthwise convolution, use data layout 
data layout is NCHW
+and kernel layout OIHW.
+"""
+
+from dataclasses import dataclass
+from itertools import chain
 import textwrap
+from typing import Iterator, Optional, Tuple
 
-from tvm import te, tir
 
-from .common import num_simd_lanes_per_word
+@dataclass
+class SMLAInstruction:
+    """Class for keeping track of an item in inventory."""
 
+    instruction: str
+    tensor_var: str
+    kernel_var: str
 
-def _get_func_name(in_dtype, tensor_h, jump, tensor_w, suffix):
-    """Gets the C function name of the tensordot function."""
-    return f"tensordot_{in_dtype}_h{tensor_h}_j{jump}_w{tensor_w}_{suffix}"
+    def call_with_acle(self, accumulator_var: str) -> str:
+        return (
+            f"{accumulator_var} = __{self.instruction}"
+            f"({self.tensor_var}, {self.kernel_var}, {accumulator_var});"
+        )
 
+    def has_same_operands(self, other: "SMLAInstruction") -> bool:
+        return self.tensor_var == other.tensor_var and self.kernel_var == 
other.kernel_var
 
-def make_intrin_tensordot(slices, strides, tensordot_params):
-    """Helper function for constructing tensordot intrinsic. We can't 
construct the whole thing here
-    (as multiple schedules use tensordot and each must build the intrinstic 
differently) but we can
-    build part here to simplify the code."""
 
-    # in_dtype, tensor_h, jump, tensor_w, suffix = tensordot_params
-    data, kernel, output = slices
-    data_strides, kernel_strides = strides
+def _get_c_function_name(num_outputs, dimensions, offsets, x_strides):
+    """Generates a C function name for tensordot.
 
-    data_buf = tir.decl_buffer(
-        data.shape, data.dtype, name="data", offset_factor=1, 
strides=data_strides
-    )
-    kernel_buf = tir.decl_buffer(
-        kernel.shape,
-        kernel.dtype,
-        name="kernel",
-        offset_factor=1,
-        strides=kernel_strides,
-    )
-    output_buf = tir.decl_buffer(
-        output.shape, output.dtype, name="output", offset_factor=1, strides=[1]
+    We do not need a suffix, as the generated function will have an #include 
guard. Unlike other
+    microTVM operators, _get_c_function_name is never called externally.
+    """
+    tensor_w, kernel_h, kernel_w = dimensions
+    return (
+        f"tensordot_opt_x{num_outputs}_int16_w{tensor_w}_"
+        + f"{kernel_h}x{kernel_w}_"
+        + "".join(map(str, offsets))
+        + (f"_{x_strides[0]}_{x_strides[1]}" if num_outputs > 1 else "")
     )
 
-    def intrin_func(ins, outs):
-        builder = tir.ir_builder.create()
-        builder.emit(
-            tir.call_extern(
-                "int32",
-                _get_func_name(*tensordot_params),
-                outs[0].access_ptr("w"),
-                ins[0].access_ptr("r"),
-                ins[1].access_ptr("r"),
-            )
-        )
-        return builder.get()
 
-    return te.decl_tensor_intrin(
-        output.op,
-        intrin_func,
-        binds={data: data_buf, kernel: kernel_buf, output: output_buf},
-    )
+def _init_biased_accumulators(num_outputs):
+    """Generates code to load the bias into the accumulators.
+
+    Addition is commutative, so we could add the bias before, during, or after 
performing our
+    multiply-accumulate operations. Where we add the bias does not change the 
overflow behavior.
+
+    Doing the bias add takes one cycle either way (if done at the beginning we 
can't use a SMULXY
+    trick to set sum_i to zero for "free"). However, doing it at the beginning 
frees up a register,
+    so we'll do it first.
+    """
+    assignments = map(lambda x: f"sum_{x:x} = *bias", range(num_outputs))
+    joined_assignments = ", ".join(assignments)
+    return f"int32_t {joined_assignments};"
+
+
+def _get_tensor_halfwords(dimensions, offset, num_outputs, in_stride) -> 
Iterator[Optional[Tuple]]:
+    """Gets the data that will be stored in memory at the tensor pointer.

Review Comment:
   i think you mean "gets the logical indices of the data that will be stored 
in memory at the given pointer" maybe? not sure this returns the actual data, 
right?



##########
python/tvm/topi/arm_cpu/qnn.py:
##########
@@ -0,0 +1,369 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""Contains TVMScript implementations of some QNN operators for Arm.
+
+Currently, the only ops with compute functions are fused regular and depthwise 
convolutions for
+Arm Cortex-M with DSP.
+"""
+
+from typing import Tuple
+
+import tvm
+from tvm import te
+from tvm.tir import const
+from tvm.script import tir as T
+from ..utils import get_const_tuple
+from .mprofile.dsp.micro_kernel import tensordot
+
+
+def int_ceil_division(x, y):
+    return -(x // -y)
+
+
+def _compute_output_dim(data_length, kernel_length, stride):
+    return int_ceil_division(data_length + 1 - kernel_length, stride)
+
+
+def _pick_tensordot_impl(attrs, inputs, num_sums=2, is_depthwise=False):
+    """Helper function that chooses the right implementation of 
micro_kernel.tensordot.
+
+    Takes as input the parameters of the conv2d, and returns a tuple of TWO 
(function_name,
+    function_code). The first pair (the aligned one) is for even numbered 
output channels, and the
+    second pair (the offset one) is for odd-numbered output channels. This 
function is used for
+    regular and depthwise convolutions.
+
+    We need different implementations for even vs odd numbered output 
channels, because the "start"
+    of an odd output channel in the data tensor or kernel might or might not 
be on a word boundary,
+    and the tensordot code expects all input pointers to be word-aligned.
+    """
+    data, kernel = inputs[0:2]
+    rq_output_zero_point_const = inputs[10]
+    assert len(rq_output_zero_point_const.op.body) == 1
+    output_zero_point = rq_output_zero_point_const.op.body[0]
+
+    _, stride_w = get_const_tuple(attrs.strides)
+
+    if is_depthwise:
+        assert attrs.data_layout == "NCHW"
+        assert attrs.kernel_layout == "IOHW"
+        _, _, height, width = get_const_tuple(data.shape)
+        _, out_channels, kernel_h, kernel_w = get_const_tuple(kernel.shape)
+
+        dimensions = (width, kernel_h, kernel_w)
+        in_stride = stride_w
+        data_per_oc_size = height * width
+    else:
+        assert attrs.data_layout == "NHWC"
+        assert attrs.kernel_layout == "OHWI"
+        _, height, width, in_channels = get_const_tuple(data.shape)
+        out_channels, kernel_h, kernel_w, _ = get_const_tuple(kernel.shape)
+
+        dimensions = (width * in_channels, kernel_h, kernel_w * in_channels)
+        in_stride = in_channels * stride_w
+        data_per_oc_size = 0
+
+    assert attrs.out_layout is not None
+    if attrs.out_layout == "NHWC":
+        out_stride = out_channels
+    elif attrs.out_layout == "NCHW":
+        out_stride = 1
+    else:
+        raise ValueError(f"Unsupported output layout {attrs.out_layout}!")
+
+    x_strides = (in_stride, out_stride)
+    aligned_func = tensordot.tensordot_int16_impl(
+        num_sums,
+        dimensions,
+        (0, 0, 0),
+        x_strides,
+        output_zero_point=output_zero_point,
+    )
+
+    kernel_per_oc_size = dimensions[1] * dimensions[2]
+
+    offsets = (data_per_oc_size % 2, kernel_per_oc_size % 2, 0)
+    offset_func = tensordot.tensordot_int16_impl(
+        num_sums,
+        dimensions,
+        offsets,
+        x_strides,
+        output_zero_point=output_zero_point,
+    )
+
+    return (aligned_func, offset_func)
+
+
+def _make_tscript_ptr(buffer, offset, length, dtype="int16"):
+    return T.tvm_access_ptr(
+        T.type_annotation(dtype=dtype),
+        buffer.data,
+        offset,
+        length,
+        1,
+        dtype="handle",
+    )
+
+
+def _make_tscript_call(func_name, *args):
+    return T.evaluate(T.call_extern(func_name, *args, dtype="int32"))
+
+
+def _make_conv2d_primfunc(
+    call_dimensions: Tuple,
+    buffer_shapes: Tuple[Tuple, Tuple, Tuple, Tuple, Tuple],
+    aligned_func: Tuple[str, str],
+    offset_func: Tuple[str, str],
+    ptr_gens: Tuple,
+):
+    height, width, out_channels = call_dimensions
+    data_shape, kernel_shape, bias_shape, scale_shape, output_shape = 
buffer_shapes
+    aligned_func_name, aligned_func_code = aligned_func
+    offset_func_name, offset_func_code = offset_func
+    output_ptr, data_ptr, kernel_ptr = ptr_gens
+
+    # If the functions are identical, we can skip the second loop
+    if aligned_func_name == offset_func_name:
+        aligned_channels = out_channels
+        offset_channels = tvm.tir.const(0)
+        c_step = tvm.tir.const(1)
+    else:
+        aligned_channels = out_channels // 2
+        offset_channels = out_channels // 2
+        c_step = tvm.tir.const(2)
+
+    def bias_ptr(bias, c):
+        return _make_tscript_ptr(bias, c, 1, dtype="int32")
+
+    def scale_ptr(scale, c):
+        return _make_tscript_ptr(scale, c, 1, dtype="int32")
+
+    @T.prim_func
+    def biased_quantized_conv2d(
+        data_handle: T.handle,
+        kernel_handle: T.handle,
+        bias_handle: T.handle,
+        scale_handle: T.handle,
+        output_handle: T.handle,
+    ) -> None:
+
+        T.func_attr({"global_symbol": "main", "tir.noalias": True})
+        data = T.match_buffer(data_handle, data_shape, dtype="int16")
+        kernel = T.match_buffer(kernel_handle, kernel_shape, dtype="int16")
+        bias = T.match_buffer(bias_handle, bias_shape, dtype="int32")
+
+        # We don't specify a data type for the requantization scale, even 
though we will read it as
+        # an int32. This is because we must pretend it is a float32, as 
Relay's requantize op only
+        # allows floating point scales.
+        scale = T.match_buffer(scale_handle, scale_shape)
+        output = T.match_buffer(output_handle, output_shape, dtype="int16")
+
+        # This hack prevents TVM from seeing these variables as "unused". I 
should be using T.reads
+        # and T.writes, but they don't work. I think it's an issue with 
BufferTouchedDomain.
+        # pylint: disable=unused-variable
+        output[0, 0, 0, 0] = 0
+        __1 = data[0, 0, 0, 0]
+        __2 = kernel[0, 0, 0, 0]
+        __3 = bias[0, 0, 0, 0]
+        __4 = scale[0]
+        # pylint: enable=unused-variable
+
+        for c_ax, y_ax, x_ax in T.grid(aligned_channels, height, width):
+            with T.block("conv2d_aligned"):
+                T.block_attr({"pragma_import_c": aligned_func_code})
+                y, x, c = T.axis.remap("SSS", [y_ax, x_ax, c_ax])
+                _make_tscript_call(
+                    aligned_func_name,
+                    output_ptr(output, y, x, c * c_step),
+                    data_ptr(data, y, x, c * c_step),
+                    kernel_ptr(kernel, c * c_step),
+                    bias_ptr(bias, c * c_step),
+                    scale_ptr(scale, c * c_step),
+                )
+
+        for c_ax, y_ax, x_ax in T.grid(offset_channels, height, width):
+            with T.block("conv2d_offset"):
+                T.block_attr({"pragma_import_c": offset_func_code})
+                y, x, c = T.axis.remap("SSS", [y_ax, x_ax, c_ax])
+                _make_tscript_call(
+                    offset_func_name,
+                    output_ptr(output, y, x, c * c_step + 1),
+                    data_ptr(data, y, x, c * c_step + 1, offset=1),
+                    kernel_ptr(kernel, c * c_step + 1, offset=1),
+                    bias_ptr(bias, c * c_step + 1),
+                    scale_ptr(scale, c * c_step + 1),
+                )
+
+    return biased_quantized_conv2d
+
+
+def qnn_conv2d(attrs, inputs, out_type):
+    """Compute for qnn.conv2d with NHWC layout.
+
+    Note that this is a DIFFERENT layout from the Hexagon variant, because 
they have special
+    instructions Cortex-M doesn't have. We expect the kernel to have OHWI 
layout. We also assume
+    that padding is not necessary, as it will have been done by another pass.
+    """
+
+    # Make a few checks to unpack the function arguments and ensure it was 
called with the right
+    # arguments. Note that unlike most schedules, qnn_conv2d does not use a 
wrapper.
+    assert len(inputs) == 11
+    data, kernel, _izp, _kzp, _iscale, _kscale, bias, scale = inputs[0:8]
+    output_layout = attrs.out_layout
+    assert output_layout == "NHWC"
+
+    _, height, width, in_channels = get_const_tuple(data.shape)
+    out_channels, kernel_h, kernel_w, _ = get_const_tuple(kernel.shape)
+    y_stride, x_stride = get_const_tuple(attrs.strides)
+
+    out_height = _compute_output_dim(height, kernel_h, y_stride)
+    out_width = _compute_output_dim(width, kernel_w, x_stride)
+
+    # Decide how many sums our function should have running at the same time. 
Doing
+    # this lets us do "more work" for each memory load, but doing too many of 
them causes us to run
+    # out of registers. Currently this is set to either 1 or 2, but autotuning 
this value would

Review Comment:
   can you reference it in the comments?



##########
python/tvm/topi/arm_cpu/mprofile/dsp/micro_kernel/tensordot.py:
##########
@@ -14,142 +14,416 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
-"""Computes a "jumpy tensordot" operator, which can be used to tensorize many 
common operators
-including regular conv2d, depthwise conv2d, and grouped conv2d provided the 
data and kernel layouts
-are the optimal ones. When groups=1, the optimal data layout is NHWC and 
kernel layout is OHWI. When
-this is a depthwise convolution, the optimal data layout is NCHW and kernel 
layout is OIHW."""
+"""Generates optimized code to compute a tensor dot product on ARMv7E-M.
 
+This function can be used to tensorize many common operators including regular 
conv2d, depthwise
+conv2d, and grouped conv2d for some data and kernel layouts. When for regular 
convolution, use data
+layout HHWC and kernel layout OHWI. For depthwise convolution, use data layout 
data layout is NCHW
+and kernel layout OIHW.
+"""
+
+from dataclasses import dataclass
+from itertools import chain
 import textwrap
+from typing import Iterator, Optional, Tuple
 
-from tvm import te, tir
 
-from .common import num_simd_lanes_per_word
+@dataclass
+class SMLAInstruction:
+    """Class for keeping track of an item in inventory."""
 
+    instruction: str
+    tensor_var: str
+    kernel_var: str
 
-def _get_func_name(in_dtype, tensor_h, jump, tensor_w, suffix):
-    """Gets the C function name of the tensordot function."""
-    return f"tensordot_{in_dtype}_h{tensor_h}_j{jump}_w{tensor_w}_{suffix}"
+    def call_with_acle(self, accumulator_var: str) -> str:
+        return (
+            f"{accumulator_var} = __{self.instruction}"
+            f"({self.tensor_var}, {self.kernel_var}, {accumulator_var});"
+        )
 
+    def has_same_operands(self, other: "SMLAInstruction") -> bool:
+        return self.tensor_var == other.tensor_var and self.kernel_var == 
other.kernel_var
 
-def make_intrin_tensordot(slices, strides, tensordot_params):
-    """Helper function for constructing tensordot intrinsic. We can't 
construct the whole thing here
-    (as multiple schedules use tensordot and each must build the intrinstic 
differently) but we can
-    build part here to simplify the code."""
 
-    # in_dtype, tensor_h, jump, tensor_w, suffix = tensordot_params
-    data, kernel, output = slices
-    data_strides, kernel_strides = strides
+def _get_c_function_name(num_outputs, dimensions, offsets, x_strides):
+    """Generates a C function name for tensordot.
 
-    data_buf = tir.decl_buffer(
-        data.shape, data.dtype, name="data", offset_factor=1, 
strides=data_strides
-    )
-    kernel_buf = tir.decl_buffer(
-        kernel.shape,
-        kernel.dtype,
-        name="kernel",
-        offset_factor=1,
-        strides=kernel_strides,
-    )
-    output_buf = tir.decl_buffer(
-        output.shape, output.dtype, name="output", offset_factor=1, strides=[1]
+    We do not need a suffix, as the generated function will have an #include 
guard. Unlike other
+    microTVM operators, _get_c_function_name is never called externally.
+    """
+    tensor_w, kernel_h, kernel_w = dimensions
+    return (
+        f"tensordot_opt_x{num_outputs}_int16_w{tensor_w}_"
+        + f"{kernel_h}x{kernel_w}_"
+        + "".join(map(str, offsets))
+        + (f"_{x_strides[0]}_{x_strides[1]}" if num_outputs > 1 else "")
     )
 
-    def intrin_func(ins, outs):
-        builder = tir.ir_builder.create()
-        builder.emit(
-            tir.call_extern(
-                "int32",
-                _get_func_name(*tensordot_params),
-                outs[0].access_ptr("w"),
-                ins[0].access_ptr("r"),
-                ins[1].access_ptr("r"),
-            )
-        )
-        return builder.get()
 
-    return te.decl_tensor_intrin(
-        output.op,
-        intrin_func,
-        binds={data: data_buf, kernel: kernel_buf, output: output_buf},
-    )
+def _init_biased_accumulators(num_outputs):
+    """Generates code to load the bias into the accumulators.
+
+    Addition is commutative, so we could add the bias before, during, or after 
performing our
+    multiply-accumulate operations. Where we add the bias does not change the 
overflow behavior.
+
+    Doing the bias add takes one cycle either way (if done at the beginning we 
can't use a SMULXY
+    trick to set sum_i to zero for "free"). However, doing it at the beginning 
frees up a register,
+    so we'll do it first.
+    """
+    assignments = map(lambda x: f"sum_{x:x} = *bias", range(num_outputs))
+    joined_assignments = ", ".join(assignments)
+    return f"int32_t {joined_assignments};"
+
+
+def _get_tensor_halfwords(dimensions, offset, num_outputs, in_stride) -> 
Iterator[Optional[Tuple]]:
+    """Gets the data that will be stored in memory at the tensor pointer.
+
+    Returns an Iterator of Optional[Tuple], while skipping over word-aligned 
pairs of unrelated
+    halfwords. The returned iterator is as short as possible while having even 
length and containing
+    all relevant tensor data. Tuples in the returned Iterator represent an (y, 
x) offset from the
+    top-left tensor position being used in this convolution. We need to be 
aware of the None values
+    so our code is correctly word-aligned.
+
+    One consequence of these requirements - each row in the tensor is broken 
into word-aligned pairs
+    of halfwords (which are later combined into full words). See the examples 
below:
+
+    A simple 3x3 depthwise convolution computing one output and with in_stride 
= 1. Note that each
+    row is padded with None at the end to make the rows word-aligned.
+        >>> _get_tensor_halfwords((48, 3, 3), 0, 1, 1)  # doctest: 
+NORMALIZE_WHITESPACE

Review Comment:
   we should definitely enable this plugin, but since it's not enabled, can you 
write these as unittests for now? i do want to make sure they pass before we 
submit this.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

[GitHub] [tvm] areusch commented on a diff in pull request #13242: [microTVM] Modernize Arm Cortex-M convolution schedules

Reply via email to