guberti commented on code in PR #13242:
URL: https://github.com/apache/tvm/pull/13242#discussion_r1040228720
##########
python/tvm/topi/arm_cpu/mprofile/dsp/micro_kernel/tensordot.py:
##########
@@ -14,142 +14,416 @@
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
-"""Computes a "jumpy tensordot" operator, which can be used to tensorize many
common operators
-including regular conv2d, depthwise conv2d, and grouped conv2d provided the
data and kernel layouts
-are the optimal ones. When groups=1, the optimal data layout is NHWC and
kernel layout is OHWI. When
-this is a depthwise convolution, the optimal data layout is NCHW and kernel
layout is OIHW."""
+"""Generates optimized code to compute a tensor dot product on ARMv7E-M.
+This function can be used to tensorize many common operators including regular
conv2d, depthwise
+conv2d, and grouped conv2d for some data and kernel layouts. When for regular
convolution, use data
+layout HHWC and kernel layout OHWI. For depthwise convolution, use data layout
data layout is NCHW
+and kernel layout OIHW.
+"""
+
+from dataclasses import dataclass
+from itertools import chain
import textwrap
+from typing import Iterator, Optional, Tuple
-from tvm import te, tir
-from .common import num_simd_lanes_per_word
+@dataclass
+class SMLAInstruction:
+ """Class for keeping track of an item in inventory."""
+ instruction: str
+ tensor_var: str
+ kernel_var: str
-def _get_func_name(in_dtype, tensor_h, jump, tensor_w, suffix):
- """Gets the C function name of the tensordot function."""
- return f"tensordot_{in_dtype}_h{tensor_h}_j{jump}_w{tensor_w}_{suffix}"
+ def call_with_acle(self, accumulator_var: str) -> str:
+ return (
+ f"{accumulator_var} = __{self.instruction}"
+ f"({self.tensor_var}, {self.kernel_var}, {accumulator_var});"
+ )
+ def has_same_operands(self, other: "SMLAInstruction") -> bool:
+ return self.tensor_var == other.tensor_var and self.kernel_var ==
other.kernel_var
-def make_intrin_tensordot(slices, strides, tensordot_params):
- """Helper function for constructing tensordot intrinsic. We can't
construct the whole thing here
- (as multiple schedules use tensordot and each must build the intrinstic
differently) but we can
- build part here to simplify the code."""
- # in_dtype, tensor_h, jump, tensor_w, suffix = tensordot_params
- data, kernel, output = slices
- data_strides, kernel_strides = strides
+def _get_c_function_name(num_outputs, dimensions, offsets, x_strides):
+ """Generates a C function name for tensordot.
- data_buf = tir.decl_buffer(
- data.shape, data.dtype, name="data", offset_factor=1,
strides=data_strides
- )
- kernel_buf = tir.decl_buffer(
- kernel.shape,
- kernel.dtype,
- name="kernel",
- offset_factor=1,
- strides=kernel_strides,
- )
- output_buf = tir.decl_buffer(
- output.shape, output.dtype, name="output", offset_factor=1, strides=[1]
+ We do not need a suffix, as the generated function will have an #include
guard. Unlike other
+ microTVM operators, _get_c_function_name is never called externally.
+ """
+ tensor_w, kernel_h, kernel_w = dimensions
+ return (
+ f"tensordot_opt_x{num_outputs}_int16_w{tensor_w}_"
+ + f"{kernel_h}x{kernel_w}_"
+ + "".join(map(str, offsets))
+ + (f"_{x_strides[0]}_{x_strides[1]}" if num_outputs > 1 else "")
)
- def intrin_func(ins, outs):
- builder = tir.ir_builder.create()
- builder.emit(
- tir.call_extern(
- "int32",
- _get_func_name(*tensordot_params),
- outs[0].access_ptr("w"),
- ins[0].access_ptr("r"),
- ins[1].access_ptr("r"),
- )
- )
- return builder.get()
- return te.decl_tensor_intrin(
- output.op,
- intrin_func,
- binds={data: data_buf, kernel: kernel_buf, output: output_buf},
- )
+def _init_biased_accumulators(num_outputs):
+ """Generates code to load the bias into the accumulators.
+
+ Addition is commutative, so we could add the bias before, during, or after
performing our
+ multiply-accumulate operations. Where we add the bias does not change the
overflow behavior.
+
+ Doing the bias add takes one cycle either way (if done at the beginning we
can't use a SMULXY
+ trick to set sum_i to zero for "free"). However, doing it at the beginning
frees up a register,
+ so we'll do it first.
+ """
+ assignments = map(lambda x: f"sum_{x:x} = *bias", range(num_outputs))
+ joined_assignments = ", ".join(assignments)
+ return f"int32_t {joined_assignments};"
+
+
+def _get_tensor_halfwords(dimensions, offset, num_outputs, in_stride) ->
Iterator[Optional[Tuple]]:
+ """Gets the data that will be stored in memory at the tensor pointer.
+
+ Returns an Iterator of Optional[Tuple], while skipping over word-aligned
pairs of unrelated
+ halfwords. The returned iterator is as short as possible while having even
length and containing
+ all relevant tensor data. Tuples in the returned Iterator represent an (y,
x) offset from the
+ top-left tensor position being used in this convolution. We need to be
aware of the None values
+ so our code is correctly word-aligned.
+
+ One consequence of these requirements - each row in the tensor is broken
into word-aligned pairs
+ of halfwords (which are later combined into full words). See the examples
below:
+
+ A simple 3x3 depthwise convolution computing one output and with in_stride
= 1. Note that each
+ row is padded with None at the end to make the rows word-aligned.
+ >>> _get_tensor_halfwords((48, 3, 3), 0, 1, 1) # doctest:
+NORMALIZE_WHITESPACE
Review Comment:
Agreed - I've moved them to `test_topi_conv2d_tensordot_opts`.
##########
python/tvm/topi/arm_cpu/mprofile/dsp/micro_kernel/tensordot.py:
##########
@@ -14,142 +14,416 @@
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
-"""Computes a "jumpy tensordot" operator, which can be used to tensorize many
common operators
-including regular conv2d, depthwise conv2d, and grouped conv2d provided the
data and kernel layouts
-are the optimal ones. When groups=1, the optimal data layout is NHWC and
kernel layout is OHWI. When
-this is a depthwise convolution, the optimal data layout is NCHW and kernel
layout is OIHW."""
+"""Generates optimized code to compute a tensor dot product on ARMv7E-M.
Review Comment:
Sometimes - this uses the DSP instructions, which are required in v7E-M but
optional in v8-M. This code also does not use MVE, which is optional in v8-M
but would be really useful for deep learning. I've clarified this in the
docstring.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]