[GitHub] [tvm] guberti commented on a diff in pull request #13242: [microTVM] Modernize Arm Cortex-M convolution schedules

GitBox Thu, 01 Dec 2022 06:05:34 -0800


guberti commented on code in PR #13242:
URL: https://github.com/apache/tvm/pull/13242#discussion_r1037148586



##########
python/tvm/topi/arm_cpu/mprofile/dsp/micro_kernel/tensordot.py:
##########
@@ -14,142 +14,334 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
-"""Computes a "jumpy tensordot" operator, which can be used to tensorize many 
common operators
-including regular conv2d, depthwise conv2d, and grouped conv2d provided the 
data and kernel layouts
-are the optimal ones. When groups=1, the optimal data layout is NHWC and 
kernel layout is OHWI. When
-this is a depthwise convolution, the optimal data layout is NCHW and kernel 
layout is OIHW."""
+"""Generates optimized code to compute a tensor dot product on ARMv7E-M.
 
+This function can be used to tensorize many common operators including regular 
conv2d, depthwise
+conv2d, and grouped conv2d for some data and kernel layouts. When for regular 
convolution, use data
+layout HHWC and kernel layout OHWI. For depthwise convolution, use data layout 
data layout is NCHW
+and kernel layout OIHW.
+"""
+
+from collections import namedtuple
+from itertools import chain
 import textwrap
+from typing import Iterator, Tuple
 
-from tvm import te, tir
+SMLAInstruction = namedtuple("SMLAInstruction", ["instruction", "tensor_var", 
"kernel_var"])
 
-from .common import num_simd_lanes_per_word
 
+def _get_c_function_name(split_size, dimensions, offsets, x_strides):
+    """Generates a C function name for tensordot.
 
-def _get_func_name(in_dtype, tensor_h, jump, tensor_w, suffix):
-    """Gets the C function name of the tensordot function."""
-    return f"tensordot_{in_dtype}_h{tensor_h}_j{jump}_w{tensor_w}_{suffix}"
+    We do not need a suffix, as the generated function will have an #include 
guard. Unlike other
+    microTVM operators, _get_c_function_name is never called externally.
+    """
+    tensor_w, kernel_h, kernel_w = dimensions
+    return (
+        f"tensordot_opt_x{split_size}_int16_w{tensor_w}_"
+        + f"{kernel_h}x{kernel_w}_"
+        + "".join(map(str, offsets))
+        + (f"_{x_strides[0]}_{x_strides[1]}" if split_size > 1 else "")
+    )
 
 
-def make_intrin_tensordot(slices, strides, tensordot_params):
-    """Helper function for constructing tensordot intrinsic. We can't 
construct the whole thing here
-    (as multiple schedules use tensordot and each must build the intrinstic 
differently) but we can
-    build part here to simplify the code."""
+def _init_biased_accumulators(split_size):
+    """Generates code to load the bias into the accumulators.
 
-    # in_dtype, tensor_h, jump, tensor_w, suffix = tensordot_params
-    data, kernel, output = slices
-    data_strides, kernel_strides = strides
+    Addition is commutative, so we could add the bias before, during, or after 
performing our
+    multiply-accumulate operations. Where we add the bias does not change the 
overflow behavior.
 
-    data_buf = tir.decl_buffer(
-        data.shape, data.dtype, name="data", offset_factor=1, 
strides=data_strides
-    )
-    kernel_buf = tir.decl_buffer(
-        kernel.shape,
-        kernel.dtype,
-        name="kernel",
-        offset_factor=1,
-        strides=kernel_strides,
-    )
-    output_buf = tir.decl_buffer(
-        output.shape, output.dtype, name="output", offset_factor=1, strides=[1]
-    )
+    Doing the bias add takes one cycle either way (if done at the beginning we 
can't use a SMULXY
+    trick to set sum_i to zero for "free"). However, doing it at the beginning 
frees up a register,
+    so we'll do it first.
+    """
+    assignments = map(lambda x: f"sum_{x:x} = *bias", range(split_size))
+    joined_assignments = ", ".join(assignments)
+    return f"int {joined_assignments};"
 
-    def intrin_func(ins, outs):
-        builder = tir.ir_builder.create()
-        builder.emit(
-            tir.call_extern(
-                "int32",
-                _get_func_name(*tensordot_params),
-                outs[0].access_ptr("w"),
-                ins[0].access_ptr("r"),
-                ins[1].access_ptr("r"),
-            )
-        )
-        return builder.get()
 
-    return te.decl_tensor_intrin(
-        output.op,
-        intrin_func,
-        binds={data: data_buf, kernel: kernel_buf, output: output_buf},
-    )
+def _get_tensor_halfwords(dimensions, offset, split_size, in_stride) -> 
Iterator:

Review Comment:
   Originally, I did some subsequent processing before calling `list`, but I 
agree that constructing the list in `_get_tensor_halfwords` is the clearest 
approach now.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

[GitHub] [tvm] guberti commented on a diff in pull request #13242: [microTVM] Modernize Arm Cortex-M convolution schedules

Reply via email to