This is an automated email from the ASF dual-hosted git repository.

guberti pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tvm.git


The following commit(s) were added to refs/heads/main by this push:
     new bbba8d97fe [microTVM] Modernize Arm Cortex-M convolution schedules 
(#13242)
bbba8d97fe is described below

commit bbba8d97fe9c32513f6143b54ea66ce8277b79d1
Author: Gavin Uberti <[email protected]>
AuthorDate: Wed Dec 7 01:27:57 2022 +0900

    [microTVM] Modernize Arm Cortex-M convolution schedules (#13242)
    
    * Quantized Corstone300 test draft
    
    * Add QNN strategy with operator fusion for Cortex-M
    
    Get QNN strategy running
    
    QNN strategy with operator fusion
    
    * Add assembly tensordot code from other PR
    
    Assembly tensordot from other PR
    
    Tensordot offset support
    
    Hand tested tensordot code
    
    * Helper work to support microTVM TIR schedules
    
    Formatting fixes
    
    Don't use automatic AOT building when skipping pass
    
    Assorted tech for scheduling with TIR
    
    Hacky int16 support
    
    * TIR schedule for microTVM conv2d
    
    Bugged schedule implementation
    
    Passing test!
    
    Works for all 1x1 conv2ds!
    
    External QNN operator altering
    
    Debugging work
    
    Pad with correct constant
    
    Broadly functional conv2d
    
    Reorganize quantize convolution test
    
    * TIR schedule for microTVM depthwise_conv2d
    
    Working depthwise convolution for strides=1
    
    Working depthwise convolution!
    
    * Clean up code
    
    Support Python 3.7
    
    Clean up code to prepare for review
    
    * Break qnn.py into helper functions
    
    * Finish reorganizing qnn.py
    
    * Fix linting
    
    * Remove residual debug code and fix linting
    
    * Try repairing unit tests
    
    * Run black to fix linting
    
    * Address code review comments
    
    * Second round of code review
    
    Second round of code review
    
    Fix tensordot opts test
    
    * Address @areusch code review
    
    * More code review
    
    * Catch VWW model download with request hook
---
 python/tvm/relay/op/nn/_nn.py                      |  17 +
 python/tvm/relay/qnn/strategy/__init__.py          |   1 +
 python/tvm/relay/qnn/strategy/arm_cpu.py           |  72 ++++
 python/tvm/topi/arm_cpu/__init__.py                |   2 +
 python/tvm/topi/arm_cpu/conv2d.py                  |  18 -
 python/tvm/topi/arm_cpu/depthwise_conv2d.py        |  20 -
 .../arm_cpu/mprofile/dsp/micro_kernel/tensordot.py | 469 ++++++++++++++++-----
 .../topi/arm_cpu/mprofile/dsp/tensordot_conv2ds.py | 296 -------------
 python/tvm/topi/arm_cpu/qnn.py                     | 370 ++++++++++++++++
 python/tvm/topi/arm_cpu/qnn_alter_op.py            | 122 ++++++
 python/tvm/topi/nn/qnn.py                          |  48 +++
 src/relay/qnn/op/convolution.cc                    |   5 +-
 .../test_ethosn/test_convert_equivalents.py        |   4 +-
 tests/python/relay/strategy/arm_cpu/test_conv2d.py |  22 -
 .../strategy/arm_cpu/test_depthwise_conv2d.py      |  31 --
 .../strategy/arm_cpu/test_generalized_conv2d.py    |  10 +-
 .../strategy/arm_cpu/test_quantized_convolution.py | 358 ++++++++++++++++
 .../topi/python/test_topi_conv2d_tensordot_opts.py | 415 ++++++++++++++++++
 tests/scripts/request_hook/request_hook.py         |   1 +
 19 files changed, 1775 insertions(+), 506 deletions(-)

diff --git a/python/tvm/relay/op/nn/_nn.py b/python/tvm/relay/op/nn/_nn.py
index 53aec11e58..e956c82828 100644
--- a/python/tvm/relay/op/nn/_nn.py
+++ b/python/tvm/relay/op/nn/_nn.py
@@ -877,6 +877,23 @@ def convert_deformable_conv2d(attrs, inputs, tinfos, 
desired_layouts):
     return relay.nn.deformable_conv2d(data, offset, weight, **new_attrs)
 
 
+# QNN ops
[email protected]_alter_op_layout("add")
+def alter_op_layout_add(attrs, inputs, tinfos, out_type):
+    """Alter the layout of a add op.
+
+    Useful for fusing the bias constant with an input zero point constant in a 
previous quantized
+    op. Only used when previous op is a quantized op, which is why it lives in 
topi.nn.qnn.
+    """
+    return topi.nn.qnn.qnn_add_alter_layout(attrs, inputs, tinfos, out_type)
+
+
[email protected]_alter_op_layout("qnn.requantize")
+def alter_op_layout_qnn_requantize(attrs, inputs, tinfos, out_type):
+    """Alter the layout of a requantization op."""
+    return topi.nn.qnn.qnn_requantize_alter_layout(attrs, inputs, tinfos, 
out_type)
+
+
 # bitpack
 @reg.register_compute("nn.bitpack")
 def compute_bitpack(attrs, inputs, out_dtype):
diff --git a/python/tvm/relay/qnn/strategy/__init__.py 
b/python/tvm/relay/qnn/strategy/__init__.py
index 05778c3e9f..d7b669a4fa 100644
--- a/python/tvm/relay/qnn/strategy/__init__.py
+++ b/python/tvm/relay/qnn/strategy/__init__.py
@@ -20,4 +20,5 @@
 from __future__ import absolute_import as _abs
 
 from .generic import *
+from . import arm_cpu
 from . import hexagon
diff --git a/python/tvm/relay/qnn/strategy/arm_cpu.py 
b/python/tvm/relay/qnn/strategy/arm_cpu.py
new file mode 100644
index 0000000000..f865381783
--- /dev/null
+++ b/python/tvm/relay/qnn/strategy/arm_cpu.py
@@ -0,0 +1,72 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""Quantized operator strategy for Arm CPU.
+
+As quantized op schedules, these are only used if the qnn.Legalize pass is 
disabled. The current
+schedules only work for fused operators with bias, as this is the most common 
use case. Only
+regular/depthwise conv2d is supported, but qnn_dense will be added 
eventually."""
+
+from tvm import topi, TVMError
+from .generic import qnn_conv2d_strategy
+from ... import op as _op
+from ...op.strategy.generic import is_depthwise_conv2d
+
+
+@qnn_conv2d_strategy.register("arm_cpu")
+def qnn_conv2d_strategy_arm_cpu(attrs, inputs, _out_type, target):
+    """qnn.conv2d strategy for Arm Cortex-M CPUs with DSP.
+
+    When computing convolutions, we want data that will be used to compute the 
same output values to
+    be adjacent in memory, as this lets us reuse memory loads and use more 
SIMD instructions.
+
+    For depthwise convolutions, channels do not interact with each other, so 
the NCHW and IOHW
+    layouts to the best job of keeping "related" data close. In contrast, 
computing one output of a
+    regular convolution requires reading all input channels, so NHWC and OHWI 
are best. Hence, these
+    are the layouts we support.
+    """
+
+    if not (target.features.has_dsp and "cortex-m" in target.mcpu):
+        raise TVMError(
+            "Quantized Arm schedules only exist for Cortex-M with DSP! "
+            "The qnn.Legalize pass should be run for other Arm processors."
+        )
+
+    data = inputs[0]
+    kernel = inputs[1]
+    data_layout = attrs.data_layout
+    kernel_layout = attrs.kernel_layout
+    groups = attrs.groups
+    strategy = _op.OpStrategy()
+
+    if groups == 1:
+        if data_layout == "NHWC" and kernel_layout == "OHWI":
+            strategy.add_implementation(
+                topi.arm_cpu.qnn_conv2d,
+                topi.arm_cpu.schedule_qnn_conv2d,
+                name="qnn_conv2d.arm_cpu",
+            )
+    elif is_depthwise_conv2d(data.shape, data_layout, kernel.shape, 
kernel_layout, groups):
+        if data_layout == "NCHW" and kernel_layout == "IOHW":
+            strategy.add_implementation(
+                topi.arm_cpu.qnn_depthwise_conv2d,
+                topi.arm_cpu.schedule_qnn_depthwise_conv2d,
+                name="qnn_depthwise_conv2d.arm_cpu",
+            )
+    else:
+        raise TVMError("No Arm Cortex-M DSP strategy exists for generic group 
qnn.conv2d")
+
+    return strategy
diff --git a/python/tvm/topi/arm_cpu/__init__.py 
b/python/tvm/topi/arm_cpu/__init__.py
index 20f92a8895..eba102662b 100644
--- a/python/tvm/topi/arm_cpu/__init__.py
+++ b/python/tvm/topi/arm_cpu/__init__.py
@@ -23,9 +23,11 @@ from .depthwise_conv2d import *
 from .conv2d_transpose import *
 from .conv2d_int8 import *
 from . import conv2d_alter_op
+from . import qnn_alter_op
 from .bitserial_conv2d import *
 from .bitserial_dense import *
 from .injective import *
 from .group_conv2d import *
 from .pooling import *
 from .dense import *
+from .qnn import *
diff --git a/python/tvm/topi/arm_cpu/conv2d.py 
b/python/tvm/topi/arm_cpu/conv2d.py
index fc46f4b34f..ab489161a8 100644
--- a/python/tvm/topi/arm_cpu/conv2d.py
+++ b/python/tvm/topi/arm_cpu/conv2d.py
@@ -37,10 +37,6 @@ from .mprofile.dsp.conv2d import (
     conv2d_nhwc_dsp_compute,
     conv2d_nhwc_dsp_schedule,
 )
-from .mprofile.dsp.tensordot_conv2ds import (
-    conv2d_nhwc_ohwi_dsp_compute,
-    tensordot_conv2ds_schedule,
-)
 
 
 @autotvm.register_topi_compute("conv2d_nchw_spatial_pack.arm_cpu")
@@ -522,17 +518,3 @@ def conv2d_nhwc_dsp(cfg, data, kernel, strides, padding, 
dilation, out_dtype):
 def schedule_conv2d_nhwc_dsp(cfg, outs):
     """Create schedule for conv2d_nhwc_dsp"""
     return conv2d_nhwc_dsp_schedule(cfg, outs)
-
-
[email protected]_topi_compute("conv2d_nhwc_ohwi_dsp.arm_cpu")
-def conv2d_nhwc_ohwi_dsp(cfg, data, kernel, strides, padding, dilation, 
out_layout, out_dtype):
-    """Compute conv2d_nhwc_ohwi with v7e-m DSP instructions and the tensordot 
kernel."""
-    return conv2d_nhwc_ohwi_dsp_compute(
-        cfg, data, kernel, strides, padding, dilation, out_layout, out_dtype
-    )
-
-
[email protected]_topi_schedule("conv2d_nhwc_ohwi_dsp.arm_cpu")
-def schedule_conv2d_nhwc_ohwi_dsp(cfg, outs):
-    """Create schedule for conv2d_nhwc_ohwi."""
-    return tensordot_conv2ds_schedule(cfg, outs)
diff --git a/python/tvm/topi/arm_cpu/depthwise_conv2d.py 
b/python/tvm/topi/arm_cpu/depthwise_conv2d.py
index 9284b94745..b6c15a30c0 100644
--- a/python/tvm/topi/arm_cpu/depthwise_conv2d.py
+++ b/python/tvm/topi/arm_cpu/depthwise_conv2d.py
@@ -31,10 +31,6 @@ from .mprofile.dsp.depthwise_conv2d import (
     depthwise_conv2d_nhwc_dsp_compute,
     depthwise_conv2d_nhwc_dsp_schedule,
 )
-from .mprofile.dsp.tensordot_conv2ds import (
-    depthwise_conv2d_nchw_oihw_dsp_compute,
-    tensordot_conv2ds_schedule,
-)
 
 
 @autotvm.register_topi_compute("depthwise_conv2d_nchw.arm_cpu")
@@ -722,19 +718,3 @@ def depthwise_conv2d_nhwc_dsp(cfg, data, kernel, strides, 
padding, dilation, out
 def schedule_depthwise_conv2d_nhwc_dsp(cfg, outs):
     """Create schedule for conv2d_nhwc_dsp"""
     return depthwise_conv2d_nhwc_dsp_schedule(cfg, outs)
-
-
[email protected]_topi_compute("depthwise_conv2d_nchw_oihw_dsp.arm_cpu")
-def depthwise_conv2d_nchw_oihw_dsp(
-    cfg, data, kernel, strides, padding, dilation, out_layout, out_dtype
-):
-    """Compute depthwise_conv2d_nchw_oihw with v7e-m DSP instructions and the 
tensordot kernel."""
-    return depthwise_conv2d_nchw_oihw_dsp_compute(
-        cfg, data, kernel, strides, padding, dilation, out_layout, out_dtype
-    )
-
-
[email protected]_topi_schedule("depthwise_conv2d_nchw_oihw_dsp.arm_cpu")
-def schedule_depthwise_conv2d_nchw_oihw_dsp(cfg, outs):
-    """Create schedule for depthwise_conv2d_nchw_oihw."""
-    return tensordot_conv2ds_schedule(cfg, outs)
diff --git a/python/tvm/topi/arm_cpu/mprofile/dsp/micro_kernel/tensordot.py 
b/python/tvm/topi/arm_cpu/mprofile/dsp/micro_kernel/tensordot.py
index 0fdffc06cf..1d36e1dd1e 100644
--- a/python/tvm/topi/arm_cpu/mprofile/dsp/micro_kernel/tensordot.py
+++ b/python/tvm/topi/arm_cpu/mprofile/dsp/micro_kernel/tensordot.py
@@ -14,142 +14,391 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
-"""Computes a "jumpy tensordot" operator, which can be used to tensorize many 
common operators
-including regular conv2d, depthwise conv2d, and grouped conv2d provided the 
data and kernel layouts
-are the optimal ones. When groups=1, the optimal data layout is NHWC and 
kernel layout is OHWI. When
-this is a depthwise convolution, the optimal data layout is NCHW and kernel 
layout is OIHW."""
+"""Generates optimized code to compute a tensor dot product on ARMv7E-M.
 
+This function can be used to tensorize many common operators including regular 
conv2d, depthwise
+conv2d, and grouped conv2d for some data and kernel layouts. When for regular 
convolution, use data
+layout HHWC and kernel layout OHWI. For depthwise convolution, use data layout 
data layout is NCHW
+and kernel layout OIHW.
+
+The generated code will also work on v8-M chips that have the DSP instructions 
(unlike v7E-M, they
+are optional in v8-M). Note that the generated code does not use the 
(potentially very useful) MVE
+instructions present on some v8-M chips.
+"""
+
+from dataclasses import dataclass
+from itertools import chain
 import textwrap
+from typing import Iterator, Optional, Tuple
 
-from tvm import te, tir
 
-from .common import num_simd_lanes_per_word
+@dataclass
+class SMLAInstruction:
+    """Class for keeping track of an item in inventory."""
 
+    instruction: str
+    tensor_var: str
+    kernel_var: str
 
-def _get_func_name(in_dtype, tensor_h, jump, tensor_w, suffix):
-    """Gets the C function name of the tensordot function."""
-    return f"tensordot_{in_dtype}_h{tensor_h}_j{jump}_w{tensor_w}_{suffix}"
+    def call_with_acle(self, accumulator_var: str) -> str:
+        return (
+            f"{accumulator_var} = __{self.instruction}"
+            f"({self.tensor_var}, {self.kernel_var}, {accumulator_var});"
+        )
 
+    def has_same_operands(self, other: "SMLAInstruction") -> bool:
+        return self.tensor_var == other.tensor_var and self.kernel_var == 
other.kernel_var
 
-def make_intrin_tensordot(slices, strides, tensordot_params):
-    """Helper function for constructing tensordot intrinsic. We can't 
construct the whole thing here
-    (as multiple schedules use tensordot and each must build the intrinstic 
differently) but we can
-    build part here to simplify the code."""
 
-    # in_dtype, tensor_h, jump, tensor_w, suffix = tensordot_params
-    data, kernel, output = slices
-    data_strides, kernel_strides = strides
+def _get_c_function_name(num_outputs, dimensions, offsets, x_strides):
+    """Generates a C function name for tensordot.
 
-    data_buf = tir.decl_buffer(
-        data.shape, data.dtype, name="data", offset_factor=1, 
strides=data_strides
-    )
-    kernel_buf = tir.decl_buffer(
-        kernel.shape,
-        kernel.dtype,
-        name="kernel",
-        offset_factor=1,
-        strides=kernel_strides,
-    )
-    output_buf = tir.decl_buffer(
-        output.shape, output.dtype, name="output", offset_factor=1, strides=[1]
+    We do not need a suffix, as the generated function will have an #include 
guard. Unlike other
+    microTVM operators, _get_c_function_name is never called externally.
+    """
+    tensor_w, kernel_h, kernel_w = dimensions
+    return (
+        f"tensordot_opt_x{num_outputs}_int16_w{tensor_w}_"
+        + f"{kernel_h}x{kernel_w}_"
+        + "".join(map(str, offsets))
+        + (f"_{x_strides[0]}_{x_strides[1]}" if num_outputs > 1 else "")
     )
 
-    def intrin_func(ins, outs):
-        builder = tir.ir_builder.create()
-        builder.emit(
-            tir.call_extern(
-                "int32",
-                _get_func_name(*tensordot_params),
-                outs[0].access_ptr("w"),
-                ins[0].access_ptr("r"),
-                ins[1].access_ptr("r"),
-            )
-        )
-        return builder.get()
 
-    return te.decl_tensor_intrin(
-        output.op,
-        intrin_func,
-        binds={data: data_buf, kernel: kernel_buf, output: output_buf},
-    )
+def _init_biased_accumulators(num_outputs):
+    """Generates code to load the bias into the accumulators.
+
+    Addition is commutative, so we could add the bias before, during, or after 
performing our
+    multiply-accumulate operations. Where we add the bias does not change the 
overflow behavior.
+
+    Doing the bias add takes one cycle either way (if done at the beginning we 
can't use a SMULXY
+    trick to set sum_i to zero for "free"). However, doing it at the beginning 
frees up a register,
+    so we'll do it first.
+    """
+    assignments = [f"sum_{x:x} = *bias" for x in range(num_outputs)]
+    joined_assignments = ", ".join(assignments)
+    return f"int32_t {joined_assignments};"
+
+
+def _get_tensor_halfwords(dimensions, offset, num_outputs, in_stride) -> 
Iterator[Optional[Tuple]]:
+    """Gets the logical indices of the data that will be stored in memory at 
the tensor pointer.
+
+    Returns an Iterator of Optional[Tuple], while skipping over word-aligned 
pairs of unrelated
+    halfwords. The returned iterator is as short as possible while having even 
length and containing
+    all relevant tensor data. Tuples in the returned Iterator represent an (y, 
x) offset from the
+    top-left tensor position being used in this convolution. We need to be 
aware of the None values
+    so our code is correctly word-aligned.
+
+    One consequence of these requirements - each row in the tensor is broken 
into word-aligned pairs
+    of halfwords (which are later combined into full words). See the test 
cases (located in
+    tests/python/topi/python/test_topi_conv2d_tensordot_opts.py) for usage 
examples.
+    """
+
+    tensor_w, kernel_h, kernel_w = dimensions
+    max_x_val = (num_outputs - 1) * in_stride + kernel_w
+    halfwords = []
+
+    for y in range(kernel_h):
+        # If needed, pad so the beginning of the row is word-aligned
+        if (y * tensor_w + offset) % 2 == 1:
+            halfwords.append(None)
+
+        for x in range(max_x_val):
+            halfwords.append((y, x))
+
+        # If needed, pad so the row length is word aligned
+        if (y * tensor_w + offset + max_x_val) % 2 == 1:
+            halfwords.append(None)
+    return halfwords
+
+
+def _get_kernel_halfwords(dimensions, offset) -> Iterator[Optional[Tuple]]:
+    """Gets the logical indices of the data that will be stored in memory at 
the kernel pointer.
 
+    Returns an Iterator of Optional[Tuple]. The returned iterator is as short 
as possible while
+    having even length and containing all kernel data. Tuples in the returned 
Iterator represent
+    an (y, x) position in the kernel, while None values represent other, 
irrelevant data. We need
+    to be aware of the None values so our code is correctly word-aligned.
 
-def tensordot_impl(in_dtype: str, tensor_h: int, jump: int, tensor_w: int, 
suffix: str) -> str:
-    """Generates C code for taking the dot products of two `tensor_h` * 
`tensor_w` tensors. Also has
-    a `jump` argument that advances the pointer of one tensor by that many 
words after each row. The
-    `jump` and `tensor_w` values must be word-aligned for the input data type, 
as non-word-aligned
-    memory access is slow on the Cortex-M series. Depending on the input 
datatype, the code may
-    contain DSP instructions for Arm v7e-m. C code contains DSP instructions 
for Arm v7e-m. See
-    the below pseudocode for reference:
-
-    tensordot(out_ptr, dat_ptr, ker_ptr) {
-        sum = 0;
-        for (i = 0; i < tensor_h; i++) {
-            for (j = 0; j < tensor_w; j++) {
-                sum += (*dat_ptr++) * (*ker_ptr++);
-            }
-            dat_ptr += jump;
-        }
-        *out_ptr = sum;
-    }
+    See test cases in 
tests/python/topi/python/test_topi_conv2d_tensordot_opts.py for examples.
     """
+    _, kernel_h, kernel_w = dimensions
+    halfwords = []
 
-    simd_lanes = num_simd_lanes_per_word(in_dtype)
-    assert tensor_w % simd_lanes == 0
-    assert jump % simd_lanes == 0
+    # Kernel data starts `offset` places after the pointer value
+    if offset == 1:
+        halfwords.append(None)
 
-    if in_dtype == "int8":
-        inner_loop = """
-              uint32_t tensor_c20 = __SXTB16(tensor_batch);
-              uint32_t kernel_c20 = __SXTB16(kernel_batch);
-              sum = __SMLAD(tensor_c20, kernel_c20, sum);
+    for y in range(kernel_h):
+        for x in range(kernel_w):
+            halfwords.append((y, x))
 
-              uint32_t tensor_c31 = __SXTB16(__ROR(tensor_batch, 8));
-              uint32_t kernel_c31 = __SXTB16(__ROR(kernel_batch, 8));
-              sum = __SMLAD(tensor_c31, kernel_c31, sum);"""
+    # Make sure the returned iterator has even length by padding with an 
"unknown" value. We want
+    # even length as this corresponds to an integer number of int32 words.
+    if (kernel_h * kernel_w + offset) % 2 == 1:
+        halfwords.append(None)
+    return halfwords
 
-    elif in_dtype == "int16":
-        inner_loop = """
-              sum = __SMLAD(tensor_batch, kernel_batch, sum);"""
 
-    elif in_dtype == "int32":
-        inner_loop = """
-              // Compiles to a single MAC instruction
-              sum += tensor_batch * kernel_batch;"""
+def _get_int16_alias(position) -> str:
+    if position is None:
+        return "unknown"
+    y, x = position
+    return f"y{y:0>2x}_x{x:0>2x}"
+
+
+def _load_tensor_vars(halfwords, tensor_w) -> Iterator[str]:
+    assert len(halfwords) % 2 == 0
+    offset = int(not bool(halfwords[0]))
+
+    for i in range(0, len(halfwords), 2):
+        var_name = 
f"{_get_int16_alias(halfwords[i])}__{_get_int16_alias(halfwords[i+1])}"
+        y, x = halfwords[i + 1] or halfwords[i]
+        tensor_index = (y * tensor_w + x + offset) // 2
+        yield f"int32_t tensor__{var_name} = tensor[{tensor_index}];"
+
+
+def _load_kernel_vars(halfwords) -> Iterator[str]:
+    assert len(halfwords) % 2 == 0
+    for i in range(0, len(halfwords), 2):
+        var_name = 
f"{_get_int16_alias(halfwords[i])}__{_get_int16_alias(halfwords[i+1])}"
+        yield f"int32_t kernel__{var_name} = kernel[{i // 2}];"
+
+
+def _get_draft_macs(
+    kernel_dims, tensor_halfwords, kernel_halfwords, offset
+) -> Iterator[SMLAInstruction]:
+    """Generates unrolled MAC instructions to compute one tensordot sum.
+
+    Unrolling these loops increases code size a tiny bit (< 0.02 KB), but 
makes the generated code
+    much faster. The generated code does not use SIMD instructions - they are 
added later by
+    _apply_simd_optimizations.
+
+    We return an iterator of SMLAInstruction named tuples. Returning an 
iterator lets us do
+    optimizations by iterator chaining.
+    """
+
+    def get_var(y, x, halfwords) -> Tuple[str, str]:
+        i = halfwords.index((y, x))
+        if i % 2 == 0:
+            return f"{_get_int16_alias((y, x))}__{_get_int16_alias(halfwords[i 
+ 1])}", "b"
+        return f"{_get_int16_alias(halfwords[i - 1])}__{_get_int16_alias((y, 
x))}", "t"
+
+    kernel_h, kernel_w = kernel_dims
+    for y in range(kernel_h):
+        for x in range(kernel_w):
+            tensor_var, tensor_half = get_var(y, x + offset, tensor_halfwords)
+            kernel_var, kernel_half = get_var(y, x, kernel_halfwords)
+            instruction = f"smla{tensor_half}{kernel_half}"
+            yield SMLAInstruction(instruction, f"tensor__{tensor_var}", 
f"kernel__{kernel_var}")
+
+
+def _apply_simd_optimizations(instruction_tuples) -> Iterator[SMLAInstruction]:
+    """When possible, fuses single MACs into SIMD MAC instructions.
+
+    The compiler cannot do this automatically, as calling __smlaxy forces the 
SMLAxy instruction to
+    be used. This function takes as input an iterator of SMLAInstructions and 
returns an iterator of
+    SMLAInstructions (possibly of different length).
+    """
+    curr_tuple = next(instruction_tuples, None)
+    while curr_tuple:
+        next_tuple = next(instruction_tuples, None)
+        if next_tuple is None:
+            yield curr_tuple
+            break
+
+        if curr_tuple.has_same_operands(next_tuple):
+            instructions = sorted([curr_tuple.instruction, 
next_tuple.instruction])
+            if instructions == ["smlabb", "smlatt"]:
+                yield SMLAInstruction("smlad", curr_tuple.tensor_var, 
curr_tuple.kernel_var)
+                next_tuple = next(instruction_tuples, None)
+            elif instructions == ["smlabt", "smlatb"]:
+                yield SMLAInstruction("smladx", curr_tuple.tensor_var, 
curr_tuple.kernel_var)
+                next_tuple = next(instruction_tuples, None)
+            else:
+                yield curr_tuple
+
+        else:
+            yield curr_tuple
+        curr_tuple = next_tuple
+
+
+def _expand_instruction_tuples(instruction_tuples, index) -> Iterator[str]:
+    """Converts an iterator of SMLAInstructions into lines of C code.
+
+    We want the compiler to re-order these with the memory loads, so we 
generate them as a series of
+    calls to instruction aliases instead of as a single `asm` block.
+    """
+
+    for smla_instruction in instruction_tuples:
+        assert "smla" in smla_instruction.instruction
+
+        # We call the instruction using the Arm C Language Extensions. Using 
ACLE gives better
+        # cross-compiler compatibility than using __builtin functions.
+        yield smla_instruction.call_with_acle(f"sum_{index}")
+
+
+def _requantize_sums(num_outputs, requantize_shift, output_zero_point) -> 
Iterator[str]:
+    """Generates code to requantize the accumulator values.
+
+    The generated code does not use floating point instructions, as it 
simulates floating point
+    multiplication with an a int64 multiply + shift. The bias is added at the 
beginning, so we can
+    skip doing it now. The shift is hard-coded, as this saves a few cycles 
without hurting accuracy
+    in "most" cases.
+
+    It's *possible* we could save one more cycle here by pre-multiplying the 
bias with the
+    requantize multiplier, and then doing the bias addition and shift in the 
same cycle (via <op2>).
+    However, it's complicated and only saves one cycle.
+
+    It's also worth noting the SSAT16 operation doesn't help us here. The data 
isn't stored as two
+    halfwords in a word, and rearrainging it would take at least one cycle. 
Two SSAT operations is
+    just as good.
+
+    Calling __ssat directly is a little bit gross, but GCC and Clang are 
unreliable about compiling
+    other ways of writing this. Both the multiply + shift and shift + 
saturation combine to one
+    instruction each.
+    """
+
+    yield "int32_t scale_val = *scale;"
+    for i in range(num_outputs):
+        yield f"int32_t requant_{i} = (sum_{i} * (int64_t) scale_val) >> 
{requantize_shift - 1};"
+        yield f"requant_{i} = (requant_{i} + 1) >> 1;"
+        yield f"requant_{i} = __ssat(requant_{i} + {output_zero_point}, 8);"
+
+
+def _write_sums_to_memory(num_outputs, offset, stride) -> Iterator[str]:
+    """Generates code to write the requantized sums to memory.
+
+    Note - halfword packing here *does* help. It seems
+    like it wouldn't, as doing two pipelined int16 stores takes two cycles - 
the same as halfword
+    packing plus a pipelined int32 store. We still do the int16 stores when 
there is an output
+    stride, though.
+
+    However, this lets the compiler re-order instructions to better preserve 
memory, as it doesn't
+    like breaking apart the store instructions (as this messes up pipelining).
+    """
+
+    if stride > 1:
+        for i in range(num_outputs):
+            yield f"((int16_t*) output)[{i * stride + offset}] = (int16_t) 
requant_{i};"
 
     else:
-        raise ValueError(f"No tensordot implementation exists for dtype 
'{in_dtype}'!")
+        num_packed = (num_outputs - offset) // 2
+        for i in range(num_packed):
+            index = 2 * i + offset
+            yield f"int32_t packed_res_{i} = requant_{index} + (requant_{index 
+ 1} << 16);"
 
-    function_name = _get_func_name(in_dtype, tensor_h, jump, tensor_w, suffix)
-    return textwrap.dedent(
-        (
-            f"""
-        #include <stdint.h>
-        #include <arm_nnsupportfunctions.h>
+        if offset == 1:
+            yield "((int16_t*) output)[1] = (int16_t) requant_0;"
 
-        #ifdef __cplusplus
-        extern "C"
-        #endif
-        __STATIC_FORCEINLINE int32_t {function_name}(
-            uint32_t *out,
-            uint32_t *tensor,
-            uint32_t *kernel) {{
-
-          uint32_t sum = 0;
-
-          #pragma GCC unroll {tensor_h}
-          for (int i = 0; i < {tensor_h}; i++) {{
-            #pragma GCC unroll {tensor_w // simd_lanes}
-            for (int j = 0; j < {tensor_w // simd_lanes}; j++) {{
-              uint32_t tensor_batch = *tensor++;
-              uint32_t kernel_batch = *kernel++;
-              {inner_loop.strip()}
-            }}
-            tensor += {jump // simd_lanes};
-          }}
-          out[0] = sum;
+        for i in range(num_packed):
+            yield f"output[{offset + i}] = packed_res_{i};"
+
+        if (offset + num_outputs) % 2 == 1:
+            yield f"((int16_t*) output)[{num_packed * 2}] = (int16_t) 
requant_{num_packed * 2};"
+
+
+def tensordot_int16_impl(
+    num_outputs: int,
+    dimensions: Tuple[int, int, int],
+    offsets: Tuple[int, int, int],
+    x_strides: Tuple[int, int],
+    requantize_shift: int = 33,
+    output_zero_point: int = -128,
+) -> Tuple[str, str]:
+    """Generates code to compute a tensor dot product with requantization.
+
+    The generated function takes pointers to the output, tensor, and kernel as 
input. All pointers
+    must be word aligned. Only works with `int16` data type. The generated 
code is optimized for the
+    ARMv7E-M architecture.
+
+    Parameters
+    ----------
+    num_outputs: int
+        The number of tensordot outputs to compute per function call. 
Computing more than one at
+        once makes us much faster by reducing how often overlapping data is 
loaded. However, setting
+        this too high causes us to run out of registers and need to store data 
on the stack. We
+        should autotune this, but num_outputs=2 is usually OK.
+
+    dimensions: Tuple[int, int, int]
+        The dimensions of each tensordot operation. dimensions[1] and 
dimensions[2] are the height
+        and width of the kernel, respectively. dimensions[0] is the width of 
the data tensor, which
+        is usually larger than the kernel.
+
+    offsets: Tuple[int, int, int]
+        Each value is 0 or 1, and represents how far after the given data, 
kernel, and output
+        pointers (respectively) we should start reading/writing. This prevents 
us from having to
+        check if each pointer is aligned or unaligned at runtime, making us 
faster.
+
+    x_strides: Tuple[int, int]
+        The distance (in halfwords) between the start of each input tensor, 
and where to write each
+        output result respectively. Only used when num_outputs > 1.
+
+    requantize_shift: int
+        The distance to right shift after multiplying by the requantization 
scale. Defaults to 33,
+        as this lets us skip a shift operation.
+
+    outout_zero_point: int
+        The output zero point, which will be subtracted after scale 
multiplication but before
+        clipping. Defaults to -128, as most models always use this.
+
+    Returns
+    -------
+    func_name, func_code: Tuple[str, str]
+        The name and source code of the generated function.
+    """
+    function_name = _get_c_function_name(num_outputs, dimensions, offsets, 
x_strides)
+    tensor_w, kernel_h, kernel_w = dimensions
+    tensor_offset, kernel_offset, output_offset = offsets
+    assert tensor_offset < 2 and kernel_offset < 2 and output_offset < 2
+    in_stride, out_stride = x_strides
+
+    tensor_halfwords = _get_tensor_halfwords(dimensions, tensor_offset, 
num_outputs, in_stride)
+    kernel_halfwords = _get_kernel_halfwords(dimensions, kernel_offset)
+    load_tensor_lines = _load_tensor_vars(tensor_halfwords, tensor_w)
+    load_kernel_lines = _load_kernel_vars(kernel_halfwords)
+
+    def gen_single_loop_macs(index):
+        draft_macs_iter = _get_draft_macs(
+            (kernel_h, kernel_w), tensor_halfwords, kernel_halfwords, index * 
in_stride
+        )
+        draft_macs_iter = _apply_simd_optimizations(draft_macs_iter)
+        return _expand_instruction_tuples(draft_macs_iter, index)
+
+    multiply_acc_lines = chain.from_iterable(gen_single_loop_macs(i) for i in 
range(num_outputs))
+    requantize_lines = _requantize_sums(
+        num_outputs, requantize_shift=requantize_shift, 
output_zero_point=output_zero_point
+    )
+    write_out_lines = _write_sums_to_memory(num_outputs, output_offset, 
out_stride)
+
+    def insert_lines(lines):
+        return ("\n" + " " * 10).join(lines)
+
+    # It's very common for one model to have different layers that use 
identical tensordot
+    # functions. To prevent function re-definition errors, we need an #include 
guard. This is better
+    # than adding a random suffix, as it saves flash memory.
+    code = textwrap.dedent(
+        f"""
+        #ifndef {function_name.upper()}_EXISTS
+        #define {function_name.upper()}_EXISTS
+        #include <arm_acle.h>
+        __attribute__((always_inline)) static inline int32_t {function_name}(
+            int32_t *output, int32_t *tensor, int32_t *kernel, int32_t *bias, 
int32_t *scale
+        ) {{
+          {_init_biased_accumulators(num_outputs)}
+
+          {insert_lines(load_tensor_lines)}
+
+          {insert_lines(load_kernel_lines)}
+
+          {insert_lines(multiply_acc_lines)}
+
+          {insert_lines(requantize_lines)}
+
+          {insert_lines(write_out_lines)}
           return 0;
         }}
+        #endif
         """
-        )
     )
+    return (function_name, code)
diff --git a/python/tvm/topi/arm_cpu/mprofile/dsp/tensordot_conv2ds.py 
b/python/tvm/topi/arm_cpu/mprofile/dsp/tensordot_conv2ds.py
deleted file mode 100644
index 79564f98ed..0000000000
--- a/python/tvm/topi/arm_cpu/mprofile/dsp/tensordot_conv2ds.py
+++ /dev/null
@@ -1,296 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""Implementations of several conv2d variations, all tensorized using 
tensordot and optimized for
-Cortex-M DSP. Currently contains a standard conv2d and depthwise conv2d 
implementation, but could be
-extended to add a grouped conv2d operator. Due to the way we tensorize, this 
schedule ONLY works
-when the data and kernel layouts are NCHWxc and OIHWxi respectively, where x 
is the number of
-input channels divided by the number of groups."""
-
-import random
-import string
-from typing import Callable, Tuple, Union
-
-import tvm
-from tvm import te
-from tvm.tir import indexdiv, indexmod
-from tvm.topi.utils import traverse_inline
-from tvm.topi.nn.pad import pad
-
-from .micro_kernel.tensordot import (
-    make_intrin_tensordot,
-    tensordot_impl,
-)
-
-
-def _unpack_2d_argument(argument: Union[int, Tuple]) -> Tuple:
-    if isinstance(argument, int):
-        return (argument, argument)
-    assert len(argument) == 2
-    return argument
-
-
-def _check_no_dilation(dilation: Union[int, Tuple]) -> None:
-    """Takes a dilation argument as an integer or tuple, and makes sure both 
dimensions are 1.
-    Dilation prevents us from using DSP instructions, so this schedule can't 
work (aside from the
-    niche case where dilation_h == stride_h and dilation_w == stride_w, which 
is rare enough we
-    probably don't need to support it)."""
-
-    dilation_h, dilation_w = _unpack_2d_argument(dilation)
-    assert dilation_h == dilation_w == 1
-
-
-def _unpack_padding(padding: Tuple) -> Tuple:
-    assert isinstance(padding, tuple)
-    if len(padding) == 2:
-        (pad_up, pad_down), (pad_left, pad_right) = padding
-    else:
-        pad_up, pad_left, pad_down, pad_right = padding
-    return pad_up, pad_left, pad_down, pad_right
-
-
-def _pad_if_needed(data: te.tensor.Tensor, layout: str, padding: Tuple) -> 
te.tensor.Tensor:
-    """Performs padding on a te.tensor.Tensor object if necessary. If padding 
= (0, 0, 0, 0), the
-    input tensor is returned unmodified. We only care about tuples here - 
"VALID" and "SAME" padding
-    will be converted by the importer TFLite importer if present."""
-
-    pad_up, pad_left, pad_down, pad_right = padding
-    if not any(padding):
-        return data
-
-    # We want to pad the "H" and "W" columns, and their position depends on 
the layout
-    pad_before, pad_after = [0, 0, 0, 0], [0, 0, 0, 0]
-    pad_before[layout.index("H")] = pad_up
-    pad_before[layout.index("W")] = pad_left
-    pad_after[layout.index("H")] = pad_down
-    pad_after[layout.index("W")] = pad_right
-    return pad(data, pad_before, pad_after, name="padded_data")
-
-
-def _compute_output_dim(
-    data_dim: int, kernel_dim: int, pad_before: int, pad_after: int, stride: 
int
-) -> int:
-    """Computes an output dimension of a convolution, given the data 
dimension, kernel dimension,
-    padding, and stride along that axis. Note that when stride > 1, this 
division will often not
-    be perfectly even."""
-    return (data_dim + pad_before + pad_after - kernel_dim) // stride + 1
-
-
-def _wrap_te_compute(
-    shape: Tuple,
-    fcompute: Callable[[int, int, int, int], tvm.ir.PrimExpr],
-    desired_out_layout: str,
-    current_out_layout: str = "NHWC",
-    **kwargs,
-) -> te.tensor.Tensor:
-    """Wrapper over te.compute that allows the output layout to be easily 
changed."""
-    assert current_out_layout.isalpha() and desired_out_layout.isalpha()
-    assert sorted(current_out_layout) == sorted(desired_out_layout)
-    forward_order = (current_out_layout.index(c) for c in desired_out_layout)
-    reverse_order = (desired_out_layout.index(c) for c in current_out_layout)
-
-    return te.compute(
-        tuple(shape[i] for i in forward_order),
-        lambda *args: fcompute(*(args[i] for i in reverse_order)),
-        **kwargs,
-    )
-
-
-def _get_suffix() -> str:
-    """Returns a random eight-character string to append to C function names. 
Prevents accidental
-    re-definition of functions if the same operator appears twice in a Relay 
graph."""
-    return "".join(random.choices(string.ascii_uppercase, k=8))
-
-
-def conv2d_nhwc_ohwi_dsp_compute(
-    _cfg, data, kernel, strides, padding, dilation, out_layout, out_dtype
-):
-    """Standard conv2d schedule that can be tensorized using tensordot."""
-
-    stride_h, stride_w = _unpack_2d_argument(strides)
-    pad_up, pad_left, pad_down, pad_right = _unpack_padding(padding)
-    _check_no_dilation(dilation)
-
-    batch_size, data_h, data_w, in_channels = data.shape
-    output_channels, kernel_h, kernel_w, _ = kernel.shape
-    assert kernel.shape[3] == in_channels
-
-    output_h = _compute_output_dim(data_h, kernel_h, pad_up, pad_down, 
stride_h)
-    output_w = _compute_output_dim(data_w, kernel_w, pad_left, pad_right, 
stride_w)
-
-    kh_i = te.reduce_axis((0, kernel_h), name="kh_i")
-    kw_i = te.reduce_axis((0, kernel_w), name="kw_i")
-    kc_i = te.reduce_axis((0, in_channels), name="rc")
-
-    padded_data = _pad_if_needed(data, "NHWC", (pad_up, pad_left, pad_down, 
pad_right))
-    return _wrap_te_compute(
-        (batch_size, output_h, output_w, output_channels),
-        lambda n, y, x, c: te.sum(
-            padded_data[n, y * stride_h + kh_i, x * stride_w + kw_i, 
kc_i].astype(out_dtype)
-            * kernel[c, kh_i, kw_i, kc_i].astype(out_dtype),
-            axis=(kh_i, kw_i, kc_i),
-        ),
-        out_layout,
-        name="conv2d",
-        tag="conv2d_nhwc_ohwi_dsp",
-    )
-
-
-def _make_conv2d_tensorization(padded_data, kernel):
-    _, _, padded_w, in_channels = padded_data.shape
-    _, kernel_h, kernel_w, _ = kernel.shape
-    in_dtype = padded_data.dtype
-    suffix = _get_suffix()
-    assert in_dtype == kernel.dtype
-
-    data_slice = te.placeholder((kernel_h, kernel_w, in_channels), name="a", 
dtype=in_dtype)
-    kernel_slice = te.placeholder((kernel_h, kernel_w, in_channels), name="b", 
dtype=in_dtype)
-
-    kh_i = te.reduce_axis((0, kernel_h), name="kh_i")
-    kw_i = te.reduce_axis((0, kernel_w), name="kw_i")
-    kc_i = te.reduce_axis((0, in_channels), name="kc_i")
-
-    output_slice = te.compute(
-        (1,),
-        lambda k: te.sum(
-            data_slice[kh_i, kw_i, kc_i].astype("int32")
-            * kernel_slice[kh_i, kw_i, kc_i].astype("int32"),
-            axis=[kh_i, kw_i, kc_i],
-        ),
-        name="c",
-    )
-
-    # TVM has a really strange bug where the outer reduction axis (kh_i) 
having length 1 causes the
-    # decl_buffer strides check to fail. height_stride is a dark magic 
workaround for this.
-    height_stride = in_channels * padded_w if kernel_h > 1 else in_channels
-    jump = (padded_w - kernel_w) * in_channels
-    tensordot_params = (in_dtype, kernel_h, jump, kernel_w * in_channels, 
suffix)
-    intrin_tensordot = make_intrin_tensordot(
-        (data_slice, kernel_slice, output_slice),
-        ([height_stride, in_channels, 1], [kernel_w * in_channels, 
in_channels, 1]),
-        tensordot_params,
-    )
-
-    tensordot_code = tensordot_impl(*tensordot_params)
-    return (intrin_tensordot, tensordot_code)
-
-
-def depthwise_conv2d_nchw_oihw_dsp_compute(
-    _cfg, data, kernel, strides, padding, dilation, out_layout, out_dtype
-):
-    """Depthwise conv2d schedule that can be tensorized using tensordot."""
-
-    stride_h, stride_w = _unpack_2d_argument(strides)
-    pad_up, pad_left, pad_down, pad_right = _unpack_padding(padding)
-    _check_no_dilation(dilation)
-
-    batch_size, in_channels, data_h, data_w = data.shape
-    _, c_mul, kernel_h, kernel_w = kernel.shape
-    output_channels = in_channels * c_mul
-    assert kernel.shape[0] == in_channels
-
-    output_h = _compute_output_dim(data_h, kernel_h, pad_up, pad_down, 
stride_h)
-    output_w = _compute_output_dim(data_w, kernel_w, pad_left, pad_right, 
stride_w)
-
-    kh_i = te.reduce_axis((0, kernel_h), name="kh_i")
-    kw_i = te.reduce_axis((0, kernel_w), name="kw_i")
-
-    padded_data = _pad_if_needed(data, "NCHW", (pad_up, pad_left, pad_down, 
pad_right))
-    return _wrap_te_compute(
-        (batch_size, output_h, output_w, output_channels),
-        lambda n, y, x, c: te.sum(
-            padded_data[
-                n,
-                indexdiv(c, c_mul),
-                y * stride_h + kh_i,
-                x * stride_w + kw_i,
-            ].astype(out_dtype)
-            * kernel[indexdiv(c, c_mul), indexmod(c, c_mul), kh_i, 
kw_i].astype(out_dtype),
-            axis=(kh_i, kw_i),
-        ),
-        out_layout,
-        name="depthwise_conv2d",
-        tag="depthwise_conv2d_nchw_oihw_dsp",
-    )
-
-
-def _make_depthwise_conv2d_tensorization(padded_data, kernel):
-    _, _, _, padded_w = padded_data.shape
-    _, _, kernel_h, kernel_w = kernel.shape
-
-    in_dtype = padded_data.dtype
-    suffix = _get_suffix()
-    assert in_dtype == kernel.dtype
-
-    data_slice = te.placeholder((kernel_h, kernel_w), name="a", dtype=in_dtype)
-    kernel_slice = te.placeholder((kernel_h, kernel_w), name="b", 
dtype=in_dtype)
-
-    kh_i = te.reduce_axis((0, kernel_h), name="kh_i")
-    kw_i = te.reduce_axis((0, kernel_w), name="kw_i")
-
-    output_slice = te.compute(
-        (1,),
-        lambda k: te.sum(
-            data_slice[kh_i, kw_i].astype("int32") * kernel_slice[kh_i, 
kw_i].astype("int32"),
-            axis=[kh_i, kw_i],
-        ),
-        name="c",
-    )
-
-    jump = padded_w - kernel_w
-    tensordot_params = (in_dtype, kernel_h, jump, kernel_w, suffix)
-    intrin_tensordot = make_intrin_tensordot(
-        (data_slice, kernel_slice, output_slice),
-        ([padded_w, 1], [kernel_w, 1]),
-        tensordot_params,
-    )
-
-    tensordot_code = tensordot_impl(*tensordot_params)
-    return (intrin_tensordot, tensordot_code)
-
-
-def tensordot_conv2ds_schedule(_cfg, outs):
-    """Schedule function using v7e-m DSP instructions for all the conv2d 
operators in this file. We
-    use one schedule function for them all, because they are tensorized with 
the same kernel."""
-
-    schedule = te.create_schedule([x.op for x in outs])
-
-    def _callback(operator):
-        if "conv2d" in operator.tag:
-            output = operator.output(0)
-            padded_data = output.op.input_tensors[0]
-            kernel = output.op.input_tensors[1]
-
-            if operator.tag == "conv2d_nhwc_ohwi_dsp":
-                b_ax, y_ax, x_ax, co_ax = schedule[output].op.axis
-                kh_ax, kw_ax, ci_ax = schedule[output].op.reduce_axis
-                schedule[output].reorder(b_ax, y_ax, x_ax, co_ax, kh_ax, 
kw_ax, ci_ax)
-                intrin, code = _make_conv2d_tensorization(padded_data, kernel)
-
-            elif operator.tag == "depthwise_conv2d_nchw_oihw_dsp":
-                b_ax, y_ax, x_ax, co_ax = schedule[output].op.axis
-                kh_ax, kw_ax = schedule[output].op.reduce_axis
-                schedule[output].reorder(b_ax, co_ax, y_ax, x_ax, kh_ax, kw_ax)
-                intrin, code = 
_make_depthwise_conv2d_tensorization(padded_data, kernel)
-
-            else:
-                raise ValueError(f"Cannot tensorize {operator.tag} with 
tensordot!")
-
-            schedule[output].tensorize(kh_ax, intrin)
-            schedule[output].pragma(b_ax, "import_c", code)
-
-    traverse_inline(schedule, outs[-1].op, _callback)
-    return schedule
diff --git a/python/tvm/topi/arm_cpu/qnn.py b/python/tvm/topi/arm_cpu/qnn.py
new file mode 100644
index 0000000000..fad64cc09b
--- /dev/null
+++ b/python/tvm/topi/arm_cpu/qnn.py
@@ -0,0 +1,370 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""Contains TVMScript implementations of some QNN operators for Arm.
+
+Currently, the only ops with compute functions are fused regular and depthwise 
convolutions for
+Arm Cortex-M with DSP.
+"""
+
+from typing import Tuple
+
+import tvm
+from tvm import te
+from tvm.tir import const
+from tvm.script import tir as T
+from ..utils import get_const_tuple
+from .mprofile.dsp.micro_kernel import tensordot
+
+
+def int_ceil_division(x, y):
+    return -(x // -y)
+
+
+def _compute_output_dim(data_length, kernel_length, stride):
+    return int_ceil_division(data_length + 1 - kernel_length, stride)
+
+
+def _pick_tensordot_impl(attrs, inputs, num_outputs=2, is_depthwise=False):
+    """Helper function that chooses the right implementation of 
micro_kernel.tensordot.
+
+    Takes as input the parameters of the conv2d, and returns a tuple of TWO 
(function_name,
+    function_code). The first pair (the aligned one) is for even numbered 
output channels, and the
+    second pair (the offset one) is for odd-numbered output channels. This 
function is used for
+    regular and depthwise convolutions.
+
+    We need different implementations for even vs odd numbered output 
channels, because the "start"
+    of an odd output channel in the data tensor or kernel might or might not 
be on a word boundary,
+    and the tensordot code expects all input pointers to be word-aligned.
+    """
+    data, kernel = inputs[0:2]
+    rq_output_zero_point_const = inputs[10]
+    assert len(rq_output_zero_point_const.op.body) == 1
+    output_zero_point = rq_output_zero_point_const.op.body[0]
+
+    _, stride_w = get_const_tuple(attrs.strides)
+
+    if is_depthwise:
+        assert attrs.data_layout == "NCHW"
+        assert attrs.kernel_layout == "IOHW"
+        _, _, height, width = get_const_tuple(data.shape)
+        _, out_channels, kernel_h, kernel_w = get_const_tuple(kernel.shape)
+
+        dimensions = (width, kernel_h, kernel_w)
+        in_stride = stride_w
+        data_per_oc_size = height * width
+    else:
+        assert attrs.data_layout == "NHWC"
+        assert attrs.kernel_layout == "OHWI"
+        _, height, width, in_channels = get_const_tuple(data.shape)
+        out_channels, kernel_h, kernel_w, _ = get_const_tuple(kernel.shape)
+
+        dimensions = (width * in_channels, kernel_h, kernel_w * in_channels)
+        in_stride = in_channels * stride_w
+        data_per_oc_size = 0
+
+    assert attrs.out_layout is not None
+    if attrs.out_layout == "NHWC":
+        out_stride = out_channels
+    elif attrs.out_layout == "NCHW":
+        out_stride = 1
+    else:
+        raise ValueError(f"Unsupported output layout {attrs.out_layout}!")
+
+    x_strides = (in_stride, out_stride)
+    aligned_func = tensordot.tensordot_int16_impl(
+        num_outputs,
+        dimensions,
+        (0, 0, 0),
+        x_strides,
+        output_zero_point=output_zero_point,
+    )
+
+    kernel_per_oc_size = dimensions[1] * dimensions[2]
+
+    offsets = (data_per_oc_size % 2, kernel_per_oc_size % 2, 0)
+    offset_func = tensordot.tensordot_int16_impl(
+        num_outputs,
+        dimensions,
+        offsets,
+        x_strides,
+        output_zero_point=output_zero_point,
+    )
+
+    return (aligned_func, offset_func)
+
+
+def _make_tscript_ptr(buffer, offset, length, dtype="int16"):
+    return T.tvm_access_ptr(
+        T.type_annotation(dtype=dtype),
+        buffer.data,
+        offset,
+        length,
+        1,
+        dtype="handle",
+    )
+
+
+def _make_tscript_call(func_name, *args):
+    return T.evaluate(T.call_extern(func_name, *args, dtype="int32"))
+
+
+def _make_conv2d_primfunc(
+    call_dimensions: Tuple,
+    buffer_shapes: Tuple[Tuple, Tuple, Tuple, Tuple, Tuple],
+    aligned_func: Tuple[str, str],
+    offset_func: Tuple[str, str],
+    ptr_gens: Tuple,
+):
+    height, width, out_channels = call_dimensions
+    data_shape, kernel_shape, bias_shape, scale_shape, output_shape = 
buffer_shapes
+    aligned_func_name, aligned_func_code = aligned_func
+    offset_func_name, offset_func_code = offset_func
+    output_ptr, data_ptr, kernel_ptr = ptr_gens
+
+    # If the functions are identical, we can skip the second loop
+    if aligned_func_name == offset_func_name:
+        aligned_channels = out_channels
+        offset_channels = tvm.tir.const(0)
+        c_step = tvm.tir.const(1)
+    else:
+        aligned_channels = out_channels // 2
+        offset_channels = out_channels // 2
+        c_step = tvm.tir.const(2)
+
+    def bias_ptr(bias, c):
+        return _make_tscript_ptr(bias, c, 1, dtype="int32")
+
+    def scale_ptr(scale, c):
+        return _make_tscript_ptr(scale, c, 1, dtype="int32")
+
+    @T.prim_func
+    def biased_quantized_conv2d(
+        data_handle: T.handle,
+        kernel_handle: T.handle,
+        bias_handle: T.handle,
+        scale_handle: T.handle,
+        output_handle: T.handle,
+    ) -> None:
+
+        T.func_attr({"global_symbol": "main", "tir.noalias": True})
+        data = T.match_buffer(data_handle, data_shape, dtype="int16")
+        kernel = T.match_buffer(kernel_handle, kernel_shape, dtype="int16")
+        bias = T.match_buffer(bias_handle, bias_shape, dtype="int32")
+
+        # We don't specify a data type for the requantization scale, even 
though we will read it as
+        # an int32. This is because we must pretend it is a float32, as 
Relay's requantize op only
+        # allows floating point scales.
+        scale = T.match_buffer(scale_handle, scale_shape)
+        output = T.match_buffer(output_handle, output_shape, dtype="int16")
+
+        # This hack prevents TVM from seeing these variables as "unused". I 
should be using T.reads
+        # and T.writes, but they don't work. I think it's an issue with 
BufferTouchedDomain.
+        # pylint: disable=unused-variable
+        output[0, 0, 0, 0] = 0
+        __1 = data[0, 0, 0, 0]
+        __2 = kernel[0, 0, 0, 0]
+        __3 = bias[0, 0, 0, 0]
+        __4 = scale[0]
+        # pylint: enable=unused-variable
+
+        for c_ax, y_ax, x_ax in T.grid(aligned_channels, height, width):
+            with T.block("conv2d_aligned"):
+                T.block_attr({"pragma_import_c": aligned_func_code})
+                y, x, c = T.axis.remap("SSS", [y_ax, x_ax, c_ax])
+                _make_tscript_call(
+                    aligned_func_name,
+                    output_ptr(output, y, x, c * c_step),
+                    data_ptr(data, y, x, c * c_step),
+                    kernel_ptr(kernel, c * c_step),
+                    bias_ptr(bias, c * c_step),
+                    scale_ptr(scale, c * c_step),
+                )
+
+        for c_ax, y_ax, x_ax in T.grid(offset_channels, height, width):
+            with T.block("conv2d_offset"):
+                T.block_attr({"pragma_import_c": offset_func_code})
+                y, x, c = T.axis.remap("SSS", [y_ax, x_ax, c_ax])
+                _make_tscript_call(
+                    offset_func_name,
+                    output_ptr(output, y, x, c * c_step + 1),
+                    data_ptr(data, y, x, c * c_step + 1, offset=1),
+                    kernel_ptr(kernel, c * c_step + 1, offset=1),
+                    bias_ptr(bias, c * c_step + 1),
+                    scale_ptr(scale, c * c_step + 1),
+                )
+
+    return biased_quantized_conv2d
+
+
+def qnn_conv2d(attrs, inputs, out_type):
+    """Compute for qnn.conv2d with NHWC layout.
+
+    Note that this is a DIFFERENT layout from the Hexagon variant, because 
they have special
+    instructions Cortex-M doesn't have. We expect the kernel to have OHWI 
layout. We also assume
+    that padding is not necessary, as it will have been done by another pass.
+    """
+
+    # Make a few checks to unpack the function arguments and ensure it was 
called with the right
+    # arguments. Note that unlike most schedules, qnn_conv2d does not use a 
wrapper.
+    assert len(inputs) == 11
+    data, kernel, _izp, _kzp, _iscale, _kscale, bias, scale = inputs[0:8]
+    output_layout = attrs.out_layout
+    assert output_layout == "NHWC"
+
+    _, height, width, in_channels = get_const_tuple(data.shape)
+    out_channels, kernel_h, kernel_w, _ = get_const_tuple(kernel.shape)
+    y_stride, x_stride = get_const_tuple(attrs.strides)
+
+    out_height = _compute_output_dim(height, kernel_h, y_stride)
+    out_width = _compute_output_dim(width, kernel_w, x_stride)
+
+    # Decide how many sums our function should have running at the same time. 
Doing
+    # this lets us do "more work" for each memory load, but doing too many of 
them causes us to run
+    # out of registers. Currently this is set to either 1 or 2, but autotuning 
this value would
+    # improve performance a lot. Tracked by 
https://github.com/apache/tvm/issues/13528.
+
+    num_outputs = 2
+
+    # Next, decide whether whether we need "parity alternation". For example, 
if we have an
+    # 8x3x3x3 kernel (8 output channels, height 3, width 3, input channels 3) 
in the OHWI layout,
+    # then every output channel kernel slice will be 27 halfwords. This means 
every other output
+    # channel will not be word aligned, which will cause slowness/crashes!
+
+    # We solve this problem by handling the "aligned" and "offset" output 
channels with different
+    # versions of our tensordot function. The "aligned func" assumes that the 
start positions of the
+    # output, data, and kernel are given exactly by their pointer. The 
"offset" version assumes that
+    # the "true" start of the output is the value in the output pointer, plus 
an offset of 0 or 1.
+    # _pick_tensordot_impl decides whether this is the case. If not, we only 
want to generate one
+    # function (to save flash), so offset_func is a tuple of empty strings.
+
+    aligned_func, offset_func = _pick_tensordot_impl(attrs, inputs, 
num_outputs, False)
+
+    # Helper functions to make pointers
+    def output_ptr(buffer, y, x, c):
+        return _make_tscript_ptr(
+            buffer,
+            y * const(out_width * out_channels) + x * const(out_channels * 
num_outputs) + c,
+            1,
+        )
+
+    # We need to disable pylint's unused argument checker, as the kwarg offset 
is unused but must
+    # be present for compatibility. We cannot add an underscore as we normally 
would, as this makes
+    # the keyword not match.
+
+    # pylint: disable=unused-argument
+    def data_ptr(buffer, y, x, c, offset=0):
+        return _make_tscript_ptr(
+            buffer,
+            y * const(y_stride * width * in_channels)
+            + x * const(x_stride * num_outputs * in_channels),
+            1,
+        )
+
+    # pylint: enable=unused-argument
+
+    def kernel_ptr(buffer, c, offset=0):
+        return _make_tscript_ptr(
+            buffer,
+            c * const(kernel_h * kernel_w * in_channels) - offset,
+            1,
+        )
+
+    prim_func = _make_conv2d_primfunc(
+        (const(out_height), const(out_width // num_outputs), 
const(out_channels)),
+        (data.shape, kernel.shape, bias.shape, scale.shape, out_type.shape),
+        aligned_func,
+        offset_func,
+        (output_ptr, data_ptr, kernel_ptr),
+    )
+
+    output = te.extern_primfunc([data, kernel, bias, scale], prim_func, 
name="tir", dtype="int16")
+    return [output]
+
+
+def schedule_qnn_conv2d(_attrs, _outs, _target):
+    """Schedule function for qnn.conv2d."""
+    return None
+
+
+def qnn_depthwise_conv2d(attrs, inputs, out_type):
+    """Compute for qnn.depthwise_conv2d with NCHW layout.
+
+    Works basically the same way as regular conv2d - see above.
+    """
+
+    assert len(inputs) == 11
+    data, kernel, _izp, _kzp, _iscale, _kscale, bias, scale = inputs[0:8]
+    output_layout = attrs.out_layout
+    assert output_layout == "NHWC"
+
+    _, _, height, width = get_const_tuple(data.shape)
+    _, out_channels, kernel_h, kernel_w = get_const_tuple(kernel.shape)
+    _, out_height, out_width, _ = get_const_tuple(out_type.shape)
+    y_stride, x_stride = get_const_tuple(attrs.strides)
+
+    out_height = _compute_output_dim(height, kernel_h, y_stride)
+    out_width = _compute_output_dim(width, kernel_w, x_stride)
+
+    num_outputs = 2
+
+    aligned_func, offset_func = _pick_tensordot_impl(attrs, inputs, 
num_outputs, True)
+
+    # Helper functions for making pointers.
+    def output_ptr(buffer, y, x, c):
+        return _make_tscript_ptr(
+            buffer,
+            y * const(out_width * out_channels) + x * const(out_channels * 
num_outputs) + c,
+            1,
+        )
+
+    def data_ptr(buffer, y, x, c, offset=0):
+        if height * width % 2 == 1:
+            x_ptr_offset = tvm.tir.const(-1)
+        else:
+            x_ptr_offset = tvm.tir.const(0)
+
+        return _make_tscript_ptr(
+            buffer,
+            c * const(width * height)
+            + y * const(y_stride * width)
+            + x * const(x_stride * num_outputs)
+            + offset * x_ptr_offset,
+            1,
+        )
+
+    def kernel_ptr(buffer, c, offset=0):
+        return _make_tscript_ptr(
+            buffer,
+            c * tvm.tir.const(kernel_h * kernel_w) - offset,
+            1,
+        )
+
+    prim_func = _make_conv2d_primfunc(
+        (const(out_height), const(out_width // num_outputs), 
const(out_channels)),
+        (data.shape, kernel.shape, bias.shape, scale.shape, out_type.shape),
+        aligned_func,
+        offset_func,
+        (output_ptr, data_ptr, kernel_ptr),
+    )
+
+    output = te.extern_primfunc([data, kernel, bias, scale], prim_func, 
name="tir", dtype="int16")
+    return [output]
+
+
+def schedule_qnn_depthwise_conv2d(_attrs, _outs, _target):
+    """Schedule function for qnn.depthwise_conv2d."""
+    return None
diff --git a/python/tvm/topi/arm_cpu/qnn_alter_op.py 
b/python/tvm/topi/arm_cpu/qnn_alter_op.py
new file mode 100644
index 0000000000..00225493db
--- /dev/null
+++ b/python/tvm/topi/arm_cpu/qnn_alter_op.py
@@ -0,0 +1,122 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""Arm Cortex-M specific optimizations for quantized operators."""
+
+import numpy as np
+
+from tvm import nd, relay, target
+from ..nn import qnn_requantize_alter_layout, qnn_add_alter_layout
+
+
+@qnn_requantize_alter_layout.register(["arm_cpu"])
+def alter_requantize_layout(attrs, inputs, _tinfos, _out_type):
+    """Changes a floating point requantize op to use int64 multiply + shift 
for microTVM.
+
+    Usually, this is done by QNN legalization. However, microTVM wants to 
manually choose the
+    integer rounding constants in order to:
+        (a) Have int32, not int64 constants
+        (b) Use a constant rounding shift to skip a memory load.
+
+    Ideally, we would pick these constants in the requantize (or fused) 
schedule. Unfortunately that
+    is not currently possible, so we pick them with `alter_layout` as a hack. 
This will only work if
+    the requantize schedule "plays along" with this hack.
+    """
+
+    # Only microTVM Cortex-M boards with DSP use the relevant schedules
+    current_target = target.Target.current(allow_none=False)
+    if not (current_target.features.has_dsp and "cortex-m" in 
current_target.mcpu):
+        return None
+
+    _, in_scale, _, out_scale, _ = inputs
+    in_scale_numpy = in_scale.data.numpy().astype("float64")
+    out_scale_scalar = out_scale.data.numpy().item()
+
+    # Shifting by 33 and rounding means shifting by 32, adding 1, and shifting 
by 1 again. This is
+    # useful, because shifting a multiplication product by 32 can be done for 
"free" with SMMUL
+    scales = ((in_scale_numpy / out_scale_scalar) * 2**33).astype("int32")
+
+    # Requantize ops in Relay do not support int32 scales - if we try to use 
one, requantize.cc will
+    # raise an error. As a hacky work-around, we change the scale dtype to 
float32, without changing
+    # underlying data. This works, as our compute function knows to interpret 
the scale as an int32.
+
+    # This is only a work-around - a better long-term solution would be adding 
a new integer
+    # requantize op, which takes integer scales, shifts, and rounding behavior.
+    fake_float_scales = scales.view("float32")
+
+    scale_constant = relay.Constant(nd.array(fake_float_scales))
+    return relay.qnn.op.requantize(inputs[0], scale_constant, *inputs[2:], 
**attrs)
+
+
+def _is_qnn_op_depthwise_conv2d(qnn_conv2d_op):
+    return relay.op.strategy.generic.is_depthwise_conv2d(
+        qnn_conv2d_op.args[0].type_annotation.shape,
+        qnn_conv2d_op.attrs.data_layout,
+        qnn_conv2d_op.args[1].data.shape,
+        qnn_conv2d_op.attrs.kernel_layout,
+        qnn_conv2d_op.attrs.groups,
+    )
+
+
+@qnn_add_alter_layout.register(["arm_cpu"])
+def alter_add_layout(_attrs, inputs, _tinfos, _out_type):
+    """Fuses the zero point for a previous quantized operator with this add 
operation.
+
+    Currently only supports qnn.conv2d, but qnn.dense support should be added. 
Note that this
+    optimization means we must pad tensors with the input zero point, and NOT 
with zero.
+    """
+
+    prev_op, biases = inputs
+    if not hasattr(prev_op, "op"):
+        return None
+    if prev_op.op.name != "qnn.conv2d":
+        return None
+
+    # We should not perform this alteration if the target has a uint * int 
SIMD MAC operation (since
+    # these do (x - (-128)) * y efficiently, and conv_input_zp is usually 
-128). For now, we
+    # restrict this optimization to just Cortex-M devices, but it might be 
helpful on others too.
+    current_target = target.Target.current(allow_none=False)
+    if not "cortex-m" in current_target.mcpu:
+        return None
+
+    conv_input_zp = prev_op.args[2].data.numpy().item()
+    kernel = prev_op.args[1].data.numpy()
+
+    if _is_qnn_op_depthwise_conv2d(prev_op):
+        axes_to_sum = "HW"
+    elif prev_op.attrs.groups == 1:
+        axes_to_sum = "HWI"
+    else:
+        # This alteration does not currently support grouped conv2d
+        return None
+    axes_to_sum = tuple(map(prev_op.attrs.kernel_layout.index, axes_to_sum))
+    element_sums = np.sum(kernel, axis=axes_to_sum).flatten()
+
+    # The zero point is subtracted from the input elements, so we need a "-" 
sign here
+    zp_shifted_sums = element_sums * (-conv_input_zp)
+
+    # We want to make sure new_biases is representable as an int32. It's 
tempting to just check
+    # whether arr.dtype == "int32" (since Numpy will automatically increase 
dtype in some cases)
+    # but this leads to weird wrapping behavior and doesn't work. We must do 
it manually.
+    new_biases = biases.data.numpy().astype("int64") + zp_shifted_sums
+    if new_biases.min() < -(2**31) or new_biases.max() > 2**31 - 1:
+        return None
+
+    new_input_zp = relay.Constant(nd.array(np.int32(0)))
+    new_conv_args = (*prev_op.args[:2], new_input_zp, *prev_op.args[3:])
+    new_conv_op = relay.qnn.op.conv2d(*new_conv_args, **prev_op.attrs)
+    bias_constant = relay.Constant(nd.array(new_biases.astype("int32")))
+    return relay.add(new_conv_op, bias_constant)
diff --git a/python/tvm/topi/nn/qnn.py b/python/tvm/topi/nn/qnn.py
index caed285800..222f7a7c22 100644
--- a/python/tvm/topi/nn/qnn.py
+++ b/python/tvm/topi/nn/qnn.py
@@ -188,3 +188,51 @@ def simulated_dequantize(data, in_dtype, input_scale=None, 
input_zero_point=None
         return intn_value
 
     return te.compute(data.shape, lambda *indices: 
_dispatch_sim_dequantize(data)[indices])
+
+
[email protected]_func
+def qnn_requantize_alter_layout(_attrs, _inputs, _tinfos, _out_type):
+    """Change requantize layout.
+
+    Parameters
+    ----------
+    attrs : tvm.ir.Attrs
+        Attributes of current convolution
+    inputs : tvm.relay.Expr
+        Grouped input symbols
+    tinfos : list
+        Input shape and dtype
+    out_type: type
+        The output type
+
+    Note
+    ----
+    Unlike other TOPI functions, this function operates on both graph level 
and operator level.
+    """
+    return None
+
+
[email protected]_func
+def qnn_add_alter_layout(_attrs, _inputs, _tinfos, _out_type):
+    """Change add layout.
+
+    Add is not a QNN-specific function, but this generic exists so that bias 
add operations can be
+    fused with input zero point add optimizations, which only happens if the 
previous operator is
+    quantized.
+
+    Parameters
+    ----------
+    attrs : tvm.ir.Attrs
+        Attributes of current convolution
+    inputs : tvm.relay.Expr
+        Grouped input symbols
+    tinfos : list
+        Input shape and dtype
+    out_type: type
+        The output type
+
+    Note
+    ----
+    Unlike other TOPI functions, this function operates on both graph level 
and operator level.
+    """
+    return None
diff --git a/src/relay/qnn/op/convolution.cc b/src/relay/qnn/op/convolution.cc
index 64a5a02e6e..2170ba76e0 100644
--- a/src/relay/qnn/op/convolution.cc
+++ b/src/relay/qnn/op/convolution.cc
@@ -53,8 +53,9 @@ bool QnnConv2DRel(const Array<Type>& types, int num_inputs, 
const Attrs& attrs,
   ICHECK(data->dtype == DataType::Int(8) || data->dtype == DataType::UInt(8) ||
          data->dtype == DataType::Int(16))
       << "Expected qnn conv2d type(int8, uint8, int16) for input but was " << 
data->dtype;
-  ICHECK(weight->dtype == DataType::Int(8) || weight->dtype == 
DataType::UInt(8))
-      << "Expected qnn conv2d type(int8, uint8) for weight but was " << 
weight->dtype;
+  ICHECK(weight->dtype == DataType::Int(8) || weight->dtype == 
DataType::UInt(8) ||
+         weight->dtype == DataType::Int(16))
+      << "Expected qnn conv2d type(int8, uint8, int16) for weight but was " << 
weight->dtype;
   ICHECK(param->out_dtype == DataType::Int(16) || param->out_dtype == 
DataType::Int(32) ||
          param->out_dtype == DataType::Int(64))
       << "Expected qnn conv2d type(int16, int32, int64) for output but was " 
<< param->out_dtype;
diff --git a/tests/python/contrib/test_ethosn/test_convert_equivalents.py 
b/tests/python/contrib/test_ethosn/test_convert_equivalents.py
index a3e48f4424..58173a9ea6 100644
--- a/tests/python/contrib/test_ethosn/test_convert_equivalents.py
+++ b/tests/python/contrib/test_ethosn/test_convert_equivalents.py
@@ -120,7 +120,7 @@ def test_multiply_to_depthwise(dtype, shape, channels, 
reverse_inputs):
 @requires_ethosn
 @pytest.mark.parametrize(
     "dtype,shape,constant_shape",
-    [("int8", (1, 4, 4), (4,)), ("int16", (1, 16, 12, 4), (1, 1, 1, 4))],
+    [("int8", (1, 4, 4), (4,)), ("int32", (1, 16, 12, 4), (1, 1, 1, 4))],
 )
 def test_unsupported_multiply_to_depthwise(dtype, shape, constant_shape):
     """Check that unsupported variants of multiply to depthwise are not 
converted."""
@@ -339,7 +339,7 @@ def test_add_to_depthwise(reverse_inputs):
 
 @requires_ethosn
 @pytest.mark.parametrize(
-    "dtype,lhs_shape,rhs_shape", [("uint8", (1, 4, 4), (1, 1, 4)), ("int16", 
(1, 4, 4, 4), (4,))]
+    "dtype,lhs_shape,rhs_shape", [("uint8", (1, 4, 4), (1, 1, 4)), ("int32", 
(1, 4, 4, 4), (4,))]
 )
 def test_unsupported_add_to_depthwise(dtype, lhs_shape, rhs_shape):
     """Check that unsupported variants of add are not converted."""
diff --git a/tests/python/relay/strategy/arm_cpu/test_conv2d.py 
b/tests/python/relay/strategy/arm_cpu/test_conv2d.py
index 6cf4bbb8e6..1b9c1a5e2e 100644
--- a/tests/python/relay/strategy/arm_cpu/test_conv2d.py
+++ b/tests/python/relay/strategy/arm_cpu/test_conv2d.py
@@ -93,28 +93,6 @@ class TestConv2d_NHWC_Spatial_Pack(Conv2dTests):
     schedule_name = parameter("conv2d_nhwc_spatial_pack.arm_cpu")
 
 
-class TestConv2d_Tensordot(Conv2dTests):
-    """This test is for the regular conv2d schedule tensorized using 
tensordot."""
-
-    data_shape, kernel_size, num_filter, strides, padding = parameters(
-        # Disabled because these kernels are not an integral number of words
-        # ((1, 32, 32, 1), (3, 3), 12, 1, 0),
-        # ((1, 32, 10, 3), (3, 3), 16, 1, 0),
-        # ((1, 96, 96, 3), (3, 3), 8, (2, 2), (0, 0, 1, 1)),
-        ((1, 32, 32, 16), (3, 3), 16, 1, (0, 2, 2, 0)),
-        ((1, 16, 16, 32), (1, 1), 64, (2, 2), 0),
-        ((1, 49, 10, 1), (10, 4), 64, (2, 1), (4, 1, 5, 1)),
-        ((4, 16, 16, 16), (5, 5), 8, 2, 0),
-    )
-    dilation = parameter(1)
-    in_dtype = parameter("int8", "int16", "int32")
-
-    data_layout = parameter("NHWC")
-    kernel_layout = parameter("OHWI")
-    out_layout = parameter("NHWC", "NCHW")
-    schedule_name = parameter("conv2d_nhwc_ohwi_dsp.arm_cpu")
-
-
 class TestConv2d_NCHW_Spatial_Pack(Conv2dTests):
     """This test is for conv2d_nchw_spatial_pack.arm_cpu schedule."""
 
diff --git a/tests/python/relay/strategy/arm_cpu/test_depthwise_conv2d.py 
b/tests/python/relay/strategy/arm_cpu/test_depthwise_conv2d.py
index f45d27bdae..95ae105f91 100644
--- a/tests/python/relay/strategy/arm_cpu/test_depthwise_conv2d.py
+++ b/tests/python/relay/strategy/arm_cpu/test_depthwise_conv2d.py
@@ -110,36 +110,5 @@ class 
TestDepthwiseConv2d_NHWC_HWOI_DSP(DepthwiseConv2dTests):
     schedule_name = parameter("depthwise_conv2d_nhwc_dsp.arm_cpu")
 
 
-class TestDepthwiseConv2d_Tensordot(DepthwiseConv2dTests):
-    """This test is for the depthwise_conv2d schedule tensorized using 
tensordot."""
-
-    data_shape, kernel_size, num_filter, strides, padding, in_dtype = 
parameters(
-        # Currently, our schedule requires kernel_w be divisible by the number 
of simd lanes given
-        # its dtype. This means 3x3 and 5x5 kernels do not work on int16 or 
int8 for now. If you had
-        # to, you could hack around this by padding the data and kernel.
-        ((1, 48, 48, 8), (3, 3), 8, (1, 1), 1, "int32"),
-        ((1, 48, 48, 16), (3, 3), 16, (2, 2), (1, 1, 0, 0), "int32"),
-        ((1, 24, 24, 32), (3, 3), 32, (1, 1), 1, "int32"),
-        ((1, 24, 24, 32), (3, 3), 32, (2, 2), (1, 1, 0, 0), "int32"),
-        ((1, 12, 12, 64), (3, 3), 64, (1, 1), 1, "int32"),
-        ((1, 12, 12, 64), (3, 3), 64, (2, 2), (1, 1, 0, 0), "int32"),
-        ((1, 6, 6, 128), (3, 3), 128, (1, 1), 1, "int32"),
-        ((1, 6, 6, 128), (3, 3), 128, (2, 2), (1, 1, 0, 0), "int32"),
-        ((1, 3, 3, 256), (3, 3), 256, (1, 1), 1, "int32"),
-        ((1, 25, 5, 64), (3, 3), 64, (1, 1), 1, "int32"),
-        ((1, 24, 24, 8), (5, 5), 8, (1, 1), 1, "int32"),
-        ((1, 24, 24, 8), (3, 5), 8, (1, 1), 1, "int32"),
-        # These "evenly divisible" kernels work on smaller dtypes.
-        ((1, 48, 48, 8), (3, 2), 8, 1, 0, "int16"),
-        ((1, 48, 48, 8), (4, 4), 8, 1, 0, "int8"),
-    )
-    dilation = parameter(1)
-
-    data_layout = parameter("NCHW")
-    kernel_layout = parameter("OIHW")
-    out_layout = parameter("NHWC", "NCHW")
-    schedule_name = parameter("depthwise_conv2d_nchw_oihw_dsp.arm_cpu")
-
-
 if __name__ == "__main__":
     main()
diff --git a/tests/python/relay/strategy/arm_cpu/test_generalized_conv2d.py 
b/tests/python/relay/strategy/arm_cpu/test_generalized_conv2d.py
index 499d677e8f..d48c7e138f 100644
--- a/tests/python/relay/strategy/arm_cpu/test_generalized_conv2d.py
+++ b/tests/python/relay/strategy/arm_cpu/test_generalized_conv2d.py
@@ -26,7 +26,7 @@ from tvm.testing.aot import AOTTestModel, compile_and_run, 
generate_ref_data
 from tvm.micro.testing.aot_test_utils import AOT_CORSTONE300_RUNNER
 
 
-def _change_ndarray_layout(arr, src_layout, dst_layout):
+def change_ndarray_layout(arr, src_layout, dst_layout):
     """Makes a copy of an ndarray, reshaping it to a new data layout.
 
     Parameter
@@ -96,7 +96,7 @@ class GeneralizedConv2dTests:
 
         ref_relay_op = relay.op.nn.conv2d(
             ref_input_var,
-            relay.const(_change_ndarray_layout(ref_kernel_data, "HWIO", 
self.ref_kernel_layout)),
+            relay.const(change_ndarray_layout(ref_kernel_data, "HWIO", 
self.ref_kernel_layout)),
             kernel_size=kernel_size,
             strides=strides,
             padding=padding,
@@ -113,11 +113,11 @@ class GeneralizedConv2dTests:
         # Reshape output dictionary to match out_layout
         assert len(ref_outputs) == 1
         output_tensor_name, output_tensor = next(iter(ref_outputs.items()))
-        ref_outputs[output_tensor_name] = 
_change_ndarray_layout(output_tensor, "NHWC", out_layout)
+        ref_outputs[output_tensor_name] = change_ndarray_layout(output_tensor, 
"NHWC", out_layout)
 
-        test_input_data = _change_ndarray_layout(ref_input_data, "NHWC", 
data_layout)
+        test_input_data = change_ndarray_layout(ref_input_data, "NHWC", 
data_layout)
         test_input_var = relay.var("input", 
relay.TensorType(test_input_data.shape, in_dtype))
-        test_kernel_data = _change_ndarray_layout(ref_kernel_data, "HWIO", 
kernel_layout)
+        test_kernel_data = change_ndarray_layout(ref_kernel_data, "HWIO", 
kernel_layout)
 
         test_relay_op = relay.op.nn.conv2d(
             test_input_var,
diff --git a/tests/python/relay/strategy/arm_cpu/test_quantized_convolution.py 
b/tests/python/relay/strategy/arm_cpu/test_quantized_convolution.py
new file mode 100644
index 0000000000..573231f963
--- /dev/null
+++ b/tests/python/relay/strategy/arm_cpu/test_quantized_convolution.py
@@ -0,0 +1,358 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""microTVM cares a lot about the convolution + bias + requantize + fused ReLU 
use case. There have
+been some accuracy issues in the past, so this test steps through a model 
(MobileNetV1) layer by
+layer and ensures there is 1-1 correspondance at each step. This test would 
run way faster if we ran
+the model all at once, but then we wouldn't know which layers had issues.
+
+Furthermore, this test uses some in-development optimizations for microTVM 
that aren't part of the
+main pipeline.
+"""
+
+import numpy as np
+from PIL import Image
+import pytest
+
+import tvm
+import tvm.testing
+from tvm import meta_schedule, relay
+from tvm.testing.aot import AOTTestModel, run_and_check, AOTCompiledTestModel
+from tvm.relay.backend import Executor, Runtime
+from tvm.micro.testing.aot_test_utils import AOT_CORSTONE300_RUNNER
+from tvm.contrib.download import download_testdata
+from test_generalized_conv2d import change_ndarray_layout
+
+
+# The model is the v0.7 version of the TinyML person detection (aka visual 
wake words) model. This
+# is an RGB 96x96 MobileNet V1 model.
+MODEL_URL = 
"https://github.com/mlcommons/tiny/raw/v0.7/benchmark/training/visual_wake_words/trained_models/vww_96_int8.tflite";
+SAMPLE_URL = (
+    
"https://github.com/dmlc/web-data/raw/main/tensorflow/models/InceptionV1/elephant-299.jpg";
+)
+
+
[email protected](scope="module")
+def interpreter():
+    """Returns a TFLite interpreter with the MLPerf Tiny visual wakewords 
model loaded, with an
+    elephant image run through it, and with all intermediate layer outputs 
saved."""
+
+    # Make sure the Tensorflow import is skipped if the test is being skipped. 
This is needed to
+    # prevent the "python: i386" tests from failing, as they don't have 
Tensorflow installed.
+    import tensorflow as tf  # pylint: disable=import-outside-toplevel
+
+    # Download the reference model
+    rel_model_path = "model_microtvm_mobilenetv1.tflite"
+    file = download_testdata(MODEL_URL, rel_model_path, overwrite=False)
+
+    # Load it into TensorFlow and allocate memory
+    interpreter = tf.lite.Interpreter(file, 
experimental_preserve_all_tensors=True)
+    interpreter.allocate_tensors()
+
+    # Download an image. The neuron activations are strange if we use random 
data or ones,
+    # so downloading an image is useful.
+    rel_image_path = "image_microtvm_mobilenetv1.jpg"
+    img_path = download_testdata(SAMPLE_URL, rel_image_path, overwrite=False)
+    image = Image.open(img_path).resize((96, 96))
+    image_data_hwc_uint8 = np.asarray(image)
+    assert image_data_hwc_uint8.shape == (96, 96, 3)
+    assert image_data_hwc_uint8.dtype == "uint8"
+    image_data_nhwc_int8 = (image_data_hwc_uint8 + 
128).view("int8").reshape((1, 96, 96, 3))
+
+    # Load the image into the TFLite interpreter and compute all intermediate 
tensor values
+    input_details = interpreter.get_input_details()
+    interpreter.set_tensor(input_details[0]["index"], image_data_nhwc_int8)
+    interpreter.invoke()
+    return interpreter
+
+
+def _get_mobilenet_v1_layer_attributes(layer_num):
+    """Returns the relevant padding and stride for a given layer in a 
MobileNetV1 model. It's a huge
+    headache to read this data from TensorFlow, as it is not user accessible 
via the interpreter. If
+    we really wanted to, we would have to parse the .tflite file ourselves. 
This function is a bit
+    of a hack, but lets us skip that."""
+
+    if layer_num == 0:  # Regular conv2d
+        return ((0, 0, 1, 1), (2, 2), False)
+    if layer_num % 2 == 0:  # 1x1 conv2d
+        return ((0, 0, 0, 0), (1, 1), False)
+    if layer_num in [3, 7, 11, 23]:  # Downsizing depthwise_conv2d layers
+        return ((0, 0, 1, 1), (2, 2), True)
+    # Depthwise conv2d
+    return ((1, 1, 1, 1), (1, 1), True)
+
+
+def _get_relu_activation_prefix(layer_num):
+    if layer_num == 0:
+        return "model/activation/Relu;"
+    return f"model/activation_{layer_num}/Relu;"
+
+
+def _get_main_path_tensor_details(details, tensor_num):
+    """A "main path" tensor is a fused layer input/output. Gets the tensor 
details from the tensor
+    index, where 0 gives the original input tensor, 1 gives the output of the 
first fused
+    convolution layer, and so on. TFLite names are a little wack, so we get 
this information by
+    finding the SECOND tensor (which has the suffix "1") for each ReLU 
activation (the first tensor
+    is the bias)."""
+
+    if tensor_num == 0:
+        return details[0]
+    prefix = _get_relu_activation_prefix(tensor_num - 1)
+    detail = next(d for d in details if d["name"].startswith(prefix) and 
d["name"].endswith("1"))
+    assert len(detail["shape"]) == 4
+    assert detail["dtype"] == np.int8
+    return detail
+
+
+def _get_bias_details(details, layer_num):
+    """Gets the tensor details for the bias tensor for the corresponding 
convolution layer. The
+    bias tensors always appear before the main path tensors, so we don't have 
to check the ending to
+    make sure we have the right one."""
+    prefix = _get_relu_activation_prefix(layer_num)
+    detail = next(d for d in details if d["name"].startswith(prefix))
+    assert len(detail["shape"]) == 1
+    assert detail["dtype"] == np.int32
+    return detail
+
+
+def _get_kernel_details(details, layer_num):
+    """Gets the tensor details for the kernel tensor for the corresponding 
convolution layer. These
+    have a different naming scheme from the main path and bias tensors, as 
they are converted before
+    activation function fusion. Note that regular vs depthwise conv2ds have 
different prefixes."""
+
+    if layer_num == 0:
+        prefix = "model/conv2d/Conv2D"
+    elif layer_num % 2 == 0:
+        prefix = f"model/conv2d_{layer_num // 2}/"
+    else:
+        prefix = f"model/batch_normalization_{layer_num}/"
+
+    detail = next(d for d in details if d["name"].startswith(prefix))
+    assert len(detail["shape"]) == 4
+    assert detail["dtype"] == np.int8
+    return detail
+
+
+def _get_quant_scale_const(quantization_dict, as_scalar=False):
+    scales = quantization_dict["scales"]
+    if as_scalar:
+        assert len(scales) == 1
+        scales = scales[0]
+    return relay.const(scales, "float32")
+
+
+def _get_quant_zp_const(quantization_dict, as_scalar=False):
+    zero_points = quantization_dict["zero_points"]
+    if as_scalar:
+        assert len(zero_points) == 1
+        zero_points = zero_points[0]
+    return relay.const(zero_points, "int32")
+
+
+def _change_layout(data, old_layout, new_layout, dtype):
+    return change_ndarray_layout(data, old_layout, new_layout).astype(dtype)
+
+
+def _load_tflite_layer(interpreter, layer):
+    tensor_details = interpreter.get_tensor_details()
+
+    def lookup(detail):
+        return interpreter.get_tensor(detail["index"]), 
detail["quantization_parameters"]
+
+    input_data = lookup(_get_main_path_tensor_details(tensor_details, layer))
+    kernel_data = lookup(_get_kernel_details(tensor_details, layer))
+    bias_data = lookup(_get_bias_details(tensor_details, layer))
+    output_data = lookup(_get_main_path_tensor_details(tensor_details, layer + 
1))
+    return input_data, kernel_data, bias_data, output_data
+
+
+def _make_relay_partial_func(relay_op, *args, **kwargs):
+    return lambda op: relay_op(op, *args, **kwargs)
+
+
+def _make_conv2d_op(kernel, data_quant, kernel_quant, hyperparams, 
is_depthwise=False):
+    dtype, padding, strides, data_layout, kernel_layout, output_layout = 
hyperparams
+    kernel_size = kernel.shape[1:3]
+    if is_depthwise:
+        channels = groups = kernel.shape[3]
+    else:
+        channels = kernel.shape[0]
+        groups = 1
+
+    kernel_ndarr = _change_layout(kernel, "OHWI", kernel_layout, dtype)
+
+    return _make_relay_partial_func(
+        relay.qnn.op.conv2d,
+        relay.const(kernel_ndarr, dtype),
+        input_zero_point=_get_quant_zp_const(data_quant, as_scalar=True),
+        kernel_zero_point=_get_quant_zp_const(kernel_quant),
+        input_scale=_get_quant_scale_const(data_quant, as_scalar=True),
+        kernel_scale=_get_quant_scale_const(kernel_quant),
+        kernel_size=kernel_size,
+        data_layout=data_layout,
+        kernel_layout="IOHW" if is_depthwise else kernel_layout,
+        dilation=(1, 1),
+        strides=strides,
+        padding=padding,
+        groups=groups,
+        channels=channels,
+        out_dtype="int32",
+        out_layout=output_layout,
+    )
+
+
+def _make_bias_op(bias, output_layout):
+    requantize_axis = output_layout.index("C")
+    return _make_relay_partial_func(
+        relay.op.nn.bias_add,
+        relay.const(bias, "int32"),
+        axis=requantize_axis,
+    )
+
+
+def _make_requantize_op(bias_quant, output_quant, output_dtype, output_layout):
+    requantize_axis = output_layout.index("C")
+    return _make_relay_partial_func(
+        relay.qnn.op.requantize,
+        _get_quant_scale_const(bias_quant),
+        _get_quant_zp_const(bias_quant),
+        _get_quant_scale_const(output_quant, as_scalar=True),
+        _get_quant_zp_const(output_quant, as_scalar=True),
+        axis=requantize_axis,
+        compute_dtype="int64",
+        out_dtype=output_dtype,
+    )
+
+
+def _make_aot_model(params, hyperparams, layouts, is_depthwise=False):
+    tensors, quantizations = zip(*params)
+    data, kernel, bias, output = tensors
+    data_quant, kernel_quant, bias_quant, output_quant = quantizations
+
+    dtype, padding, _strides = hyperparams
+    data_layout, _, output_layout = layouts
+
+    if any(padding):
+        pad_const = int(data_quant["zero_points"][0])
+        pad_before = (0, padding[0], padding[1], 0)
+        pad_after = (0, padding[2], padding[3], 0)
+        data = np.pad(data, tuple(zip(pad_before, pad_after)), 
constant_values=pad_const)
+    data_ndarr = _change_layout(data, "NHWC", data_layout, dtype)
+    output_ndarr = _change_layout(output, "NHWC", output_layout, dtype)
+
+    input_var = relay.var("input", relay.TensorType(data_ndarr.shape, dtype))
+    conv2d = _make_conv2d_op(kernel, data_quant, kernel_quant, hyperparams + 
layouts, is_depthwise)
+    bias = _make_bias_op(bias, output_layout)
+    requantize = _make_requantize_op(bias_quant, output_quant, dtype, 
output_layout)
+
+    relay_mod = requantize(bias(conv2d(input_var)))
+    relay_func = relay.Function([input_var], relay_mod)
+    return AOTTestModel(
+        module=tvm.IRModule.from_expr(relay_func),
+        inputs={"input": data_ndarr},
+        outputs={"output": output_ndarr},
+        output_tolerance=1,
+    )
+
+
+def _make_target():
+    return tvm.target.Target("c -keys=arm_cpu -mcpu=cortex-m7")
+
+
+def _make_executor():
+    return Executor(
+        "aot",
+        {
+            "workspace-byte-alignment": 8,
+            "constant-byte-alignment": 8,
+            "interface-api": "c",
+            "unpacked-api": True,
+        },
+    )
+
+
[email protected]("layer", range(23))
[email protected]_corstone300
+def test_qnn_conv2d_mobilenetv1_layer(interpreter, layer):
+    """Checks microTVM output against TFLite for one MobileNetV1 layer.
+
+    Loads the input, kernel, bias, expected output, and quantization 
parameters from the specified
+    layer in a TFLite Interpreter. That information is used to construct a 
Relay Function with the
+    same structure. The Function is run using microTVM and AOTTestModel, and 
we verify microTVM's
+    output is the same as the TFLite ground truth.
+
+    This function only cross-checks the first 23 layers in MobileNetV1, which 
are regular and
+    depthwise 2D convolutions (this function only works for 2D convolutions). 
We do not test the
+    average pool, dense, or softmax layers at the end of the model.
+
+    Note that we disable the QNN Legalization pass. This allows TVM to use its 
QNN compute
+    definitions, fuse the three operations together, and perform other 
optimizations.
+
+    Parameters
+    ----------
+    interpreter: tensorflow.lite.python.interpreter.Interpreter
+        A TensorFlow Lite interpreter for a MobileNetV1 model, where invoke() 
has already been
+        called and experimental_preserve_all_tensors=True. Should be passed as 
a Pytest fixture.
+
+    layer: int
+        The index of the layer to check against TensorFlow's ground truth 
values.
+    """
+    dtype = "int16"
+
+    tensor, kernel, bias, output = _load_tflite_layer(interpreter, layer)
+
+    padding, strides, is_depthwise = _get_mobilenet_v1_layer_attributes(layer)
+    if is_depthwise:
+        data_layout, kernel_layout, output_layout = "NCHW", "OIHW", "NHWC"
+    else:
+        data_layout, kernel_layout, output_layout = "NHWC", "OHWI", "NHWC"
+
+    test_model = _make_aot_model(
+        (tensor, kernel, bias, output),
+        (dtype, padding, strides),
+        (data_layout, kernel_layout, output_layout),
+        is_depthwise=is_depthwise,
+    )
+
+    def schedule_fn(_sch):
+        return True
+
+    with tvm.transform.PassContext(
+        opt_level=3,
+        config={
+            "tir.disable_vectorize": True,
+            "relay.backend.use_meta_schedule": True,
+            "relay.backend.tir_converter": "allow_extern",
+        },
+        disabled_pass=["qnn.Legalize"],
+    ), meta_schedule.database.ScheduleFnDatabase(schedule_fn):
+        executor_factory = tvm.relay.build(
+            test_model.module,
+            _make_target(),
+            executor=_make_executor(),
+            runtime=Runtime("crt"),
+            params=test_model.params,
+            mod_name=test_model.name,
+        )
+        compiled = AOTCompiledTestModel(model=test_model, 
executor_factory=executor_factory)
+
+    run_and_check(
+        models=[compiled],
+        runner=AOT_CORSTONE300_RUNNER,
+        interface_api="c",
+        workspace_byte_alignment=8,
+        constant_byte_alignment=8,
+    )
diff --git a/tests/python/topi/python/test_topi_conv2d_tensordot_opts.py 
b/tests/python/topi/python/test_topi_conv2d_tensordot_opts.py
new file mode 100644
index 0000000000..46d2797ba3
--- /dev/null
+++ b/tests/python/topi/python/test_topi_conv2d_tensordot_opts.py
@@ -0,0 +1,415 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""Tests for functions in tvm.topi.arm_cpu.mprofile.dsp.micro_kernel.tensordot.
+
+Contains a few unit tests, followed by integration tests for common use cases. 
Note that we do not
+run the generated code - we just make sure the strings match exactly.
+
+Note that a *lot* of instruction reordering happens during compilation from C 
to assembly (by GCC or
+Clang). I've verified that this instruction reordering happens correctly for 
all the functions here.
+For more details on why the generated code is the way it is, see 
`tensordot_int16_impl`."""
+
+import textwrap
+
+from tvm.topi.arm_cpu.mprofile.dsp.micro_kernel.tensordot import (
+    _get_tensor_halfwords,
+    _get_kernel_halfwords,
+    tensordot_int16_impl,
+)
+
+
+def test_get_tensor_halfwords():
+    """Tests the _get_tensor_halfwords helper function in tensordot.py.
+
+    This function loads the logical indices of the data that will be stored in 
memory at the tensor
+    pointer. See the function docstring for more details.
+    """
+
+    # fmt: off
+    # A simple 3x3 depthwise convolution computing one output and with 
in_stride = 1. Note that each
+    # row is padded with None at the end to make the rows word-aligned.
+    assert _get_tensor_halfwords((48, 3, 3), 0, 1, 1) == [
+        (0, 0), (0, 1), (0, 2), None,
+        (1, 0), (1, 1), (1, 2), None,
+        (2, 0), (2, 1), (2, 2), None
+    ]
+
+    # If the tensor width is odd, padding alternates before/after every row.
+    assert _get_tensor_halfwords((49, 3, 3), 0, 1, 1) == [
+        (0, 0), (0, 1), (0, 2), None,
+        None, (1, 0), (1, 1), (1, 2),
+        (2, 0), (2, 1), (2, 2), None
+    ]
+
+    # If we are computing multiple outputs, more tensor data becomes relevant.
+    assert _get_tensor_halfwords((48, 3, 3), 0, 2, 1) == [
+        (0, 0), (0, 1), (0, 2), (0, 3),
+        (1, 0), (1, 1), (1, 2), (1, 3),
+        (2, 0), (2, 1), (2, 2), (2, 3)
+    ]
+
+    # If offset=1, relevant data starts one halfword after the kernel pointer.
+    assert _get_tensor_halfwords((48, 3, 3), 1, 1, 1) == [
+        None, (0, 0), (0, 1), (0, 2),
+        None, (1, 0), (1, 1), (1, 2),
+        None, (2, 0), (2, 1), (2, 2)
+    ]
+
+    # These adjustments can be (and often are) used together.
+    assert _get_tensor_halfwords((49, 3, 3), 1, 2, 2) == [
+        None, (0, 0), (0, 1), (0, 2), (0, 3), (0, 4),
+        (1, 0), (1, 1), (1, 2), (1, 3), (1, 4), None,
+        None, (2, 0), (2, 1), (2, 2), (2, 3), (2, 4)
+    ]
+    # fmt: on
+
+
+def test_get_kernel_halfwords():
+    """Tests the _get_kernel_halfwords helper function in tensordot.py.
+
+    This function loads the logical indices of the data that will be stored in 
memory at the kernel
+    pointer. See the function docstring for more details.
+    """
+
+    # fmt: off
+    # Example of a kernel for a 3x3 depthwise convolution channel
+    assert _get_kernel_halfwords((96, 3, 3), 0) == [
+        (0, 0), (0, 1), (0, 2),
+        (1, 0), (1, 1), (1, 2),
+        (2, 0), (2, 1), (2, 2),
+        None,
+    ]
+
+    # Example of a kernel for a 1x1 regular convolution with 4 channels
+    assert _get_kernel_halfwords((48, 1, 4), 1) == [
+        None, (0, 0), (0, 1), (0, 2), (0, 3), None,
+    ]
+    # fmt: on
+
+
+def test_write_3x3_depthwise_code():
+    """This is the function that would be generated for a 1x4x48x48 NCHW input 
tensor with "SAME"
+    padding. We are only computing one sum at once, so we don't need stride or 
output. Note that
+    this is pretty inefficient - it would be much better to compute a few sums 
concurrently.
+
+    When inlined, this code compiles (with armv7-a clang 11) into:
+
+    tensordot_opt_x1_int16_w48_3x3_000(int*, int*, int*, int*, int*):
+        ldr.w   lr, [r3]
+        ldrd    r11, r4, [r1]
+        ldrd    r5, r9, [r1, #96]
+        ldrd    r10, r8, [r1, #192]
+        ldm.w   r2, {r1, r6, r7}
+        ldr.w   r12, [sp, #36]
+        smlad   r1, r11, r1, lr
+        smlabb  r1, r4, r6, r1
+        smlatb  r1, r6, r5, r1
+        ldrd    r3, r2, [r2, #12]
+        smlatb  r1, r5, r7, r1
+        smlatb  r1, r7, r9, r1
+        smlad   r1, r10, r3, r1
+        ldr.w   r3, [r12]
+        smlabb  r1, r8, r2, r1
+        smmul   r1, r3, r1
+        ssat    r1, #8, r1, asr #8
+        strh    r1, [r0]
+    """
+    _, code = tensordot_int16_impl(1, (48, 3, 3), (0, 0, 0), (1, 1))
+    assert code == textwrap.dedent(
+        """
+    #ifndef TENSORDOT_OPT_X1_INT16_W48_3X3_000_EXISTS
+    #define TENSORDOT_OPT_X1_INT16_W48_3X3_000_EXISTS
+    #include <arm_acle.h>
+    __attribute__((always_inline)) static inline int32_t 
tensordot_opt_x1_int16_w48_3x3_000(
+        int32_t *output, int32_t *tensor, int32_t *kernel, int32_t *bias, 
int32_t *scale
+    ) {
+      int32_t sum_0 = *bias;
+
+      int32_t tensor__y00_x00__y00_x01 = tensor[0];
+      int32_t tensor__y00_x02__unknown = tensor[1];
+      int32_t tensor__y01_x00__y01_x01 = tensor[24];
+      int32_t tensor__y01_x02__unknown = tensor[25];
+      int32_t tensor__y02_x00__y02_x01 = tensor[48];
+      int32_t tensor__y02_x02__unknown = tensor[49];
+
+      int32_t kernel__y00_x00__y00_x01 = kernel[0];
+      int32_t kernel__y00_x02__y01_x00 = kernel[1];
+      int32_t kernel__y01_x01__y01_x02 = kernel[2];
+      int32_t kernel__y02_x00__y02_x01 = kernel[3];
+      int32_t kernel__y02_x02__unknown = kernel[4];
+
+      sum_0 = __smlad(tensor__y00_x00__y00_x01, kernel__y00_x00__y00_x01, 
sum_0);
+      sum_0 = __smlabb(tensor__y00_x02__unknown, kernel__y00_x02__y01_x00, 
sum_0);
+      sum_0 = __smlabt(tensor__y01_x00__y01_x01, kernel__y00_x02__y01_x00, 
sum_0);
+      sum_0 = __smlatb(tensor__y01_x00__y01_x01, kernel__y01_x01__y01_x02, 
sum_0);
+      sum_0 = __smlabt(tensor__y01_x02__unknown, kernel__y01_x01__y01_x02, 
sum_0);
+      sum_0 = __smlad(tensor__y02_x00__y02_x01, kernel__y02_x00__y02_x01, 
sum_0);
+      sum_0 = __smlabb(tensor__y02_x02__unknown, kernel__y02_x02__unknown, 
sum_0);
+
+      int32_t scale_val = *scale;
+      int32_t requant_0 = (sum_0 * (int64_t) scale_val) >> 32;
+      requant_0 = (requant_0 + 1) >> 1;
+      requant_0 = __ssat(requant_0 + -128, 8);
+
+      ((int16_t*) output)[0] = (int16_t) requant_0;
+      return 0;
+    }
+    #endif
+    """
+    )
+
+
+def test_odd_width_3x3_depthwise_strides_code():
+    """This is the function that would be generated for a 1x4x48x48 NCHW input 
tensor with "SAME"
+    padding and (2, 2) strides, being written into NHWC layout. The layout 
change is encoded by
+    out_stride = 4. This is a common use case seen in MobileNetV1, among 
others.
+
+    Note that despite the rows not being word-aligned, the *tensor pointer 
will always be word
+    aligned (satisfying this requirement) since y_stride = 2."""
+
+    _, code = tensordot_int16_impl(2, (49, 3, 3), (0, 0, 0), (2, 4))
+    assert code == textwrap.dedent(
+        """
+    #ifndef TENSORDOT_OPT_X2_INT16_W49_3X3_000_2_4_EXISTS
+    #define TENSORDOT_OPT_X2_INT16_W49_3X3_000_2_4_EXISTS
+    #include <arm_acle.h>
+    __attribute__((always_inline)) static inline int32_t 
tensordot_opt_x2_int16_w49_3x3_000_2_4(
+        int32_t *output, int32_t *tensor, int32_t *kernel, int32_t *bias, 
int32_t *scale
+    ) {
+      int32_t sum_0 = *bias, sum_1 = *bias;
+
+      int32_t tensor__y00_x00__y00_x01 = tensor[0];
+      int32_t tensor__y00_x02__y00_x03 = tensor[1];
+      int32_t tensor__y00_x04__unknown = tensor[2];
+      int32_t tensor__unknown__y01_x00 = tensor[24];
+      int32_t tensor__y01_x01__y01_x02 = tensor[25];
+      int32_t tensor__y01_x03__y01_x04 = tensor[26];
+      int32_t tensor__y02_x00__y02_x01 = tensor[49];
+      int32_t tensor__y02_x02__y02_x03 = tensor[50];
+      int32_t tensor__y02_x04__unknown = tensor[51];
+
+      int32_t kernel__y00_x00__y00_x01 = kernel[0];
+      int32_t kernel__y00_x02__y01_x00 = kernel[1];
+      int32_t kernel__y01_x01__y01_x02 = kernel[2];
+      int32_t kernel__y02_x00__y02_x01 = kernel[3];
+      int32_t kernel__y02_x02__unknown = kernel[4];
+
+      sum_0 = __smlad(tensor__y00_x00__y00_x01, kernel__y00_x00__y00_x01, 
sum_0);
+      sum_0 = __smlabb(tensor__y00_x02__y00_x03, kernel__y00_x02__y01_x00, 
sum_0);
+      sum_0 = __smlatt(tensor__unknown__y01_x00, kernel__y00_x02__y01_x00, 
sum_0);
+      sum_0 = __smlad(tensor__y01_x01__y01_x02, kernel__y01_x01__y01_x02, 
sum_0);
+      sum_0 = __smlad(tensor__y02_x00__y02_x01, kernel__y02_x00__y02_x01, 
sum_0);
+      sum_0 = __smlabb(tensor__y02_x02__y02_x03, kernel__y02_x02__unknown, 
sum_0);
+      sum_1 = __smlad(tensor__y00_x02__y00_x03, kernel__y00_x00__y00_x01, 
sum_1);
+      sum_1 = __smlabb(tensor__y00_x04__unknown, kernel__y00_x02__y01_x00, 
sum_1);
+      sum_1 = __smlatt(tensor__y01_x01__y01_x02, kernel__y00_x02__y01_x00, 
sum_1);
+      sum_1 = __smlad(tensor__y01_x03__y01_x04, kernel__y01_x01__y01_x02, 
sum_1);
+      sum_1 = __smlad(tensor__y02_x02__y02_x03, kernel__y02_x00__y02_x01, 
sum_1);
+      sum_1 = __smlabb(tensor__y02_x04__unknown, kernel__y02_x02__unknown, 
sum_1);
+
+      int32_t scale_val = *scale;
+      int32_t requant_0 = (sum_0 * (int64_t) scale_val) >> 32;
+      requant_0 = (requant_0 + 1) >> 1;
+      requant_0 = __ssat(requant_0 + -128, 8);
+      int32_t requant_1 = (sum_1 * (int64_t) scale_val) >> 32;
+      requant_1 = (requant_1 + 1) >> 1;
+      requant_1 = __ssat(requant_1 + -128, 8);
+
+      ((int16_t*) output)[0] = (int16_t) requant_0;
+      ((int16_t*) output)[4] = (int16_t) requant_1;
+      return 0;
+    }
+    #endif
+    """
+    )
+
+
+def test_1x1x8_convolution_code():
+    """This is the function that would be generated for a 1x48x48x8 NHWC input 
tensor under
+    standard convolution with a 1x1 kernel. This is a common use case seen in 
MobileNetV1,
+    among others. In this scenario, a very high amount of memory re-use means 
that summing
+    four channels at once makes us faster."""
+
+    _, code = tensordot_int16_impl(4, (48 * 8, 1, 8), (0, 0, 0), (8, 1))
+    assert code == textwrap.dedent(
+        """
+    #ifndef TENSORDOT_OPT_X4_INT16_W384_1X8_000_8_1_EXISTS
+    #define TENSORDOT_OPT_X4_INT16_W384_1X8_000_8_1_EXISTS
+    #include <arm_acle.h>
+    __attribute__((always_inline)) static inline int32_t 
tensordot_opt_x4_int16_w384_1x8_000_8_1(
+        int32_t *output, int32_t *tensor, int32_t *kernel, int32_t *bias, 
int32_t *scale
+    ) {
+      int32_t sum_0 = *bias, sum_1 = *bias, sum_2 = *bias, sum_3 = *bias;
+
+      int32_t tensor__y00_x00__y00_x01 = tensor[0];
+      int32_t tensor__y00_x02__y00_x03 = tensor[1];
+      int32_t tensor__y00_x04__y00_x05 = tensor[2];
+      int32_t tensor__y00_x06__y00_x07 = tensor[3];
+      int32_t tensor__y00_x08__y00_x09 = tensor[4];
+      int32_t tensor__y00_x0a__y00_x0b = tensor[5];
+      int32_t tensor__y00_x0c__y00_x0d = tensor[6];
+      int32_t tensor__y00_x0e__y00_x0f = tensor[7];
+      int32_t tensor__y00_x10__y00_x11 = tensor[8];
+      int32_t tensor__y00_x12__y00_x13 = tensor[9];
+      int32_t tensor__y00_x14__y00_x15 = tensor[10];
+      int32_t tensor__y00_x16__y00_x17 = tensor[11];
+      int32_t tensor__y00_x18__y00_x19 = tensor[12];
+      int32_t tensor__y00_x1a__y00_x1b = tensor[13];
+      int32_t tensor__y00_x1c__y00_x1d = tensor[14];
+      int32_t tensor__y00_x1e__y00_x1f = tensor[15];
+
+      int32_t kernel__y00_x00__y00_x01 = kernel[0];
+      int32_t kernel__y00_x02__y00_x03 = kernel[1];
+      int32_t kernel__y00_x04__y00_x05 = kernel[2];
+      int32_t kernel__y00_x06__y00_x07 = kernel[3];
+
+      sum_0 = __smlad(tensor__y00_x00__y00_x01, kernel__y00_x00__y00_x01, 
sum_0);
+      sum_0 = __smlad(tensor__y00_x02__y00_x03, kernel__y00_x02__y00_x03, 
sum_0);
+      sum_0 = __smlad(tensor__y00_x04__y00_x05, kernel__y00_x04__y00_x05, 
sum_0);
+      sum_0 = __smlad(tensor__y00_x06__y00_x07, kernel__y00_x06__y00_x07, 
sum_0);
+      sum_1 = __smlad(tensor__y00_x08__y00_x09, kernel__y00_x00__y00_x01, 
sum_1);
+      sum_1 = __smlad(tensor__y00_x0a__y00_x0b, kernel__y00_x02__y00_x03, 
sum_1);
+      sum_1 = __smlad(tensor__y00_x0c__y00_x0d, kernel__y00_x04__y00_x05, 
sum_1);
+      sum_1 = __smlad(tensor__y00_x0e__y00_x0f, kernel__y00_x06__y00_x07, 
sum_1);
+      sum_2 = __smlad(tensor__y00_x10__y00_x11, kernel__y00_x00__y00_x01, 
sum_2);
+      sum_2 = __smlad(tensor__y00_x12__y00_x13, kernel__y00_x02__y00_x03, 
sum_2);
+      sum_2 = __smlad(tensor__y00_x14__y00_x15, kernel__y00_x04__y00_x05, 
sum_2);
+      sum_2 = __smlad(tensor__y00_x16__y00_x17, kernel__y00_x06__y00_x07, 
sum_2);
+      sum_3 = __smlad(tensor__y00_x18__y00_x19, kernel__y00_x00__y00_x01, 
sum_3);
+      sum_3 = __smlad(tensor__y00_x1a__y00_x1b, kernel__y00_x02__y00_x03, 
sum_3);
+      sum_3 = __smlad(tensor__y00_x1c__y00_x1d, kernel__y00_x04__y00_x05, 
sum_3);
+      sum_3 = __smlad(tensor__y00_x1e__y00_x1f, kernel__y00_x06__y00_x07, 
sum_3);
+
+      int32_t scale_val = *scale;
+      int32_t requant_0 = (sum_0 * (int64_t) scale_val) >> 32;
+      requant_0 = (requant_0 + 1) >> 1;
+      requant_0 = __ssat(requant_0 + -128, 8);
+      int32_t requant_1 = (sum_1 * (int64_t) scale_val) >> 32;
+      requant_1 = (requant_1 + 1) >> 1;
+      requant_1 = __ssat(requant_1 + -128, 8);
+      int32_t requant_2 = (sum_2 * (int64_t) scale_val) >> 32;
+      requant_2 = (requant_2 + 1) >> 1;
+      requant_2 = __ssat(requant_2 + -128, 8);
+      int32_t requant_3 = (sum_3 * (int64_t) scale_val) >> 32;
+      requant_3 = (requant_3 + 1) >> 1;
+      requant_3 = __ssat(requant_3 + -128, 8);
+
+      int32_t packed_res_0 = requant_0 + (requant_1 << 16);
+      int32_t packed_res_1 = requant_2 + (requant_3 << 16);
+      output[0] = packed_res_0;
+      output[1] = packed_res_1;
+      return 0;
+    }
+    #endif
+    """
+    )
+
+
+def test_3x3x3_offset_convolution_code():
+    """This is the function that would be generated for a 1x96x96x3 NHWC input 
tensor under
+    standard convolution with a 3x3x3 kernel - the first layer of MobileNetV1. 
This is special, as
+    it means that every other kernel channel will not start on an even 
numbered halfword. We won't
+    have this issue for the input tensor, as we will always compute two 
positions at a time.
+
+    To solve this 'every other' issue, we will need two different version of 
this function to
+    alternate between. This alternation will be handled in TIR scheduling. 
Here, we just test the
+    version where the kernel is not word aligned.
+
+    Also tests the requantize_shift and output_zero_point keyword args. These 
might be needed for
+    some ResNet models (like image classification from MLPerf Tiny).
+    """
+
+    _, code = tensordot_int16_impl(
+        1,
+        (96 * 3, 3, 9),
+        (1, 1, 1),
+        (3, 1),
+        requantize_shift=40,
+        output_zero_point=4,
+    )
+    assert code == textwrap.dedent(
+        """
+    #ifndef TENSORDOT_OPT_X1_INT16_W288_3X9_111_EXISTS
+    #define TENSORDOT_OPT_X1_INT16_W288_3X9_111_EXISTS
+    #include <arm_acle.h>
+    __attribute__((always_inline)) static inline int32_t 
tensordot_opt_x1_int16_w288_3x9_111(
+        int32_t *output, int32_t *tensor, int32_t *kernel, int32_t *bias, 
int32_t *scale
+    ) {
+      int32_t sum_0 = *bias;
+
+      int32_t tensor__unknown__y00_x00 = tensor[0];
+      int32_t tensor__y00_x01__y00_x02 = tensor[1];
+      int32_t tensor__y00_x03__y00_x04 = tensor[2];
+      int32_t tensor__y00_x05__y00_x06 = tensor[3];
+      int32_t tensor__y00_x07__y00_x08 = tensor[4];
+      int32_t tensor__unknown__y01_x00 = tensor[144];
+      int32_t tensor__y01_x01__y01_x02 = tensor[145];
+      int32_t tensor__y01_x03__y01_x04 = tensor[146];
+      int32_t tensor__y01_x05__y01_x06 = tensor[147];
+      int32_t tensor__y01_x07__y01_x08 = tensor[148];
+      int32_t tensor__unknown__y02_x00 = tensor[288];
+      int32_t tensor__y02_x01__y02_x02 = tensor[289];
+      int32_t tensor__y02_x03__y02_x04 = tensor[290];
+      int32_t tensor__y02_x05__y02_x06 = tensor[291];
+      int32_t tensor__y02_x07__y02_x08 = tensor[292];
+
+      int32_t kernel__unknown__y00_x00 = kernel[0];
+      int32_t kernel__y00_x01__y00_x02 = kernel[1];
+      int32_t kernel__y00_x03__y00_x04 = kernel[2];
+      int32_t kernel__y00_x05__y00_x06 = kernel[3];
+      int32_t kernel__y00_x07__y00_x08 = kernel[4];
+      int32_t kernel__y01_x00__y01_x01 = kernel[5];
+      int32_t kernel__y01_x02__y01_x03 = kernel[6];
+      int32_t kernel__y01_x04__y01_x05 = kernel[7];
+      int32_t kernel__y01_x06__y01_x07 = kernel[8];
+      int32_t kernel__y01_x08__y02_x00 = kernel[9];
+      int32_t kernel__y02_x01__y02_x02 = kernel[10];
+      int32_t kernel__y02_x03__y02_x04 = kernel[11];
+      int32_t kernel__y02_x05__y02_x06 = kernel[12];
+      int32_t kernel__y02_x07__y02_x08 = kernel[13];
+
+      sum_0 = __smlatt(tensor__unknown__y00_x00, kernel__unknown__y00_x00, 
sum_0);
+      sum_0 = __smlad(tensor__y00_x01__y00_x02, kernel__y00_x01__y00_x02, 
sum_0);
+      sum_0 = __smlad(tensor__y00_x03__y00_x04, kernel__y00_x03__y00_x04, 
sum_0);
+      sum_0 = __smlad(tensor__y00_x05__y00_x06, kernel__y00_x05__y00_x06, 
sum_0);
+      sum_0 = __smlad(tensor__y00_x07__y00_x08, kernel__y00_x07__y00_x08, 
sum_0);
+      sum_0 = __smlatb(tensor__unknown__y01_x00, kernel__y01_x00__y01_x01, 
sum_0);
+      sum_0 = __smlabt(tensor__y01_x01__y01_x02, kernel__y01_x00__y01_x01, 
sum_0);
+      sum_0 = __smlatb(tensor__y01_x01__y01_x02, kernel__y01_x02__y01_x03, 
sum_0);
+      sum_0 = __smlabt(tensor__y01_x03__y01_x04, kernel__y01_x02__y01_x03, 
sum_0);
+      sum_0 = __smlatb(tensor__y01_x03__y01_x04, kernel__y01_x04__y01_x05, 
sum_0);
+      sum_0 = __smlabt(tensor__y01_x05__y01_x06, kernel__y01_x04__y01_x05, 
sum_0);
+      sum_0 = __smlatb(tensor__y01_x05__y01_x06, kernel__y01_x06__y01_x07, 
sum_0);
+      sum_0 = __smlabt(tensor__y01_x07__y01_x08, kernel__y01_x06__y01_x07, 
sum_0);
+      sum_0 = __smlatb(tensor__y01_x07__y01_x08, kernel__y01_x08__y02_x00, 
sum_0);
+      sum_0 = __smlatt(tensor__unknown__y02_x00, kernel__y01_x08__y02_x00, 
sum_0);
+      sum_0 = __smlad(tensor__y02_x01__y02_x02, kernel__y02_x01__y02_x02, 
sum_0);
+      sum_0 = __smlad(tensor__y02_x03__y02_x04, kernel__y02_x03__y02_x04, 
sum_0);
+      sum_0 = __smlad(tensor__y02_x05__y02_x06, kernel__y02_x05__y02_x06, 
sum_0);
+      sum_0 = __smlad(tensor__y02_x07__y02_x08, kernel__y02_x07__y02_x08, 
sum_0);
+
+      int32_t scale_val = *scale;
+      int32_t requant_0 = (sum_0 * (int64_t) scale_val) >> 39;
+      requant_0 = (requant_0 + 1) >> 1;
+      requant_0 = __ssat(requant_0 + 4, 8);
+
+      ((int16_t*) output)[1] = (int16_t) requant_0;
+      return 0;
+    }
+    #endif
+    """
+    )
diff --git a/tests/scripts/request_hook/request_hook.py 
b/tests/scripts/request_hook/request_hook.py
index ce379b6b2c..cb24353539 100644
--- a/tests/scripts/request_hook/request_hook.py
+++ b/tests/scripts/request_hook/request_hook.py
@@ -145,6 +145,7 @@ URL_MAP = {
     
"https://github.com/tlc-pack/web-data/raw/967fc387dadb272c5a7f8c3461d34c060100dbf1/testdata/microTVM/data/keyword_spotting_int8_6.pyc.npy":
 
f"{BASE}/tlc-pack/web-data/raw/967fc387dadb272c5a7f8c3461d34c060100dbf1/testdata/microTVM/data/keyword_spotting_int8_6.pyc.npy",
     
"https://github.com/tlc-pack/web-data/raw/main/testdata/microTVM/data/keyword_spotting_int8_6.pyc.npy":
 
f"{BASE}/tlc-pack/web-data/raw/main/testdata/microTVM/data/keyword_spotting_int8_6.pyc.npy",
     
"https://github.com/tlc-pack/web-data/raw/main/testdata/microTVM/model/keyword_spotting_quant.tflite":
 
f"{BASE}/tlc-pack/web-data/raw/main/testdata/microTVM/model/keyword_spotting_quant.tflite",
+    
"https://github.com/mlcommons/tiny/raw/v0.7/benchmark/training/visual_wake_words/trained_models/vww_96_int8.tflite":
 
f"{BASE}/mlcommons/tiny/raw/v0.7/benchmark/training/visual_wake_words/trained_models/vww_96_int8.tflite",
     "https://github.com/uwsampl/web-data/raw/main/vta/models/synset.txt": 
f"{BASE}/2022-10-05/synset.txt",
     "https://homes.cs.washington.edu/~cyulin/media/gnn_model/gcn_cora.torch": 
f"{BASE}/gcn_cora.torch",
     "https://homes.cs.washington.edu/~moreau/media/vta/cat.jpg": 
f"{BASE}/vta_cat.jpg",

Reply via email to