This is an automated email from the ASF dual-hosted git repository.
areusch pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tvm.git
The following commit(s) were added to refs/heads/main by this push:
new 76c78a9 [Topi] Cortex-M DSP support (#9233)
76c78a9 is described below
commit 76c78a9d4017ebd61730e7451fa765ae3b00071b
Author: Sergei Smirnov <[email protected]>
AuthorDate: Mon Nov 15 20:41:18 2021 +0300
[Topi] Cortex-M DSP support (#9233)
Co-authored-by: Sergey Smirnov <[email protected]>
Co-authored-by: Ekaterina Bern <[email protected]>
Co-authored-by: Mikhail Trubnikov <[email protected]>
Co-authored-by: German Tretiakov <[email protected]>
Co-authored-by: Ilya Gozman <[email protected]>
Co-authored-by: Alexey.Yazev <[email protected]>
Co-authored-by: Ilya Gozman
<[email protected]>
---
python/tvm/relay/op/strategy/arm_cpu.py | 93 +++++-
python/tvm/target/arm_isa.py | 24 +-
python/tvm/testing/plugin.py | 1 +
python/tvm/testing/utils.py | 12 +
python/tvm/topi/arm_cpu/__init__.py | 4 +-
.../{target/arm_isa.py => topi/arm_cpu/conv1d.py} | 28 +-
python/tvm/topi/arm_cpu/conv2d.py | 23 +-
python/tvm/topi/arm_cpu/cortex_m7/conv2d/direct.py | 186 -----------
.../tvm/topi/arm_cpu/dense.py | 13 +-
.../arm_cpu/{cortex_m7 => mprofile}/__init__.py | 5 +-
.../micro_kernel => mprofile/dsp}/__init__.py | 0
.../direct_simd.py => mprofile/dsp/conv1d.py} | 123 ++++---
.../direct_simd.py => mprofile/dsp/conv2d.py} | 30 +-
python/tvm/topi/arm_cpu/mprofile/dsp/dense.py | 52 +++
.../dsp}/micro_kernel/__init__.py | 0
.../arm_cpu/mprofile/dsp/micro_kernel/avg_pool.py | 146 +++++++++
.../arm_cpu/mprofile/dsp/micro_kernel/common.py | 20 +-
.../dsp}/micro_kernel/gemm.py | 91 ++++--
.../arm_cpu/mprofile/dsp/micro_kernel/max_pool.py | 165 ++++++++++
python/tvm/topi/arm_cpu/mprofile/dsp/pool.py | 125 ++++++++
.../{cortex_m7/conv2d/__init__.py => pooling.py} | 10 +-
tests/python/conftest.py | 21 ++
tests/python/integration/test_arm_mprofile_dsp.py | 355 +++++++++++++++++++++
tests/python/relay/aot/aot_test_utils.py | 24 +-
tests/scripts/task_python_integration.sh | 5 +
tests/scripts/task_python_integration_gpuonly.sh | 1 +
26 files changed, 1191 insertions(+), 366 deletions(-)
diff --git a/python/tvm/relay/op/strategy/arm_cpu.py
b/python/tvm/relay/op/strategy/arm_cpu.py
index 06dfc87..35db043 100644
--- a/python/tvm/relay/op/strategy/arm_cpu.py
+++ b/python/tvm/relay/op/strategy/arm_cpu.py
@@ -19,7 +19,7 @@
import re
import logging
-from tvm import topi
+from tvm import relay, topi
from ....target import arm_isa
from ....topi.generic import conv2d as conv2d_generic
from .generic import *
@@ -49,6 +49,25 @@ def schedule_concatenate_arm_cpu(_, outs, target):
return topi.arm_cpu.schedule_concatenate(outs)
+@schedule_pool.register(["arm_cpu"])
+def schedule_pool_arm_cpu(attrs, outs, target):
+ """schedule pooling ops arm cpu"""
+ layout = attrs.layout
+ isa = arm_isa.IsaAnalyzer(target)
+ avg_pool = isinstance(attrs, relay.op.op_attrs.AvgPool2DAttrs)
+ with target:
+ if (
+ avg_pool
+ and isa.has_dsp_support
+ and layout in ("NCW", "NCHW")
+ or not avg_pool
+ and isa.has_dsp_support
+ and layout in ("NWC", "NHWC")
+ ):
+ return topi.arm_cpu.schedule_pool(outs, layout)
+ return topi.generic.schedule_pool(outs, layout)
+
+
@conv2d_strategy.register(["arm_cpu", "micro_dev"])
def conv2d_strategy_arm_cpu(attrs, inputs, out_type, target):
"""conv2d arm cpu strategy"""
@@ -128,11 +147,11 @@ def conv2d_strategy_arm_cpu(attrs, inputs, out_type,
target):
name="conv2d_hwcn.generic",
)
elif layout == "NHWC":
- if "SMLAD" in isa and kernel_layout == "HWOI":
+ if isa.has_dsp_support and kernel_layout == "HWOI":
strategy.add_implementation(
- wrap_compute_conv2d(topi.arm_cpu.conv2d_nhwc_direct_simd),
-
wrap_topi_schedule(topi.arm_cpu.schedule_conv2d_nhwc_direct_simd),
- name="conv2d_nhwc_direct_simd.micro_dev",
+ wrap_compute_conv2d(topi.arm_cpu.conv2d_nhwc_dsp),
+ wrap_topi_schedule(topi.arm_cpu.schedule_conv2d_nhwc_dsp),
+ name="conv2d_nhwc_dsp.micro_dev",
)
elif kernel_layout == "HWIO":
is_aarch64 = topi.arm_cpu.arm_utils.is_aarch64_arm()
@@ -415,3 +434,67 @@ def schedule_bitserial_dense_arm_cpu(attrs, inputs,
out_type, target):
name="bitserial_dense.arm_cpu",
)
return strategy
+
+
+@dense_strategy.register(["arm_cpu"])
+def schedule_dense_arm_cpu(attrs, inputs, out_type, target):
+ """dense arm cpu strategy"""
+ strategy = _op.OpStrategy()
+ isa = arm_isa.IsaAnalyzer(target)
+ if isa.has_dsp_support:
+ strategy.add_implementation(
+ wrap_compute_dense(topi.nn.dense),
+ wrap_topi_schedule(topi.arm_cpu.schedule_dense_dsp),
+ name="dense_dsp",
+ )
+ else:
+ strategy.add_implementation(
+ wrap_compute_dense(topi.nn.dense),
+ wrap_topi_schedule(topi.generic.schedule_dense),
+ name="dense.generic",
+ )
+ return strategy
+
+
+@conv1d_strategy.register("arm_cpu")
+def conv1d_strategy_arm_cpu(attrs, inputs, out_type, target):
+ """conv1d strategy"""
+ strategy = _op.OpStrategy()
+ layout = attrs.data_layout
+ kernel_layout = attrs.kernel_layout
+ dilation = get_const_tuple(attrs.dilation)
+ if dilation[0] < 1:
+ raise ValueError("dilation should be a positive value")
+
+ isa = arm_isa.IsaAnalyzer(target)
+
+ if kernel_layout == "WOI":
+ if layout == "NWC" and isa.has_dsp_support:
+ strategy.add_implementation(
+ wrap_compute_conv1d(topi.arm_cpu.conv1d_nwc_dsp),
+ wrap_topi_schedule(topi.arm_cpu.schedule_conv1d_nwc_dsp),
+ name="conv1d_dsp",
+ )
+ else:
+ raise RuntimeError(
+ "Unsupported kernel layout {} for conv1d {} for arm
cpu.".format(
+ kernel_layout, layout
+ )
+ )
+ elif layout == "NCW":
+ strategy.add_implementation(
+ wrap_compute_conv1d(topi.nn.conv1d_ncw),
+ wrap_topi_schedule(topi.generic.schedule_conv1d_ncw),
+ name="conv1d_ncw.generic",
+ )
+ elif layout == "NWC":
+ strategy.add_implementation(
+ wrap_compute_conv1d(topi.nn.conv1d_nwc),
+ wrap_topi_schedule(topi.generic.schedule_conv1d_nwc),
+ name="conv1d_nwc.generic",
+ )
+ else:
+ raise RuntimeError(
+ "Unsupported kernel layout {} for conv1d {} for arm
cpu.".format(kernel_layout, layout)
+ )
+ return strategy
diff --git a/python/tvm/target/arm_isa.py b/python/tvm/target/arm_isa.py
index 60fc659..a5ac9b1 100644
--- a/python/tvm/target/arm_isa.py
+++ b/python/tvm/target/arm_isa.py
@@ -16,18 +16,24 @@
# under the License.
"""Defines functions to analyze available opcodes in the ARM ISA."""
+import tvm.target
-ARM_ISA_MAP = {
- "armv7e-m": ["SMLAD"],
-}
+
+ARM_MPROFILE_DSP_SUPPORT_LIST = [
+ "cortex-m7",
+ "cortex-m4",
+ "cortex-m33",
+ "cortex-m35p",
+ "cortex-m55",
+]
class IsaAnalyzer(object):
+ """Checks ISA support for given target"""
+
def __init__(self, target):
- self.target = target
- # TODO: actually parse -mcpu
- arch = "armv7e-m"
- self._isa_map = ARM_ISA_MAP[arch]
+ self.target = tvm.target.Target(target)
- def __contains__(self, instruction):
- return instruction in self._isa_map
+ @property
+ def has_dsp_support(self):
+ return self.target.mcpu is not None and self.target.mcpu in
ARM_MPROFILE_DSP_SUPPORT_LIST
diff --git a/python/tvm/testing/plugin.py b/python/tvm/testing/plugin.py
index c0decb7..e90bd5e 100644
--- a/python/tvm/testing/plugin.py
+++ b/python/tvm/testing/plugin.py
@@ -49,6 +49,7 @@ MARKERS = {
"llvm": "mark a test as requiring llvm",
"ethosn": "mark a test as requiring ethosn",
"hexagon": "mark a test as requiring hexagon",
+ "corstone300": "mark a test as requiring Corstone300 FVP",
}
diff --git a/python/tvm/testing/utils.py b/python/tvm/testing/utils.py
index 4188fea..768705a 100644
--- a/python/tvm/testing/utils.py
+++ b/python/tvm/testing/utils.py
@@ -674,6 +674,18 @@ def requires_opencl(*args):
return _compose(args, _requires_opencl)
+def requires_corstone300(*args):
+ """Mark a test as requiring the corstone300 FVP
+
+ Parameters
+ ----------
+ f : function
+ Function to mark
+ """
+ _requires_corstone300 = [pytest.mark.corstone300]
+ return _compose(args, _requires_corstone300)
+
+
def requires_rocm(*args):
"""Mark a test as requiring the rocm runtime.
diff --git a/python/tvm/topi/arm_cpu/__init__.py
b/python/tvm/topi/arm_cpu/__init__.py
index 9e2057a..20f92a8 100644
--- a/python/tvm/topi/arm_cpu/__init__.py
+++ b/python/tvm/topi/arm_cpu/__init__.py
@@ -17,6 +17,7 @@
# pylint: disable=wildcard-import
"""Schedule for ARM CPU"""
+from .conv1d import *
from .conv2d import *
from .depthwise_conv2d import *
from .conv2d_transpose import *
@@ -25,5 +26,6 @@ from . import conv2d_alter_op
from .bitserial_conv2d import *
from .bitserial_dense import *
from .injective import *
-from . import cortex_m7
from .group_conv2d import *
+from .pooling import *
+from .dense import *
diff --git a/python/tvm/target/arm_isa.py b/python/tvm/topi/arm_cpu/conv1d.py
similarity index 51%
copy from python/tvm/target/arm_isa.py
copy to python/tvm/topi/arm_cpu/conv1d.py
index 60fc659..54a6968 100644
--- a/python/tvm/target/arm_isa.py
+++ b/python/tvm/topi/arm_cpu/conv1d.py
@@ -14,20 +14,24 @@
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
-"""Defines functions to analyze available opcodes in the ARM ISA."""
+# pylint: disable=invalid-name, unused-variable, no-else-return,
unused-argument, import-outside-toplevel
+"""Conv1D schedule for ARM CPU"""
+from __future__ import absolute_import as _abs
+from tvm import autotvm
-ARM_ISA_MAP = {
- "armv7e-m": ["SMLAD"],
-}
+from .mprofile.dsp.conv1d import (
+ conv1d_nwc_dsp_compute,
+ conv1d_nwc_dsp_schedule,
+)
-class IsaAnalyzer(object):
- def __init__(self, target):
- self.target = target
- # TODO: actually parse -mcpu
- arch = "armv7e-m"
- self._isa_map = ARM_ISA_MAP[arch]
[email protected]_topi_compute("conv1d_nwc_dsp.arm_cpu")
+def conv1d_nwc_dsp(cfg, data, kernel, strides, padding, dilation, out_dtype):
+ """Compute conv1d with v7e-m DSP instructions."""
+ return conv1d_nwc_dsp_compute(cfg, data, kernel, strides, padding,
dilation, out_dtype)
- def __contains__(self, instruction):
- return instruction in self._isa_map
+
[email protected]_topi_schedule("conv1d_nwc_dsp.arm_cpu")
+def schedule_conv1d_nwc_dsp(cfg, outs):
+ return conv1d_nwc_dsp_schedule(cfg, outs)
diff --git a/python/tvm/topi/arm_cpu/conv2d.py
b/python/tvm/topi/arm_cpu/conv2d.py
index 0500eb5..ab48916 100644
--- a/python/tvm/topi/arm_cpu/conv2d.py
+++ b/python/tvm/topi/arm_cpu/conv2d.py
@@ -33,7 +33,10 @@ from .conv2d_spatial_pack import (
schedule_conv2d_spatial_pack_nchw,
schedule_conv2d_spatial_pack_nhwc,
)
-from .cortex_m7.conv2d import direct_simd
+from .mprofile.dsp.conv2d import (
+ conv2d_nhwc_dsp_compute,
+ conv2d_nhwc_dsp_schedule,
+)
@autotvm.register_topi_compute("conv2d_nchw_spatial_pack.arm_cpu")
@@ -505,15 +508,13 @@ def
schedule_conv2d_nchw_winograd_nnpack_without_weight_transform(cfg, outs):
return s
[email protected]_topi_compute("conv2d_nhwc_direct_simd.arm_cpu")
-def conv2d_nhwc_direct_simd(cfg, data, kernel, strides, padding, dilation,
out_dtype):
- """Compute conv2d_nhwc with SIMD (v7e-m)."""
- return direct_simd.conv2d_nhwc_direct_simd_compute(
- cfg, data, kernel, strides, padding, dilation, out_dtype
- )
[email protected]_topi_compute("conv2d_nhwc_dsp.arm_cpu")
+def conv2d_nhwc_dsp(cfg, data, kernel, strides, padding, dilation, out_dtype):
+ """Compute conv2d_nhwc with v7e-m DSP instructions."""
+ return conv2d_nhwc_dsp_compute(cfg, data, kernel, strides, padding,
dilation, out_dtype)
[email protected]_topi_schedule("conv2d_nhwc_direct_simd.arm_cpu")
-def schedule_conv2d_nhwc_direct_simd(cfg, outs):
- """Create schedule for conv2d_nhwc_direct_simd"""
- return direct_simd.conv2d_nhwc_direct_simd_schedule(cfg, outs)
[email protected]_topi_schedule("conv2d_nhwc_dsp.arm_cpu")
+def schedule_conv2d_nhwc_dsp(cfg, outs):
+ """Create schedule for conv2d_nhwc_dsp"""
+ return conv2d_nhwc_dsp_schedule(cfg, outs)
diff --git a/python/tvm/topi/arm_cpu/cortex_m7/conv2d/direct.py
b/python/tvm/topi/arm_cpu/cortex_m7/conv2d/direct.py
deleted file mode 100644
index 4f721da..0000000
--- a/python/tvm/topi/arm_cpu/cortex_m7/conv2d/direct.py
+++ /dev/null
@@ -1,186 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements. See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership. The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied. See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint: disable=invalid-name
-"""Direct implementation of conv2d."""
-
-import tvm
-from tvm import autotvm
-from tvm.autotvm.task import deserialize_args
-from tvm.topi.nn.conv2d import conv2d_nchw, conv2d_nhwc
-from tvm.topi.utils import get_const_tuple, get_const_int, traverse_inline
-
-
-def conv2d_direct(*args, **kwargs):
- """Schedule function for directly-scheduled conv2d."""
- assert not kwargs, "Do not support kwargs in template function call"
- args = deserialize_args(args)
- data, kernel = args[:2]
- layout = args[-2]
- cfg = autotvm.get_config()
- args = [cfg] + args
- conv = conv2d_direct_compute(*args)
- if layout == "NHWC":
- sched = conv2d_direct_nhwc_schedule(cfg, [data, kernel, conv])
- elif layout == "NCHW":
- sched = conv2d_direct_nchw_schedule(cfg, [data, kernel, conv])
- else:
- raise RuntimeError(f'unsupported data layout "{layout}"')
- return sched, [data, kernel, conv]
-
-
-conv2d_direct.template_key = "direct"
-conv2d_direct.default_data_layout = "NHWC"
-conv2d_direct.default_kernel_layout = "HWIO"
-
-
[email protected]_topi_compute("conv2d_direct.micro_dev")
-def conv2d_direct_compute(*args):
- layout = args[-2]
- if layout == "NHWC":
- return _conv2d_direct_nhwc_compute(*args)
- if layout == "NCHW":
- return _conv2d_direct_nchw_compute(*args)
-
- raise RuntimeError(f'unsupported data layout "{layout}"')
-
-
-def _conv2d_direct_nhwc_compute(cfg, data, kernel, strides, padding, dilation,
layout, out_dtype):
- assert layout == "NHWC"
- conv = conv2d_nhwc(data, kernel, strides, padding, dilation, out_dtype)
-
- # Config Space Definition
- N, H, W, CI = get_const_tuple(data.shape)
- KH, KW, _, CO = get_const_tuple(kernel.shape)
- n, oh, ow, co = cfg.axis(N), cfg.axis(H), cfg.axis(W), cfg.axis(CO)
- kh, kw, ci = cfg.reduce_axis(KH), cfg.reduce_axis(KW), cfg.reduce_axis(CI)
-
- # TODO should we add a max_factor attr to these splits?
- co, vc = cfg.define_split("tile_co", co, num_outputs=2)
- oh, vh = cfg.define_split("tile_oh", oh, num_outputs=2)
- ow, vw = cfg.define_split("tile_ow", ow, num_outputs=2)
-
- cfg.define_reorder(
- "reorder_0",
- [n, co, oh, ow, ci, kh, kw, vh, vw, vc],
- policy="candidate",
- candidate=[
- [n, co, oh, ow, ci, kh, kw, vh, vw, vc],
- [n, co, oh, ow, ci, kh, kw, vc, vh, vw],
- [n, co, oh, ow, ci, vh, vw, vc, kh, kw],
- [n, co, oh, ow, ci, vc, vh, vw, kh, kw],
- ],
- )
-
- cfg.define_annotate("ann_reduce", [kh, kw], policy="try_unroll")
- cfg.define_annotate("ann_spatial", [vh, vw, vc], policy="try_unroll")
-
- cfg.define_knob("auto_unroll_max_step", [0, 2, 4, 8, 16, 32])
- cfg.define_knob("unroll_explicit", [0, 1])
-
- return conv
-
-
-def _conv2d_direct_nchw_compute(cfg, data, kernel, strides, padding, dilation,
layout, out_dtype):
- assert layout == "NCHW"
- conv = conv2d_nchw(data, kernel, strides, padding, dilation, out_dtype)
-
- ###########################
- # Config Space Definition #
- ###########################
- cfg.define_knob("auto_unroll_max_step", [0, 2, 4, 8, 16, 32])
- cfg.define_knob("unroll_explicit", [0, 1])
-
- return conv
-
-
[email protected]_topi_schedule("conv2d_direct_nhwc.micro_dev")
-def conv2d_direct_nhwc_schedule(cfg, outs):
- """Schedule function for directly-scheduled conv2d on NHWC layout."""
- sched = tvm.create_schedule([x.op for x in outs])
-
- def _callback(op):
- if "conv2d_nhwc" not in op.tag:
- return
-
- ### extract tensors ###
- output = op.output(0)
- conv = op
- data_vec = conv.input_tensors[0]
- kernel = conv.input_tensors[1] # pylint: disable=unused-variable
- last = outs[0] # pylint: disable=unused-variable
-
- # tile reduction axes
- n, oh, ow, co = sched[conv].op.axis
- kh, kw, ci = sched[conv].op.reduce_axis
- # NOTE we can't inline data padding in the SIMD path, because it
- # introduces conditionals in the inner loop.
- data_pad = data_vec.op
- sched[data_pad].compute_inline()
-
- co, vc = cfg["tile_co"].apply(sched, conv, co)
- oh, vh = cfg["tile_oh"].apply(sched, conv, oh)
- ow, vw = cfg["tile_ow"].apply(sched, conv, ow)
- cfg["reorder_0"].apply(sched, conv, [n, co, oh, ow, ci, kh, kw, vh,
vw, vc])
- cfg["ann_reduce"].apply(
- sched,
- conv,
- [kh, kw],
- axis_lens=[get_const_int(kh.dom.extent),
get_const_int(kw.dom.extent)],
- max_unroll=8,
- cfg=cfg,
- )
- cfg["ann_spatial"].apply(
- sched,
- conv,
- [vh, vw, vc],
- axis_lens=[cfg["tile_oh"].size[-1], cfg["tile_ow"].size[-1],
cfg["tile_co"].size[-1]],
- max_unroll=8,
- cfg=cfg,
- )
-
- kernel_scope = n # this is the scope to attach global config inside
this kernel
-
- # tune unroll
- sched[output].pragma(kernel_scope, "auto_unroll_max_step",
cfg["auto_unroll_max_step"].val)
- sched[output].pragma(kernel_scope, "unroll_explicit",
cfg["unroll_explicit"].val)
-
- traverse_inline(sched, outs[-1].op, _callback)
- return sched
-
-
[email protected]_topi_schedule("conv2d_direct_nchw.micro_dev")
-def conv2d_direct_nchw_schedule(cfg, outs):
- """Schedule function for Cortex-M7 direct implementation of conv2d."""
- # use default schedule
- sched = tvm.create_schedule([x.op for x in outs])
-
- conv = outs[-1].op
- output = conv.output(0)
- data_vec = conv.input_tensors[0]
- data_pad = data_vec.op
- sched[data_pad].compute_inline()
-
- # TODO add more schedule opts (similar to the NHWC template)
-
- n, _, _, _ = sched[conv].op.axis
- kernel_scope = n # this is the scope to attach global config inside this
kernel
-
- # tune unroll
- sched[output].pragma(kernel_scope, "auto_unroll_max_step",
cfg["auto_unroll_max_step"].val)
- sched[output].pragma(kernel_scope, "unroll_explicit",
cfg["unroll_explicit"].val)
-
- return sched
diff --git a/tests/scripts/task_python_integration_gpuonly.sh
b/python/tvm/topi/arm_cpu/dense.py
old mode 100755
new mode 100644
similarity index 72%
copy from tests/scripts/task_python_integration_gpuonly.sh
copy to python/tvm/topi/arm_cpu/dense.py
index 36c3883..f2e2eb6
--- a/tests/scripts/task_python_integration_gpuonly.sh
+++ b/python/tvm/topi/arm_cpu/dense.py
@@ -1,4 +1,3 @@
-#!/bin/bash
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
@@ -15,10 +14,12 @@
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
+# pylint: disable=invalid-name, unused-variable, no-else-return,
unused-argument, import-outside-toplevel
+"""Dense schedule for ARM CPU"""
-export TVM_TEST_TARGETS="cuda;opencl;metal;rocm;nvptx;opencl
-device=mali,aocl_sw_emu"
-export PYTEST_ADDOPTS="-m gpu $PYTEST_ADDOPTS"
-export TVM_RELAY_TEST_TARGETS="cuda"
-export TVM_INTEGRATION_TESTSUITE_NAME=python-integration-gpu
+from .mprofile.dsp.dense import dense_dsp_schedule
-./tests/scripts/task_python_integration.sh
+
+def schedule_dense_dsp(outs):
+ """Create schedule for dense_dsp"""
+ return dense_dsp_schedule(outs)
diff --git a/python/tvm/topi/arm_cpu/cortex_m7/__init__.py
b/python/tvm/topi/arm_cpu/mprofile/__init__.py
similarity index 92%
rename from python/tvm/topi/arm_cpu/cortex_m7/__init__.py
rename to python/tvm/topi/arm_cpu/mprofile/__init__.py
index 631c5f7..32ce4d3 100644
--- a/python/tvm/topi/arm_cpu/cortex_m7/__init__.py
+++ b/python/tvm/topi/arm_cpu/mprofile/__init__.py
@@ -14,7 +14,4 @@
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
-"""Schedules specialized for cortex-m7."""
-
-
-from . import conv2d
+"""Schedules specialized for cortex-m DSP instructions."""
diff --git a/python/tvm/topi/arm_cpu/cortex_m7/micro_kernel/__init__.py
b/python/tvm/topi/arm_cpu/mprofile/dsp/__init__.py
similarity index 100%
copy from python/tvm/topi/arm_cpu/cortex_m7/micro_kernel/__init__.py
copy to python/tvm/topi/arm_cpu/mprofile/dsp/__init__.py
diff --git a/python/tvm/topi/arm_cpu/cortex_m7/conv2d/direct_simd.py
b/python/tvm/topi/arm_cpu/mprofile/dsp/conv1d.py
similarity index 54%
copy from python/tvm/topi/arm_cpu/cortex_m7/conv2d/direct_simd.py
copy to python/tvm/topi/arm_cpu/mprofile/dsp/conv1d.py
index 5ef9fd8..521a58d 100644
--- a/python/tvm/topi/arm_cpu/cortex_m7/conv2d/direct_simd.py
+++ b/python/tvm/topi/arm_cpu/mprofile/dsp/conv1d.py
@@ -15,100 +15,86 @@
# specific language governing permissions and limitations
# under the License.
# pylint: disable=invalid-name, no-value-for-parameter
-"""Direct implementation of conv2d."""
-
+"""Direct implementation of conv1d."""
from tvm import autotvm
from tvm.autotvm.task import deserialize_args
from tvm import te
from tvm.topi.utils import simplify, traverse_inline
from tvm.topi.nn.pad import pad
-from tvm.topi.nn.utils import get_pad_tuple
+from tvm.topi.nn.utils import get_pad_tuple1d
+from tvm.tir.expr import Mul
-from ..micro_kernel.gemm import (
+from .micro_kernel.gemm import (
intrin_gemm_MxKxN,
gemm_MxKxN_impl,
)
-def conv2d_nhwc_direct_simd(*args, **kwargs):
- """Defines the Cortex-M7 SIMD implementation of conv2d."""
+def conv1d_nwc_dsp(*args, **kwargs):
+ """Defines the v7e-m DSP instructions of conv1d on NWC layout."""
assert not kwargs, "Do not support kwargs in template function call"
args = deserialize_args(args)
data, kernel = args[:2]
layout = args[-2]
cfg = autotvm.get_config()
args = [cfg] + args
- assert layout == "NHWC"
- conv = conv2d_nhwc_direct_simd_compute(*args)
- sched = conv2d_nhwc_direct_simd_schedule(cfg, [data, kernel, conv])
+ assert layout == "NWC"
+ conv = conv1d_nwc_dsp_compute(*args)
+ sched = conv1d_nwc_dsp_schedule(cfg, [data, kernel, conv])
return sched, [data, kernel, conv]
-conv2d_nhwc_direct_simd.template_key = "direct_simd"
-conv2d_nhwc_direct_simd.default_data_layout = "NHWC"
-conv2d_nhwc_direct_simd.default_kernel_layout = "HWOI"
-
+conv1d_nwc_dsp.template_key = "dsp"
+conv1d_nwc_dsp.default_data_layout = "NWC"
+conv1d_nwc_dsp.default_kernel_layout = "WOI"
-def conv2d_nhwc_direct_simd_compute(cfg, data, kernel, strides, padding,
dilation, out_dtype):
- """Compute function for Cortex-M7 SIMD implementation of conv2d."""
- assert isinstance(strides, int) or len(strides) == 2
- assert isinstance(dilation, int) or len(dilation) == 2
- if isinstance(strides, int):
- stride_h = stride_w = strides
- else:
- stride_h, stride_w = strides
+def conv1d_nwc_dsp_compute(cfg, data, kernel, strides, padding, dilation,
out_dtype):
+ """Compute function for v7e-m DSP instructions of conv1d on NWC layout."""
+ if isinstance(strides, (tuple, list)):
+ strides = strides[0]
+ if isinstance(dilation, (tuple, list)):
+ dilation = dilation[0]
- if isinstance(dilation, int):
- dilation_h = dilation_w = dilation
- else:
- dilation_h, dilation_w = dilation
+ batch_size, data_width, in_channels = data.shape
+ kernel_size, out_channels, _ = kernel.shape
- batch_size, in_height, in_width, in_channels = data.shape
- kernel_h, kernel_w, out_channels, _ = kernel.shape
+ # Compute the output shape
+ dilated_kernel_size = (kernel_size - 1) * dilation + 1
+ pad_left, pad_right = get_pad_tuple1d(padding, (dilated_kernel_size,))
+ out_channels = simplify(out_channels)
+ out_width = simplify((data_width - dilated_kernel_size + pad_left +
pad_right) // strides + 1)
- # compute the output shape
- dilated_kernel_h = (kernel_h - 1) * dilation_h + 1
- dilated_kernel_w = (kernel_w - 1) * dilation_w + 1
- pad_top, pad_left, pad_down, pad_right = get_pad_tuple(
- padding, (dilated_kernel_h, dilated_kernel_w)
- )
- out_height = simplify((in_height - dilated_kernel_h + pad_top + pad_down)
// stride_h + 1)
- out_width = simplify((in_width - dilated_kernel_w + pad_left + pad_right)
// stride_w + 1)
-
- pad_before = [0, pad_top, pad_left, 0]
- pad_after = [0, pad_down, pad_right, 0]
+ # Apply padding
+ pad_before = [0, pad_left, 0]
+ pad_after = [0, pad_right, 0]
padded_data = pad(data, pad_before, pad_after, name="padded_data")
+ # Compute graph
rc = te.reduce_axis((0, in_channels), name="rc")
- ry = te.reduce_axis((0, kernel_h), name="ry")
- rx = te.reduce_axis((0, kernel_w), name="rx")
+ rw = te.reduce_axis((0, kernel_size), name="rw")
conv = te.compute(
- (batch_size, out_height, out_width, out_channels),
- lambda nn, yy, xx, ff: te.sum(
- padded_data[
- nn, yy * stride_h + ry * dilation_h, xx * stride_w + rx *
dilation_w, rc
- ].astype(out_dtype)
- * kernel[ry, rx, ff, rc].astype(out_dtype),
- axis=[ry, rx, rc],
+ (batch_size, out_width, out_channels),
+ lambda b, w, c: te.sum(
+ padded_data[b, w * strides + rw * dilation, rc].astype(out_dtype)
+ * kernel[rw, c, rc].astype(out_dtype),
+ axis=[rw, rc],
),
- name="conv2d",
- tag="conv2d_nhwc",
+ name="conv1d",
+ tag="conv1d_nwc",
)
###########################
# Config Space Definition #
###########################
- n, oh, ow, co = (
+ n, ow, co = (
cfg.axis(batch_size.value),
- cfg.axis(out_height.value),
cfg.axis(out_width.value),
cfg.axis(out_channels.value),
)
- kh, kw, ci = (
- cfg.reduce_axis(kernel_h.value),
- cfg.reduce_axis(kernel_w.value),
+ kw, ci = (
+ cfg.reduce_axis(kernel_size.value),
cfg.reduce_axis(in_channels.value),
)
@@ -125,13 +111,13 @@ def conv2d_nhwc_direct_simd_compute(cfg, data, kernel,
strides, padding, dilatio
cfg.define_reorder(
"reorder_0_simd",
- [n, oh, owo, owi, coo, coi, kh, kw, cio, cii],
+ [n, owo, owi, coo, coi, kw, cio, cii],
policy="candidate",
candidate=[
- [n, oh, kh, kw, owo, coo, cio, owi, coi, cii],
- [n, oh, kh, kw, coo, owo, cio, owi, coi, cii],
- [n, kh, kw, oh, owo, coo, cio, owi, coi, cii],
- [n, kh, kw, oh, coo, owo, cio, owi, coi, cii],
+ [n, kw, owo, coo, cio, owi, coi, cii],
+ [n, kw, coo, owo, cio, owi, coi, cii],
+ [n, kw, owo, coo, cio, owi, coi, cii],
+ [n, kw, coo, owo, cio, owi, coi, cii],
],
)
@@ -146,24 +132,25 @@ def conv2d_nhwc_direct_simd_compute(cfg, data, kernel,
strides, padding, dilatio
return conv
-def conv2d_nhwc_direct_simd_schedule(cfg, outs):
- """Schedule function for Cortex-M7 SIMD implementation of conv2d."""
+def conv1d_nwc_dsp_schedule(cfg, outs):
+ """Schedule function for v7e-m DSP instructions of conv1d on NWC layout."""
sched = te.create_schedule([x.op for x in outs])
def _callback(op):
- if "conv2d_nhwc" not in op.tag:
+ if "conv1d_nwc" not in op.tag:
return
# extract tensors
output = op.output(0)
conv = op
data_vec = conv.input_tensors[0]
- kernel = conv.input_tensors[1] # pylint: disable=unused-variable
- last = outs[0] # pylint: disable=unused-variable
+
+ source_index_w = output.op.body[0].source[0].a.value.indices[1].a
+ stride_w = source_index_w.b.value if isinstance(source_index_w, Mul)
else 1
# tile reduction axes
- n, oh, ow, co = sched[conv].op.axis
- kh, kw, ci = sched[conv].op.reduce_axis
+ n, ow, co = sched[conv].op.axis
+ kw, ci = sched[conv].op.reduce_axis
M = cfg["tile_ow"].size[-1]
K = cfg["tile_ci"].size[-1]
@@ -173,9 +160,9 @@ def conv2d_nhwc_direct_simd_schedule(cfg, outs):
cio, cii = cfg["tile_ci"].apply(sched, conv, ci)
coo, coi = cfg["tile_co"].apply(sched, conv, co)
- cfg["reorder_0_simd"].apply(sched, conv, [n, oh, owo, owi, coo, coi,
kh, kw, cio, cii])
+ cfg["reorder_0_simd"].apply(sched, conv, [n, owo, owi, coo, coi, kw,
cio, cii])
- gemm, uniq_id = intrin_gemm_MxKxN(M, K, N, data_vec.dtype,
output.dtype)
+ gemm, uniq_id = intrin_gemm_MxKxN(M, K, N, data_vec.dtype,
output.dtype, stride_w)
sched[output].tensorize(owi, gemm)
sched[output].pragma(n, "import_c", gemm_MxKxN_impl(M, K, N, uniq_id))
diff --git a/python/tvm/topi/arm_cpu/cortex_m7/conv2d/direct_simd.py
b/python/tvm/topi/arm_cpu/mprofile/dsp/conv2d.py
similarity index 88%
rename from python/tvm/topi/arm_cpu/cortex_m7/conv2d/direct_simd.py
rename to python/tvm/topi/arm_cpu/mprofile/dsp/conv2d.py
index 5ef9fd8..470d46b 100644
--- a/python/tvm/topi/arm_cpu/cortex_m7/conv2d/direct_simd.py
+++ b/python/tvm/topi/arm_cpu/mprofile/dsp/conv2d.py
@@ -23,15 +23,16 @@ from tvm import te
from tvm.topi.utils import simplify, traverse_inline
from tvm.topi.nn.pad import pad
from tvm.topi.nn.utils import get_pad_tuple
+from tvm.tir.expr import Mul
-from ..micro_kernel.gemm import (
+from .micro_kernel.gemm import (
intrin_gemm_MxKxN,
gemm_MxKxN_impl,
)
-def conv2d_nhwc_direct_simd(*args, **kwargs):
- """Defines the Cortex-M7 SIMD implementation of conv2d."""
+def conv2d_nhwc_dsp(*args, **kwargs):
+ """Defines the v7e-m DSP instructions of conv2d."""
assert not kwargs, "Do not support kwargs in template function call"
args = deserialize_args(args)
data, kernel = args[:2]
@@ -39,18 +40,18 @@ def conv2d_nhwc_direct_simd(*args, **kwargs):
cfg = autotvm.get_config()
args = [cfg] + args
assert layout == "NHWC"
- conv = conv2d_nhwc_direct_simd_compute(*args)
- sched = conv2d_nhwc_direct_simd_schedule(cfg, [data, kernel, conv])
+ conv = conv2d_nhwc_dsp_compute(*args)
+ sched = conv2d_nhwc_dsp_schedule(cfg, [data, kernel, conv])
return sched, [data, kernel, conv]
-conv2d_nhwc_direct_simd.template_key = "direct_simd"
-conv2d_nhwc_direct_simd.default_data_layout = "NHWC"
-conv2d_nhwc_direct_simd.default_kernel_layout = "HWOI"
+conv2d_nhwc_dsp.template_key = "dsp"
+conv2d_nhwc_dsp.default_data_layout = "NHWC"
+conv2d_nhwc_dsp.default_kernel_layout = "HWOI"
-def conv2d_nhwc_direct_simd_compute(cfg, data, kernel, strides, padding,
dilation, out_dtype):
- """Compute function for Cortex-M7 SIMD implementation of conv2d."""
+def conv2d_nhwc_dsp_compute(cfg, data, kernel, strides, padding, dilation,
out_dtype):
+ """Compute function for v7e-m DSP instructions of conv2d."""
assert isinstance(strides, int) or len(strides) == 2
assert isinstance(dilation, int) or len(dilation) == 2
@@ -146,8 +147,8 @@ def conv2d_nhwc_direct_simd_compute(cfg, data, kernel,
strides, padding, dilatio
return conv
-def conv2d_nhwc_direct_simd_schedule(cfg, outs):
- """Schedule function for Cortex-M7 SIMD implementation of conv2d."""
+def conv2d_nhwc_dsp_schedule(cfg, outs):
+ """Schedule function for v7e-m DSP instructions of conv2d."""
sched = te.create_schedule([x.op for x in outs])
def _callback(op):
@@ -161,6 +162,9 @@ def conv2d_nhwc_direct_simd_schedule(cfg, outs):
kernel = conv.input_tensors[1] # pylint: disable=unused-variable
last = outs[0] # pylint: disable=unused-variable
+ source_index_w = output.op.body[0].source[0].a.value.indices[2].a
+ stride_w = source_index_w.b.value if isinstance(source_index_w, Mul)
else 1
+
# tile reduction axes
n, oh, ow, co = sched[conv].op.axis
kh, kw, ci = sched[conv].op.reduce_axis
@@ -175,7 +179,7 @@ def conv2d_nhwc_direct_simd_schedule(cfg, outs):
cfg["reorder_0_simd"].apply(sched, conv, [n, oh, owo, owi, coo, coi,
kh, kw, cio, cii])
- gemm, uniq_id = intrin_gemm_MxKxN(M, K, N, data_vec.dtype,
output.dtype)
+ gemm, uniq_id = intrin_gemm_MxKxN(M, K, N, data_vec.dtype,
output.dtype, stride_w)
sched[output].tensorize(owi, gemm)
sched[output].pragma(n, "import_c", gemm_MxKxN_impl(M, K, N, uniq_id))
diff --git a/python/tvm/topi/arm_cpu/mprofile/dsp/dense.py
b/python/tvm/topi/arm_cpu/mprofile/dsp/dense.py
new file mode 100644
index 0000000..20dfb09
--- /dev/null
+++ b/python/tvm/topi/arm_cpu/mprofile/dsp/dense.py
@@ -0,0 +1,52 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+# pylint: disable=invalid-name, no-value-for-parameter
+"""Direct implementation of dense."""
+
+from tvm import te
+from tvm.topi.utils import traverse_inline
+
+from .micro_kernel.gemm import (
+ intrin_gemm_MxKxN,
+ gemm_MxKxN_impl,
+)
+
+
+def dense_dsp_schedule(outs):
+ """Schedule function for v7e-m DSP instructions of dense."""
+ sched = te.create_schedule([x.op for x in outs])
+
+ def _callback(op):
+ if "dense" not in op.tag:
+ return
+
+ # extract tensors
+ output = op.output(0)
+ dense = op
+ data_vec = dense.input_tensors[0]
+ M, K = data_vec.shape
+ N, _ = dense.input_tensors[1].shape
+
+ n, _ = sched[dense].op.axis
+ no, ni = sched[dense].split(n, nparts=1)
+
+ gemm, uniq_id = intrin_gemm_MxKxN(M, K, N, data_vec.dtype,
output.dtype)
+ sched[output].tensorize(ni, gemm)
+ sched[output].pragma(no, "import_c", gemm_MxKxN_impl(M, K, N, uniq_id))
+
+ traverse_inline(sched, outs[-1].op, _callback)
+ return sched
diff --git a/python/tvm/topi/arm_cpu/cortex_m7/micro_kernel/__init__.py
b/python/tvm/topi/arm_cpu/mprofile/dsp/micro_kernel/__init__.py
similarity index 100%
rename from python/tvm/topi/arm_cpu/cortex_m7/micro_kernel/__init__.py
rename to python/tvm/topi/arm_cpu/mprofile/dsp/micro_kernel/__init__.py
diff --git a/python/tvm/topi/arm_cpu/mprofile/dsp/micro_kernel/avg_pool.py
b/python/tvm/topi/arm_cpu/mprofile/dsp/micro_kernel/avg_pool.py
new file mode 100644
index 0000000..786ac26
--- /dev/null
+++ b/python/tvm/topi/arm_cpu/mprofile/dsp/micro_kernel/avg_pool.py
@@ -0,0 +1,146 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+# pylint: disable=invalid-name, no-value-for-parameter
+"""Defines sum intrinsics for sum operation with v7e-m DSP instructions."""
+
+import random
+import string
+
+import tvm
+from tvm import te
+from . import common
+
+
+def intrin_sum(shape, in_dtype, out_dtype, reset=False):
+ """Defines a v7e-m DSP-accelerated sum operation."""
+ UNIQ_ID_LEN = 8
+ uniq_id = "".join(random.choices(string.ascii_uppercase, k=UNIQ_ID_LEN))
+ func_prefix = "sum16"
+
+ assert in_dtype == "int16"
+ assert out_dtype == "int16"
+
+ width = shape[-1]
+ x = te.placeholder(shape, name="x", dtype=in_dtype)
+ k = te.reduce_axis((0, width), name="rc")
+
+ def get_slice(indices, k):
+ s = list(indices)
+ s[-1] = s[-1] + k
+ return tuple(s)
+
+ z = te.compute(
+ (1,) * len(shape), lambda *i: te.sum(x[get_slice(i, k)],
axis=[k]).astype(out_dtype)
+ )
+
+ def _intrin_func(ins, outs):
+ aa = ins[0]
+ cc = outs[0]
+
+ def _body():
+ ib = tvm.tir.ir_builder.create()
+ ib.emit(
+ tvm.tir.call_extern(
+ cc.dtype,
+ f"{func_prefix}_{width}_{uniq_id}",
+ aa.access_ptr("r"),
+ cc.access_ptr("w"),
+ aa.elem_offset,
+ 1 if reset else 0,
+ )
+ )
+ return ib.get()
+
+ def _reduce_reset():
+ ib = tvm.tir.ir_builder.create()
+ ib.emit(
+ tvm.tir.call_extern(cc.dtype,
f"{func_prefix}_reset_{uniq_id}", cc.access_ptr("w"))
+ )
+ return ib.get()
+
+ def _reduce_update():
+ return _body()
+
+ return _body(), _reduce_reset(), _reduce_update()
+
+ binds = {
+ t: tvm.tir.decl_buffer(
+ t.shape,
+ t.dtype,
+ t.op.name,
+ strides=[te.var(f"{t.op.name}_s_{i}") for i in range(0,
len(t.shape))],
+ offset_factor=1,
+ )
+ for t in [x, z]
+ }
+
+ intrin_decl = te.decl_tensor_intrin(z.op, _intrin_func, binds=binds)
+ return intrin_decl, uniq_id
+
+
+def sum_impl(N, uniq_id):
+ """Emit C code for sum impl."""
+ cc_code = (
+ common.common_includes
+ + f"""
+
+#ifdef __cplusplus
+extern "C"
+#endif // __cplusplus
+__STATIC_FORCEINLINE int32_t sum16_reset_{uniq_id}(
+ int16_t *res) {{
+ *res = (int16_t)0;
+ return 0;
+}}
+
+#ifdef __cplusplus
+extern "C"
+#endif
+__STATIC_FORCEINLINE int32_t sum16_{N}_{uniq_id}(
+ int16_t *arr,
+ int16_t *res16,
+ long arr_offset,
+ int reset) {{
+ int n;
+ int32_t *p32;
+ int32_t res = reset ? 0 : *res16;
+
+ if ( arr_offset % 4 != 0 ) {{
+ res += *arr;
+ p32 = (int32_t *)(&arr[1]);
+ n = {N} - 1;
+ }} else {{
+ p32 = (int32_t *)arr;
+ n = {N};
+ }}
+
+ for ( int i = 0; i < n / 2; ++ i ) {{
+ res = __SMLAD(*p32, 0x00010001, res);
+ ++ p32;
+ }}
+
+ if ( n % 2 != 0 )
+ res += *(int16_t *)p32;
+
+ *res16 = res;
+
+ return 0;
+}}
+
+"""
+ )
+ return cc_code
diff --git a/tests/scripts/task_python_integration_gpuonly.sh
b/python/tvm/topi/arm_cpu/mprofile/dsp/micro_kernel/common.py
old mode 100755
new mode 100644
similarity index 72%
copy from tests/scripts/task_python_integration_gpuonly.sh
copy to python/tvm/topi/arm_cpu/mprofile/dsp/micro_kernel/common.py
index 36c3883..a37b297
--- a/tests/scripts/task_python_integration_gpuonly.sh
+++ b/python/tvm/topi/arm_cpu/mprofile/dsp/micro_kernel/common.py
@@ -1,4 +1,3 @@
-#!/bin/bash
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
@@ -15,10 +14,19 @@
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
+# pylint: disable=invalid-name, no-value-for-parameter
+"""Defines common C code for all microkernel operations."""
-export TVM_TEST_TARGETS="cuda;opencl;metal;rocm;nvptx;opencl
-device=mali,aocl_sw_emu"
-export PYTEST_ADDOPTS="-m gpu $PYTEST_ADDOPTS"
-export TVM_RELAY_TEST_TARGETS="cuda"
-export TVM_INTEGRATION_TESTSUITE_NAME=python-integration-gpu
-./tests/scripts/task_python_integration.sh
+common_includes = """
+
+#include <stdint.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include <arm_math.h>
+#include <arm_nnsupportfunctions.h>
+
+#include <tvm/runtime/crt/error_codes.h>
+
+"""
diff --git a/python/tvm/topi/arm_cpu/cortex_m7/micro_kernel/gemm.py
b/python/tvm/topi/arm_cpu/mprofile/dsp/micro_kernel/gemm.py
similarity index 89%
rename from python/tvm/topi/arm_cpu/cortex_m7/micro_kernel/gemm.py
rename to python/tvm/topi/arm_cpu/mprofile/dsp/micro_kernel/gemm.py
index 9a00fe2..ffc48ea 100644
--- a/python/tvm/topi/arm_cpu/cortex_m7/micro_kernel/gemm.py
+++ b/python/tvm/topi/arm_cpu/mprofile/dsp/micro_kernel/gemm.py
@@ -15,21 +15,23 @@
# specific language governing permissions and limitations
# under the License.
# pylint: disable=invalid-name, no-value-for-parameter
-"""Defines gemm intrinsics for SIMD matrix multiplication."""
+"""Defines gemm intrinsics for matrix multiplication with v7e-m DSP
instructions."""
import random
import string
import tvm
from tvm import te
+from . import common
+
##########################
# MxKxN MatMul Intrinsic #
##########################
# NOTE this is transposed matmul (A * B^T)
-def intrin_gemm_MxKxN(M, K, N, in_dtype, out_dtype):
- """Defines a SIMD-accelerated transposed matmul."""
+def intrin_gemm_MxKxN(M, K, N, in_dtype, out_dtype, stride_w=1):
+ """Defines a v7e-m DSP-accelerated transposed matmul."""
# we generate a unique ID for every intrinsic definition, to prevent name
# collisions in the generated source (e.g., if there are multiple operators
# in the same module that use the same intrinsic)
@@ -49,12 +51,14 @@ def intrin_gemm_MxKxN(M, K, N, in_dtype, out_dtype):
# TODO(weberlo, areusch): support more dtypes?
assert in_dtype in ("int8", "int16")
assert out_dtype == "int32"
- A = te.placeholder((M, K), name="a", dtype=in_dtype)
+ A = te.placeholder((M * stride_w - (stride_w - 1), K), name="a",
dtype=in_dtype)
B = te.placeholder((N, K), name="b", dtype=in_dtype)
k = te.reduce_axis((0, K), name="k")
C = te.compute(
(M, N),
- lambda i, j: te.sum(A[i, k].astype(out_dtype) * B[j,
k].astype(out_dtype), axis=k),
+ lambda i, j: te.sum(
+ A[i * stride_w, k].astype(out_dtype) * B[j, k].astype(out_dtype),
axis=k
+ ),
name="c",
)
A_buf = tvm.tir.decl_buffer(
@@ -81,7 +85,7 @@ def intrin_gemm_MxKxN(M, K, N, in_dtype, out_dtype):
aa.access_ptr("r"),
bb.access_ptr("r"),
cc.access_ptr("w"),
- aa.strides[0],
+ aa.strides[0] * stride_w,
bb.strides[0],
cc.strides[0],
)
@@ -106,7 +110,7 @@ def intrin_gemm_MxKxN(M, K, N, in_dtype, out_dtype):
aa.access_ptr("r"),
bb.access_ptr("r"),
cc.access_ptr("w"),
- aa.strides[0],
+ aa.strides[0] * stride_w,
bb.strides[0],
cc.strides[0],
)
@@ -125,12 +129,10 @@ def gemm_MxKxN_impl(M, K, N, uniq_id):
# aa_pad_size = M * K
bb_pad_size = N * K
# code reference: CMSIS-NN paper (https://arxiv.org/abs/1801.06601)
- cc_code = f"""
-#ifdef __cplusplus
-extern "C"
-#endif
-#include <arm_math.h>
-#include <arm_nnsupportfunctions.h>
+ cc_code = (
+ common.common_includes
+ + f"""
+
#ifdef __cplusplus
extern "C"
@@ -203,9 +205,12 @@ __STATIC_FORCEINLINE int32_t
gemm_{M}x{K}x{N}_body_{uniq_id}(
int8_t *aa, int8_t *bb, int32_t *cc,
int A_stride, int B_stride, int C_stride) {{
int16_t bb_pad[{bb_pad_size}];
+ int32_t retcode = 0;
- if ( {M} < 16 || {N} < 16 )
- return gemm_{M}x{K}x{N}_body_loop_{uniq_id}(aa, bb, cc, A_stride,
B_stride, C_stride);
+ if ( {M} < 16 || {N} < 16 ) {{
+ retcode = gemm_{M}x{K}x{N}_body_loop_{uniq_id}(aa, bb, cc, A_stride,
B_stride, C_stride);
+ goto out;
+ }}
for (int i = 0; i < {N}; i++)
for (int j = 0; j < {K} / 4; j++)
@@ -234,10 +239,10 @@ __STATIC_FORCEINLINE int32_t
gemm_{M}x{K}x{N}_body_{uniq_id}(
if ( {K} % 4 != 0 )
gemm_{M}x{N}_body_rest_{uniq_id}({K}, aa, bb, cc, A_stride, B_stride,
C_stride);
- return 0;
+out:
+ return retcode;
}}
-
#ifdef __cplusplus
extern "C"
#endif
@@ -306,9 +311,12 @@ __STATIC_FORCEINLINE int32_t
gemm_{M}x{K}x{N}_update_{uniq_id}(
int8_t *aa, int8_t *bb, int32_t *cc,
int A_stride, int B_stride, int C_stride) {{
int16_t bb_pad[{bb_pad_size}];
+ int32_t retcode = 0;
- if ( {M} < 16 || {N} < 16 )
- return gemm_{M}x{K}x{N}_update_loop_{uniq_id}(aa, bb, cc, A_stride,
B_stride, C_stride);
+ if ( {M} < 16 || {N} < 16 ) {{
+ retcode = gemm_{M}x{K}x{N}_update_loop_{uniq_id}(aa, bb, cc, A_stride,
B_stride, C_stride);
+ goto out;
+ }}
for (int i = 0; i < {N}; i++)
for (int j = 0; j < {K} / 4; j++)
@@ -334,11 +342,10 @@ __STATIC_FORCEINLINE int32_t
gemm_{M}x{K}x{N}_update_{uniq_id}(
if ( {K} % 4 != 0 )
gemm_{M}x{N}_update_rest_{uniq_id}({K}, aa, bb, cc, A_stride, B_stride,
C_stride);
- return 0;
+out:
+ return retcode;
}}
-
-
#ifdef __cplusplus
extern "C"
#endif
@@ -383,15 +390,24 @@ extern "C"
#endif
__STATIC_FORCEINLINE int32_t gemm16_{M}x{K}x{N}_body_{uniq_id}(
int16_t *aa, int16_t *bb, int32_t *cc,
- int A_stride, int B_stride, int C_stride) {{
- if ( {M} < 2 || {N} < 2 )
- return gemm16_{M}x{K}x{N}_body_loop_{uniq_id}(aa, bb, cc, A_stride,
B_stride, C_stride);
+ int A_stride, int B_stride, int C_stride) {{
+ int32_t retcode = 0;
+
+ if ( {M} < 2 || {N} < 2 ) {{
+ retcode = gemm16_{M}x{K}x{N}_body_loop_{uniq_id}(aa, bb, cc, A_stride,
B_stride, C_stride);
+ goto out;
+ }}
+
+ if(((uint32_t)aa & 0x3) != 0 || ((uint32_t)bb & 0x3) != 0){{
+ retcode = kTvmErrorFunctionCallInvalidArg;
+ goto out;
+ }}
for (int i = 0; i < {M}; i++) {{
for (int j = 0; j < {N}; j++) {{
int32_t *aa_ptr = (int32_t *) &aa[i*A_stride];
int32_t *bb_ptr = (int32_t *) &bb[j*B_stride];
-
+
int32_t sum = 0;
for (int l = 0; l < {K} / 2; l++) {{
sum = __SMLAD(*aa_ptr, *bb_ptr, sum);
@@ -407,10 +423,10 @@ __STATIC_FORCEINLINE int32_t
gemm16_{M}x{K}x{N}_body_{uniq_id}(
if ( {K} % 2 != 0 )
gemm16_{M}x{N}_body_rest_{uniq_id}({K}, aa, bb, cc, A_stride, B_stride,
C_stride);
- return 0;
+out:
+ return retcode;
}}
-
#ifdef __cplusplus
extern "C"
#endif
@@ -452,9 +468,13 @@ extern "C"
#endif
__STATIC_FORCEINLINE int32_t gemm16_{M}x{K}x{N}_update_{uniq_id}(
int16_t *aa, int16_t *bb, int32_t *cc,
- int A_stride, int B_stride, int C_stride) {{
- if ( {M} < 2 || {N} < 2 )
- return gemm16_{M}x{K}x{N}_update_loop_{uniq_id}(aa, bb, cc, A_stride,
B_stride, C_stride);
+ int A_stride, int B_stride, int C_stride) {{
+ int32_t retcode = 0;
+
+ if ( {M} < 2 || {N} < 2 ) {{
+ retcode = gemm16_{M}x{K}x{N}_update_loop_{uniq_id}(aa, bb, cc, A_stride,
B_stride, C_stride);
+ goto out;
+ }}
for (int i = 0; i < {M}; i++) {{
for (int j = 0; j < {N}; j++) {{
@@ -473,11 +493,10 @@ __STATIC_FORCEINLINE int32_t
gemm16_{M}x{K}x{N}_update_{uniq_id}(
if ( {K} % 2 != 0 )
gemm16_{M}x{N}_update_rest_{uniq_id}({K}, aa, bb, cc, A_stride, B_stride,
C_stride);
- return 0;
+out:
+ return retcode;
}}
-
-
#ifdef __cplusplus
extern "C"
#endif
@@ -489,5 +508,7 @@ __STATIC_FORCEINLINE int32_t
gemm_{M}x{K}x{N}_reset_{uniq_id}(int32_t *cc, int C
}}
return 0;
}}
- """
+
+"""
+ )
return cc_code
diff --git a/python/tvm/topi/arm_cpu/mprofile/dsp/micro_kernel/max_pool.py
b/python/tvm/topi/arm_cpu/mprofile/dsp/micro_kernel/max_pool.py
new file mode 100644
index 0000000..4d41042
--- /dev/null
+++ b/python/tvm/topi/arm_cpu/mprofile/dsp/micro_kernel/max_pool.py
@@ -0,0 +1,165 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+# pylint: disable=invalid-name, no-value-for-parameter
+"""Defines max intrinsics for elemwise max operation with v7e-m DSP
instructions."""
+
+import random
+import string
+
+import tvm
+from tvm import te
+from . import common
+
+
+def intrin_max(shape, in_dtype, out_dtype):
+ """Defines a v7e-m DSP-accelerated max pool."""
+ UNIQ_ID_LEN = 8
+ uniq_id = "".join(random.choices(string.ascii_uppercase, k=UNIQ_ID_LEN))
+ func_prefix = "max8"
+
+ assert in_dtype == "int8"
+ assert out_dtype == "int8"
+
+ x = te.placeholder(shape, name="x", dtype=in_dtype)
+ k = te.reduce_axis((0, 1), name="rc")
+ z = te.compute(shape, lambda *i: tvm.tir.max(x[i],
axis=[k]).astype(out_dtype))
+
+ def _intrin_func(ins, outs):
+ aa = ins[0]
+ cc = outs[0]
+
+ def _body():
+ ib = tvm.tir.ir_builder.create()
+ ib.emit(
+ tvm.tir.call_extern(
+ cc.dtype,
+ f"{func_prefix}_{uniq_id}",
+ aa.access_ptr("r"),
+ cc.access_ptr("w"),
+ cc.strides[0],
+ )
+ )
+ return ib.get()
+
+ def _reduce_reset():
+ ib = tvm.tir.ir_builder.create()
+ ib.emit(
+ tvm.tir.call_extern(
+ cc.dtype, f"{func_prefix}_reset_{uniq_id}",
cc.access_ptr("w"), cc.strides[0]
+ )
+ )
+ return ib.get()
+
+ def _reduce_update():
+ return _body()
+
+ return _body(), _reduce_reset(), _reduce_update()
+
+ binds = {
+ t: tvm.tir.decl_buffer(
+ t.shape,
+ t.dtype,
+ t.op.name,
+ strides=[te.var(f"{t.op.name}_s_{i}") for i in range(0,
len(t.shape))],
+ offset_factor=1,
+ )
+ for t in [x, z]
+ }
+
+ intrin_decl = te.decl_tensor_intrin(z.op, _intrin_func, binds=binds)
+ return intrin_decl, uniq_id
+
+
+def max_impl(uniq_id):
+ """Emit C code for pool impl."""
+ cc_code = (
+ common.common_includes
+ + f"""
+
+
+#ifdef __cplusplus
+extern "C"
+#endif
+__STATIC_FORCEINLINE int32_t max8_reset_{uniq_id}(
+ int8_t *res,
+ int N) {{
+ memset(res, (int8_t)-128, N * sizeof(*res));
+ return 0;
+}}
+
+#ifdef __cplusplus
+extern "C"
+#endif
+__STATIC_FORCEINLINE int32_t max8_loop_{uniq_id}(
+ int8_t *arg,
+ int8_t *res,
+ int N) {{
+ for ( int i = 0; i < N; ++ i )
+ if ( arg[i] > res[i] )
+ res[i] = arg[i];
+ return 0;
+}}
+
+#ifdef __cplusplus
+extern "C"
+#endif
+__STATIC_FORCEINLINE int32_t max8_{uniq_id}(
+ int8_t *arg,
+ int8_t *res,
+ int N) {{
+ int32_t *parg32, *pres32;
+ int una_arg = (int32_t)arg & 0x3, una_res = (int32_t)res & 0x3;
+ int32_t retcode = 0;
+
+ if ( N < 4 || ((una_arg || una_res) && una_arg != una_res) ) {{
+ retcode = max8_loop_{uniq_id}(arg, res, N);
+ goto out;
+ }}
+ if ( una_arg ) {{
+ int n = (4 - una_arg);
+ if ( n > N || (N - n) < 4 )
+ n = N;
+ retcode = max8_loop_{uniq_id}(arg, res, n);
+ N -= n;
+ if ( N == 0 )
+ goto out;
+ arg += n; res += n;
+ }}
+
+ parg32 = (int32_t *)arg;
+ pres32 = (int32_t *)res;
+
+ for ( int i = 0; i < N / 4; ++ i ) {{
+ int32_t arg32 = *parg32 ++;
+ int32_t res32 = *pres32;
+ __SSUB8(arg32, res32);
+ res32 = __SEL(arg32, res32);
+ *pres32 ++ = res32;
+ }}
+
+ if ( N & 0x3 ) {{
+ retcode = max8_loop_{uniq_id}((int8_t *)parg32, (int8_t *)pres32, N & 0x3);
+ goto out;
+ }}
+
+out:
+ return retcode;
+}}
+
+"""
+ )
+ return cc_code
diff --git a/python/tvm/topi/arm_cpu/mprofile/dsp/pool.py
b/python/tvm/topi/arm_cpu/mprofile/dsp/pool.py
new file mode 100644
index 0000000..99470a2
--- /dev/null
+++ b/python/tvm/topi/arm_cpu/mprofile/dsp/pool.py
@@ -0,0 +1,125 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+# pylint: disable=invalid-name, no-value-for-parameter
+"""Direct implementation of pool."""
+import logging
+
+import tvm
+
+from tvm import te
+from tvm.topi.utils import traverse_inline
+
+from .micro_kernel.max_pool import (
+ intrin_max,
+ max_impl,
+)
+
+from .micro_kernel.avg_pool import (
+ intrin_sum,
+ sum_impl,
+)
+
+logger = logging.getLogger("topi")
+
+
+def schedule_maxpool_1d_nwc(s, op):
+ """Schedule function for v7e-m DSP instructions of maxpool 1d NWC
layout."""
+ output = op.output(0)
+ data_vec = op.input_tensors[0]
+
+ channels = data_vec.shape[-1]
+ if isinstance(channels, tvm.tir.IntImm):
+ channels = channels.value
+
+ n, w, c = s[op].op.axis
+ (k,) = s[op].op.reduce_axis
+
+ s[op].reorder(n, w, k, c)
+ max_val, uniq_id = intrin_max((1, 1, channels), data_vec.dtype,
output.dtype)
+ s[op].tensorize(c, max_val)
+ s[output].pragma(n, "import_c", max_impl(uniq_id))
+
+
+def schedule_maxpool_2d_nhwc(s, op):
+ """Schedule function for v7e-m DSP instructions of maxpool 2d NHWC
layout."""
+ output = op.output(0)
+ data_vec = op.input_tensors[0]
+
+ channels = data_vec.shape[-1]
+ if isinstance(channels, tvm.tir.IntImm):
+ channels = channels.value
+
+ n, h, w, c = s[op].op.axis
+ ko, ki = s[op].op.reduce_axis
+
+ s[op].reorder(n, h, w, ko, ki, c)
+ max_val, uniq_id = intrin_max((1, 1, 1, channels), data_vec.dtype,
output.dtype)
+ s[op].tensorize(c, max_val)
+ s[output].pragma(n, "import_c", max_impl(uniq_id))
+
+
+def schedule_avgpool_1d_ncw(s, op):
+ """Schedule function for v7e-m DSP instructions of avgpool 1d NCW
layout."""
+ output = op.output(0)
+ data_vec = op.input_tensors[0]
+
+ n, _, _ = s[op].op.axis
+ (k,) = s[op].op.reduce_axis
+ pool_w = k.dom.extent.value
+
+ summary, uniq_id = intrin_sum((1, 1, pool_w), data_vec.dtype,
output.dtype, reset=True)
+ s[op].tensorize(k, summary)
+ s[output].pragma(n, "import_c", sum_impl(pool_w, uniq_id))
+
+
+def schedule_avgpool_2d_nchw(s, op):
+ """Schedule function for v7e-m DSP instructions of avgpool 2d NCHW
layout."""
+ output = op.output(0)
+ data_vec = op.input_tensors[0]
+
+ n, _, _, _ = s[op].op.axis
+ _, ki = s[op].op.reduce_axis
+ pool_w = ki.dom.extent.value
+
+ summary, uniq_id = intrin_sum((1, 1, 1, pool_w), data_vec.dtype,
output.dtype)
+ s[op].tensorize(ki, summary)
+ s[output].pragma(n, "import_c", sum_impl(pool_w, uniq_id))
+
+
+def pool_dsp_schedule(outs, layout):
+ """Schedule function for v7e-m DSP instructions of pooling."""
+ s = te.create_schedule([x.op for x in outs])
+
+ def _callback(op):
+ in_dtype = op.input_tensors[0].dtype
+ if "pool_max" in op.tag:
+ if in_dtype != "int8":
+ logger.warning("Does not have micro-kernel for %s maxpool.",
in_dtype)
+ elif layout == "NWC":
+ schedule_maxpool_1d_nwc(s, op)
+ elif layout == "NHWC":
+ schedule_maxpool_2d_nhwc(s, op)
+ elif "pool_sum" in op.tag:
+ if in_dtype != "int16":
+ logger.warning("Does not have micro-kernel for %s avgpool.",
in_dtype)
+ elif layout == "NCW":
+ schedule_avgpool_1d_ncw(s, op)
+ elif layout == "NCHW":
+ schedule_avgpool_2d_nchw(s, op)
+
+ traverse_inline(s, outs[-1].op, _callback)
+ return s
diff --git a/python/tvm/topi/arm_cpu/cortex_m7/conv2d/__init__.py
b/python/tvm/topi/arm_cpu/pooling.py
similarity index 74%
rename from python/tvm/topi/arm_cpu/cortex_m7/conv2d/__init__.py
rename to python/tvm/topi/arm_cpu/pooling.py
index cc4faf9..f09f008 100644
--- a/python/tvm/topi/arm_cpu/cortex_m7/conv2d/__init__.py
+++ b/python/tvm/topi/arm_cpu/pooling.py
@@ -14,6 +14,12 @@
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
-"""Conv2d implementations for cortex-m7."""
+# pylint: disable=invalid-name, unused-variable
+"""Schedule for pooling operators"""
-from . import direct_simd
+from .mprofile.dsp.pool import pool_dsp_schedule
+
+
+def schedule_pool(outs, layout):
+ """Create schedule for avgpool/maxpool with dsp"""
+ return pool_dsp_schedule(outs, layout)
diff --git a/tests/python/conftest.py b/tests/python/conftest.py
index ab3ea4e..0dbb3dc 100644
--- a/tests/python/conftest.py
+++ b/tests/python/conftest.py
@@ -17,6 +17,7 @@
import sys
import tvm
+import pytest
collect_ignore = []
if sys.platform.startswith("win"):
@@ -37,3 +38,23 @@ if sys.platform.startswith("win"):
# collect_ignore.append("unittest/test_auto_scheduler_measure.py") #
exception ignored
collect_ignore.append("unittest/test_tir_intrin.py")
+
+
+def pytest_addoption(parser):
+ parser.addoption(
+ "--enable-corstone300-tests",
+ action="store_true",
+ default=False,
+ help="Run Corstone-300 FVP tests",
+ )
+
+
+def pytest_collection_modifyitems(config, items):
+ if not config.getoption("--enable-corstone300-tests"):
+ for item in items:
+ if "corstone300" in item.keywords:
+ item.add_marker(
+ pytest.mark.skip(
+ reason="Need --enable-corstone300-tests option to run
this test"
+ )
+ )
diff --git a/tests/python/integration/test_arm_mprofile_dsp.py
b/tests/python/integration/test_arm_mprofile_dsp.py
new file mode 100644
index 0000000..cdafa91
--- /dev/null
+++ b/tests/python/integration/test_arm_mprofile_dsp.py
@@ -0,0 +1,355 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+import sys
+import numpy as np
+import pytest
+import tvm
+from tvm import relay
+from tests.python.relay.aot.aot_test_utils import (
+ AOTTestModel,
+ AOT_CORSTONE300_RUNNER,
+ generate_ref_data,
+ compile_and_run,
+)
+
+
[email protected]_corstone300
[email protected](
+ "data_shape_nhwc, kernel_size, num_filter, strides, padding, dilation",
+ [
+ ((1, 32, 32, 1), (3, 3), 12, 1, 0, 1),
+ ((1, 32, 10, 3), (3, 3), 16, 1, 0, 1),
+ ((1, 49, 10, 1), (10, 4), 64, (2, 1), (4, 1, 5, 1), 1),
+ ((1, 32, 32, 16), (3, 3), 16, 1, (0, 2, 2, 0), 1),
+ ((1, 32, 32, 16), (3, 3), 16, 1, 0, 1),
+ ((1, 32, 32, 16), (3, 3), 16, 1, 0, 1),
+ ((1, 32, 32, 16), (3, 3), 16, 1, (0, 2, 2, 0), 2),
+ ((1, 32, 32, 16), (3, 3), 16, 1, (1, 1, 2, 2), 2),
+ # bug https://github.com/apache/tvm/issues/9226
+ ((1, 49, 10, 1), (10, 4), 64, (2, 2), (4, 1, 5, 1), 1),
+ # from Visual Wake Word model
+ ((1, 96, 96, 3), (3, 3), 8, (2, 2), (0, 0, 1, 1), 1),
+ # from Image Classification model (one of the MLPerfTiny models)
+ ((1, 16, 16, 32), (1, 1), 64, (2, 2), 0, 1),
+ ((4, 16, 16, 8), (5, 5), 8, 2, (0, 4, 4, 0), 1),
+ ((4, 16, 16, 8), (5, 5), 16, 2, (0, 4, 4, 0), 1),
+ ((4, 16, 16, 8), (5, 5), 8, 2, 0, 1),
+ ((4, 16, 16, 8), (5, 5), 16, 2, 0, 1),
+ ((1, 16, 16, 8), (3, 3), 16, 2, (0, 0, 1, 1), 1),
+ ((1, 16, 16, 8), (3, 3), 16, 2, (1, 1, 2, 2), 1),
+ ((1, 16, 16, 8), (5, 5), 16, 2, (3, 3, 2, 2), 1),
+ ((1, 16, 16, 8), (3, 3), 16, 2, (0, 1, 2, 3), 1),
+ ],
+)
[email protected]("dtype", ["int8", "int16"])
+def test_conv2d(data_shape_nhwc, kernel_size, num_filter, strides, padding,
dilation, dtype):
+ """Test a subgraph with a single conv2d operator."""
+ ishape = data_shape_nhwc
+ wshape = (*kernel_size, data_shape_nhwc[-1], num_filter)
+
+ weight_data = np.random.randint(low=-10, high=10, size=wshape, dtype=dtype)
+
+ input0 = relay.var("input", relay.TensorType(ishape, dtype))
+ weight0 = relay.const(weight_data)
+ out0 = relay.op.nn.conv2d(
+ input0,
+ weight0,
+ kernel_size=kernel_size,
+ strides=strides,
+ padding=padding,
+ dilation=(dilation, dilation),
+ data_layout="NHWC",
+ kernel_layout="HWIO",
+ out_dtype="int32",
+ out_layout="NHWC",
+ )
+ ref_mod = tvm.IRModule.from_expr(relay.Function([input0], out0))
+
+ input1 = relay.var("input", relay.TensorType(ishape, dtype))
+ weight1 = relay.const(np.moveaxis(weight_data, 2, -1))
+ out1 = relay.op.nn.conv2d(
+ input1,
+ weight1,
+ kernel_size=kernel_size,
+ strides=strides,
+ padding=padding,
+ dilation=(dilation, dilation),
+ data_layout="NHWC",
+ kernel_layout="HWOI",
+ out_dtype="int32",
+ out_layout="NHWC",
+ )
+ mod = tvm.IRModule.from_expr(relay.Function([input1], out1))
+
+ inputs = {"input": np.random.randint(low=-128, high=127, size=ishape,
dtype=dtype)}
+ output_list = generate_ref_data(ref_mod, inputs)
+
+ compile_and_run(
+ AOTTestModel(module=mod, inputs=inputs, outputs=output_list),
+ runner=AOT_CORSTONE300_RUNNER,
+ interface_api="c",
+ use_unpacked_api=True,
+ target_opts={
+ "-keys": "arm_cpu",
+ "-mcpu": "cortex-m7",
+ },
+ )
+
+
[email protected]_corstone300
[email protected](
+ "data_shape_nwc, kernel_size, num_filter, strides, padding",
+ [
+ ((1, 32, 12), 3, 16, 1, 0),
+ ((3, 12, 10), 4, 24, 1, 0),
+ ((1, 7, 7), 3, 5, 1, 0),
+ ((1, 10, 2), 4, 4, 2, (1, 1)),
+ ((1, 20, 2), 4, 4, 2, (0, 1)),
+ ((1, 16, 4), 1, 12, 1, (1, 0)),
+ ((1, 24, 16), 1, 32, 3, (2, 2)),
+ ],
+)
[email protected]("dtype", ["int8", "int16"])
+def test_conv1d(data_shape_nwc, kernel_size, num_filter, strides, padding,
dtype):
+ """Test a subgraph with a single conv1d operator."""
+ ishape = data_shape_nwc
+ wshape = (kernel_size, data_shape_nwc[-1], num_filter)
+
+ weight_data = np.random.randint(low=-10, high=10, size=wshape, dtype=dtype)
+
+ input0 = relay.var("input", relay.TensorType(ishape, dtype))
+ weight0 = relay.const(weight_data)
+ out0 = relay.op.nn.conv1d(
+ input0,
+ weight0,
+ strides=strides,
+ padding=padding,
+ data_layout="NWC",
+ kernel_layout="WIO",
+ out_dtype="int32",
+ out_layout="NWC",
+ )
+ ref_mod = tvm.IRModule.from_expr(relay.Function([input0], out0))
+
+ input1 = relay.var("input", relay.TensorType(ishape, dtype))
+ weight1 = relay.const(np.moveaxis(weight_data, 1, -1))
+ out1 = relay.op.nn.conv1d(
+ input1,
+ weight1,
+ strides=strides,
+ padding=padding,
+ data_layout="NWC",
+ kernel_layout="WOI",
+ out_dtype="int32",
+ out_layout="NWC",
+ )
+ mod = tvm.IRModule.from_expr(relay.Function([input1], out1))
+
+ inputs = {"input": np.random.randint(low=-128, high=127, size=ishape,
dtype=dtype)}
+ output_list = generate_ref_data(ref_mod, inputs)
+
+ compile_and_run(
+ AOTTestModel(module=mod, inputs=inputs, outputs=output_list),
+ runner=AOT_CORSTONE300_RUNNER,
+ interface_api="c",
+ use_unpacked_api=True,
+ target_opts={
+ "-keys": "arm_cpu",
+ "-mcpu": "cortex-m7",
+ },
+ )
+
+
[email protected]_corstone300
[email protected](
+ "M, K, N",
+ [
+ (1, 32, 64),
+ (3, 12, 10),
+ ],
+)
+def test_dense(M, K, N):
+ """Test a subgraph with a single dense operator."""
+ ishape = (M, K)
+ wshape = (N, K)
+
+ input0 = relay.var("input", relay.TensorType(ishape, "int8"))
+ dense_f = relay.op.nn.batch_flatten(input0)
+ weight0 = relay.const(np.random.randint(low=-10, high=10, size=wshape,
dtype="int8"))
+ out = relay.op.nn.dense(dense_f, weight0, out_dtype="int32")
+
+ mod = tvm.IRModule.from_expr(relay.Function([input0], out))
+ inputs = {"input": np.random.randint(low=-128, high=127, size=ishape,
dtype="int8")}
+ output_list = generate_ref_data(mod, inputs)
+
+ compile_and_run(
+ AOTTestModel(module=mod, inputs=inputs, outputs=output_list),
+ runner=AOT_CORSTONE300_RUNNER,
+ interface_api="c",
+ use_unpacked_api=True,
+ target_opts={
+ "-keys": "arm_cpu",
+ "-mcpu": "cortex-m7",
+ },
+ )
+
+
[email protected]_corstone300
[email protected](
+ "data_shape_nhwc, pool_size, strides, padding",
+ [
+ ((1, 32, 32, 1), (3, 3), 1, 0),
+ ((1, 32, 20, 4), (3, 3), (2, 2), 0),
+ ],
+)
+def test_maxpool_2d(data_shape_nhwc, pool_size, strides, padding):
+ """Test a subgraph with a single maxpool_2d operator."""
+
+ ishape = data_shape_nhwc
+
+ input0 = relay.var("input", relay.TensorType(ishape, "int8"))
+ out = relay.op.nn.max_pool2d(input0, pool_size, layout="NHWC",
strides=strides, padding=padding)
+
+ mod = tvm.IRModule.from_expr(relay.Function([input0], out))
+ inputs = {"input": np.random.randint(low=-128, high=127, size=ishape,
dtype="int8")}
+ output_list = generate_ref_data(mod, inputs)
+
+ compile_and_run(
+ AOTTestModel(module=mod, inputs=inputs, outputs=output_list),
+ runner=AOT_CORSTONE300_RUNNER,
+ interface_api="c",
+ use_unpacked_api=True,
+ target_opts={
+ "-keys": "arm_cpu",
+ "-mcpu": "cortex-m7",
+ },
+ )
+
+
[email protected]_corstone300
[email protected](
+ "data_shape_nwc, pool_size, strides, padding",
+ [
+ ((1, 32, 1), 3, 1, 0),
+ ((1, 20, 4), 3, 2, 0),
+ ],
+)
+def test_maxpool_1d(data_shape_nwc, pool_size, strides, padding):
+ """Test a subgraph with a single maxpool_1d operator."""
+ ishape = data_shape_nwc
+
+ input0 = relay.var("input", relay.TensorType(ishape, "int8"))
+ out = relay.op.nn.max_pool1d(input0, pool_size, layout="NWC",
strides=strides, padding=padding)
+
+ mod = tvm.IRModule.from_expr(relay.Function([input0], out))
+ inputs = {"input": np.random.randint(low=-128, high=127, size=ishape,
dtype="int8")}
+ output_list = generate_ref_data(mod, inputs)
+
+ compile_and_run(
+ AOTTestModel(module=mod, inputs=inputs, outputs=output_list),
+ runner=AOT_CORSTONE300_RUNNER,
+ interface_api="c",
+ use_unpacked_api=True,
+ target_opts={
+ "-keys": "arm_cpu",
+ "-mcpu": "cortex-m7",
+ },
+ )
+
+
[email protected]_corstone300
[email protected](
+ "data_shape_nchw, pool_size, strides, padding",
+ [
+ ((1, 1, 32, 32), (3, 3), 1, 0),
+ ((1, 4, 32, 20), (3, 3), (2, 2), 0),
+ ],
+)
+def test_avgpool_2d(data_shape_nchw, pool_size, strides, padding):
+ """Test a subgraph with a single avgpool_2d operator."""
+
+ ishape = data_shape_nchw
+
+ input0 = relay.var("input", relay.TensorType(ishape, "int32"))
+ out0 = relay.nn.avg_pool2d(input0, pool_size, layout="NCHW",
strides=strides, padding=padding)
+ ref_mod = tvm.IRModule.from_expr(relay.Function([input0], out0))
+
+ input1 = relay.var("input", relay.TensorType(ishape, "int16"))
+ out1 = relay.op.nn.avg_pool2d(
+ input1, pool_size, layout="NCHW", strides=strides, padding=padding
+ )
+ mod = tvm.IRModule.from_expr(relay.Function([input1], out1))
+
+ input_data = np.random.randint(low=-128, high=127, size=ishape,
dtype="int32")
+ inputs = {"input": input_data}
+ output_list = generate_ref_data(ref_mod, inputs)
+
+ compile_and_run(
+ AOTTestModel(
+ module=mod, inputs={"input": input_data.astype(dtype="int16")},
outputs=output_list
+ ),
+ runner=AOT_CORSTONE300_RUNNER,
+ interface_api="c",
+ use_unpacked_api=True,
+ target_opts={
+ "-keys": "arm_cpu",
+ "-mcpu": "cortex-m7",
+ },
+ )
+
+
[email protected]_corstone300
[email protected](
+ "data_shape_ncw, pool_size, strides, padding",
+ [
+ ((1, 1, 32), 3, 1, 0),
+ ((1, 4, 20), 3, 2, 2),
+ ],
+)
+def test_avgpool_1d(data_shape_ncw, pool_size, strides, padding):
+ """Test a subgraph with a single avgpool_1d operator."""
+
+ ishape = data_shape_ncw
+
+ input0 = relay.var("input", relay.TensorType(ishape, "int32"))
+ out0 = relay.op.nn.avg_pool1d(input0, pool_size, layout="NCW",
strides=strides, padding=padding)
+ ref_mod = tvm.IRModule.from_expr(relay.Function([input0], out0))
+
+ input1 = relay.var("input", relay.TensorType(ishape, "int16"))
+ out1 = relay.op.nn.avg_pool1d(input1, pool_size, layout="NCW",
strides=strides, padding=padding)
+ mod = tvm.IRModule.from_expr(relay.Function([input1], out1))
+
+ input_data = np.random.randint(low=-10, high=10, size=ishape,
dtype="int32")
+ inputs = {"input": input_data}
+ output_list = generate_ref_data(ref_mod, inputs)
+
+ compile_and_run(
+ AOTTestModel(
+ module=mod, inputs={"input": input_data.astype(dtype="int16")},
outputs=output_list
+ ),
+ runner=AOT_CORSTONE300_RUNNER,
+ interface_api="c",
+ use_unpacked_api=True,
+ target_opts={
+ "-keys": "arm_cpu",
+ "-mcpu": "cortex-m7",
+ },
+ )
+
+
+if __name__ == "__main__":
+ sys.exit(pytest.main([__file__] + sys.argv[1:]))
diff --git a/tests/python/relay/aot/aot_test_utils.py
b/tests/python/relay/aot/aot_test_utils.py
index 7d8a4f0..c73af19 100644
--- a/tests/python/relay/aot/aot_test_utils.py
+++ b/tests/python/relay/aot/aot_test_utils.py
@@ -545,6 +545,15 @@ def create_header_file(tensor_name, npy_data, output_path,
data_linkage):
It is used to capture the tensor data (for both inputs and expected
outputs) to be bundled into the standalone application.
"""
file_path = pathlib.Path(f"{output_path}/" + tensor_name).resolve()
+ np_type_to_c = {
+ "int8": "int8_t",
+ "uint8": "uint8_t",
+ "int16": "int16_t",
+ "uint16": "uint16_t",
+ "int32": "int32_t",
+ "uint32": "uint32_t",
+ "float32": "float",
+ }
# create header file
raw_path = file_path.with_suffix(".h").resolve()
with open(raw_path, "w") as header_file:
@@ -555,14 +564,7 @@ def create_header_file(tensor_name, npy_data, output_path,
data_linkage):
emit_data_linkage(header_file, data_linkage)
- if npy_data.dtype == "int8":
- header_file.write(f"int8_t {tensor_name}[] =")
- elif npy_data.dtype == "int32":
- header_file.write(f"int32_t {tensor_name}[] = ")
- elif npy_data.dtype == "uint8":
- header_file.write(f"uint8_t {tensor_name}[] = ")
- elif npy_data.dtype == "float32":
- header_file.write(f"float {tensor_name}[] = ")
+ header_file.write(f"{np_type_to_c[str(npy_data.dtype)]}
{tensor_name}[] =")
header_file.write("{")
for i in np.ndindex(npy_data.shape):
@@ -577,6 +579,7 @@ def compile_models(
workspace_byte_alignment: int = 8,
enable_op_fusion: bool = True,
pass_config: Dict[str, Any] = None,
+ target_opts: Dict = None,
) -> List[AOTCompiledTestModel]:
"""
This method generates runtime.Modules for the tests
@@ -586,6 +589,9 @@ def compile_models(
base_target = "c -runtime=c --link-params --executor=aot"
extra_target = f"--workspace-byte-alignment={workspace_byte_alignment}
--interface-api={interface_api} --unpacked-api={int(use_unpacked_api)}"
+ if target_opts:
+ for key, val in target_opts.items():
+ extra_target += f" {key}={val}"
target = f"{base_target} {extra_target}"
config = {"tir.disable_vectorize": True}
@@ -727,6 +733,7 @@ def compile_and_run(
workspace_byte_alignment: int = 8,
enable_op_fusion: bool = True,
data_linkage: AOTDataLinkage = None,
+ target_opts: Dict = None,
):
"""This is a wrapper API to compile and run models as test for AoT"""
compiled_test_mods = compile_models(
@@ -736,6 +743,7 @@ def compile_and_run(
workspace_byte_alignment=workspace_byte_alignment,
enable_op_fusion=enable_op_fusion,
pass_config=runner.pass_config,
+ target_opts=target_opts,
)
run_and_check(
models=compiled_test_mods,
diff --git a/tests/scripts/task_python_integration.sh
b/tests/scripts/task_python_integration.sh
index 8618619..615caa5 100755
--- a/tests/scripts/task_python_integration.sh
+++ b/tests/scripts/task_python_integration.sh
@@ -74,3 +74,8 @@ run_pytest ctypes ${TVM_INTEGRATION_TESTSUITE_NAME}-driver
tests/python/driver
# Do not enable OpenGL
# run_pytest ctypes ${TVM_INTEGRATION_TESTSUITE_NAME}-webgl tests/webgl
+
+
+if [ -z "${TVM_INTEGRATION_GPU_ONLY:-}" ] && [ -z
"${TVM_INTEGRATION_I386_ONLY:-}" ] ; then
+ run_pytest ctypes ${TVM_INTEGRATION_TESTSUITE_NAME}-m7-simd
tests/python/integration/test_arm_mprofile_dsp.py --enable-corstone300-tests
+fi
diff --git a/tests/scripts/task_python_integration_gpuonly.sh
b/tests/scripts/task_python_integration_gpuonly.sh
index 36c3883..cb6bec4 100755
--- a/tests/scripts/task_python_integration_gpuonly.sh
+++ b/tests/scripts/task_python_integration_gpuonly.sh
@@ -20,5 +20,6 @@ export TVM_TEST_TARGETS="cuda;opencl;metal;rocm;nvptx;opencl
-device=mali,aocl_s
export PYTEST_ADDOPTS="-m gpu $PYTEST_ADDOPTS"
export TVM_RELAY_TEST_TARGETS="cuda"
export TVM_INTEGRATION_TESTSUITE_NAME=python-integration-gpu
+export TVM_INTEGRATION_GPU_ONLY=1
./tests/scripts/task_python_integration.sh