areusch commented on code in PR #13242:
URL: https://github.com/apache/tvm/pull/13242#discussion_r1026548519


##########
python/tvm/relay/op/nn/_nn.py:
##########
@@ -877,6 +877,21 @@ def convert_deformable_conv2d(attrs, inputs, tinfos, 
desired_layouts):
     return relay.nn.deformable_conv2d(data, offset, weight, **new_attrs)
 
 
+# QNN ops
[email protected]_alter_op_layout("add")
+def alter_op_layout_add(attrs, inputs, tinfos, out_type):
+    """Alter the layout of a add op. Useful for fusing the bias constant with 
an input zero point

Review Comment:
   follow numpydoc style with a one-liner here. 
https://peps.python.org/pep-0257/#multi-line-docstrings



##########
python/tvm/relay/qnn/strategy/arm_cpu.py:
##########
@@ -0,0 +1,62 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""Quantized operator strategy for Arm CPU. These schedules are only used if 
the qnn.Legalize pass

Review Comment:
   one-liner to start



##########
python/tvm/topi/arm_cpu/mprofile/dsp/micro_kernel/tensordot.py:
##########
@@ -15,141 +15,294 @@
 # specific language governing permissions and limitations
 # under the License.
 """Computes a "jumpy tensordot" operator, which can be used to tensorize many 
common operators
-including regular conv2d, depthwise conv2d, and grouped conv2d provided the 
data and kernel layouts
-are the optimal ones. When groups=1, the optimal data layout is NHWC and 
kernel layout is OHWI. When
-this is a depthwise convolution, the optimal data layout is NCHW and kernel 
layout is OIHW."""
+including regular conv2d, depthwise conv2d, and grouped conv2d for some data 
and kernel layouts.
+When for regular convolution, use data laout HHWC and kernel layout OHWI. For 
depthwise convolution,
+use data layout data layout is NCHW and kernel layout OIHW."""
 
+from itertools import chain
 import textwrap
+from typing import Iterator, Tuple
 
-from tvm import te, tir
 
-from .common import num_simd_lanes_per_word
+def _get_c_function_name(split_size, dimensions, offsets, x_strides):
+    """Gets the C function name of the tensordot function. We do not need a 
suffix, as the generated
+    function will have an #include guard. Unlike other microTVM operators, 
_get_c_function_name is
+    never called externally."""
+    tensor_w, kernel_h, kernel_w = dimensions
+    return (
+        f"tensordot_opt_x{split_size}_int16_w{tensor_w}_"
+        + f"{kernel_h}x{kernel_w}_"
+        + "".join(map(str, offsets))
+        + (f"_{x_strides[0]}_{x_strides[1]}" if split_size > 1 else "")
+    )
 
 
-def _get_func_name(in_dtype, tensor_h, jump, tensor_w, suffix):
-    """Gets the C function name of the tensordot function."""
-    return f"tensordot_{in_dtype}_h{tensor_h}_j{jump}_w{tensor_w}_{suffix}"
+def _init_biased_accumulators(split_size):
+    """Addition is commutative, so we could add the bias before, during, or 
after performing our
+    multiply-accumulate operations. It "costs" one cycle either way - if done 
at the beginning we
+    can't use a SMULXY trick to set sum_i to zero for "free", and if done at 
the end it doesn't
+    combine with anything. However, doing it at the beginning frees up a 
register/prevents needing

Review Comment:
   what about overflow?



##########
python/tvm/relay/qnn/strategy/arm_cpu.py:
##########
@@ -0,0 +1,62 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""Quantized operator strategy for Arm CPU. These schedules are only used if 
the qnn.Legalize pass
+is disabled. These schedules only work on fused operators with a bias, as this 
is a very common use
+case. Currently only regular/depthwise conv2d is supported, but qnn_dense 
should be added."""
+
+from tvm import topi
+from .generic import qnn_conv2d_strategy
+from ... import op as _op
+from ...op.strategy.generic import is_depthwise_conv2d
+
+
+@qnn_conv2d_strategy.register("arm_cpu")
+def qnn_conv2d_strategy_arm_cpu(attrs, inputs, _out_type, target):
+    """qnn.conv2d strategy for Arm CPU. Currently, the schedules only support 
Cortex-M processors
+    with DSP - the qnn.Legalize pass should be run on all others."""
+
+    if not (target.features.has_dsp and "cortex-m" in target.mcpu):
+        raise RuntimeError(
+            "Quantized Arm schedules only exist for Cortex-M with DSP! "
+            "The qnn.Legalize pass should be run for other Arm processors."
+        )
+
+    data = inputs[0]
+    kernel = inputs[1]
+    data_layout = attrs.data_layout
+    kernel_layout = attrs.kernel_layout
+    groups = attrs.groups
+    strategy = _op.OpStrategy()
+
+    if groups == 1:
+        if data_layout == "NHWC" and kernel_layout == "OHWI":
+            strategy.add_implementation(
+                topi.arm_cpu.qnn_conv2d,
+                topi.arm_cpu.schedule_qnn_conv2d,
+                name="qnn_conv2d.arm_cpu",
+            )
+    elif is_depthwise_conv2d(data.shape, data_layout, kernel.shape, 
kernel_layout, groups):
+        if data_layout == "NCHW" and kernel_layout == "IOHW":
+            strategy.add_implementation(
+                topi.arm_cpu.qnn_depthwise_conv2d,
+                topi.arm_cpu.schedule_qnn_depthwise_conv2d,
+                name="qnn_depthwise_conv2d.arm_cpu",
+            )
+    else:
+        raise RuntimeError("No Arm Cortex-M DSP strategy exists for generic 
group qnn.conv2d")

Review Comment:
   can we assert or raise TVMError here and above?



##########
tests/python/relay/strategy/arm_cpu/test_quantized_convolution.py:
##########
@@ -0,0 +1,329 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""microTVM cares a lot about the convolution + bias + requantize + fused ReLU 
use case. There have
+been some accuracy issues in the past, so this test steps through a model 
(MobileNetV1) layer by
+layer and ensures there is 1-1 correspondance at each step. This test would 
run way faster if we ran
+the model all at once, but then we wouldn't know which layers had issues.
+
+Furthermore, this test uses some in-development optimizations for microTVM 
that aren't part of the
+main pipeline.
+"""
+
+import numpy as np
+from PIL import Image
+import pytest
+import tensorflow as tf
+
+import tvm
+import tvm.testing
+from tvm import meta_schedule, relay
+from tvm.testing.aot import AOTTestModel, run_and_check, AOTCompiledTestModel
+from tvm.relay.backend import Executor, Runtime
+from tvm.micro.testing.aot_test_utils import AOT_CORSTONE300_RUNNER
+from tvm.contrib.download import download_testdata
+from test_generalized_conv2d import change_ndarray_layout
+
+
+MODEL_URL = 
"https://github.com/mlcommons/tiny/raw/master/benchmark/training/visual_wake_words/trained_models/vww_96_int8.tflite";

Review Comment:
   should this be version-pinned?



##########
src/relay/qnn/op/requantize.cc:
##########
@@ -498,8 +498,8 @@ bool RequantizeRel(const Array<Type>& types, int 
num_inputs, const Attrs& attrs,
     axis_shape = Integer(1);
   }
   // Check and assign types for scale and zero points.
-  AssignType(types[1], DataType::Float(32), axis_shape, reporter);  // 
input_scale
-  AssignType(types[2], DataType::Int(32), axis_shape, reporter);    // 
input_zero_pt
+  // AssignType(types[1], DataType::Float(32), axis_shape, reporter);  // 
input_scale

Review Comment:
   uncomment?



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Reply via email to