[GitHub] [tvm] mehrdadh commented on a diff in pull request #13752: [microTVM] Use QNN schedules to give SOTA performance

via GitHub Fri, 03 Feb 2023 16:46:39 -0800


mehrdadh commented on code in PR #13752:
URL: https://github.com/apache/tvm/pull/13752#discussion_r1096409344



##########
python/tvm/topi/arm_cpu/qnn.py:
##########
@@ -368,3 +389,139 @@ def kernel_ptr(buffer, c, offset=0):
 def schedule_qnn_depthwise_conv2d(_attrs, _outs, _target):
     """Schedule function for qnn.depthwise_conv2d."""
     return None
+
+
+def _make_unrolled_conv2d_primfunc(
+    output_dimensions: Tuple[int, int, int],
+    buffer_shapes: Tuple[Tuple, Tuple, Tuple, Tuple, Tuple],
+    function_names: Dict[Tuple, str],
+    function_code: str,
+    ptr_gens: Tuple[Callable, Callable],
+    output_layout="NHWC",
+):
+    out_height, out_width, out_channels = output_dimensions
+    data_shape, kernel_shape, bias_shape, scale_shape, output_shape = 
buffer_shapes
+    data_ptr, kernel_ptr = ptr_gens
+
+    def output_ptr(output, y, c):
+        if output_layout == "NHWC":
+            return _make_tscript_ptr(output, y * const(out_width * 
out_channels) + c, 1)
+        elif output_layout == "NCHW":
+            return _make_tscript_ptr(
+                output, c * const(out_height * out_width) + y * 
const(out_width), 1
+            )
+        else:
+            raise TVMError(f"Unsupported out_layout '{output_layout}'!")
+
+    def make_row_call(buffers, c_var, y, c):
+        output, data, kernel, bias, scale = buffers
+        return _make_tscript_call(
+            function_names[(y + c) % 2, c % 2, 0],
+            output_ptr(output, y, c_var + c),
+            data_ptr(data, y, c_var + c, offset=(y + c) % 2),
+            kernel_ptr(kernel, c_var + c, offset=c),
+            _bias_ptr(bias, c_var + c),
+            _scale_ptr(scale, c_var + c),
+        )
+
+    @T.prim_func
+    def biased_quantized_conv2d(
+        data_handle: T.handle,
+        kernel_handle: T.handle,
+        bias_handle: T.handle,
+        scale_handle: T.handle,
+        output_handle: T.handle,
+    ) -> None:
+        # Same setup is used as in _make_conv2d_primfunc
+        T.func_attr({"global_symbol": "main", "tir.noalias": True})
+        data = T.match_buffer(data_handle, data_shape, dtype="int16")
+        kernel = T.match_buffer(kernel_handle, kernel_shape, dtype="int16")
+        bias = T.match_buffer(bias_handle, bias_shape, dtype="int32")
+        scale = T.match_buffer(scale_handle, scale_shape)
+        output = T.match_buffer(output_handle, output_shape, dtype="int16")
+
+        # pylint: disable=unused-variable
+        output[0, 0, 0, 0] = 0
+        __1 = data[0, 0, 0, 0]
+        __2 = kernel[0, 0, 0, 0]
+        __3 = bias[0, 0, 0, 0]
+        __4 = scale[0]
+        # pylint: enable=unused-variable
+
+        for c_ax in T.grid(out_channels // 2):
+            with T.block("conv2ds"):
+                T.block_attr({"pragma_import_c": function_code})
+                c = T.axis.remap("S", [c_ax]) * 2
+
+                # TODO how can I programatically make the right number of

Review Comment:
   where you planing to change this part?



##########
python/tvm/topi/arm_cpu/qnn_legalize.py:
##########
@@ -0,0 +1,349 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""QNN legalization transforms that help eliminate sparse channels.
+
+Some models (like MobileNetV1 when fine-tuned) have output channels in their 
kernels which are
+completely full of zeros. Sometimes these can be optimized away by the C 
compiler, but this does not
+happen when complex schedules (like the ACLE tensordot convolutions) are used.
+
+Instead, we will remove these channels by replacing blocks of operators with 
equivalent "denser"
+ones during legalization. This is harder than it looks - while the outputs of 
channels with all-zero
+kernels do not depend on the input data, they are usually not zero. We work 
around this by computing
+how these constant values affect subsequent operators, and "folding" these 
effects into a bias_add.
+
+It would eventually be nice to have a generalized, cross-target solution for 
removing zero channels,
+as there is no downside. This may be possible with Relax, but I'm unsure.
+"""
+
+import numpy as np
+from scipy.signal import convolve2d
+from tvm.topi.utils import get_const_tuple
+from tvm import nd, relay
+from .qnn_alter_op import prev_ops_match, edit_attrs
+from ..nn import qnn_bias_add_legalize
+
+
+def _compute_fixed_conv2d_outputs(requantize_op):
+    """Compute all conv2d output values that do not depend on the layer 
input."""
+    bias_add_op = requantize_op.args[0]
+    conv2d_op = bias_add_op.args[0]
+
+    assert conv2d_op.attrs.kernel_layout.isalpha()
+    assert conv2d_op.attrs.groups == 1
+    kernel = conv2d_op.args[1].data.numpy()
+    oc_axis = conv2d_op.attrs.kernel_layout.index("O")
+
+    num_channels = kernel.shape[oc_axis]
+    rq_input_scale = requantize_op.args[1].data.numpy()
+    rq_output_scale = requantize_op.args[3].data.numpy().item()
+    rq_output_zero_point = requantize_op.args[4].data.numpy().item()
+    bias_data = bias_add_op.args[1].data.numpy()
+
+    fixed_outputs = {}
+
+    for i in range(num_channels):
+        if np.any(np.take(kernel, i, axis=oc_axis)):
+            continue
+        scale = rq_input_scale[i] / rq_output_scale
+        channel_constant = round(bias_data[i] * scale + rq_output_zero_point)
+        clipped = min(127, max(-128, channel_constant))
+        fixed_outputs[i] = clipped
+
+    return fixed_outputs
+
+
+def _compute_fixed_depthwise_outputs(requantize_op, fixed_channel_inputs):
+    """Compute all depthwise conv2d output values that do not depend on the 
PREVIOUS layer input.
+
+    We take as input a requantize operator, and a dictionary of which inputs 
to our depthwise
+    operator are fixed and what values they are fixed to. However, a fixed 
input to one channel
+    of our depthwise operator does NOT guarantee we can remove the output, 
because of padding.
+    This function checks if the padding makes a difference in the outputs, and 
if not, removes
+    the channels from the depthwise_conv2d.
+    """
+    bias_add_op = requantize_op.args[0]
+    depthwise_op = bias_add_op.args[0]
+
+    assert depthwise_op.attrs.kernel_layout.isalpha()
+    assert depthwise_op.attrs.groups > 1
+    kernel = depthwise_op.args[1].data.numpy()
+    oc_axis = depthwise_op.attrs.kernel_layout.index("O")
+
+    conv_input_zero_point = depthwise_op.args[2].data.numpy().item()
+    rq_input_scale = requantize_op.args[1].data.numpy()
+    rq_output_scale = requantize_op.args[3].data.numpy().item()
+    rq_output_zero_point = requantize_op.args[4].data.numpy().item()
+    bias_data = bias_add_op.args[1].data.numpy()
+
+    kernel_size = get_const_tuple(depthwise_op.attrs.kernel_size)
+
+    # Make a kernel_size x kernel_size array of fixed_input
+    # Pad it with zeros usint padding
+    # Do a convolution and make sure
+
+    fixed_outputs = {}
+
+    for i, fixed_input in fixed_channel_inputs.items():
+        input_array = np.full(kernel_size, fixed_input, dtype="int32") - 
conv_input_zero_point
+        kernel_channel = np.take(kernel, i, axis=oc_axis).reshape(kernel_size)
+        scale = rq_input_scale[i] / rq_output_scale
+
+        convolved = convolve2d(input_array, kernel_channel, mode="same")
+        rounded = np.around((convolved + bias_data[i]) * scale).astype("int32")
+        clipped = np.clip(rounded + rq_output_zero_point, -128, 127)
+
+        # We require the ENTIRE padded convolution to all have the same 
clipped value before we do
+        # a replacement. This is excessive - we only have to check for the 
padding that will
+        # actually be performed on the depthwise convolution, which is often 
less. If we felt even
+        # more ambitious, we could do the replacement for "close enough" 
looking convolution
+        # outputs, which in theory could reduce accuracy but in practice does 
not. Doing this would
+        # yield a ~0.5% speed gain on MobileNetV1, and nothing on other models.
+
+        if np.all(clipped == clipped[0, 0]):
+            fixed_outputs[i] = clipped[0, 0]
+
+    # TODO look for all-zero entries in the depthwise kernel. I don't think 
these really occur in

Review Comment:
   same here?
   also for TODOs please add your github handler in ()



##########
python/tvm/topi/arm_cpu/qnn_legalize.py:
##########
@@ -0,0 +1,349 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""QNN legalization transforms that help eliminate sparse channels.
+
+Some models (like MobileNetV1 when fine-tuned) have output channels in their 
kernels which are
+completely full of zeros. Sometimes these can be optimized away by the C 
compiler, but this does not
+happen when complex schedules (like the ACLE tensordot convolutions) are used.
+
+Instead, we will remove these channels by replacing blocks of operators with 
equivalent "denser"
+ones during legalization. This is harder than it looks - while the outputs of 
channels with all-zero
+kernels do not depend on the input data, they are usually not zero. We work 
around this by computing
+how these constant values affect subsequent operators, and "folding" these 
effects into a bias_add.
+
+It would eventually be nice to have a generalized, cross-target solution for 
removing zero channels,
+as there is no downside. This may be possible with Relax, but I'm unsure.
+"""
+
+import numpy as np
+from scipy.signal import convolve2d
+from tvm.topi.utils import get_const_tuple
+from tvm import nd, relay
+from .qnn_alter_op import prev_ops_match, edit_attrs
+from ..nn import qnn_bias_add_legalize
+
+
+def _compute_fixed_conv2d_outputs(requantize_op):
+    """Compute all conv2d output values that do not depend on the layer 
input."""
+    bias_add_op = requantize_op.args[0]
+    conv2d_op = bias_add_op.args[0]
+
+    assert conv2d_op.attrs.kernel_layout.isalpha()
+    assert conv2d_op.attrs.groups == 1
+    kernel = conv2d_op.args[1].data.numpy()
+    oc_axis = conv2d_op.attrs.kernel_layout.index("O")
+
+    num_channels = kernel.shape[oc_axis]
+    rq_input_scale = requantize_op.args[1].data.numpy()
+    rq_output_scale = requantize_op.args[3].data.numpy().item()
+    rq_output_zero_point = requantize_op.args[4].data.numpy().item()
+    bias_data = bias_add_op.args[1].data.numpy()
+
+    fixed_outputs = {}
+
+    for i in range(num_channels):
+        if np.any(np.take(kernel, i, axis=oc_axis)):
+            continue
+        scale = rq_input_scale[i] / rq_output_scale
+        channel_constant = round(bias_data[i] * scale + rq_output_zero_point)
+        clipped = min(127, max(-128, channel_constant))
+        fixed_outputs[i] = clipped
+
+    return fixed_outputs
+
+
+def _compute_fixed_depthwise_outputs(requantize_op, fixed_channel_inputs):
+    """Compute all depthwise conv2d output values that do not depend on the 
PREVIOUS layer input.
+
+    We take as input a requantize operator, and a dictionary of which inputs 
to our depthwise
+    operator are fixed and what values they are fixed to. However, a fixed 
input to one channel
+    of our depthwise operator does NOT guarantee we can remove the output, 
because of padding.
+    This function checks if the padding makes a difference in the outputs, and 
if not, removes
+    the channels from the depthwise_conv2d.
+    """
+    bias_add_op = requantize_op.args[0]
+    depthwise_op = bias_add_op.args[0]
+
+    assert depthwise_op.attrs.kernel_layout.isalpha()
+    assert depthwise_op.attrs.groups > 1
+    kernel = depthwise_op.args[1].data.numpy()
+    oc_axis = depthwise_op.attrs.kernel_layout.index("O")
+
+    conv_input_zero_point = depthwise_op.args[2].data.numpy().item()
+    rq_input_scale = requantize_op.args[1].data.numpy()
+    rq_output_scale = requantize_op.args[3].data.numpy().item()
+    rq_output_zero_point = requantize_op.args[4].data.numpy().item()
+    bias_data = bias_add_op.args[1].data.numpy()
+
+    kernel_size = get_const_tuple(depthwise_op.attrs.kernel_size)
+
+    # Make a kernel_size x kernel_size array of fixed_input

Review Comment:
   remove or if needed for explanation, make it more polished?



##########
src/relay/transforms/simplify_expr.cc:
##########
@@ -979,7 +992,16 @@ Pass SimplifyExpr() {
   return CreateFunctionPass(pass_func, 0, "SimplifyExpr", {"InferType"});
 }
 
+Pass SimplifyExprPostAlterOp() {
+  runtime::TypedPackedFunc<Function(Function, IRModule, PassContext)> 
pass_func =
+      [=](Function f, IRModule m, PassContext pc) {
+        return Downcast<Function>(SimplifyExprPostAlterOp(f, m));
+      };
+  return CreateFunctionPass(pass_func, 0, "SimplifyExprPostAlterOp", 
{"InferType"});
+}
+
 
TVM_REGISTER_GLOBAL("relay._transform.SimplifyExpr").set_body_typed(SimplifyExpr);
+// Don't globally register SimplifyExprPostAlterOp

Review Comment:
   why is that?



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

[GitHub] [tvm] mehrdadh commented on a diff in pull request #13752: [microTVM] Use QNN schedules to give SOTA performance

Reply via email to