[GitHub] [incubator-tvm] FrozenGene commented on a change in pull request #5754: [RFC] Improve quantized convolution performance for armv8 architectures

GitBox Thu, 18 Jun 2020 20:04:32 -0700


FrozenGene commented on a change in pull request #5754:
URL: https://github.com/apache/incubator-tvm/pull/5754#discussion_r442604112




##########
File path: src/relay/op/nn/convolution.h
##########
@@ -383,6 +383,38 @@ inline bool Conv2DWinogradWeightTransformRel(const 
Array<Type>& types, int num_i
   return true;
 }
 
+// Gemm convolution shape relations
+inline bool Conv2DGemmWeightTransformRel(const Array<Type>& types, int 
num_inputs,
+                                         const Attrs& attrs, const 
TypeReporter& reporter) {
+  CHECK_EQ(types.size(), 2);
+  const auto* data = types[0].as<TensorTypeNode>();

Review comment:
       Sugget naming `data` to `weight`

##########
File path: python/tvm/relay/qnn/op/legalizations.py
##########
@@ -237,17 +237,23 @@ def is_fast_int8_on_arm():
     target = tvm.target.Target.current(allow_none=False)
     return '+v8.2a,+dotprod' in ' '.join(target.options)
 
+def is_aarch64_arm():
+    """ Checks whether the hardware has support for fast Int8 arithmetic 
operations. """
+    target = tvm.target.Target.current(allow_none=False)
+    return 'aarch64' in ' '.join(target.options)
+
 ########################
 # ARM CPU legalizations.
 ########################
 
 @qnn_conv2d_legalize.register('arm_cpu')
 def _qnn_conv2d_legalize_arm_cpu(attrs, inputs, types):
     # ARM prefers the dtypes to be same.
-    if is_fast_int8_on_arm():
+    if is_aarch64_arm() and attrs["data_layout"] == "NHWC" or 
is_fast_int8_on_arm():

Review comment:
       Let us add `parentheses`. i.e. `if (is_aarch64_arm() and 
attrs["data_layout"] == "NHWC") or is_fast_int8_on_arm()`

##########
File path: topi/python/topi/arm_cpu/conv2d_gemm.py
##########
@@ -0,0 +1,176 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+# pylint: disable=invalid-name, unused-variable, too-many-locals
+# pylint: disable=unused-argument, redefined-builtin
+"""GEMM Convolution schedule on ARM"""
+import tvm
+from tvm import te
+from topi import nn
+from ..util import get_const_tuple
+from ..nn.util import get_pad_tuple
+from .tensor_intrin import gemv_quantized, gemv_quantized_impl
+
+
+# Compute function
+def compute_conv2d_gemm_without_weight_transform(cfg,
+                                                 data, B_interleaved_t, 
strides, padding, dilation,
+                                                 out_dtype, kernel_size, 
output_channels):
+    """Compute conv2d by transforming the input,
+    executing GEMM and transforming the output back"""
+    batches, IH, IW, IC = get_const_tuple(data.shape)
+
+    KH, KW = kernel_size
+    OC = output_channels
+
+    K_AREA = KH * KW
+
+    if isinstance(dilation, int):
+        dilation_h = dilation_w = dilation
+    else:
+        dilation_h, dilation_w = dilation
+
+    dilated_kernel_h = (KH - 1) * dilation_h + 1
+    dilated_kernel_w = (KW - 1) * dilation_w + 1
+
+    pad_top, pad_left, pad_down, pad_right = \
+        get_pad_tuple(padding, (dilated_kernel_h, dilated_kernel_w))
+    HSTR, WSTR = strides if isinstance(strides, (tuple, list)) else (strides, 
strides)
+
+    OH = (IH + pad_top + pad_down - dilated_kernel_h) // HSTR + 1
+    OW = (IW + pad_left + pad_right - dilated_kernel_w) // WSTR + 1
+    if pad_top or pad_left:
+        data_pad = nn.pad(data, [0, pad_top, pad_left, 0], [0, pad_down, 
pad_right, 0],
+                          name="data_pad")
+    else:
+        data_pad = data
+
+    # --- Im2col
+    M = OH * OW
+    K = IC * K_AREA
+    N = OC
+
+    A_shape = (batches, M, K)
+    if K_AREA == 1:
+        A = te.compute(A_shape, lambda n, x, y: data_pad[n, HSTR * (x // OW), 
WSTR * (x % OW), y],
+                       name='data_flatten')
+    else:
+        A = te.compute(A_shape, lambda n, x, y:
+                       data_pad[n,
+                                HSTR * (x // OW) + dilation_h * (y // IC) // 
KW,
+                                WSTR * (x % OW) + dilation_w * (y // IC) % KW, 
y % IC],
+                       name='data_im2col')
+    N_transformed = B_interleaved_t.shape[0]
+
+    # --- Pad if necessary
+    idxm = tvm.tir.indexmod
+
+    pad_m = 0
+    pad_k = 0
+
+    if M % 4 != 0:
+        pad_m = 4 - (M % 4)
+
+    if K % 16 != 0:
+        pad_k = 16 - (K % 16)
+
+    M_padded = M + pad_m
+    K_padded = K + pad_k
+
+    pad_before = (0, 0, 0)
+    pad_after = (0, pad_m, pad_k)
+
+    if pad_m != 0 or pad_k != 0:
+        A = nn.pad(A, pad_before=pad_before, pad_after=pad_after, 
name="A_padded")
+
+    # --- GEMM: A*B'
+    k = te.reduce_axis((0, K_padded), "k")
+
+    A_interleaved = te.compute((batches, M_padded // 4, K_padded // 16, 4, 16),
+                               lambda b, x, y, z, w: A[b, z + 4 * x, w + 16 * 
y],
+                               name='A_interleaved')
+
+    C_interleaved = te.compute((batches, M_padded // 4, N_transformed, 4, 4),
+                               lambda b, x, y, w, z:
+                               te.sum(A_interleaved[b, x, k//16, w, idxm(k, 
16)].astype(out_dtype)*
+                                      B_interleaved_t[y, k//16, z, idxm(k, 
16)].astype(out_dtype),
+                                      axis=k),
+                               name='C_interleaved')
+
+    # --- Unpack C
+    C = te.compute((batches, M, N),
+                   lambda b, x, y:
+                   C_interleaved[b, x // 4, y // 4, idxm(x, 4), idxm(y, 4)],
+                   name="C", tag='injective')
+
+    # --- Produce the conv output
+    out_shape = (batches, OH, OW, OC)
+    out = te.compute(out_shape, lambda b, x, y, z: C(b, y + OW * x, z),
+                     name='conv2d_gemm_output')
+
+    return out
+
+# Schedules
+
+

Review comment:
       unnecessary blank lines 

##########
File path: python/tvm/relay/qnn/op/legalizations.py
##########
@@ -237,17 +237,23 @@ def is_fast_int8_on_arm():
     target = tvm.target.Target.current(allow_none=False)
     return '+v8.2a,+dotprod' in ' '.join(target.options)
 
+def is_aarch64_arm():
+    """ Checks whether the hardware has support for fast Int8 arithmetic 
operations. """

Review comment:
       The comment is not correct.

##########
File path: src/relay/op/nn/convolution.h
##########
@@ -383,6 +383,38 @@ inline bool Conv2DWinogradWeightTransformRel(const 
Array<Type>& types, int num_i
   return true;
 }
 
+// Gemm convolution shape relations
+inline bool Conv2DGemmWeightTransformRel(const Array<Type>& types, int 
num_inputs,

Review comment:
       Suggest add code doc comment of the principle of array packing and why 
is 4 and 16. 

##########
File path: src/relay/op/nn/convolution.h
##########
@@ -383,6 +383,38 @@ inline bool Conv2DWinogradWeightTransformRel(const 
Array<Type>& types, int num_i
   return true;
 }
 
+// Gemm convolution shape relations
+inline bool Conv2DGemmWeightTransformRel(const Array<Type>& types, int 
num_inputs,
+                                         const Attrs& attrs, const 
TypeReporter& reporter) {
+  CHECK_EQ(types.size(), 2);
+  const auto* data = types[0].as<TensorTypeNode>();
+  if (data == nullptr) return false;
+
+  CHECK_EQ(data->shape.size(), 4) << "Only support HWIO kernel layout";
+
+  const auto K = data->shape[0] * data->shape[1] * data->shape[2];
+  const auto N = data->shape[3];
+
+  auto k_mod_16 = indexmod(K, 16);
+  auto n_mod_4 = indexmod(N, 4);

Review comment:
       Could we let it be one variable to accept this? Because we create one 
op, however I think this shouldn't be restricted to use it on arm platforms. If 
someone want to use it on x86 CPU (like AVX512) , the number could be different.




----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
[email protected]

[GitHub] [incubator-tvm] FrozenGene commented on a change in pull request #5754: [RFC] Improve quantized convolution performance for armv8 architectures

Reply via email to