[incubator-mxnet] branch master updated: [PERFORMANCE] [master] Layer normalization code from Marian for CPU (#19602)

sxjscience Mon, 04 Jan 2021 00:59:48 -0800

This is an automated email from the ASF dual-hosted git repository.

sxjscience pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/incubator-mxnet.git



The following commit(s) were added to refs/heads/master by this push:
     new ae8c974  [PERFORMANCE] [master] Layer normalization code from Marian 
for CPU (#19602)
ae8c974 is described below

commit ae8c9748743ca98979964bd34643aca343f93c7c
Author: kpuatamazon <[email protected]>
AuthorDate: Mon Jan 4 08:57:05 2021 +0000

    [PERFORMANCE] [master] Layer normalization code from Marian for CPU (#19602)
    
    * Layer normalization code from Marian
    
    * Remove MKL version of LayerNorm.
    
    Experiment with OMP_NUM_THREADS=4, times in s, c5.12xlarge
    
    |batchxchanne| New code | MKL      |
    |    1x   32 | 0.0000288| 0.0000278|
    |  128x   32 | 0.0000308| 0.0000311|
    | 2560x   32 | 0.0000712| 0.0000672|
    | 4096x   32 | 0.0000946| 0.0000910|
    | 8192x   32 | 0.0001597| 0.0001523|
    |16384x   32 | 0.0002905| 0.0002619|
    |    1x   64 | 0.0000264| 0.0000256|
    |  128x   64 | 0.0000339| 0.0000330|
    | 2560x   64 | 0.0000829| 0.0000972|
    | 4096x   64 | 0.0001137| 0.0001356|
    | 8192x   64 | 0.0002027| 0.0002435|
    |16384x   64 | 0.0003715| 0.0004639|
    |    1x  128 | 0.0000262| 0.0000263|
    |  128x  128 | 0.0000325| 0.0000389|
    | 2560x  128 | 0.0001074| 0.0001580|
    | 4096x  128 | 0.0001505| 0.0002336|
    | 8192x  128 | 0.0002861| 0.0004481|
    |16384x  128 | 0.0005648| 0.0008613|
    |    1x  256 | 0.0000273| 0.0000276|
    |  128x  256 | 0.0000390| 0.0000431|
    | 2560x  256 | 0.0001533| 0.0002811|
    | 4096x  256 | 0.0002258| 0.0004300|
    | 8192x  256 | 0.0004300| 0.0008464|
    |16384x  256 | 0.0010436| 0.0017613|
    |    1x  512 | 0.0000256| 0.0000302|
    |  128x  512 | 0.0000408| 0.0000551|
    | 2560x  512 | 0.0002444| 0.0005225|
    | 4096x  512 | 0.0003828| 0.0008147|
    | 8192x  512 | 0.0008832| 0.0017192|
    |16384x  512 | 0.0058463| 0.0074497|
    |    1x  768 | 0.0000252| 0.0000308|
    |  128x  768 | 0.0000450| 0.0000676|
    | 2560x  768 | 0.0003440| 0.0007719|
    | 4096x  768 | 0.0005890| 0.0013346|
    | 8192x  768 | 0.0014946| 0.0026145|
    |16384x  768 | 0.0089495| 0.0113557|
    |    1x 1024 | 0.0000285| 0.0000308|
    |  128x 1024 | 0.0000487| 0.0000786|
    | 2560x 1024 | 0.0004614| 0.0010190|
    | 4096x 1024 | 0.0008083| 0.0017376|
    | 8192x 1024 | 0.0059020| 0.0075588|
    |16384x 1024 | 0.0116553| 0.0146855|
    
    Benchmark program
    ```python
    import mxnet as mx
    import time
    
    def time_procedure(shape, count):
      data = mx.nd.random_uniform(shape=shape, low=-1.0, high = 1.0)
      factors = mx.nd.random_uniform(shape=(shape[-1],))
      mx.nd.waitall()
      begin = time.time()
      for i in range(0, count):
        out = mx.nd.LayerNorm(data, factors, factors)
        mx.nd.waitall()
      return (time.time() - begin) / count
    
    count = 200
    
    for channel in [32, 64, 128, 256, 512, 768, 1024]:
      for batch in [1, 128, 2560, 4096, 8192, 16384]:
        s = (batch, channel)
        timing = time_procedure(s, count)
        print("{:5d}x{:5d} | {:.7f}".format(s[0], s[1], timing))
    ```
    
    * Enable pragma omp simd on MSVC
    
    * Fix MSVC error C3016: 'j': index variable in OpenMP 'for' statement must 
have signed integral type
    
    * Try to make MSVC happy since it doesn't have ssize_t
    
    * Revert "Remove MKL version of LayerNorm."
    
    This reverts commit 740c4726c3068ac30b3809cd6280fa7e91af8c52.
    
    * Restore MKL layer normalization code, but it isn't called yet
    
    * Pull division out of the hot loop
    
    * Option to use MKL version requested by @samskalicky
    
    * Add -DUSE_MKL_LAYERNORM=ON to ubuntu MKL CPU test
    
    Co-authored-by: Kenneth Heafield <[email protected]>
---
 CMakeLists.txt                 |  14 ++++
 LICENSE                        |   1 +
 ci/docker/runtime_functions.sh |   1 +
 src/operator/nn/layer_norm.cc  | 171 +++++++++++++++++++++++++++++++++++++----
 4 files changed, 173 insertions(+), 14 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 02491e6..b8f9a93 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -61,6 +61,7 @@ cmake_dependent_option(USE_SSE "Build with x86 SSE 
instruction support" ON
 option(USE_F16C "Build with x86 F16C instruction support" ON) # autodetects 
support if ON
 option(USE_LAPACK "Build with lapack support" ON)
 option(USE_MKL_IF_AVAILABLE "Use MKL if found" ON)
+option(USE_MKL_LAYERNORM "Use layer normalization from MKL, which is currently 
slower than internal.  No effect unless USE_MKL_IF_AVAILABLE is set." OFF)
 if(USE_MKL_IF_AVAILABLE AND (NOT APPLE) AND (NOT MSVC) AND 
(CMAKE_HOST_SYSTEM_PROCESSOR STREQUAL "x86_64") AND (NOT CMAKE_CROSSCOMPILING))
   option(USE_MKLDNN "Build with MKL-DNN support" ON)
 else()
@@ -254,6 +255,9 @@ if(USE_TENSORRT)
           ${ONNX_PROTO_LIBRARY} ${ONNX_LIBRARY} ${PROTOBUF_LIBRARY})
 endif()
 
+if(USE_MKL_LAYERNORM)
+  add_definitions(-DMXNET_USE_MKL_LAYERNORM=1)
+endif()
 if(USE_MKLDNN)
   # CPU architecture (e.g., C5) can't run on another architecture (e.g., g3).
   if(MSVC)
@@ -428,6 +432,16 @@ if(USE_OPENMP)
     if(OPENMP_FOUND)
       set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${OpenMP_C_FLAGS}")
       set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS}")
+      # Enable pragma omp simd
+      # "While the name of this switch is 'experimental', the switch itself, 
and
+      # the functionality it enables is fully supported and production-ready.
+      # The name reflects that it doesn’t enable any complete subset or
+      # version of an OpenMP standard."
+      # -- 
https://devblogs.microsoft.com/cppblog/simd-extension-to-c-openmp-in-visual-studio/
+      if(MSVC)
+        set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -openmp:experimental")
+        set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -openmp:experimental")
+      endif()
       if(NOT BLAS STREQUAL "MKL")
         # Linker flags for Intel OMP are already set in case MKL is used. Only 
set if not MKL
         set(CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} 
${OpenMP_EXE_LINKER_FLAGS}")
diff --git a/LICENSE b/LICENSE
index 9e87416..1ba4b66 100644
--- a/LICENSE
+++ b/LICENSE
@@ -250,6 +250,7 @@
     docs/python_docs/themes/mx-theme
     3rdparty/intgemm
     3rdparty/tvm/3rdparty/compiler-rt/builtin_fp16.h
+    src/operator/nn/layer_norm.cc
 
     
=======================================================================================
     3-clause BSD license
diff --git a/ci/docker/runtime_functions.sh b/ci/docker/runtime_functions.sh
index 53f7c4c..f2fb2ef 100755
--- a/ci/docker/runtime_functions.sh
+++ b/ci/docker/runtime_functions.sh
@@ -338,6 +338,7 @@ build_ubuntu_cpu_mkl() {
         -DUSE_CUDA=OFF \
         -DUSE_TVM_OP=ON \
         -DUSE_MKL_IF_AVAILABLE=ON \
+        -DUSE_MKL_LAYERNORM=ON \
         -DUSE_BLAS=MKL \
         -DBUILD_EXTENSION_PATH=/work/mxnet/example/extensions/lib_external_ops 
\
         -GNinja /work/mxnet
diff --git a/src/operator/nn/layer_norm.cc b/src/operator/nn/layer_norm.cc
index 11178b3..0884720 100644
--- a/src/operator/nn/layer_norm.cc
+++ b/src/operator/nn/layer_norm.cc
@@ -15,6 +15,37 @@
  * KIND, either express or implied.  See the License for the
  * specific language governing permissions and limitations
  * under the License.
+ *
+ * Function LayerNormCPUKernel is adapated from Marian
+ * 
https://github.com/marian-nmt/marian-dev/blob/master/src/tensors/cpu/tensor_operators.cpp
+ * under the MIT license
+ * MIT License
+ *
+ * Copyright (c) 2016 Marcin Junczys-Dowmunt, the University of Edinburgh, Adam
+ * Mickiewicz University
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to 
deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ * 
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ * 
+ * All or part of this file was contributed by Intel under license:
+ *   Copyright (C) 2017-2018 Intel Corporation
+ *   SPDX-License-Identifier: MIT
+ *
  */
 
 /*!
@@ -68,23 +99,127 @@ static bool LayerNormShape(const nnvm::NodeAttrs& attrs,
   return true;
 }
 
-template<>
-void LayerNormCompute<cpu>(const nnvm::NodeAttrs& attrs,
-                           const OpContext& ctx, const std::vector<TBlob>& 
inputs,
-                           const std::vector<OpReqType>& req,
-                           const std::vector<TBlob>& outputs) {
-  return LayerNormComputeGeneral<cpu>(attrs, ctx, inputs, req, outputs);
+/* CPU optimized kernel for LayerNorm assuming axis = -1.
+ * Data is the underlying storage data type.
+ * Accum is the type to use for accumulation.
+ *   Apparently there isn't a reduction operator for half_t and anyway it isn't
+ *   efficient to use on the CPU, so use float for reduction of half_t.
+ *
+ * width is the number of values being summed to compute a mean.
+ * instances is how many independent layer normalization problems are packed 
into the tensors. 
+ *
+ * Inputs:
+ * data is instances x width
+ * gamma is width
+ * beta is width
+ *
+ * Outputs:
+ * out is instances x width, can be same as data
+ * mean is instances: means of each problem
+ * std is instances: standard deviation of each problem
+ *
+ */
+template <typename Data, typename Accum = typename
+            /* By default accumulate in float32 for float16.  Otherwise use 
same type. */
+            std::conditional<std::is_same<mshadow::half::half_t, Data>::value,
+                             float,
+                             Data>::type>
+void LayerNormCPUKernel(size_t width,
+                        size_t instances,
+                        Data eps,
+                        const Data *data,
+                        const Data *gamma,
+                        const Data *beta,
+                        Data *out,
+                        Data *mean,
+                        Data *std) {
+  // Parallelize over independent instances to normalize.
+  // MSVC says index variable in OpenMP 'for' statement must have signed 
integral type.
+  const mshadow::index_t signed_instances = 
static_cast<mshadow::index_t>(instances);
+#pragma omp parallel for
+  for (nnvm::dim_t j = 0; j < signed_instances; ++j) {
+    const Data *from = data + j * width;
+
+    // Sum the values to compute mean.
+    Accum sum = 0.f;
+#pragma omp simd reduction(+ : sum)
+    for (size_t i = 0; i < width; ++i) {
+      sum += from[i];
+    }
+    Accum mean_value = sum / width;
+    mean[j] = static_cast<Data>(mean_value);
+
+    // Sum squares from mean to compute stddev.
+    Accum squares = 0.f;
+#pragma omp simd reduction(+ : squares)
+    for (size_t i = 0; i < width; ++i) {
+      Accum off = from[i] - mean_value;
+      squares += off * off;
+    }
+    Accum sigma = std::sqrt(squares / width + eps);
+    std[j] = static_cast<Data>(sigma);
+
+    // Write normalized values.
+    Data *to = out + j * width;
+    Accum inv_sigma = 1.f / sigma;
+#pragma omp simd
+    for (size_t i = 0; i < width; ++i) {
+      to[i] = (from[i] - mean_value) * gamma[i] * inv_sigma + beta[i];
+    }
+  }
 }
 
-#if MSHADOW_USE_MKL == 1
-void LayerNormComputeMKL(const nnvm::NodeAttrs& attrs,
+/* Wrap the above LayerNormCPUKernel in MXNet's API.  Returns true if it
+ * is able to run.
+ */
+bool LayerNormCPU(const nnvm::NodeAttrs& attrs,
+                  const OpContext& ctx, const std::vector<TBlob>& inputs,
+                  const std::vector<OpReqType>& req,
+                  const std::vector<TBlob>& outputs) {
+  const LayerNormParam& param = nnvm::get<LayerNormParam>(attrs.parsed);
+  CHECK_EQ(inputs.size(), 3U);
+  CHECK_EQ(outputs.size(), 3U);
+
+  switch (req[layernorm::kOut]) {
+    case kNullOp:
+      return true;
+    case kWriteTo:
+      break;
+    case kWriteInplace:
+      break;
+    default:
+      // Should only be kAddTo, which isn't supported by the others 
implementation either.
+      return false;
+  }
+  // Axis must be the last one.
+  int axis = GetRealAxis(param.axis, inputs[layernorm::kData].ndim());
+  if (axis != inputs[layernorm::kData].ndim() - 1) {
+    return false;
+  }
+  MSHADOW_REAL_TYPE_SWITCH(inputs[layernorm::kData].type_flag_, DType, {
+    LayerNormCPUKernel<DType>(
+        inputs[layernorm::kData].shape_[axis],
+        outputs[layernorm::kMean].Size(),
+        param.eps,
+        inputs[layernorm::kData].dptr<DType>(),
+        inputs[layernorm::kGamma].dptr<DType>(),
+        inputs[layernorm::kBeta].dptr<DType>(),
+        outputs[layernorm::kOut].dptr<DType>(),
+        outputs[layernorm::kMean].dptr<DType>(),
+        outputs[layernorm::kStd].dptr<DType>());
+  });
+  return true;
+}
+
+#if MSHADOW_USE_MKL == 1 && MXNET_USE_MKL_LAYERNORM == 1
+bool LayerNormComputeMKL(const nnvm::NodeAttrs& attrs,
                          const OpContext& ctx,
                          const std::vector<TBlob>& inputs,
                          const std::vector<OpReqType>& req,
                          const std::vector<TBlob>& outputs) {
   using namespace mshadow;
   const LayerNormParam& param = nnvm::get<LayerNormParam>(attrs.parsed);
-  if (req[0] == kNullOp) return;
+  if (req[0] == kNullOp) return true;
   CHECK_NE(req[0], kAddTo);
   CHECK_EQ(inputs.size(), 3U);
   int axis = GetRealAxis(param.axis, inputs[0].ndim());
@@ -113,13 +248,25 @@ void LayerNormComputeMKL(const nnvm::NodeAttrs& attrs,
                                  outputs[layernorm::kStd].dptr<DType>(),
                                  static_cast<DType>(param.eps));
     });
+    return true;
   } else {
     // fallback
-    LayerNormCompute<cpu>(attrs, ctx, inputs, req, outputs);
+    return false;
   }
 }
 #endif
 
+template<>
+void LayerNormCompute<cpu>(const nnvm::NodeAttrs& attrs,
+                           const OpContext& ctx, const std::vector<TBlob>& 
inputs,
+                           const std::vector<OpReqType>& req,
+                           const std::vector<TBlob>& outputs) {
+#if MSHADOW_USE_MKL == 1 && MXNET_USE_MKL_LAYERNORM == 1
+  if (LayerNormComputeMKL(attrs, ctx, inputs, req, outputs)) return;
+#endif
+  if (LayerNormCPU(attrs, ctx, inputs, req, outputs)) return;
+  LayerNormComputeGeneral<cpu>(attrs, ctx, inputs, req, outputs);
+}
 
 template<>
 void LayerNormGradCompute<cpu>(const nnvm::NodeAttrs& attrs,
@@ -175,11 +322,7 @@ axis to be the last item in the input shape.
 })
 .set_attr<mxnet::FInferShape>("FInferShape", LayerNormShape)
 .set_attr<nnvm::FInferType>("FInferType", ElemwiseType<3, 3>)
-#if MSHADOW_USE_MKL == 1
-.set_attr<FCompute>("FCompute<cpu>", LayerNormComputeMKL)
-#else
 .set_attr<FCompute>("FCompute<cpu>", LayerNormCompute<cpu>)
-#endif
 .set_attr<nnvm::FGradient>("FGradient", [](const nnvm::ObjectPtr& n,
                                            const std::vector<nnvm::NodeEntry>& 
ograds) {
   std::vector<nnvm::NodeEntry> heads;

[incubator-mxnet] branch master updated: [PERFORMANCE] [master] Layer normalization code from Marian for CPU (#19602)

Reply via email to