This is an automated email from the ASF dual-hosted git repository.
sxjscience pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/incubator-mxnet.git
The following commit(s) were added to refs/heads/master by this push:
new ae8c974 [PERFORMANCE] [master] Layer normalization code from Marian
for CPU (#19602)
ae8c974 is described below
commit ae8c9748743ca98979964bd34643aca343f93c7c
Author: kpuatamazon <[email protected]>
AuthorDate: Mon Jan 4 08:57:05 2021 +0000
[PERFORMANCE] [master] Layer normalization code from Marian for CPU (#19602)
* Layer normalization code from Marian
* Remove MKL version of LayerNorm.
Experiment with OMP_NUM_THREADS=4, times in s, c5.12xlarge
|batchxchanne| New code | MKL |
| 1x 32 | 0.0000288| 0.0000278|
| 128x 32 | 0.0000308| 0.0000311|
| 2560x 32 | 0.0000712| 0.0000672|
| 4096x 32 | 0.0000946| 0.0000910|
| 8192x 32 | 0.0001597| 0.0001523|
|16384x 32 | 0.0002905| 0.0002619|
| 1x 64 | 0.0000264| 0.0000256|
| 128x 64 | 0.0000339| 0.0000330|
| 2560x 64 | 0.0000829| 0.0000972|
| 4096x 64 | 0.0001137| 0.0001356|
| 8192x 64 | 0.0002027| 0.0002435|
|16384x 64 | 0.0003715| 0.0004639|
| 1x 128 | 0.0000262| 0.0000263|
| 128x 128 | 0.0000325| 0.0000389|
| 2560x 128 | 0.0001074| 0.0001580|
| 4096x 128 | 0.0001505| 0.0002336|
| 8192x 128 | 0.0002861| 0.0004481|
|16384x 128 | 0.0005648| 0.0008613|
| 1x 256 | 0.0000273| 0.0000276|
| 128x 256 | 0.0000390| 0.0000431|
| 2560x 256 | 0.0001533| 0.0002811|
| 4096x 256 | 0.0002258| 0.0004300|
| 8192x 256 | 0.0004300| 0.0008464|
|16384x 256 | 0.0010436| 0.0017613|
| 1x 512 | 0.0000256| 0.0000302|
| 128x 512 | 0.0000408| 0.0000551|
| 2560x 512 | 0.0002444| 0.0005225|
| 4096x 512 | 0.0003828| 0.0008147|
| 8192x 512 | 0.0008832| 0.0017192|
|16384x 512 | 0.0058463| 0.0074497|
| 1x 768 | 0.0000252| 0.0000308|
| 128x 768 | 0.0000450| 0.0000676|
| 2560x 768 | 0.0003440| 0.0007719|
| 4096x 768 | 0.0005890| 0.0013346|
| 8192x 768 | 0.0014946| 0.0026145|
|16384x 768 | 0.0089495| 0.0113557|
| 1x 1024 | 0.0000285| 0.0000308|
| 128x 1024 | 0.0000487| 0.0000786|
| 2560x 1024 | 0.0004614| 0.0010190|
| 4096x 1024 | 0.0008083| 0.0017376|
| 8192x 1024 | 0.0059020| 0.0075588|
|16384x 1024 | 0.0116553| 0.0146855|
Benchmark program
```python
import mxnet as mx
import time
def time_procedure(shape, count):
data = mx.nd.random_uniform(shape=shape, low=-1.0, high = 1.0)
factors = mx.nd.random_uniform(shape=(shape[-1],))
mx.nd.waitall()
begin = time.time()
for i in range(0, count):
out = mx.nd.LayerNorm(data, factors, factors)
mx.nd.waitall()
return (time.time() - begin) / count
count = 200
for channel in [32, 64, 128, 256, 512, 768, 1024]:
for batch in [1, 128, 2560, 4096, 8192, 16384]:
s = (batch, channel)
timing = time_procedure(s, count)
print("{:5d}x{:5d} | {:.7f}".format(s[0], s[1], timing))
```
* Enable pragma omp simd on MSVC
* Fix MSVC error C3016: 'j': index variable in OpenMP 'for' statement must
have signed integral type
* Try to make MSVC happy since it doesn't have ssize_t
* Revert "Remove MKL version of LayerNorm."
This reverts commit 740c4726c3068ac30b3809cd6280fa7e91af8c52.
* Restore MKL layer normalization code, but it isn't called yet
* Pull division out of the hot loop
* Option to use MKL version requested by @samskalicky
* Add -DUSE_MKL_LAYERNORM=ON to ubuntu MKL CPU test
Co-authored-by: Kenneth Heafield <[email protected]>
---
CMakeLists.txt | 14 ++++
LICENSE | 1 +
ci/docker/runtime_functions.sh | 1 +
src/operator/nn/layer_norm.cc | 171 +++++++++++++++++++++++++++++++++++++----
4 files changed, 173 insertions(+), 14 deletions(-)
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 02491e6..b8f9a93 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -61,6 +61,7 @@ cmake_dependent_option(USE_SSE "Build with x86 SSE
instruction support" ON
option(USE_F16C "Build with x86 F16C instruction support" ON) # autodetects
support if ON
option(USE_LAPACK "Build with lapack support" ON)
option(USE_MKL_IF_AVAILABLE "Use MKL if found" ON)
+option(USE_MKL_LAYERNORM "Use layer normalization from MKL, which is currently
slower than internal. No effect unless USE_MKL_IF_AVAILABLE is set." OFF)
if(USE_MKL_IF_AVAILABLE AND (NOT APPLE) AND (NOT MSVC) AND
(CMAKE_HOST_SYSTEM_PROCESSOR STREQUAL "x86_64") AND (NOT CMAKE_CROSSCOMPILING))
option(USE_MKLDNN "Build with MKL-DNN support" ON)
else()
@@ -254,6 +255,9 @@ if(USE_TENSORRT)
${ONNX_PROTO_LIBRARY} ${ONNX_LIBRARY} ${PROTOBUF_LIBRARY})
endif()
+if(USE_MKL_LAYERNORM)
+ add_definitions(-DMXNET_USE_MKL_LAYERNORM=1)
+endif()
if(USE_MKLDNN)
# CPU architecture (e.g., C5) can't run on another architecture (e.g., g3).
if(MSVC)
@@ -428,6 +432,16 @@ if(USE_OPENMP)
if(OPENMP_FOUND)
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${OpenMP_C_FLAGS}")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS}")
+ # Enable pragma omp simd
+ # "While the name of this switch is 'experimental', the switch itself,
and
+ # the functionality it enables is fully supported and production-ready.
+ # The name reflects that it doesn’t enable any complete subset or
+ # version of an OpenMP standard."
+ # --
https://devblogs.microsoft.com/cppblog/simd-extension-to-c-openmp-in-visual-studio/
+ if(MSVC)
+ set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -openmp:experimental")
+ set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -openmp:experimental")
+ endif()
if(NOT BLAS STREQUAL "MKL")
# Linker flags for Intel OMP are already set in case MKL is used. Only
set if not MKL
set(CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS}
${OpenMP_EXE_LINKER_FLAGS}")
diff --git a/LICENSE b/LICENSE
index 9e87416..1ba4b66 100644
--- a/LICENSE
+++ b/LICENSE
@@ -250,6 +250,7 @@
docs/python_docs/themes/mx-theme
3rdparty/intgemm
3rdparty/tvm/3rdparty/compiler-rt/builtin_fp16.h
+ src/operator/nn/layer_norm.cc
=======================================================================================
3-clause BSD license
diff --git a/ci/docker/runtime_functions.sh b/ci/docker/runtime_functions.sh
index 53f7c4c..f2fb2ef 100755
--- a/ci/docker/runtime_functions.sh
+++ b/ci/docker/runtime_functions.sh
@@ -338,6 +338,7 @@ build_ubuntu_cpu_mkl() {
-DUSE_CUDA=OFF \
-DUSE_TVM_OP=ON \
-DUSE_MKL_IF_AVAILABLE=ON \
+ -DUSE_MKL_LAYERNORM=ON \
-DUSE_BLAS=MKL \
-DBUILD_EXTENSION_PATH=/work/mxnet/example/extensions/lib_external_ops
\
-GNinja /work/mxnet
diff --git a/src/operator/nn/layer_norm.cc b/src/operator/nn/layer_norm.cc
index 11178b3..0884720 100644
--- a/src/operator/nn/layer_norm.cc
+++ b/src/operator/nn/layer_norm.cc
@@ -15,6 +15,37 @@
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
+ *
+ * Function LayerNormCPUKernel is adapated from Marian
+ *
https://github.com/marian-nmt/marian-dev/blob/master/src/tensors/cpu/tensor_operators.cpp
+ * under the MIT license
+ * MIT License
+ *
+ * Copyright (c) 2016 Marcin Junczys-Dowmunt, the University of Edinburgh, Adam
+ * Mickiewicz University
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ *
+ * All or part of this file was contributed by Intel under license:
+ * Copyright (C) 2017-2018 Intel Corporation
+ * SPDX-License-Identifier: MIT
+ *
*/
/*!
@@ -68,23 +99,127 @@ static bool LayerNormShape(const nnvm::NodeAttrs& attrs,
return true;
}
-template<>
-void LayerNormCompute<cpu>(const nnvm::NodeAttrs& attrs,
- const OpContext& ctx, const std::vector<TBlob>&
inputs,
- const std::vector<OpReqType>& req,
- const std::vector<TBlob>& outputs) {
- return LayerNormComputeGeneral<cpu>(attrs, ctx, inputs, req, outputs);
+/* CPU optimized kernel for LayerNorm assuming axis = -1.
+ * Data is the underlying storage data type.
+ * Accum is the type to use for accumulation.
+ * Apparently there isn't a reduction operator for half_t and anyway it isn't
+ * efficient to use on the CPU, so use float for reduction of half_t.
+ *
+ * width is the number of values being summed to compute a mean.
+ * instances is how many independent layer normalization problems are packed
into the tensors.
+ *
+ * Inputs:
+ * data is instances x width
+ * gamma is width
+ * beta is width
+ *
+ * Outputs:
+ * out is instances x width, can be same as data
+ * mean is instances: means of each problem
+ * std is instances: standard deviation of each problem
+ *
+ */
+template <typename Data, typename Accum = typename
+ /* By default accumulate in float32 for float16. Otherwise use
same type. */
+ std::conditional<std::is_same<mshadow::half::half_t, Data>::value,
+ float,
+ Data>::type>
+void LayerNormCPUKernel(size_t width,
+ size_t instances,
+ Data eps,
+ const Data *data,
+ const Data *gamma,
+ const Data *beta,
+ Data *out,
+ Data *mean,
+ Data *std) {
+ // Parallelize over independent instances to normalize.
+ // MSVC says index variable in OpenMP 'for' statement must have signed
integral type.
+ const mshadow::index_t signed_instances =
static_cast<mshadow::index_t>(instances);
+#pragma omp parallel for
+ for (nnvm::dim_t j = 0; j < signed_instances; ++j) {
+ const Data *from = data + j * width;
+
+ // Sum the values to compute mean.
+ Accum sum = 0.f;
+#pragma omp simd reduction(+ : sum)
+ for (size_t i = 0; i < width; ++i) {
+ sum += from[i];
+ }
+ Accum mean_value = sum / width;
+ mean[j] = static_cast<Data>(mean_value);
+
+ // Sum squares from mean to compute stddev.
+ Accum squares = 0.f;
+#pragma omp simd reduction(+ : squares)
+ for (size_t i = 0; i < width; ++i) {
+ Accum off = from[i] - mean_value;
+ squares += off * off;
+ }
+ Accum sigma = std::sqrt(squares / width + eps);
+ std[j] = static_cast<Data>(sigma);
+
+ // Write normalized values.
+ Data *to = out + j * width;
+ Accum inv_sigma = 1.f / sigma;
+#pragma omp simd
+ for (size_t i = 0; i < width; ++i) {
+ to[i] = (from[i] - mean_value) * gamma[i] * inv_sigma + beta[i];
+ }
+ }
}
-#if MSHADOW_USE_MKL == 1
-void LayerNormComputeMKL(const nnvm::NodeAttrs& attrs,
+/* Wrap the above LayerNormCPUKernel in MXNet's API. Returns true if it
+ * is able to run.
+ */
+bool LayerNormCPU(const nnvm::NodeAttrs& attrs,
+ const OpContext& ctx, const std::vector<TBlob>& inputs,
+ const std::vector<OpReqType>& req,
+ const std::vector<TBlob>& outputs) {
+ const LayerNormParam& param = nnvm::get<LayerNormParam>(attrs.parsed);
+ CHECK_EQ(inputs.size(), 3U);
+ CHECK_EQ(outputs.size(), 3U);
+
+ switch (req[layernorm::kOut]) {
+ case kNullOp:
+ return true;
+ case kWriteTo:
+ break;
+ case kWriteInplace:
+ break;
+ default:
+ // Should only be kAddTo, which isn't supported by the others
implementation either.
+ return false;
+ }
+ // Axis must be the last one.
+ int axis = GetRealAxis(param.axis, inputs[layernorm::kData].ndim());
+ if (axis != inputs[layernorm::kData].ndim() - 1) {
+ return false;
+ }
+ MSHADOW_REAL_TYPE_SWITCH(inputs[layernorm::kData].type_flag_, DType, {
+ LayerNormCPUKernel<DType>(
+ inputs[layernorm::kData].shape_[axis],
+ outputs[layernorm::kMean].Size(),
+ param.eps,
+ inputs[layernorm::kData].dptr<DType>(),
+ inputs[layernorm::kGamma].dptr<DType>(),
+ inputs[layernorm::kBeta].dptr<DType>(),
+ outputs[layernorm::kOut].dptr<DType>(),
+ outputs[layernorm::kMean].dptr<DType>(),
+ outputs[layernorm::kStd].dptr<DType>());
+ });
+ return true;
+}
+
+#if MSHADOW_USE_MKL == 1 && MXNET_USE_MKL_LAYERNORM == 1
+bool LayerNormComputeMKL(const nnvm::NodeAttrs& attrs,
const OpContext& ctx,
const std::vector<TBlob>& inputs,
const std::vector<OpReqType>& req,
const std::vector<TBlob>& outputs) {
using namespace mshadow;
const LayerNormParam& param = nnvm::get<LayerNormParam>(attrs.parsed);
- if (req[0] == kNullOp) return;
+ if (req[0] == kNullOp) return true;
CHECK_NE(req[0], kAddTo);
CHECK_EQ(inputs.size(), 3U);
int axis = GetRealAxis(param.axis, inputs[0].ndim());
@@ -113,13 +248,25 @@ void LayerNormComputeMKL(const nnvm::NodeAttrs& attrs,
outputs[layernorm::kStd].dptr<DType>(),
static_cast<DType>(param.eps));
});
+ return true;
} else {
// fallback
- LayerNormCompute<cpu>(attrs, ctx, inputs, req, outputs);
+ return false;
}
}
#endif
+template<>
+void LayerNormCompute<cpu>(const nnvm::NodeAttrs& attrs,
+ const OpContext& ctx, const std::vector<TBlob>&
inputs,
+ const std::vector<OpReqType>& req,
+ const std::vector<TBlob>& outputs) {
+#if MSHADOW_USE_MKL == 1 && MXNET_USE_MKL_LAYERNORM == 1
+ if (LayerNormComputeMKL(attrs, ctx, inputs, req, outputs)) return;
+#endif
+ if (LayerNormCPU(attrs, ctx, inputs, req, outputs)) return;
+ LayerNormComputeGeneral<cpu>(attrs, ctx, inputs, req, outputs);
+}
template<>
void LayerNormGradCompute<cpu>(const nnvm::NodeAttrs& attrs,
@@ -175,11 +322,7 @@ axis to be the last item in the input shape.
})
.set_attr<mxnet::FInferShape>("FInferShape", LayerNormShape)
.set_attr<nnvm::FInferType>("FInferType", ElemwiseType<3, 3>)
-#if MSHADOW_USE_MKL == 1
-.set_attr<FCompute>("FCompute<cpu>", LayerNormComputeMKL)
-#else
.set_attr<FCompute>("FCompute<cpu>", LayerNormCompute<cpu>)
-#endif
.set_attr<nnvm::FGradient>("FGradient", [](const nnvm::ObjectPtr& n,
const std::vector<nnvm::NodeEntry>&
ograds) {
std::vector<nnvm::NodeEntry> heads;