szha closed pull request #11573: Add stable nrm2 Reducer
URL: https://github.com/apache/incubator-mxnet/pull/11573
This is a PR merged from a forked repository.
As GitHub hides the original diff on merge, it is displayed below for
the sake of provenance:
As this is a foreign pull request (from a fork), the diff is supplied
below (as it won't show otherwise due to GitHub magic):
diff --git a/3rdparty/mshadow b/3rdparty/mshadow
index a8c650ce8a7..463c0dffe3e 160000
--- a/3rdparty/mshadow
+++ b/3rdparty/mshadow
@@ -1 +1 @@
-Subproject commit a8c650ce8a708608a282c4d1e251c57873a8db25
+Subproject commit 463c0dffe3eae8c39caf7989c85b7244823df27e
diff --git a/src/operator/mshadow_op.h b/src/operator/mshadow_op.h
index 5953568c7fa..81a55c4a013 100644
--- a/src/operator/mshadow_op.h
+++ b/src/operator/mshadow_op.h
@@ -697,6 +697,22 @@ struct product {
MSHADOW_XINLINE static void Reduce(volatile DType& dst, volatile DType src,
volatile DType& none) { // NOLINT(*)
Reduce(dst, src);
}
+ /*! \brief combine the results of two reducers */
+ template<typename DType>
+ MSHADOW_XINLINE static void Merge(volatile DType& dst_val, volatile DType&
src_val) { // NOLINT(*)
+ Reduce(dst_val, src_val);
+ }
+ /*! \brief combine the results of two reducers */
+ template<typename DType>
+ MSHADOW_XINLINE static void Merge(volatile DType& dst_val, volatile DType&
dst_residual, volatile DType& src_val, volatile DType& src_residual) { //
NOLINT(*)
+ Reduce(dst_val, src_val);
+ }
+ /*! \brief finalize reduction */
+ template<typename DType>
+ MSHADOW_XINLINE static void Finalize(volatile DType& dst) {} // NOLINT(*)
+ /*! \brief finalize reduction */
+ template<typename DType>
+ MSHADOW_XINLINE static void Finalize(volatile DType& dst, volatile DType&
none) {} // NOLINT(*)
/*!
*\brief calculate gradient of redres with respect to redsrc,
* redres: reduced result, redsrc: one of reduction element
@@ -762,6 +778,26 @@ struct nansum {
residual = (t - dst) - y;
dst = t;
}
+ /*! \brief combine the results of two reducers */
+ template<typename DType>
+ MSHADOW_XINLINE static void Merge(volatile DType& dst_val, volatile DType&
src_val) { // NOLINT(*)
+ Reduce(dst_val, src_val);
+ }
+ /*! \brief combine the results of two reducers */
+ template<typename DType>
+ MSHADOW_XINLINE static void Merge(volatile DType& dst_val, volatile DType&
dst_residual, volatile DType& src_val, volatile DType& src_residual) { //
NOLINT(*)
+ DType t1 = dst_val + src_val;
+ DType e = t1 - src_val;
+ DType t2 = ((src_val - e) + (dst_val - (t1 - e))) + dst_residual +
src_residual;
+ dst_val = t1 + t2;
+ dst_residual = t2 - (dst_val - t1);
+ }
+ /*! \brief finalize reduction */
+ template<typename DType>
+ MSHADOW_XINLINE static void Finalize(volatile DType& dst) {} // NOLINT(*)
+ /*! \brief finalize reduction */
+ template<typename DType>
+ MSHADOW_XINLINE static void Finalize(volatile DType& dst, volatile DType&
residual) {} // NOLINT(*)
/*!
*\brief set the initial value during reduction
*/
@@ -799,6 +835,22 @@ struct nanprod {
MSHADOW_XINLINE static void Reduce(volatile DType& dst, volatile DType src,
volatile DType& none) { // NOLINT(*)
Reduce(dst, src);
}
+ /*! \brief combine the results of two reducers */
+ template<typename DType>
+ MSHADOW_XINLINE static void Merge(volatile DType& dst_val, volatile DType&
src_val) { // NOLINT(*)
+ Reduce(dst_val, src_val);
+ }
+ /*! \brief combine the results of two reducers */
+ template<typename DType>
+ MSHADOW_XINLINE static void Merge(volatile DType& dst_val, volatile DType&
dst_residual, volatile DType& src_val, volatile DType& src_residual) { //
NOLINT(*)
+ Reduce(dst_val, src_val);
+ }
+ /*! \brief finalize reduction */
+ template<typename DType>
+ MSHADOW_XINLINE static void Finalize(volatile DType& dst) {} // NOLINT(*)
+ /*! \brief finalize reduction */
+ template<typename DType>
+ MSHADOW_XINLINE static void Finalize(volatile DType& dst, volatile DType&
none) {} // NOLINT(*)
/*!
*\brief set the initial value during reduction
*/
@@ -806,6 +858,7 @@ struct nanprod {
MSHADOW_XINLINE static void SetInitValue(DType & initv) { // NOLINT(*)
initv = 1;
}
+
/*!
*\brief set the initial value during reduction
*/
@@ -815,6 +868,76 @@ struct nanprod {
}
};
+/*! \brief compute l2 norm */
+struct nrm2 {
+ /*! \brief do reduction into dst */
+ template<typename DType>
+ MSHADOW_XINLINE static void Reduce(volatile DType& sum_of_squares, volatile
DType src) { // NOLINT(*)
+ sum_of_squares += src * src;
+ }
+ /*! \brief do stable reduction into dst */
+ template<typename DType>
+ MSHADOW_XINLINE static void Reduce(volatile DType& sum_of_squares, volatile
DType src, volatile DType& scale) { // NOLINT(*)
+ if (src != 0) {
+ DType abs = mshadow_op::abs::Map(src);
+ if (scale < abs) {
+ sum_of_squares = 1 + sum_of_squares * (scale / abs) * (scale / abs);
+ scale = abs;
+ } else {
+ sum_of_squares = sum_of_squares + (abs / scale) * (abs / scale);
+ }
+ }
+ }
+ /*! \brief combine the results of two reducers */
+ template<typename DType>
+ MSHADOW_XINLINE static void Merge(volatile DType& dst_val, volatile DType&
src_val) { // NOLINT(*)
+ dst_val += src_val;
+ }
+ /*! \brief combine the results of two reducers */
+ template<typename DType>
+ MSHADOW_XINLINE static void Merge(volatile DType& dst_ssq, volatile DType&
dst_scale, volatile DType& src_ssq, volatile DType& src_scale) { // NOLINT(*)
+ if (dst_scale != 0 && dst_scale >= src_scale) {
+ dst_ssq = dst_ssq + src_ssq * (src_scale / dst_scale) * (src_scale /
dst_scale);
+ } else if (src_scale != 0 && dst_scale < src_scale) {
+ dst_ssq = src_ssq + dst_ssq * (dst_scale / src_scale) * (dst_scale /
src_scale);
+ dst_scale = src_scale;
+ }
+ }
+ /*! \brief finalize reduction result */
+ template<typename DType>
+ MSHADOW_XINLINE static void Finalize(volatile DType& sum_of_squares) { //
NOLINT(*)
+ sum_of_squares = math::sqrt(sum_of_squares);
+ }
+ /*! \brief finalize reduction result */
+ template<typename DType>
+ MSHADOW_XINLINE static void Finalize(volatile DType& sum_of_squares,
volatile DType& scale) { // NOLINT(*)
+ sum_of_squares = scale * math::sqrt(sum_of_squares);
+ }
+ /*!
+ *\brief calculate gradient of redres with respect to redsrc,
+ * redres: reduced result, redsrc: one of reduction element
+ */
+ template<typename DType>
+ MSHADOW_XINLINE static DType PartialGrad(DType redres, DType redsrc) {
+ return redsrc / redres;
+ }
+ /*!
+ *\brief set the initial value during reduction
+ */
+ template<typename DType>
+ MSHADOW_XINLINE static void SetInitValue(DType &sum_of_squares) { //
NOLINT(*)
+ sum_of_squares = 0;
+ }
+ /*!
+ *\brief set the initial value during reduction
+ */
+ template<typename DType>
+ MSHADOW_XINLINE static void SetInitValue(DType &sum_of_squares, DType
&scale) { // NOLINT(*)
+ SetInitValue(sum_of_squares);
+ scale = 0;
+ }
+};
+
struct nanprod_grad : public mxnet_op::tunable {
template<typename DType>
MSHADOW_XINLINE static DType Map(DType a, DType b) {
diff --git a/src/operator/tensor/broadcast_reduce-inl.cuh
b/src/operator/tensor/broadcast_reduce-inl.cuh
index b6bb39a1984..5c9b45f547f 100644
--- a/src/operator/tensor/broadcast_reduce-inl.cuh
+++ b/src/operator/tensor/broadcast_reduce-inl.cuh
@@ -123,27 +123,32 @@ __global__ void reduce_kernel(const int N, const int M,
const bool addto,
// Fix bx to avoid bank conflicts. Assumes warpSize number of banks
const int fbx = (do_transpose && ((bx & (warpSize - 1)) == 0)) ? (bx +
1) : bx;
const int it0 = tidx + tidy*fbx;
- shTile[it0] = val;
+ shTile[it0 * 2] = val;
+ shTile[it0 * 2 + 1] = residual;
__syncthreads();
for (int t=1;t < by;t <<= 1) {
- DType tmp, residual;
- Reducer::SetInitValue(tmp, residual);
- if (tidy + t < by) tmp = shTile[it0 + t*fbx];
+ DType tmp, tmp_residual;
+ Reducer::SetInitValue(tmp, tmp_residual);
+ if (tidy + t < by) {
+ tmp = shTile[(it0 + t*fbx) * 2];
+ tmp_residual = shTile[(it0 + t*fbx) * 2 + 1];
+ }
__syncthreads();
- Reducer::Reduce(shTile[it0], tmp, residual);
+ Reducer::Merge(shTile[it0 * 2], shTile[it0 * 2 + 1], tmp,
tmp_residual);
__syncthreads();
}
if (idx < N && tidy == 0) {
- assign(&small[idx + m0*N], addto, shTile[tidx]);
+ Reducer::Finalize(shTile[tidx * 2], shTile[tidx * 2 + 1]);
+ assign(&small[idx + m0*N], addto, shTile[tidx * 2]);
}
} else {
if (idx < N) {
+ Reducer::Finalize(val, residual);
assign(&small[idx + m0*N], addto, val);
}
}
}
}
-
}
template<typename Reducer, int ndim, typename DType, typename OP1, typename
OP2, int unroll>
@@ -207,27 +212,32 @@ __global__ void reduce_kernel(const int N, const int M,
const bool addto,
// Fix bx to avoid bank conflicts. Assumes warpSize number of banks
const int fbx = (do_transpose && ((bx & (warpSize - 1)) == 0)) ? (bx +
1) : bx;
const int it0 = tidx + tidy*fbx;
- shTile[it0] = val;
+ shTile[it0 * 2] = val;
+ shTile[it0 * 2 + 1] = residual;
__syncthreads();
for (int t=1;t < by;t <<= 1) {
- DType tmp, residual;
- Reducer::SetInitValue(tmp, residual);
- if (tidy + t < by) tmp = shTile[it0 + t*fbx];
+ DType tmp, tmp_residual;
+ Reducer::SetInitValue(tmp, tmp_residual);
+ if (tidy + t < by) {
+ tmp = shTile[(it0 + t*fbx) * 2];
+ tmp_residual = shTile[(it0 + t*fbx) * 2 + 1];
+ }
__syncthreads();
- Reducer::Reduce(shTile[it0], tmp, residual);
+ Reducer::Merge(shTile[it0 * 2], shTile[it0 * 2 + 1], tmp,
tmp_residual);
__syncthreads();
}
if (idx < N && tidy == 0) {
- assign(&small[idx + m0*N], addto, shTile[tidx]);
+ Reducer::Finalize(shTile[tidx * 2], shTile[tidx * 2 + 1]);
+ assign(&small[idx + m0*N], addto, shTile[tidx * 2]);
}
} else {
if (idx < N) {
+ Reducer::Finalize(val, residual);
assign(&small[idx + m0*N], addto, val);
}
}
}
}
-
}
// Simple reduction of lines when M is small
@@ -244,6 +254,7 @@ __global__ void reduce_lines_kernel(const int N, const int
M, const bool addto,
}
if (idx < N) {
+ Reducer::Finalize(val, residual);
assign(&small_out[idx], addto, val);
}
@@ -453,7 +464,7 @@ ReduceImplConfig<ndim> ConfigureReduceImpl(const TShape&
small, const TShape& bi
by++;
}
config.kernel_1.shMemSize = (config.kernel_1.blockDim.x > 1) ?
- config.kernel_1.blockDim.x*by*sizeof(DType) : 0;
+ config.kernel_1.blockDim.x*by*sizeof(DType) * 2 : 0;
// Maximum number of times we want TB to loop in M
// Max size of M-block each TB can handle
int maxMblock = config.kernel_1.blockDim.x*config.maxLoopPerTB;
@@ -464,7 +475,7 @@ ReduceImplConfig<ndim> ConfigureReduceImpl(const TShape&
small, const TShape& bi
ceil_idiv<unsigned int>(config.N, config.kernel_1.blockDim.x));
config.kernel_1.gridDim.y = std::min(kBaseGridNum, config.Mnext);
config.kernel_1.shMemSize = (config.kernel_1.blockDim.y > 1) ?
- config.kernel_1.blockDim.x*config.kernel_1.blockDim.y*sizeof(DType) :
0;
+ config.kernel_1.blockDim.x*config.kernel_1.blockDim.y*sizeof(DType) *
2 : 0;
// Maximum number of times we want TB to loop in M
// Max size of M-block each TB can handle
int maxMblock = config.kernel_1.blockDim.y*config.maxLoopPerTB;
diff --git a/src/operator/tensor/broadcast_reduce-inl.h
b/src/operator/tensor/broadcast_reduce-inl.h
index 76ec92a9e72..713e3f1ac60 100644
--- a/src/operator/tensor/broadcast_reduce-inl.h
+++ b/src/operator/tensor/broadcast_reduce-inl.h
@@ -165,6 +165,7 @@ MSHADOW_XINLINE void seq_reduce_assign(const int idx, const
int M, const bool ad
coord = unravel(k, rshape);
Reducer::Reduce(val, OP::Map(big[j + dot(coord, rstride)]), residual);
}
+ Reducer::Finalize(val, residual);
assign(&small[idx], addto, val);
}
@@ -256,6 +257,7 @@ MSHADOW_XINLINE void seq_reduce_assign(const int idx, const
int M, const bool ad
Reducer::Reduce(val, OP1::Map(big[idx_big], OP2::Map(lhs[idx_lhs],
rhs[idx_rhs])), residual);
}
+ Reducer::Finalize(val, residual);
assign(&small[idx], addto, val);
}
diff --git a/src/operator/tensor/broadcast_reduce_op.h
b/src/operator/tensor/broadcast_reduce_op.h
index ac7199a9482..d9a749e0db8 100644
--- a/src/operator/tensor/broadcast_reduce_op.h
+++ b/src/operator/tensor/broadcast_reduce_op.h
@@ -1005,9 +1005,8 @@ void LpNormCompute(const nnvm::NodeAttrs& attrs,
ReduceAxesComputeImpl<xpu, mshadow::red::sum, false, mshadow_op::abs>(
ctx, inputs, req, outputs, small);
} else if (param.ord == 2) {
- ReduceAxesComputeImpl<xpu, mshadow::red::sum, false, mshadow_op::square>(
+ ReduceAxesComputeImpl<xpu, mshadow_op::nrm2, false, mshadow_op::identity>(
ctx, inputs, req, outputs, small);
- SqRootForL2<xpu>(ctx, req[0], outputs[0]);
}
}
diff --git a/tests/python/unittest/test_ndarray.py
b/tests/python/unittest/test_ndarray.py
index cf5906ae454..b57e71d73b2 100644
--- a/tests/python/unittest/test_ndarray.py
+++ b/tests/python/unittest/test_ndarray.py
@@ -17,6 +17,7 @@
import mxnet as mx
import numpy as np
+from distutils.version import LooseVersion
import os
import pickle as pkl
import unittest
@@ -1276,10 +1277,19 @@ def test_ndarray_astype():
@with_seed()
def test_norm(ctx=default_context()):
+ try:
+ import scipy
+ assert LooseVersion(scipy.__version__) >= LooseVersion('0.1')
+ from scipy.linalg import norm as sp_norm
+ except (AssertionError, ImportError):
+ print("Could not import scipy.linalg.norm or scipy is too old. "
+ "Falling back to numpy.linalg.norm which is not numerically
stable.")
+ from numpy.linalg import norm as sp_norm
+
def l1norm(input_data, axis=0, keepdims=False):
return np.sum(abs(input_data), axis=axis, keepdims=keepdims)
def l2norm(input_data, axis=0, keepdims=False):
- return np.linalg.norm(input_data, axis=axis, keepdims=keepdims)
+ return sp_norm(input_data, axis=axis, keepdims=keepdims)
in_data_dim = random_sample([4,5,6], 1)[0]
in_data_shape = rand_shape_nd(in_data_dim)
diff --git a/tests/python/unittest/test_operator.py
b/tests/python/unittest/test_operator.py
index ae5cba21711..5a2067eab4a 100644
--- a/tests/python/unittest/test_operator.py
+++ b/tests/python/unittest/test_operator.py
@@ -23,6 +23,7 @@
import math
import random
import itertools
+from distutils.version import LooseVersion
from numpy.testing import assert_allclose, assert_array_equal
from mxnet.test_utils import *
from mxnet.base import py_str, MXNetError, _as_list
@@ -3031,13 +3032,22 @@ def npy_layer_norm(data, gamma, beta, axis=1, eps=1E-5):
grad_nodes={'data': req, 'gamma': req, 'beta':
req},
numeric_eps=1e-2, rtol=1e-2, atol=1e-2)
[email protected]("Flaky test:
https://github.com/apache/incubator-mxnet/issues/11509")
@with_seed()
def test_norm():
+ try:
+ import scipy
+ assert LooseVersion(scipy.__version__) >= LooseVersion('0.1')
+ from scipy.linalg import norm as sp_norm
+ except (AssertionError, ImportError):
+ print("Could not import scipy.linalg.norm or scipy is too old. "
+ "Falling back to numpy.linalg.norm which is not numerically
stable.")
+ from numpy.linalg import norm as sp_norm
+
def l1norm(input_data, axis=0, keepdims=True):
return np.sum(abs(input_data), axis=axis, keepdims=keepdims)
- def l2norm(input_data, axis=0, keepdims=True):
- return np.linalg.norm(input_data, axis=axis, keepdims=keepdims)
+
+ def l2norm(input_data, axis=0, keepdims=True):
+ return sp_norm(input_data, axis=axis, keepdims=keepdims)
ctx = default_context()
data = mx.symbol.Variable('data')
@@ -3051,7 +3061,7 @@ def l2norm(input_data, axis=0, keepdims=True):
for i in range(in_data_dim):
norm_sym = mx.symbol.norm(data=data, ord=order, axis=i,
keepdims=True)
npy_out = l1norm(in_data, i) if order is 1 else
l2norm(in_data, i)
- npy_out_backward = np.sign(in_data) if order is 1 else
in_data/npy_out
+ npy_out_backward = np.sign(in_data) if order is 1 else
in_data/npy_out
check_symbolic_forward(norm_sym, [in_data], [npy_out],
rtol=1e-2 if dtype is np.float16 else
1e-5,
atol=1e-2 if dtype is np.float16 else
1e-5, ctx=ctx)
@@ -3059,22 +3069,23 @@ def l2norm(input_data, axis=0, keepdims=True):
[npy_out_backward],
rtol=1e-2 if dtype is np.float16 else
1e-5,
atol=1e-2 if dtype is np.float16 else
1e-5, ctx=ctx)
- # check gradient
- check_numeric_gradient(norm_sym, [in_data],
numeric_eps=epsilon, rtol=1e-2, atol=1e-3)
- if i < in_data_dim-1:
- norm_sym = mx.symbol.norm(data=data, ord=order, axis=(i,
i+1), keepdims=True)
- npy_out = l1norm(in_data, (i, i+1)) if order is 1 else
l2norm(in_data, (i, i+1))
- npy_out_backward = np.sign(in_data) if order is 1 else
in_data/npy_out
- check_symbolic_forward(norm_sym, [in_data], [npy_out],
- rtol=1e-2 if dtype is np.float16
else 1e-5,
- atol=1e-2 if dtype is np.float16
else 1e-5, ctx=ctx)
- check_symbolic_backward(norm_sym, [in_data],
[np.ones(npy_out.shape)],
- [npy_out_backward],
- rtol=1e-2 if dtype is np.float16
else 1e-5,
- atol=1e-2 if dtype is np.float16
else 1e-5, ctx=ctx)
- # check gradient
- check_numeric_gradient(norm_sym, [in_data],
numeric_eps=epsilon, rtol=1e-2, atol=1e-3)
-
+ # Disable numeric gradient
https://github.com/apache/incubator-mxnet/issues/11509
+ # # check gradient
+ # check_numeric_gradient(norm_sym, [in_data],
numeric_eps=epsilon, rtol=1e-2, atol=1e-3)
+ # if i < in_data_dim-1:
+ # norm_sym = mx.symbol.norm(data=data, ord=order, axis=(i,
i+1), keepdims=True)
+ # npy_out = l1norm(in_data, (i, i+1)) if order is 1 else
l2norm(in_data, (i, i+1))
+ # npy_out_backward = np.sign(in_data) if order is 1 else
in_data/npy_out
+ # check_symbolic_forward(norm_sym, [in_data], [npy_out],
+ # rtol=1e-2 if dtype is np.float16
else 1e-5,
+ # atol=1e-2 if dtype is np.float16
else 1e-5, ctx=ctx)
+ # check_symbolic_backward(norm_sym, [in_data],
[np.ones(npy_out.shape)],
+ # [npy_out_backward],
+ # rtol=1e-2 if dtype is np.float16
else 1e-5,
+ # atol=1e-2 if dtype is np.float16
else 1e-5, ctx=ctx)
+ # # check gradient
+ # check_numeric_gradient(norm_sym, [in_data],
numeric_eps=epsilon, rtol=1e-2, atol=1e-3)
+
def test_layer_norm():
for dtype, forward_check_eps in zip([np.float16, np.float32, np.float64],
----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on GitHub and use the
URL above to go to the specific comment.
For queries about this service, please contact Infrastructure at:
[email protected]
With regards,
Apache Git Services