szha closed pull request #11573: Add stable nrm2 Reducer
URL: https://github.com/apache/incubator-mxnet/pull/11573
 
 
   

This is a PR merged from a forked repository.
As GitHub hides the original diff on merge, it is displayed below for
the sake of provenance:

As this is a foreign pull request (from a fork), the diff is supplied
below (as it won't show otherwise due to GitHub magic):

diff --git a/3rdparty/mshadow b/3rdparty/mshadow
index a8c650ce8a7..463c0dffe3e 160000
--- a/3rdparty/mshadow
+++ b/3rdparty/mshadow
@@ -1 +1 @@
-Subproject commit a8c650ce8a708608a282c4d1e251c57873a8db25
+Subproject commit 463c0dffe3eae8c39caf7989c85b7244823df27e
diff --git a/src/operator/mshadow_op.h b/src/operator/mshadow_op.h
index 5953568c7fa..81a55c4a013 100644
--- a/src/operator/mshadow_op.h
+++ b/src/operator/mshadow_op.h
@@ -697,6 +697,22 @@ struct product {
   MSHADOW_XINLINE static void Reduce(volatile DType& dst, volatile DType src, 
volatile DType& none) { // NOLINT(*)
     Reduce(dst, src);
   }
+  /*! \brief combine the results of two reducers */
+  template<typename DType>
+  MSHADOW_XINLINE static void Merge(volatile DType& dst_val, volatile DType& 
src_val) { // NOLINT(*)
+    Reduce(dst_val, src_val);
+  }
+  /*! \brief combine the results of two reducers */
+  template<typename DType>
+  MSHADOW_XINLINE static void Merge(volatile DType& dst_val, volatile DType& 
dst_residual, volatile DType& src_val, volatile DType& src_residual) { // 
NOLINT(*)
+    Reduce(dst_val, src_val);
+  }
+  /*! \brief finalize reduction */
+  template<typename DType>
+  MSHADOW_XINLINE static void Finalize(volatile DType& dst) {} // NOLINT(*)
+  /*! \brief finalize reduction */
+  template<typename DType>
+  MSHADOW_XINLINE static void Finalize(volatile DType& dst, volatile DType& 
none) {} // NOLINT(*)
   /*!
   *\brief calculate gradient of redres with respect to redsrc,
   * redres: reduced result, redsrc: one of reduction element
@@ -762,6 +778,26 @@ struct nansum {
     residual = (t - dst) - y;
     dst = t;
   }
+  /*! \brief combine the results of two reducers */
+  template<typename DType>
+  MSHADOW_XINLINE static void Merge(volatile DType& dst_val, volatile DType& 
src_val) { // NOLINT(*)
+    Reduce(dst_val, src_val);
+  }
+  /*! \brief combine the results of two reducers */
+  template<typename DType>
+  MSHADOW_XINLINE static void Merge(volatile DType& dst_val, volatile DType& 
dst_residual, volatile DType& src_val, volatile DType& src_residual) { // 
NOLINT(*)
+    DType t1 = dst_val + src_val;
+    DType e = t1 - src_val;
+    DType t2 = ((src_val - e) + (dst_val - (t1 - e))) + dst_residual + 
src_residual;
+    dst_val = t1 + t2;
+    dst_residual = t2 - (dst_val - t1);
+  }
+  /*! \brief finalize reduction */
+  template<typename DType>
+  MSHADOW_XINLINE static void Finalize(volatile DType& dst) {} // NOLINT(*)
+  /*! \brief finalize reduction */
+  template<typename DType>
+  MSHADOW_XINLINE static void Finalize(volatile DType& dst, volatile DType& 
residual) {} // NOLINT(*)
   /*!
   *\brief set the initial value during reduction
   */
@@ -799,6 +835,22 @@ struct nanprod {
   MSHADOW_XINLINE static void Reduce(volatile DType& dst, volatile DType src, 
volatile DType& none) { // NOLINT(*)
     Reduce(dst, src);
   }
+  /*! \brief combine the results of two reducers */
+  template<typename DType>
+  MSHADOW_XINLINE static void Merge(volatile DType& dst_val, volatile DType& 
src_val) { // NOLINT(*)
+    Reduce(dst_val, src_val);
+  }
+  /*! \brief combine the results of two reducers */
+  template<typename DType>
+  MSHADOW_XINLINE static void Merge(volatile DType& dst_val, volatile DType& 
dst_residual, volatile DType& src_val, volatile DType& src_residual) { // 
NOLINT(*)
+    Reduce(dst_val, src_val);
+  }
+  /*! \brief finalize reduction */
+  template<typename DType>
+  MSHADOW_XINLINE static void Finalize(volatile DType& dst) {} // NOLINT(*)
+  /*! \brief finalize reduction */
+  template<typename DType>
+  MSHADOW_XINLINE static void Finalize(volatile DType& dst, volatile DType& 
none) {} // NOLINT(*)
   /*!
   *\brief set the initial value during reduction
   */
@@ -806,6 +858,7 @@ struct nanprod {
   MSHADOW_XINLINE static void SetInitValue(DType & initv) { // NOLINT(*)
     initv = 1;
   }
+
   /*!
   *\brief set the initial value during reduction
   */
@@ -815,6 +868,76 @@ struct nanprod {
   }
 };
 
+/*! \brief compute l2 norm */
+struct nrm2 {
+  /*! \brief do reduction into dst */
+  template<typename DType>
+  MSHADOW_XINLINE static void Reduce(volatile DType& sum_of_squares, volatile 
DType src) { // NOLINT(*)
+    sum_of_squares += src * src;
+  }
+  /*! \brief do stable reduction into dst */
+  template<typename DType>
+  MSHADOW_XINLINE static void Reduce(volatile DType& sum_of_squares,  volatile 
DType src, volatile DType& scale) { // NOLINT(*)
+    if (src != 0) {
+      DType abs = mshadow_op::abs::Map(src);
+      if (scale < abs) {
+        sum_of_squares = 1 + sum_of_squares * (scale / abs) * (scale / abs);
+        scale = abs;
+      } else {
+        sum_of_squares = sum_of_squares + (abs / scale) * (abs / scale);
+      }
+    }
+  }
+  /*! \brief combine the results of two reducers */
+  template<typename DType>
+  MSHADOW_XINLINE static void Merge(volatile DType& dst_val, volatile DType& 
src_val) { // NOLINT(*)
+    dst_val += src_val;
+  }
+  /*! \brief combine the results of two reducers */
+  template<typename DType>
+  MSHADOW_XINLINE static void Merge(volatile DType& dst_ssq, volatile DType& 
dst_scale, volatile DType& src_ssq, volatile DType& src_scale) { // NOLINT(*)
+    if (dst_scale != 0 && dst_scale >= src_scale) {
+      dst_ssq = dst_ssq + src_ssq * (src_scale / dst_scale) * (src_scale / 
dst_scale);
+    } else if (src_scale != 0 && dst_scale < src_scale) {
+      dst_ssq = src_ssq + dst_ssq * (dst_scale / src_scale) * (dst_scale / 
src_scale);
+      dst_scale = src_scale;
+    }
+  }
+  /*! \brief finalize reduction result */
+  template<typename DType>
+  MSHADOW_XINLINE static void Finalize(volatile DType& sum_of_squares) { // 
NOLINT(*)
+    sum_of_squares = math::sqrt(sum_of_squares);
+  }
+  /*! \brief finalize reduction result */
+  template<typename DType>
+  MSHADOW_XINLINE static void Finalize(volatile DType& sum_of_squares, 
volatile DType& scale) { // NOLINT(*)
+    sum_of_squares = scale * math::sqrt(sum_of_squares);
+  }
+  /*!
+   *\brief calculate gradient of redres with respect to redsrc,
+   * redres: reduced result, redsrc: one of reduction element
+   */
+  template<typename DType>
+  MSHADOW_XINLINE static DType PartialGrad(DType redres, DType redsrc) {
+    return redsrc / redres;
+  }
+  /*!
+   *\brief set the initial value during reduction
+   */
+  template<typename DType>
+  MSHADOW_XINLINE static void SetInitValue(DType &sum_of_squares) { // 
NOLINT(*)
+    sum_of_squares = 0;
+  }
+  /*!
+   *\brief set the initial value during reduction
+   */
+  template<typename DType>
+  MSHADOW_XINLINE static void SetInitValue(DType &sum_of_squares, DType 
&scale) { // NOLINT(*)
+    SetInitValue(sum_of_squares);
+    scale = 0;
+  }
+};
+
 struct nanprod_grad : public mxnet_op::tunable {
   template<typename DType>
   MSHADOW_XINLINE static DType Map(DType a, DType b) {
diff --git a/src/operator/tensor/broadcast_reduce-inl.cuh 
b/src/operator/tensor/broadcast_reduce-inl.cuh
index b6bb39a1984..5c9b45f547f 100644
--- a/src/operator/tensor/broadcast_reduce-inl.cuh
+++ b/src/operator/tensor/broadcast_reduce-inl.cuh
@@ -123,27 +123,32 @@ __global__ void reduce_kernel(const int N, const int M, 
const bool addto,
         // Fix bx to avoid bank conflicts. Assumes warpSize number of banks
         const int fbx = (do_transpose && ((bx & (warpSize - 1)) == 0)) ? (bx + 
1) : bx;
         const int it0 = tidx + tidy*fbx;
-        shTile[it0] = val;
+        shTile[it0 * 2] = val;
+        shTile[it0 * 2 + 1] = residual;
         __syncthreads();
         for (int t=1;t < by;t <<= 1) {
-          DType tmp, residual;
-          Reducer::SetInitValue(tmp, residual);
-          if (tidy + t < by) tmp = shTile[it0 + t*fbx];
+          DType tmp, tmp_residual;
+          Reducer::SetInitValue(tmp, tmp_residual);
+          if (tidy + t < by) {
+            tmp = shTile[(it0 + t*fbx) * 2];
+            tmp_residual = shTile[(it0 + t*fbx) * 2 + 1];
+          }
           __syncthreads();
-          Reducer::Reduce(shTile[it0], tmp, residual);
+          Reducer::Merge(shTile[it0 * 2], shTile[it0 * 2 + 1], tmp, 
tmp_residual);
           __syncthreads();
         }
         if (idx < N && tidy == 0) {
-          assign(&small[idx + m0*N], addto, shTile[tidx]);
+          Reducer::Finalize(shTile[tidx * 2], shTile[tidx * 2 + 1]);
+          assign(&small[idx + m0*N], addto, shTile[tidx * 2]);
         }
       } else {
         if (idx < N) {
+          Reducer::Finalize(val, residual);
           assign(&small[idx + m0*N], addto, val);
         }
       }
     }
   }
-
 }
 
 template<typename Reducer, int ndim, typename DType, typename OP1, typename 
OP2, int unroll>
@@ -207,27 +212,32 @@ __global__ void reduce_kernel(const int N, const int M, 
const bool addto,
         // Fix bx to avoid bank conflicts. Assumes warpSize number of banks
         const int fbx = (do_transpose && ((bx & (warpSize - 1)) == 0)) ? (bx + 
1) : bx;
         const int it0 = tidx + tidy*fbx;
-        shTile[it0] = val;
+        shTile[it0 * 2] = val;
+        shTile[it0 * 2 + 1] = residual;
         __syncthreads();
         for (int t=1;t < by;t <<= 1) {
-          DType tmp, residual;
-          Reducer::SetInitValue(tmp, residual);
-          if (tidy + t < by) tmp = shTile[it0 + t*fbx];
+          DType tmp, tmp_residual;
+          Reducer::SetInitValue(tmp, tmp_residual);
+          if (tidy + t < by) {
+            tmp = shTile[(it0 + t*fbx) * 2];
+            tmp_residual = shTile[(it0 + t*fbx) * 2 + 1];
+          }
           __syncthreads();
-          Reducer::Reduce(shTile[it0], tmp, residual);
+          Reducer::Merge(shTile[it0 * 2], shTile[it0 * 2 + 1], tmp, 
tmp_residual);
           __syncthreads();
         }
         if (idx < N && tidy == 0) {
-          assign(&small[idx + m0*N], addto, shTile[tidx]);
+          Reducer::Finalize(shTile[tidx * 2], shTile[tidx * 2 + 1]);
+          assign(&small[idx + m0*N], addto, shTile[tidx * 2]);
         }
       } else {
         if (idx < N) {
+          Reducer::Finalize(val, residual);
           assign(&small[idx + m0*N], addto, val);
         }
       }
     }
   }
-
 }
 
 // Simple reduction of lines when M is small
@@ -244,6 +254,7 @@ __global__ void reduce_lines_kernel(const int N, const int 
M, const bool addto,
     }
 
     if (idx < N) {
+      Reducer::Finalize(val, residual);
       assign(&small_out[idx], addto, val);
     }
 
@@ -453,7 +464,7 @@ ReduceImplConfig<ndim> ConfigureReduceImpl(const TShape& 
small, const TShape& bi
         by++;
       }
       config.kernel_1.shMemSize = (config.kernel_1.blockDim.x > 1) ?
-        config.kernel_1.blockDim.x*by*sizeof(DType) : 0;
+        config.kernel_1.blockDim.x*by*sizeof(DType) * 2 : 0;
       // Maximum number of times we want TB to loop in M
       // Max size of M-block each TB can handle
       int maxMblock = config.kernel_1.blockDim.x*config.maxLoopPerTB;
@@ -464,7 +475,7 @@ ReduceImplConfig<ndim> ConfigureReduceImpl(const TShape& 
small, const TShape& bi
         ceil_idiv<unsigned int>(config.N, config.kernel_1.blockDim.x));
       config.kernel_1.gridDim.y = std::min(kBaseGridNum, config.Mnext);
       config.kernel_1.shMemSize = (config.kernel_1.blockDim.y > 1) ?
-        config.kernel_1.blockDim.x*config.kernel_1.blockDim.y*sizeof(DType) : 
0;
+        config.kernel_1.blockDim.x*config.kernel_1.blockDim.y*sizeof(DType) * 
2 : 0;
       // Maximum number of times we want TB to loop in M
       // Max size of M-block each TB can handle
       int maxMblock = config.kernel_1.blockDim.y*config.maxLoopPerTB;
diff --git a/src/operator/tensor/broadcast_reduce-inl.h 
b/src/operator/tensor/broadcast_reduce-inl.h
index 76ec92a9e72..713e3f1ac60 100644
--- a/src/operator/tensor/broadcast_reduce-inl.h
+++ b/src/operator/tensor/broadcast_reduce-inl.h
@@ -165,6 +165,7 @@ MSHADOW_XINLINE void seq_reduce_assign(const int idx, const 
int M, const bool ad
     coord = unravel(k, rshape);
     Reducer::Reduce(val, OP::Map(big[j + dot(coord, rstride)]), residual);
   }
+  Reducer::Finalize(val, residual);
   assign(&small[idx], addto, val);
 }
 
@@ -256,6 +257,7 @@ MSHADOW_XINLINE void seq_reduce_assign(const int idx, const 
int M, const bool ad
 
     Reducer::Reduce(val, OP1::Map(big[idx_big], OP2::Map(lhs[idx_lhs], 
rhs[idx_rhs])), residual);
   }
+  Reducer::Finalize(val, residual);
   assign(&small[idx], addto, val);
 }
 
diff --git a/src/operator/tensor/broadcast_reduce_op.h 
b/src/operator/tensor/broadcast_reduce_op.h
index ac7199a9482..d9a749e0db8 100644
--- a/src/operator/tensor/broadcast_reduce_op.h
+++ b/src/operator/tensor/broadcast_reduce_op.h
@@ -1005,9 +1005,8 @@ void LpNormCompute(const nnvm::NodeAttrs& attrs,
     ReduceAxesComputeImpl<xpu, mshadow::red::sum, false, mshadow_op::abs>(
           ctx, inputs, req, outputs, small);
   } else if (param.ord == 2) {
-    ReduceAxesComputeImpl<xpu, mshadow::red::sum, false, mshadow_op::square>(
+    ReduceAxesComputeImpl<xpu, mshadow_op::nrm2, false, mshadow_op::identity>(
         ctx, inputs, req, outputs, small);
-    SqRootForL2<xpu>(ctx, req[0], outputs[0]);
   }
 }
 
diff --git a/tests/python/unittest/test_ndarray.py 
b/tests/python/unittest/test_ndarray.py
index cf5906ae454..b57e71d73b2 100644
--- a/tests/python/unittest/test_ndarray.py
+++ b/tests/python/unittest/test_ndarray.py
@@ -17,6 +17,7 @@
 
 import mxnet as mx
 import numpy as np
+from distutils.version import LooseVersion
 import os
 import pickle as pkl
 import unittest
@@ -1276,10 +1277,19 @@ def test_ndarray_astype():
 
 @with_seed()
 def test_norm(ctx=default_context()):
+    try:
+        import scipy
+        assert LooseVersion(scipy.__version__) >= LooseVersion('0.1')
+        from scipy.linalg import norm as sp_norm
+    except (AssertionError, ImportError):
+        print("Could not import scipy.linalg.norm or scipy is too old. "
+              "Falling back to numpy.linalg.norm which is not numerically 
stable.")
+        from numpy.linalg import norm as sp_norm
+
     def l1norm(input_data, axis=0, keepdims=False):
         return np.sum(abs(input_data), axis=axis, keepdims=keepdims)
     def l2norm(input_data, axis=0, keepdims=False): 
-        return np.linalg.norm(input_data, axis=axis, keepdims=keepdims)
+        return sp_norm(input_data, axis=axis, keepdims=keepdims)
 
     in_data_dim = random_sample([4,5,6], 1)[0]
     in_data_shape = rand_shape_nd(in_data_dim)
diff --git a/tests/python/unittest/test_operator.py 
b/tests/python/unittest/test_operator.py
index ae5cba21711..5a2067eab4a 100644
--- a/tests/python/unittest/test_operator.py
+++ b/tests/python/unittest/test_operator.py
@@ -23,6 +23,7 @@
 import math
 import random
 import itertools
+from distutils.version import LooseVersion
 from numpy.testing import assert_allclose, assert_array_equal
 from mxnet.test_utils import *
 from mxnet.base import py_str, MXNetError, _as_list
@@ -3031,13 +3032,22 @@ def npy_layer_norm(data, gamma, beta, axis=1, eps=1E-5):
                                grad_nodes={'data': req, 'gamma': req, 'beta': 
req},
                                numeric_eps=1e-2, rtol=1e-2, atol=1e-2)
 
-@unittest.skip("Flaky test: 
https://github.com/apache/incubator-mxnet/issues/11509";)
 @with_seed()
 def test_norm():
+    try:
+        import scipy
+        assert LooseVersion(scipy.__version__) >= LooseVersion('0.1')
+        from scipy.linalg import norm as sp_norm
+    except (AssertionError, ImportError):
+        print("Could not import scipy.linalg.norm or scipy is too old. "
+              "Falling back to numpy.linalg.norm which is not numerically 
stable.")
+        from numpy.linalg import norm as sp_norm
+
     def l1norm(input_data, axis=0, keepdims=True):
         return np.sum(abs(input_data), axis=axis, keepdims=keepdims)
-    def l2norm(input_data, axis=0, keepdims=True): 
-        return np.linalg.norm(input_data, axis=axis, keepdims=keepdims)
+
+    def l2norm(input_data, axis=0, keepdims=True):
+        return sp_norm(input_data, axis=axis, keepdims=keepdims)
 
     ctx = default_context()
     data = mx.symbol.Variable('data')
@@ -3051,7 +3061,7 @@ def l2norm(input_data, axis=0, keepdims=True):
             for i in range(in_data_dim):
                 norm_sym = mx.symbol.norm(data=data, ord=order, axis=i, 
keepdims=True)
                 npy_out = l1norm(in_data, i) if order is 1 else 
l2norm(in_data, i)
-                npy_out_backward = np.sign(in_data) if order is 1 else 
in_data/npy_out 
+                npy_out_backward = np.sign(in_data) if order is 1 else 
in_data/npy_out
                 check_symbolic_forward(norm_sym, [in_data], [npy_out],
                                         rtol=1e-2 if dtype is np.float16 else 
1e-5,
                                         atol=1e-2 if dtype is np.float16 else 
1e-5, ctx=ctx)
@@ -3059,22 +3069,23 @@ def l2norm(input_data, axis=0, keepdims=True):
                                         [npy_out_backward],
                                         rtol=1e-2 if dtype is np.float16 else 
1e-5,
                                         atol=1e-2 if dtype is np.float16 else 
1e-5, ctx=ctx)
-                # check gradient
-                check_numeric_gradient(norm_sym, [in_data], 
numeric_eps=epsilon, rtol=1e-2, atol=1e-3)
-                if i < in_data_dim-1:
-                    norm_sym = mx.symbol.norm(data=data, ord=order, axis=(i, 
i+1), keepdims=True)
-                    npy_out = l1norm(in_data, (i, i+1)) if order is 1 else 
l2norm(in_data, (i, i+1))
-                    npy_out_backward = np.sign(in_data) if order is 1 else 
in_data/npy_out 
-                    check_symbolic_forward(norm_sym, [in_data], [npy_out],
-                                           rtol=1e-2 if dtype is np.float16 
else 1e-5,
-                                           atol=1e-2 if dtype is np.float16 
else 1e-5, ctx=ctx)
-                    check_symbolic_backward(norm_sym, [in_data], 
[np.ones(npy_out.shape)],
-                                            [npy_out_backward],
-                                            rtol=1e-2 if dtype is np.float16 
else 1e-5,
-                                            atol=1e-2 if dtype is np.float16 
else 1e-5, ctx=ctx)
-                    # check gradient
-                    check_numeric_gradient(norm_sym, [in_data], 
numeric_eps=epsilon, rtol=1e-2, atol=1e-3)
-                        
+                # Disable numeric gradient 
https://github.com/apache/incubator-mxnet/issues/11509
+                # # check gradient
+                # check_numeric_gradient(norm_sym, [in_data], 
numeric_eps=epsilon, rtol=1e-2, atol=1e-3)
+                # if i < in_data_dim-1:
+                #     norm_sym = mx.symbol.norm(data=data, ord=order, axis=(i, 
i+1), keepdims=True)
+                #     npy_out = l1norm(in_data, (i, i+1)) if order is 1 else 
l2norm(in_data, (i, i+1))
+                #     npy_out_backward = np.sign(in_data) if order is 1 else 
in_data/npy_out
+                #     check_symbolic_forward(norm_sym, [in_data], [npy_out],
+                #                            rtol=1e-2 if dtype is np.float16 
else 1e-5,
+                #                            atol=1e-2 if dtype is np.float16 
else 1e-5, ctx=ctx)
+                #     check_symbolic_backward(norm_sym, [in_data], 
[np.ones(npy_out.shape)],
+                #                             [npy_out_backward],
+                #                             rtol=1e-2 if dtype is np.float16 
else 1e-5,
+                #                             atol=1e-2 if dtype is np.float16 
else 1e-5, ctx=ctx)
+                #     # check gradient
+                #     check_numeric_gradient(norm_sym, [in_data], 
numeric_eps=epsilon, rtol=1e-2, atol=1e-3)
+
 
 def test_layer_norm():
     for dtype, forward_check_eps in zip([np.float16, np.float32, np.float64],


 

----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


With regards,
Apache Git Services

Reply via email to