This is an automated email from the ASF dual-hosted git repository.

jxie pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/incubator-mxnet.git


The following commit(s) were added to refs/heads/master by this push:
     new c24765f  [MXNET-265] Update optimizer doc to clarify wd behaviors 
(#10388)
c24765f is described below

commit c24765f6088fd66d072dfc3402998680de659217
Author: Haibin Lin <linhaibin.e...@gmail.com>
AuthorDate: Mon Apr 9 10:14:09 2018 -0700

    [MXNET-265] Update optimizer doc to clarify wd behaviors (#10388)
    
    * Update optimizer.py
    
    * FTML
    
    * adamax
    
    * Update optimizer.py
    
    * AdaDelta
    
    * Fix lint
    
    * fix adagrad
    
    * merge wd to grad for adadelta
---
 python/mxnet/optimizer.py               | 40 +++++++++++++--
 tests/python/unittest/test_optimizer.py | 89 +++++++++++++++++++++++++++++++--
 2 files changed, 122 insertions(+), 7 deletions(-)

diff --git a/python/mxnet/optimizer.py b/python/mxnet/optimizer.py
index 6589e77..18bd5c6 100644
--- a/python/mxnet/optimizer.py
+++ b/python/mxnet/optimizer.py
@@ -538,7 +538,7 @@ class SGD(Optimizer):
 class Signum(Optimizer):
     """The Signum optimizer that takes the sign of gradient or momentum.
 
-    The optimizer updates the weight by:
+    The optimizer updates the weight by::
 
         rescaled_grad = rescale_grad * clip(grad, clip_gradient) + wd * weight
         state = momentum * state + (1-momentum)*rescaled_grad
@@ -604,6 +604,14 @@ class FTML(Optimizer):
     *FTML - Follow the Moving Leader in Deep Learning*,
     available at http://proceedings.mlr.press/v70/zheng17a/zheng17a.pdf.
 
+    Denote time step by t. The optimizer updates the weight by::
+
+        rescaled_grad = clip(grad * rescale_grad + wd * weight, clip_gradient)
+        v = beta2 * v + (1 - beta2) * square(rescaled_grad)
+        d_t = (1 - power(beta1, t)) / lr * square_root(v / (1 - power(beta2, 
t))) + epsilon)
+        z = beta1 * z + (1 - beta1) * rescaled_grad - (d_t - beta1 * d_(t-1)) 
* weight
+        weight = - z / d_t
+
     This optimizer accepts the following parameters in addition to those 
accepted
     by :class:`.Optimizer`.
 
@@ -1068,6 +1076,13 @@ class AdaGrad(Optimizer):
     Methods for Online Learning and Stochastic Optimization*, and available at
     http://www.jmlr.org/papers/volume12/duchi11a/duchi11a.pdf.
 
+    This optimizer updates each weight by::
+
+            grad = clip(grad * rescale_grad + weight * wd, clip_gradient)
+            history += square(grad)
+            div = grad / sqrt(history + float_stable_eps)
+            weight += div * -lr
+
     This optimizer accepts the following parameters in addition to those 
accepted
     by :class:`.Optimizer`.
 
@@ -1105,12 +1120,12 @@ class AdaGrad(Optimizer):
                 kwargs['clip_gradient'] = self.clip_gradient
             sparse.adagrad_update(weight, grad, history, out=weight, lr=lr, 
wd=wd, **kwargs)
         else:
-            grad = grad * self.rescale_grad
+            grad = grad * self.rescale_grad + weight * wd
             if self.clip_gradient is not None:
                 grad = clip(grad, -self.clip_gradient, self.clip_gradient)
             history[:] += square(grad)
             div = grad / sqrt(history + self.float_stable_eps)
-            weight[:] += (div + weight * wd) * -lr
+            weight[:] += div * -lr
 
 @register
 class RMSProp(Optimizer):
@@ -1195,6 +1210,15 @@ class AdaDelta(Optimizer):
     This class implements AdaDelta, an optimizer described in  *ADADELTA: An 
adaptive
     learning rate method*, available at https://arxiv.org/abs/1212.5701.
 
+    This optimizer updates each weight by::
+
+        grad = clip(grad * rescale_grad + wd * weight, clip_gradient)
+        acc_grad = rho * acc_grad + (1. - rho) * grad * grad
+        delta = sqrt(acc_delta + epsilon) / sqrt(acc_grad + epsilon) * grad
+        acc_delta = rho * acc_delta + (1. - rho) * delta * delta
+        weight -= delta
+
+
     This optimizer accepts the following parameters in addition to those 
accepted
     by :class:`.Optimizer`.
 
@@ -1222,6 +1246,7 @@ class AdaDelta(Optimizer):
 
         # preprocess grad
         grad *= self.rescale_grad
+        grad += wd * weight
         if self.clip_gradient is not None:
             grad = clip(grad, -self.clip_gradient, self.clip_gradient)
 
@@ -1234,7 +1259,7 @@ class AdaDelta(Optimizer):
         acc_delta[:] = self.rho * acc_delta + (1. - self.rho) * current_delta 
* current_delta
 
         # update weight
-        weight[:] -= current_delta + wd * weight
+        weight[:] -= current_delta
 
 #pylint: disable=invalid-name
 #pylint: disable=line-too-long
@@ -1321,6 +1346,13 @@ class Adamax(Optimizer):
     It is a variant of Adam based on the infinity norm
     available at http://arxiv.org/abs/1412.6980 Section 7.
 
+    The optimizer updates the weight by::
+
+        grad = clip(grad * rescale_grad + wd * weight, clip_gradient)
+        m = beta1 * m_t + (1 - beta1) * grad
+        u = maximum(beta2 * u, abs(grad))
+        weight -= lr / (1 - beta1**t) * m / u
+
     This optimizer accepts the following parameters in addition to those 
accepted
     by :class:`.Optimizer`.
 
diff --git a/tests/python/unittest/test_optimizer.py 
b/tests/python/unittest/test_optimizer.py
index bbd7845..07b3067 100644
--- a/tests/python/unittest/test_optimizer.py
+++ b/tests/python/unittest/test_optimizer.py
@@ -999,12 +999,12 @@ class PyAdaGrad(mx.optimizer.Optimizer):
         wd = self._get_wd(index)
 
         history = state
-        grad = grad * self.rescale_grad
+        grad = grad * self.rescale_grad + weight * wd
         if self.clip_gradient is not None:
             grad = mx.nd.clip(grad, -self.clip_gradient, self.clip_gradient)
         history[:] += mx.nd.square(grad)
         div = grad / mx.nd.sqrt(history + self.float_stable_eps)
-        weight[:] += (div + weight * wd) * -lr
+        weight[:] += div * -lr
 
 def test_adagrad():
     mx.random.seed(0)
@@ -1014,7 +1014,7 @@ def test_adagrad():
     eps_options = [{}, {'eps': 1e-8}]
     cg_options = [{}, {'clip_gradient': 0.4}, {'clip_gradient': 0.5}]
     rg_options = [{}, {'rescale_grad': 0.14}, {'rescale_grad': 0.8}]
-    wd_options = [{}, {'wd': 0.0}]
+    wd_options = [{}, {'wd': 0.1}]
     for dtype in [np.float32]:
         for eps_option in eps_options:
             for cg_option in cg_options:
@@ -1031,6 +1031,89 @@ def test_adagrad():
                                               w_stype='row_sparse', 
g_stype='row_sparse')
 
 
+# AdaDelta
+class PyAdaDelta(mx.optimizer.Optimizer):
+    """The python reference of AdaDelta optimizer.
+
+    This class implements AdaDelta, an optimizer described in  *ADADELTA: An 
adaptive
+    learning rate method*, available at https://arxiv.org/abs/1212.5701.
+
+    This optimizer updates each weight by::
+
+        grad = clip(grad * rescale_grad + wd * weight, clip_gradient)
+        acc_grad = rho * acc_grad + (1. - rho) * grad * grad
+        delta = sqrt(acc_delta + epsilon) / sqrt(acc_grad + epsilon) * grad
+        acc_delta = rho * acc_delta + (1. - rho) * delta * delta
+        weight -= delta
+
+    This optimizer accepts the following parameters in addition to those 
accepted
+    by :class:`.Optimizer`.
+
+    Parameters
+    ----------
+    eps: float, optional
+        Small value to avoid division by 0.
+
+    Parameters
+    ----------
+    rho: float
+        Decay rate for both squared gradients and delta.
+    epsilon : float
+        Small value to avoid division by 0.
+    """
+    def __init__(self, rho=0.90, epsilon=1e-5, **kwargs):
+        super(PyAdaDelta, self).__init__(**kwargs)
+        self.rho = rho
+        self.epsilon = epsilon
+
+    def create_state(self, index, weight):
+        return (mx.nd.zeros(weight.shape, weight.context),  # accumulated g
+                mx.nd.zeros(weight.shape, weight.context))  # accumulated delta
+
+    def update(self, index, weight, grad, state):
+        self._update_count(index)
+        lr = self._get_lr(index)
+        wd = self._get_wd(index)
+
+        # preprocess grad
+        grad *= self.rescale_grad
+        grad += wd * weight
+        if self.clip_gradient is not None:
+            grad = mx.nd.clip(grad, -self.clip_gradient, self.clip_gradient)
+
+        # accumulated g and delta initlization
+        acc_g, acc_delta = state
+
+        # update g, delta
+        acc_g[:] = self.rho * acc_g + (1. - self.rho) * grad * grad
+        current_delta = mx.nd.sqrt(acc_delta + self.epsilon) / 
mx.nd.sqrt(acc_g + self.epsilon) * grad
+        acc_delta[:] = self.rho * acc_delta + (1. - self.rho) * current_delta 
* current_delta
+
+        # update weight
+        weight[:] -= current_delta
+
+def test_adadelta():
+    mx.random.seed(0)
+    opt1 = PyAdaDelta
+    opt2 = mx.optimizer.AdaDelta
+    shape = (3, 4, 5)
+    eps_options = [{}, {'epsilon': 1e-8}]
+    cg_options = [{}, {'clip_gradient': 0.4}]
+    rg_options = [{}, {'rescale_grad': 0.14}]
+    wd_options = [{}, {'wd': 0.1}]
+    for dtype in [np.float32]:
+        for eps_option in eps_options:
+            for cg_option in cg_options:
+                for rg_option in rg_options:
+                    for wd_option in wd_options:
+                        kwarg = {}
+                        kwarg.update(eps_option)
+                        kwarg.update(cg_option)
+                        kwarg.update(rg_option)
+                        kwarg.update(wd_option)
+                        compare_optimizer(opt1(**kwarg), opt2(**kwarg), shape, 
dtype)
+
+
 
 if __name__ == '__main__':
     import nose

-- 
To stop receiving notification emails like this one, please contact
j...@apache.org.

Reply via email to