piiswrong closed pull request #10388: [MXNET-265] Update optimizer doc to 
clarify wd behaviors
URL: https://github.com/apache/incubator-mxnet/pull/10388
 
 
   

This is a PR merged from a forked repository.
As GitHub hides the original diff on merge, it is displayed below for
the sake of provenance:

As this is a foreign pull request (from a fork), the diff is supplied
below (as it won't show otherwise due to GitHub magic):

diff --git a/python/mxnet/optimizer.py b/python/mxnet/optimizer.py
index 6589e77e453..18bd5c6e4fe 100644
--- a/python/mxnet/optimizer.py
+++ b/python/mxnet/optimizer.py
@@ -538,7 +538,7 @@ def update_multi_precision(self, index, weight, grad, 
state):
 class Signum(Optimizer):
     """The Signum optimizer that takes the sign of gradient or momentum.
 
-    The optimizer updates the weight by:
+    The optimizer updates the weight by::
 
         rescaled_grad = rescale_grad * clip(grad, clip_gradient) + wd * weight
         state = momentum * state + (1-momentum)*rescaled_grad
@@ -604,6 +604,14 @@ class FTML(Optimizer):
     *FTML - Follow the Moving Leader in Deep Learning*,
     available at http://proceedings.mlr.press/v70/zheng17a/zheng17a.pdf.
 
+    Denote time step by t. The optimizer updates the weight by::
+
+        rescaled_grad = clip(grad * rescale_grad + wd * weight, clip_gradient)
+        v = beta2 * v + (1 - beta2) * square(rescaled_grad)
+        d_t = (1 - power(beta1, t)) / lr * square_root(v / (1 - power(beta2, 
t))) + epsilon)
+        z = beta1 * z + (1 - beta1) * rescaled_grad - (d_t - beta1 * d_(t-1)) 
* weight
+        weight = - z / d_t
+
     This optimizer accepts the following parameters in addition to those 
accepted
     by :class:`.Optimizer`.
 
@@ -1068,6 +1076,13 @@ class AdaGrad(Optimizer):
     Methods for Online Learning and Stochastic Optimization*, and available at
     http://www.jmlr.org/papers/volume12/duchi11a/duchi11a.pdf.
 
+    This optimizer updates each weight by::
+
+            grad = clip(grad * rescale_grad + weight * wd, clip_gradient)
+            history += square(grad)
+            div = grad / sqrt(history + float_stable_eps)
+            weight += div * -lr
+
     This optimizer accepts the following parameters in addition to those 
accepted
     by :class:`.Optimizer`.
 
@@ -1105,12 +1120,12 @@ def update(self, index, weight, grad, state):
                 kwargs['clip_gradient'] = self.clip_gradient
             sparse.adagrad_update(weight, grad, history, out=weight, lr=lr, 
wd=wd, **kwargs)
         else:
-            grad = grad * self.rescale_grad
+            grad = grad * self.rescale_grad + weight * wd
             if self.clip_gradient is not None:
                 grad = clip(grad, -self.clip_gradient, self.clip_gradient)
             history[:] += square(grad)
             div = grad / sqrt(history + self.float_stable_eps)
-            weight[:] += (div + weight * wd) * -lr
+            weight[:] += div * -lr
 
 @register
 class RMSProp(Optimizer):
@@ -1195,6 +1210,15 @@ class AdaDelta(Optimizer):
     This class implements AdaDelta, an optimizer described in  *ADADELTA: An 
adaptive
     learning rate method*, available at https://arxiv.org/abs/1212.5701.
 
+    This optimizer updates each weight by::
+
+        grad = clip(grad * rescale_grad + wd * weight, clip_gradient)
+        acc_grad = rho * acc_grad + (1. - rho) * grad * grad
+        delta = sqrt(acc_delta + epsilon) / sqrt(acc_grad + epsilon) * grad
+        acc_delta = rho * acc_delta + (1. - rho) * delta * delta
+        weight -= delta
+
+
     This optimizer accepts the following parameters in addition to those 
accepted
     by :class:`.Optimizer`.
 
@@ -1222,6 +1246,7 @@ def update(self, index, weight, grad, state):
 
         # preprocess grad
         grad *= self.rescale_grad
+        grad += wd * weight
         if self.clip_gradient is not None:
             grad = clip(grad, -self.clip_gradient, self.clip_gradient)
 
@@ -1234,7 +1259,7 @@ def update(self, index, weight, grad, state):
         acc_delta[:] = self.rho * acc_delta + (1. - self.rho) * current_delta 
* current_delta
 
         # update weight
-        weight[:] -= current_delta + wd * weight
+        weight[:] -= current_delta
 
 #pylint: disable=invalid-name
 #pylint: disable=line-too-long
@@ -1321,6 +1346,13 @@ class Adamax(Optimizer):
     It is a variant of Adam based on the infinity norm
     available at http://arxiv.org/abs/1412.6980 Section 7.
 
+    The optimizer updates the weight by::
+
+        grad = clip(grad * rescale_grad + wd * weight, clip_gradient)
+        m = beta1 * m_t + (1 - beta1) * grad
+        u = maximum(beta2 * u, abs(grad))
+        weight -= lr / (1 - beta1**t) * m / u
+
     This optimizer accepts the following parameters in addition to those 
accepted
     by :class:`.Optimizer`.
 
diff --git a/tests/python/unittest/test_optimizer.py 
b/tests/python/unittest/test_optimizer.py
index bbd7845f66f..07b306702f3 100644
--- a/tests/python/unittest/test_optimizer.py
+++ b/tests/python/unittest/test_optimizer.py
@@ -999,12 +999,12 @@ def update(self, index, weight, grad, state):
         wd = self._get_wd(index)
 
         history = state
-        grad = grad * self.rescale_grad
+        grad = grad * self.rescale_grad + weight * wd
         if self.clip_gradient is not None:
             grad = mx.nd.clip(grad, -self.clip_gradient, self.clip_gradient)
         history[:] += mx.nd.square(grad)
         div = grad / mx.nd.sqrt(history + self.float_stable_eps)
-        weight[:] += (div + weight * wd) * -lr
+        weight[:] += div * -lr
 
 def test_adagrad():
     mx.random.seed(0)
@@ -1014,7 +1014,7 @@ def test_adagrad():
     eps_options = [{}, {'eps': 1e-8}]
     cg_options = [{}, {'clip_gradient': 0.4}, {'clip_gradient': 0.5}]
     rg_options = [{}, {'rescale_grad': 0.14}, {'rescale_grad': 0.8}]
-    wd_options = [{}, {'wd': 0.0}]
+    wd_options = [{}, {'wd': 0.1}]
     for dtype in [np.float32]:
         for eps_option in eps_options:
             for cg_option in cg_options:
@@ -1031,6 +1031,89 @@ def test_adagrad():
                                               w_stype='row_sparse', 
g_stype='row_sparse')
 
 
+# AdaDelta
+class PyAdaDelta(mx.optimizer.Optimizer):
+    """The python reference of AdaDelta optimizer.
+
+    This class implements AdaDelta, an optimizer described in  *ADADELTA: An 
adaptive
+    learning rate method*, available at https://arxiv.org/abs/1212.5701.
+
+    This optimizer updates each weight by::
+
+        grad = clip(grad * rescale_grad + wd * weight, clip_gradient)
+        acc_grad = rho * acc_grad + (1. - rho) * grad * grad
+        delta = sqrt(acc_delta + epsilon) / sqrt(acc_grad + epsilon) * grad
+        acc_delta = rho * acc_delta + (1. - rho) * delta * delta
+        weight -= delta
+
+    This optimizer accepts the following parameters in addition to those 
accepted
+    by :class:`.Optimizer`.
+
+    Parameters
+    ----------
+    eps: float, optional
+        Small value to avoid division by 0.
+
+    Parameters
+    ----------
+    rho: float
+        Decay rate for both squared gradients and delta.
+    epsilon : float
+        Small value to avoid division by 0.
+    """
+    def __init__(self, rho=0.90, epsilon=1e-5, **kwargs):
+        super(PyAdaDelta, self).__init__(**kwargs)
+        self.rho = rho
+        self.epsilon = epsilon
+
+    def create_state(self, index, weight):
+        return (mx.nd.zeros(weight.shape, weight.context),  # accumulated g
+                mx.nd.zeros(weight.shape, weight.context))  # accumulated delta
+
+    def update(self, index, weight, grad, state):
+        self._update_count(index)
+        lr = self._get_lr(index)
+        wd = self._get_wd(index)
+
+        # preprocess grad
+        grad *= self.rescale_grad
+        grad += wd * weight
+        if self.clip_gradient is not None:
+            grad = mx.nd.clip(grad, -self.clip_gradient, self.clip_gradient)
+
+        # accumulated g and delta initlization
+        acc_g, acc_delta = state
+
+        # update g, delta
+        acc_g[:] = self.rho * acc_g + (1. - self.rho) * grad * grad
+        current_delta = mx.nd.sqrt(acc_delta + self.epsilon) / 
mx.nd.sqrt(acc_g + self.epsilon) * grad
+        acc_delta[:] = self.rho * acc_delta + (1. - self.rho) * current_delta 
* current_delta
+
+        # update weight
+        weight[:] -= current_delta
+
+def test_adadelta():
+    mx.random.seed(0)
+    opt1 = PyAdaDelta
+    opt2 = mx.optimizer.AdaDelta
+    shape = (3, 4, 5)
+    eps_options = [{}, {'epsilon': 1e-8}]
+    cg_options = [{}, {'clip_gradient': 0.4}]
+    rg_options = [{}, {'rescale_grad': 0.14}]
+    wd_options = [{}, {'wd': 0.1}]
+    for dtype in [np.float32]:
+        for eps_option in eps_options:
+            for cg_option in cg_options:
+                for rg_option in rg_options:
+                    for wd_option in wd_options:
+                        kwarg = {}
+                        kwarg.update(eps_option)
+                        kwarg.update(cg_option)
+                        kwarg.update(rg_option)
+                        kwarg.update(wd_option)
+                        compare_optimizer(opt1(**kwarg), opt2(**kwarg), shape, 
dtype)
+
+
 
 if __name__ == '__main__':
     import nose


 

----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


With regards,
Apache Git Services

Reply via email to