sandeep-krishnamurthy closed pull request #13443: Adadelta optimizer test
URL: https://github.com/apache/incubator-mxnet/pull/13443
 
 
   

This is a PR merged from a forked repository.
As GitHub hides the original diff on merge, it is displayed below for
the sake of provenance:

As this is a foreign pull request (from a fork), the diff is supplied
below (as it won't show otherwise due to GitHub magic):

diff --git a/python/mxnet/optimizer/optimizer.py 
b/python/mxnet/optimizer/optimizer.py
index 442a11d0220..d7b6821ac8c 100644
--- a/python/mxnet/optimizer/optimizer.py
+++ b/python/mxnet/optimizer/optimizer.py
@@ -637,6 +637,8 @@ class FTML(Optimizer):
         z = beta1 * z + (1 - beta1) * rescaled_grad - (d_t - beta1 * d_(t-1)) 
* weight
         weight = - z / d_t
 
+    For details of the update algorithm, see 
:class:`~mxnet.ndarray.ftml_update`.
+
     This optimizer accepts the following parameters in addition to those 
accepted
     by :class:`.Optimizer`.
 
diff --git a/tests/python/unittest/test_optimizer.py 
b/tests/python/unittest/test_optimizer.py
index 334b7d4c0fd..36f7897acaf 100644
--- a/tests/python/unittest/test_optimizer.py
+++ b/tests/python/unittest/test_optimizer.py
@@ -15,6 +15,7 @@
 # specific language governing permissions and limitations
 # under the License.
 
+import itertools
 import numpy as np
 import mxnet as mx
 import mxnet.lr_scheduler as lr_scheduler
@@ -976,8 +977,8 @@ def update(self, index, weight, grad, state):
         div = grad / mx.nd.sqrt(history + self.float_stable_eps)
         weight[:] += (div + weight * wd) * -lr
 
+@with_seed()
 def test_adagrad():
-    mx.random.seed(0)
     opt1 = PyAdaGrad
     opt2 = mx.optimizer.AdaGrad
     shape = (3, 4, 5)
@@ -1002,6 +1003,76 @@ def test_adagrad():
                             compare_optimizer(opt1(**kwarg), opt2(**kwarg), 
shape, dtype,
                                               g_stype='row_sparse')
 
+# AdaDelta
+class PyAdaDelta(mx.optimizer.Optimizer):
+    """The python reference of AdaDelta optimizer.
+
+    This class implements AdaDelta, an optimizer described in  *ADADELTA: An 
adaptive
+    learning rate method*, available at https://arxiv.org/abs/1212.5701.
+
+    This optimizer updates each weight by::
+
+        grad = clip(grad * rescale_grad + wd * weight, clip_gradient)
+        acc_grad = rho * acc_grad + (1. - rho) * grad ** 2
+        cur_delta = sqrt(acc_delta + epsilon) / sqrt(acc_grad + epsilon) * grad
+        acc_delta = rho * acc_delta + (1. - rho) * cur_delta ** 2
+        weight -= (cur_delta + wd * weight)
+
+    This optimizer accepts the following parameters in addition to those 
accepted
+    by :class:`.Optimizer`.
+
+    Parameters
+    ----------
+    rho: float
+        Decay rate for both squared gradients and delta.
+    epsilon : float
+        Small value to avoid division by 0.
+    """
+    def __init__(self, rho=0.90, epsilon=1e-5, **kwargs):
+        super(PyAdaDelta, self).__init__(**kwargs)
+        self.rho = rho
+        self.epsilon = epsilon
+
+    def create_state(self, index, weight):
+        return (mx.nd.zeros(weight.shape, weight.context),
+                mx.nd.zeros(weight.shape, weight.context))
+
+    def update(self, index, weight, grad, state):
+        self._update_count(index)
+        wd = self._get_wd(index)
+
+        grad *= self.rescale_grad
+        if self.clip_gradient is not None:
+            grad = mx.nd.clip(grad, -self.clip_gradient, self.clip_gradient)
+
+        acc_grad, acc_delta = state
+
+        acc_grad[:] = self.rho * acc_grad + (1. - self.rho) * grad ** 2
+        current_delta = (mx.nd.sqrt(acc_delta + self.epsilon) /
+                         mx.nd.sqrt(acc_grad + self.epsilon)) * grad
+        acc_delta[:] = self.rho * acc_delta + (1. - self.rho) * current_delta 
** 2
+
+        # update weight
+        weight[:] -= current_delta + wd * weight
+
+@with_seed()
+def test_adadelta():
+    opt1 = PyAdaDelta
+    opt2 = mx.optimizer.AdaDelta
+    shape = (3, 4, 5)
+    rho_options = [{'rho': 0.9}]
+    eps_options = [{}, {'epsilon': 1e-8}]
+    cg_options = [{}, {'clip_gradient': 0.4}, {'clip_gradient': 0.5}]
+    rg_options = [{}, {'rescale_grad': 0.14}, {'rescale_grad': 0.8}]
+    wd_options = [{}, {'wd': 0.0}]
+    for dtype in [np.float16, np.float32]:
+        for params in itertools.product(rho_options, eps_options, cg_options,
+                                        rg_options, wd_options):
+            kwarg = {k: v for param in params for k, v in param.items()}
+            if dtype is np.float16:
+                kwarg.update({'multi_precision': True})
+            compare_optimizer(opt1(**kwarg), opt2(**kwarg), shape, dtype)
+
 
 def test_factor_scheduler():
     base_lr = 1


 

----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
[email protected]


With regards,
Apache Git Services

Reply via email to