[incubator-mxnet] branch master updated: [MXNET-563] Refactor R optimizers to fix memory leak (#11374)

the Wed, 25 Jul 2018 14:10:34 -0700

This is an automated email from the ASF dual-hosted git repository.

the pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/incubator-mxnet.git



The following commit(s) were added to refs/heads/master by this push:
     new be47870  [MXNET-563] Refactor R optimizers to fix memory leak (#11374)
be47870 is described below

commit be478700e01944ecfee0c30a7cc5dc07d1b2789a
Author: jeremiedb <[email protected]>
AuthorDate: Wed Jul 25 17:10:16 2018 -0400

    [MXNET-563] Refactor R optimizers to fix memory leak (#11374)
    
    * refactor R optimizers to fix memory leak
    
    * add Adadelta and Adagrad
    
    * fix comments
    
    * fix comments
    
    * fix comments
    
    * add tests
    
    * fix whitespaces
    
    * fix whitespaces
    
    * fix typo
    
    * fix typo
    
    * add doc on clipping
---
 R-package/R/model.R                       |   2 +-
 R-package/R/model.rnn.R                   | 170 ++++++-----
 R-package/R/optimizer.R                   | 450 +++++++++++++++++-------------
 R-package/tests/testthat/test_optimizer.R | 204 ++++++++++++++
 4 files changed, 545 insertions(+), 281 deletions(-)

diff --git a/R-package/R/model.R b/R-package/R/model.R
index b461f79..a2c4419 100644
--- a/R-package/R/model.R
+++ b/R-package/R/model.R
@@ -147,7 +147,7 @@ mx.model.train <- function(symbol, ctx, input.shape, 
output.shape,
     kvstore$set.optimizer(optimizer)
   } else {
     updaters <- lapply(seq_len(ndevice), function(i) {
-      mx.opt.get.updater(optimizer, train.execs[[i]]$ref.arg.arrays)
+      mx.opt.get.updater(optimizer, train.execs[[i]]$ref.arg.arrays, ctx = 
ctx[[i]])
     })
   }
   if (!is.null(kvstore)) {
diff --git a/R-package/R/model.rnn.R b/R-package/R/model.rnn.R
index f328d1b..580c82a 100644
--- a/R-package/R/model.rnn.R
+++ b/R-package/R/model.rnn.R
@@ -1,51 +1,50 @@
 # Internal function to do multiple device training on RNN
-mx.model.train.buckets <- function(symbol, ctx, train.data, eval.data, 
-                                   dlist, arg.params, aux.params, 
-                                   grad.req, arg.update.idx, 
+mx.model.train.buckets <- function(symbol, ctx, train.data, eval.data,
+                                   dlist, arg.params, aux.params,
+                                   grad.req, arg.update.idx,
                                    begin.round, end.round, optimizer, metric, 
metric_cpu,
-                                   epoch.end.callback, batch.end.callback, 
kvstore, verbose,
-                                   gc_freq) {
-  
+                                   epoch.end.callback, batch.end.callback, 
kvstore, verbose) {
+
   ndevice <- length(ctx)
-  if (verbose) 
+  if (verbose)
     message("Start training with ", ndevice, " devices")
-  
+
   input.names <- names(dlist)
   arg.params.names <- names(arg.params)
-  
+
   if (is.list(symbol)) sym_ini <- symbol[[names(train.data$bucketID)]] else 
sym_ini <- symbol
-  
+
   slices <- lapply(seq_len(ndevice), function(i) {
     sapply(names(dlist), function(n) mx.nd.split(data=dlist[[n]], num_outputs 
= ndevice, axis = 0, squeeze_axis = FALSE))
   })
-  
+
   train.execs <- lapply(seq_len(ndevice), function(i) {
     s <- slices[[i]]
-    mx.symbol.bind(symbol = sym_ini, arg.arrays = c(s, 
arg.params)[arg.update.idx], 
+    mx.symbol.bind(symbol = sym_ini, arg.arrays = c(s, 
arg.params)[arg.update.idx],
                    aux.arrays = aux.params, ctx = ctx[[i]], grad.req = 
grad.req)
   })
-  
+
   # KVStore related stuffs
   params.index <- as.integer(
     mx.util.filter.null(
       lapply(seq_along(train.execs[[1]]$ref.grad.arrays), function(k) {
         if (!is.null(train.execs[[1]]$ref.grad.arrays[[k]])) k else NULL}
       )))
-  
+
   update.on.kvstore <- FALSE
   if (!is.null(kvstore) && kvstore$update.on.kvstore) {
     update.on.kvstore <- TRUE
     kvstore$set.optimizer(optimizer)
   } else {
     updaters <- lapply(seq_len(ndevice), function(i) {
-      mx.opt.get.updater(optimizer, train.execs[[i]]$ref.arg.arrays)
+      mx.opt.get.updater(optimizer, train.execs[[i]]$ref.arg.arrays, ctx = 
ctx[[i]])
     })
   }
-  
+
   if (!is.null(kvstore)) {
     kvstore$init(params.index, train.execs[[1]]$ref.arg.arrays[params.index])
   }
-  
+
   # train over specified number of epochs
   for (iteration in begin.round:end.round) {
     nbatch <- 0
@@ -55,20 +54,20 @@ mx.model.train.buckets <- function(symbol, ctx, train.data, 
eval.data,
     }
     train.data$reset()
     while (train.data$iter.next()) {
-      
+
       # Get iterator data
       dlist <- train.data$value()[input.names]
-      
+
       # Slice inputs for multi-devices
       slices <- lapply(seq_len(ndevice), function(i) {
         sapply(names(dlist), function(n) mx.nd.split(data=dlist[[n]], 
num_outputs = ndevice, axis = 0, squeeze_axis = F))
       })
-      
+
       # Assign input to each executor - bug on inference if using BatchNorm
       if (is.list(symbol)) {
         train.execs <- lapply(seq_len(ndevice), function(i) {
           s <- slices[[i]]
-          mx.symbol.bind(symbol = symbol[[names(train.data$bucketID)]], 
+          mx.symbol.bind(symbol = symbol[[names(train.data$bucketID)]],
                          arg.arrays = c(s, 
train.execs[[i]]$arg.arrays[arg.params.names])[arg.update.idx],
                          aux.arrays = train.execs[[i]]$aux.arrays, ctx = 
ctx[[i]], grad.req = grad.req)
         })
@@ -78,12 +77,12 @@ mx.model.train.buckets <- function(symbol, ctx, train.data, 
eval.data,
           mx.exec.update.arg.arrays(train.execs[[i]], s, match.name=TRUE)
         }
       }
-      
+
       # forward pass
       for (texec in train.execs) {
         mx.exec.forward(texec, is.train = TRUE)
       }
-      
+
       # copy of preds and labels for metric
       if (!is.null(metric)) {
         preds <- lapply(train.execs, function(texec) {texec$ref.outputs[[1]]})
@@ -93,12 +92,12 @@ mx.model.train.buckets <- function(symbol, ctx, train.data, 
eval.data,
           labels <- lapply(seq_along(train.execs), function(i) 
{mx.nd.copyto(labels[[i]], mx.cpu())})
         }
       }
-      
+
       # backward pass
       for (texec in train.execs) {
         mx.exec.backward(texec)
       }
-      
+
       if (!is.null(kvstore)) {
         # push the gradient
         kvstore$push(params.index, lapply(train.execs, function(texec) {
@@ -124,7 +123,7 @@ mx.model.train.buckets <- function(symbol, ctx, train.data, 
eval.data,
           mx.exec.update.arg.arrays(train.execs[[i]], arg.blocks[[i]], 
skip.null = TRUE)
         }
       }
-      
+
       # Update the evaluation metrics
       if (!is.null(metric)) {
         for (i in seq_len(ndevice)) {
@@ -133,43 +132,40 @@ mx.model.train.buckets <- function(symbol, ctx, 
train.data, eval.data,
                                         state = train.metric)
         }
       }
-      
+
       nbatch <- nbatch + 1
-      if (!is.null(gc_freq)) {
-        if (nbatch %% gc_freq == 0) gc()
-      }
-      
+
       if (!is.null(batch.end.callback)) {
         batch.end.callback(iteration, nbatch, environment())
       }
     }
-    
+
     if (!is.null(metric)) {
       result <- metric$get(train.metric)
-      if (verbose) 
+      if (verbose)
         message("[", iteration, "] Train-", result$name, "=", result$value)
     }
-    
+
     if (!is.null(eval.data)) {
       if (!is.null(metric)) {
         eval.metric <- metric$init()
       }
       eval.data$reset()
       while (eval.data$iter.next()) {
-        
+
         # Get iterator data
         dlist <- eval.data$value()[input.names]
-        
+
         # Slice input to multiple devices
         slices <- lapply(seq_len(ndevice), function(i) {
           sapply(names(dlist), function(n) mx.nd.split(data=dlist[[n]], 
num_outputs = ndevice, axis = 0, squeeze_axis = FALSE))
         })
-        
+
         # Assign input to each executor - bug on inference if using BatchNorm
         if (is.list(symbol)) {
           train.execs <- lapply(seq_len(ndevice), function(i) {
             s <- slices[[i]]
-            mx.symbol.bind(symbol = symbol[[names(eval.data$bucketID)]], 
+            mx.symbol.bind(symbol = symbol[[names(eval.data$bucketID)]],
                            arg.arrays = c(s, 
train.execs[[i]]$arg.arrays[arg.params.names])[arg.update.idx],
                            aux.arrays = train.execs[[i]]$aux.arrays, ctx = 
ctx[[i]], grad.req = grad.req)
           })
@@ -179,12 +175,12 @@ mx.model.train.buckets <- function(symbol, ctx, 
train.data, eval.data,
             mx.exec.update.arg.arrays(train.execs[[i]], s, match.name=TRUE)
           }
         }
-        
+
         # forward pass
         for (texec in train.execs) {
           mx.exec.forward(texec, is.train = FALSE)
         }
-        
+
         # copy of preds and labels for metric and update metric
         if (!is.null(metric)) {
           preds <- lapply(train.execs, function(texec) 
{texec$ref.outputs[[1]]})
@@ -194,17 +190,17 @@ mx.model.train.buckets <- function(symbol, ctx, 
train.data, eval.data,
             labels <- lapply(seq_along(train.execs), function(i) 
{mx.nd.copyto(labels[[i]], mx.cpu())})
           }
           for (i in seq_len(ndevice)) {
-            eval.metric <- metric$update(label = labels[[i]], 
-                                         pred = preds[[i]], 
+            eval.metric <- metric$update(label = labels[[i]],
+                                         pred = preds[[i]],
                                          state = eval.metric)
           }
         }
       }
-      
+
       if (!is.null(metric)) {
         result <- metric$get(eval.metric)
         if (verbose) {
-          message("[", iteration, "] Validation-", result$name, "=", 
+          message("[", iteration, "] Validation-", result$name, "=",
                   result$value)
         }
       }
@@ -213,12 +209,12 @@ mx.model.train.buckets <- function(symbol, ctx, 
train.data, eval.data,
     }
     # get the model out
     model <- mx.model.extract.model(sym_ini, train.execs)
-    
+
     epoch_continue <- TRUE
     if (!is.null(epoch.end.callback)) {
       epoch_continue <- epoch.end.callback(iteration, 0, environment(), 
verbose = verbose)
     }
-    
+
     if (!epoch_continue) {
       break
     }
@@ -227,7 +223,7 @@ mx.model.train.buckets <- function(symbol, ctx, train.data, 
eval.data,
 }
 
 
-# 
+#
 #' Train RNN with bucket support
 #'
 #' @param symbol Symbol or list of Symbols representing the model
@@ -245,33 +241,33 @@ mx.model.train.buckets <- function(symbol, ctx, 
train.data, eval.data,
 #' @param verbose
 #'
 #' @export
-mx.model.buckets <- function(symbol, train.data, eval.data = NULL, metric = 
NULL, 
-                             arg.params = NULL, aux.params = NULL, 
fixed.params = NULL, 
-                             num.round = 1, begin.round = 1, 
-                             initializer = mx.init.uniform(0.01), optimizer = 
"sgd", ctx = NULL, 
-                             batch.end.callback = NULL, epoch.end.callback = 
NULL, 
-                             kvstore = "local", verbose = TRUE, metric_cpu = 
TRUE, gc_freq = NULL) {
-  
+mx.model.buckets <- function(symbol, train.data, eval.data = NULL, metric = 
NULL,
+                             arg.params = NULL, aux.params = NULL, 
fixed.params = NULL,
+                             num.round = 1, begin.round = 1,
+                             initializer = mx.init.uniform(0.01), optimizer = 
"sgd", ctx = NULL,
+                             batch.end.callback = NULL, epoch.end.callback = 
NULL,
+                             kvstore = "local", verbose = TRUE, metric_cpu = 
TRUE) {
+
   if (!train.data$iter.next()) {
     train.data$reset()
-    if (!train.data$iter.next()) 
+    if (!train.data$iter.next())
       stop("Empty train.data")
   }
-  
+
   if (!is.null(eval.data)) {
     if (!eval.data$iter.next()) {
       eval.data$reset()
-      if (!eval.data$iter.next()) 
+      if (!eval.data$iter.next())
         stop("Empty eval.data")
     }
   }
-  
-  if (is.null(ctx)) 
+
+  if (is.null(ctx))
     ctx <- mx.ctx.default()
   if (is.mx.context(ctx)) {
     ctx <- list(ctx)
   }
-  if (!is.list(ctx)) 
+  if (!is.list(ctx))
     stop("ctx must be mx.context or list of mx.context")
   if (is.character(optimizer)) {
     if (is.numeric(input.shape)) {
@@ -283,75 +279,75 @@ mx.model.buckets <- function(symbol, train.data, 
eval.data = NULL, metric = NULL
     }
     optimizer <- mx.opt.create(optimizer, rescale.grad = (1/batchsize), ...)
   }
-  
+
   sym_ini <- if (is.list(symbol)) symbol[[names(train.data$bucketID)]] else 
symbol
-  
+
   arguments <- sym_ini$arguments
   input.names <- intersect(names(train.data$value()), arguments)
-  
+
   input.shape <- sapply(input.names, function(n) {
     dim(train.data$value()[[n]])
   }, simplify = FALSE)
-  
+
   shapes <- sym_ini$infer.shape(input.shape)
-  
+
   # assign arg.params and aux.params arguments to arg.params.input and 
aux.params.input
   arg.params.input <- arg.params
   aux.params.input <- aux.params
-  
+
   # initialize all arguments with zeros
   arg.params <- lapply(shapes$arg.shapes, function(shape) {
     mx.nd.zeros(shape = shape, ctx = mx.cpu())
   })
-  
+
   # initialize input parameters
   dlist <- arg.params[input.names]
-  
+
   # initialize parameters - only argument ending with _weight and _bias are 
initialized
   arg.params.ini <- mx.init.create(initializer = initializer, shape.array = 
shapes$arg.shapes, ctx = mx.cpu(), skip.unknown = TRUE)
-  
+
   # assign initilized parameters to arg.params
   arg.params[names(arg.params.ini)] <- arg.params.ini
-  
+
   # assign input params to arg.params
   arg.params[names(arg.params.input)] <- arg.params.input
-  
+
   # remove input params from arg.params
   arg.params[input.names] <- NULL
-  
+
   # Grad request
   grad.req <- rep("null", length(arguments))
   grad.req.write <- arguments %in% setdiff(names(arg.params.ini), fixed.params)
   grad.req[grad.req.write] <- "write"
-  
+
   # Arg array order
   update_names <- c(input.names, names(arg.params))
   arg.update.idx <- match(arguments, update_names)
-  
+
   # aux parameters setup
   aux.params <- lapply(shapes$aux.shapes, function(shape) {
     mx.nd.zeros(shape = shape, ctx = mx.cpu())
   })
-  
+
   aux.params.ini <- mx.init.create(initializer, shapes$aux.shapes, ctx = 
mx.cpu(), skip.unknown = FALSE)
   if (length(aux.params) > 0) {
     aux.params[names(aux.params.ini)] <- aux.params.ini
   } else aux.params <- NULL
-  
+
   aux.params[names(aux.params.input)] <- aux.params.input
-  
+
   # kvstore initialization
-  kvstore <- mx.model.create.kvstore(kvstore, params$arg.params, length(ctx), 
+  kvstore <- mx.model.create.kvstore(kvstore, params$arg.params, length(ctx),
                                      verbose = verbose)
-  
+
   ### Execute training
-  model <- mx.model.train.buckets(symbol = symbol, ctx = ctx,  train.data = 
train.data, eval.data = eval.data, 
-                                  dlist = dlist,  arg.params = arg.params, 
aux.params = aux.params, 
-                                  grad.req = grad.req, arg.update.idx = 
arg.update.idx, 
-                                  optimizer = optimizer, metric = metric, 
-                                  begin.round = begin.round, end.round = 
num.round, 
-                                  batch.end.callback = batch.end.callback, 
epoch.end.callback = epoch.end.callback, 
-                                  kvstore = kvstore, verbose = verbose, 
metric_cpu = metric_cpu, gc_freq = gc_freq)
-  
+  model <- mx.model.train.buckets(symbol = symbol, ctx = ctx,  train.data = 
train.data, eval.data = eval.data,
+                                  dlist = dlist,  arg.params = arg.params, 
aux.params = aux.params,
+                                  grad.req = grad.req, arg.update.idx = 
arg.update.idx,
+                                  optimizer = optimizer, metric = metric,
+                                  begin.round = begin.round, end.round = 
num.round,
+                                  batch.end.callback = batch.end.callback, 
epoch.end.callback = epoch.end.callback,
+                                  kvstore = kvstore, verbose = verbose, 
metric_cpu = metric_cpu)
+
   return(model)
 }
diff --git a/R-package/R/optimizer.R b/R-package/R/optimizer.R
index 3c503c2..7283f67 100644
--- a/R-package/R/optimizer.R
+++ b/R-package/R/optimizer.R
@@ -1,31 +1,69 @@
 #' Create an SGD optimizer with respective parameters.
 #' Perform SGD with momentum update
 #'
-mx.opt.sgd <- function(learning.rate,
-                       momentum=0,
-                       wd=0,
-                       rescale.grad=1,
-                       clip_gradient = NULL, 
+#' @param learning.rate float, default=0.01
+#'      The initial learning rate.
+#' @param momentum float, default=0
+#'      The momentumvalue
+#' @param wd float, default=0.0
+#'      L2 regularization coefficient add to all the weights.
+#' @param rescale.grad float, default=1.0
+#'      rescaling factor of gradient.
+#' @param clip_gradient float, optional, default=-1 (no clipping if < 0)
+#'      clip gradient in range [-clip_gradient, clip_gradient].
+#' @param lr_scheduler function, optional
+#'      The learning rate scheduler.
+mx.opt.sgd <- function(learning.rate = 0.01,
+                       momentum = 0,
+                       wd = 0,
+                       rescale.grad = 1,
+                       clip_gradient = -1,
                        lr_scheduler = NULL) {
-  # use lr as short for learing rate.
+
   lr <- learning.rate
-  count       <- 0
-  num_update  <- 0
+  count <- 0
+  num_update <- 0
 
   sgd <- new.env()
   sgd$lr <- lr
   sgd$count <- 0
   sgd$num_update <- 0
 
-  create.state <- function(index, weight) {
+  create_exec <- function(index, weight_dim, ctx) {
+
     if (momentum == 0) {
-      return(NULL)
+
+      weight <- mx.symbol.Variable("weight")
+      grad <- mx.symbol.Variable("grad")
+
+      sym <- mx.symbol.sgd_update(weight,
+                                  grad,
+                                  lr = lr,
+                                  wd = wd,
+                                  rescale_grad = rescale.grad,
+                                  clip_gradient = clip_gradient,
+                                  name = "w")
     } else {
-      ret <- (mx.nd.zeros(dim(weight), ctx(weight)))
-      return(ret)
+
+      weight <- mx.symbol.Variable("weight")
+      grad <- mx.symbol.Variable("grad")
+      mom <- mx.symbol.Variable("mom")
+
+      sym <- mx.symbol.sgd_mom_update(weight,
+                                      grad,
+                                      mom,
+                                      lr = lr,
+                                      wd = wd,
+                                      momentum= momentum,
+                                      rescale_grad = rescale.grad,
+                                      clip_gradient = clip_gradient,
+                                      name = "w")
     }
+    exec <- mx.simple.bind(symbol = sym, weight = weight_dim, ctx = ctx, 
grad.req = "null")
+    return(exec)
   }
-  update <- function(index, weight, grad, state) {
+
+  update <- function(index, exec_w, weight, grad) {
 
     if (!is.null(lr_scheduler)){
       lr_scheduler(sgd) ## changing lr
@@ -40,77 +78,104 @@ mx.opt.sgd <- function(learning.rate,
         sgd$num_update <- max(sgd$num_update, sgd[[indexKey]])
       }
     }
-    grad <- grad * rescale.grad
-    if (!is.null(clip_gradient)){
-      if(clip_gradient >= 0){
-        grad <- mx.nd.clip(grad, -clip_gradient, clip_gradient)
-      } else {
-        stop("Error: clip_gradient should be positive number.")
-      }
-    }
-    if (is.null(state)) {
-      weight <- weight - lr * (grad + wd * weight)
-    } else {
-      mom <- state
-      mom <- mom * momentum
-      mom <- mom - lr * (grad + wd * weight)
-      weight <- weight + mom
-      state <- mom
-    }
-    return(list(weight=weight, state=state))
+
+    mx.exec.update.arg.arrays(exec_w, arg.arrays = list(weight = weight,grad = 
grad), match.name = T)
+    mx.exec.forward(exec_w, is.train = F)
+    return(exec_w$ref.outputs$w_output)
   }
-  return(list(create.state=create.state, update=update))
+  return(list(create_exec = create_exec, update = update))
 }
 
 #' Create an RMSProp optimizer with respective parameters.
 #' Reference: Tieleman T, Hinton G. Lecture 6.5-rmsprop: Divide the gradient 
by a running average of its recent magnitude[J]. COURSERA: Neural Networks for 
Machine Learning, 2012, 4(2).
 #' The code follows: http://arxiv.org/pdf/1308.0850v5.pdf Eq(38) - Eq(45) by 
Alex Graves, 2013.
-#' 
+#'
 #' @param learning.rate float, default=0.002
-#'      Step size.
+#'      The initial learning rate.
 #' @param gamma1 float, default=0.95
 #'      decay factor of moving average for gradient, gradient^2.
-#' @param gamm2 float, default=0.9
+#' @param gamma2 float, default=0.9
 #'      "momentum" factor.
+#' @param epsilon float, default=1e-4
 #' @param wd float, default=0.0
 #'      L2 regularization coefficient add to all the weights.
 #' @param rescale.grad float, default=1.0
 #'      rescaling factor of gradient.
-#' @param clip_gradient float, optional
+#' @param clip_gradient float, optional, default=-1 (no clipping if < 0)
 #'      clip gradient in range [-clip_gradient, clip_gradient].
 #' @param lr_scheduler function, optional
 #'      The learning rate scheduler.
 #'
-mx.opt.rmsprop <- function(learning.rate=0.002,
-                           gamma1=0.95,
-                           gamma2=0.9,
-                           wd=0,
-                           rescale.grad=1,
-                           clip_gradient = NULL, 
+mx.opt.rmsprop <- function(learning.rate = 0.002,
+                           centered = TRUE,
+                           gamma1 = 0.95,
+                           gamma2 = 0.9,
+                           epsilon = 1e-4,
+                           wd = 0,
+                           rescale.grad = 1,
+                           clip_gradient = -1,
                            lr_scheduler = NULL) {
-  # use lr as short for learing rate.
+
   lr <- learning.rate
-  count       <- 0
-  num_update  <- 0
+  count <- 0
+  num_update <- 0
 
   rmsprop <- new.env()
   rmsprop$lr <- lr
   rmsprop$count <- 0
   rmsprop$num_update <- 0
 
-  create.state <- function(index, weight) {
-      return (list(n=mx.nd.zeros(dim(weight), ctx(weight)),
-                   g=mx.nd.zeros(dim(weight), ctx(weight)),
-                   delta=mx.nd.zeros(dim(weight), ctx(weight))))
+  create_exec <- function(index, weight_dim, ctx) {
+
+    if (centered) {
+
+      weight <- mx.symbol.Variable("weight")
+      grad <- mx.symbol.Variable("grad")
+      n <- mx.symbol.Variable("n")
+      g <- mx.symbol.Variable("g")
+      delta <- mx.symbol.Variable("delta")
+
+      sym <- mx.symbol.rmspropalex_update(weight,
+                                          grad,
+                                          n,
+                                          g,
+                                          delta,
+                                          lr = lr,
+                                          gamma1 = gamma1,
+                                          gamma2 = gamma2,
+                                          epsilon = epsilon,
+                                          wd = wd,
+                                          rescale_grad = rescale.grad,
+                                          clip_gradient = clip_gradient,
+                                          name = "w")
+    } else {
+      weight <- mx.symbol.Variable("weight")
+      grad <- mx.symbol.Variable("grad")
+      n <- mx.symbol.Variable("n")
+
+      sym <- mx.symbol.rmsprop_update(weight,
+                                      grad,
+                                      n,
+                                      lr = lr,
+                                      gamma1 = gamma1,
+                                      epsilon = epsilon,
+                                      wd = wd,
+                                      rescale_grad = rescale.grad,
+                                      clip_gradient = clip_gradient,
+                                      name = "w")
+    }
+
+    exec <- mx.simple.bind(symbol = sym, weight = weight_dim, ctx = ctx, 
grad.req = "null")
+    return(exec)
   }
 
-  update <- function(index, weight, grad, state) {
+  update <- function(index, exec_w, weight, grad) {
     if (!is.null(lr_scheduler)){
       lr_scheduler(rmsprop) ## changing lr
       lr <- rmsprop$lr
       ## update count
       indexKey <- paste0('ik', index)
-      if (!exists(envir = rmsprop, x = indexKey, inherits = FALSE)){
+      if (!exists(envir = rmsprop, x = indexKey, inherits = FALSE)) {
         rmsprop[[indexKey]] <- 0
       } else {
         indexValue <- rmsprop[[indexKey]]
@@ -118,27 +183,12 @@ mx.opt.rmsprop <- function(learning.rate=0.002,
         rmsprop$num_update <- max(rmsprop$num_update, rmsprop[[indexKey]])
       }
     }
-    grad <- grad * rescale.grad
-    if (!is.null(clip_gradient)){
-      if(clip_gradient >= 0){
-        grad <- mx.nd.clip(grad, -clip_gradient, clip_gradient)
-      } else {
-        stop("Error: clip_gradient should be positive number.")
-      }
-    }
 
-    n <- state$n
-    g <- state$g
-    delta <- state$delta
-    n <- gamma1 * n + (1 - gamma1) * (grad * grad)
-    g <- gamma1 * g + (1 - gamma1) * grad
-    delta <- gamma2 * delta - lr * (grad / mx.nd.sqrt(n - g*g + 1e-4) + wd * 
weight)
-    weight <- weight + delta
-    state <- list(n=n, g=g, delta=delta)
-
-    return(list(weight=weight, state=state))
+    mx.exec.update.arg.arrays(exec_w, arg.arrays = list(weight = weight,grad = 
grad), match.name = T)
+    mx.exec.forward(exec_w, is.train = F)
+    return(exec_w$ref.outputs$w_output)
   }
-  return(list(create.state=create.state, update=update))
+  return(list(create_exec = create_exec, update = update))
 }
 
 #' Create an Adam optimizer with respective parameters.
@@ -148,8 +198,8 @@ mx.opt.rmsprop <- function(learning.rate=0.002,
 #' Adam: A Method for Stochastic Optimization,
 #' http://arxiv.org/abs/1412.6980
 #'
-#' @param learning.rate float, default=0.001
-#'      Step size.
+#' @param learning.rate float, default=1e-3
+#'      The initial learning rate.
 #' @param beta1 float, default=0.9
 #'      Exponential decay rate for the first moment estimates.
 #' @param beta2 float, default=0.999
@@ -159,41 +209,60 @@ mx.opt.rmsprop <- function(learning.rate=0.002,
 #'      L2 regularization coefficient add to all the weights.
 #' @param rescale.grad float, default=1.0
 #'      rescaling factor of gradient.
-#' @param clip_gradient float, optional
+#' @param clip_gradient float, optional, default=-1 (no clipping if < 0)
 #'      clip gradient in range [-clip_gradient, clip_gradient].
 #' @param lr_scheduler function, optional
 #'      The learning rate scheduler.
 #'
-mx.opt.adam <- function(learning.rate=0.001,
-                        beta1=0.9,
-                        beta2=0.999,
-                        epsilon=1e-8,
-                        wd=0,
-                        rescale.grad=1,
-                        clip_gradient = NULL,
+mx.opt.adam <- function(learning.rate = 1e-3,
+                        beta1 = 0.9,
+                        beta2 = 0.999,
+                        epsilon = 1e-8,
+                        wd = 0,
+                        rescale.grad = 1,
+                        clip_gradient = -1,
                         lr_scheduler = NULL) {
-  # use lr as short for learing rate.
+
   lr <- learning.rate
-  count       <- 0
-  num_update  <- 0
+  count <- 0
+  num_update <- 0
 
   adam <- new.env()
   adam$lr <- lr
   adam$count <- 0
   adam$num_update <- 0
 
-  create.state <- function(index, weight) {
-      return (list(mean=mx.nd.zeros(dim(weight), ctx(weight)),
-                   variance=mx.nd.zeros(dim(weight), ctx(weight))))
+  create_exec <- function(index, weight_dim, ctx) {
+
+    weight <- mx.symbol.Variable("weight")
+    grad <- mx.symbol.Variable("grad")
+    mean <- mx.symbol.Variable("mean")
+    var <- mx.symbol.Variable("var")
+
+    sym <- mx.symbol.adam_update(weight,
+                                 grad,
+                                 mean,
+                                 var,
+                                 lr = lr,
+                                 beta1 = beta1,
+                                 beta2 = beta2,
+                                 epsilon = epsilon,
+                                 wd = wd,
+                                 rescale_grad = rescale.grad,
+                                 clip_gradient = clip_gradient,
+                                 name = "w")
+
+    exec <- mx.simple.bind(symbol = sym, weight = weight_dim, ctx = ctx, 
grad.req = "null")
+    return(exec)
   }
 
-  update <- function(index, weight, grad, state) {
+  update <- function(index, exec_w, weight, grad) {
     if (!is.null(lr_scheduler)){
       lr_scheduler(adam) ## changing lr
       lr <- adam$lr
       ## update count
       indexKey <- paste0('ik', index)
-      if (!exists(envir = adam, x = indexKey, inherits = FALSE)){
+      if (!exists(envir = adam, x = indexKey, inherits = FALSE)) {
         adam[[indexKey]] <- 0
       } else {
         indexValue <- adam[[indexKey]]
@@ -202,44 +271,15 @@ mx.opt.adam <- function(learning.rate=0.001,
       }
     }
 
-    # increment time
-    time.key <- paste0('t', index)
-    if (!exists(envir = adam, x = time.key, inherits = FALSE)){
-      adam[[time.key]] <- 0
-    }
-    t <- adam[[time.key]]
-    t <- t + 1
-    adam[[time.key]] <- t
-
-    mean <- state$mean
-    variance <- state$variance
-
-    grad <- grad * rescale.grad
-    if (!is.null(clip_gradient)){
-      if(clip_gradient >= 0){
-        grad <- mx.nd.clip(grad, -clip_gradient, clip_gradient)
-      } else {
-        stop("Error: clip_gradient should be positive number.")
-      }
-    }
-
-    mean <- beta1 * mean + (1 - beta1) * grad
-    variance <- beta2 * variance + (1 - beta2) * (grad * grad)
-
-    coef1 <- 1 - beta1^t
-    coef2 <- 1 - beta2^t
-    lr <- lr * sqrt(coef2)/coef1
-
-    weight <- weight - lr * mean / (mx.nd.sqrt(variance) + epsilon)
-    weight <- weight - lr * wd * weight
-
-    state <- list(mean=mean, variance=variance)
-
-    return(list(weight=weight, state=state))
+    mx.exec.update.arg.arrays(exec_w, arg.arrays = list(weight = weight,grad = 
grad), match.name = T)
+    mx.exec.forward(exec_w, is.train = F)
+    return(exec_w$ref.outputs$w_output)
   }
-  return(list(create.state=create.state, update=update))
+  return(list(create_exec = create_exec, update = update))
 }
 
+
+
 #' Create an AdaGrad optimizer with respective parameters.
 #' AdaGrad optimizer of Duchi et al., 2011,
 #'
@@ -254,38 +294,58 @@ mx.opt.adam <- function(learning.rate=0.001,
 #'      L2 regularization coefficient add to all the weights.
 #' @param rescale.grad float, default=1.0
 #'      rescaling factor of gradient.
-#' @param clip_gradient float, optional
+#' @param clip_gradient float, default=-1.0 (no clipping if < 0)
 #'      clip gradient in range [-clip_gradient, clip_gradient].
 #' @param lr_scheduler function, optional
 #'      The learning rate scheduler.
 #'
-mx.opt.adagrad <- function(learning.rate=0.05,
-                           epsilon=1e-8,
-                           wd=0,
-                           rescale.grad=1,
-                           clip_gradient = NULL,
+mx.opt.adagrad <- function(learning.rate = 0.05,
+                           epsilon = 1e-8,
+                           wd = 0,
+                           rescale.grad = 1,
+                           clip_gradient = -1,
                            lr_scheduler = NULL) {
   # use lr as short for learing rate.
   lr <- learning.rate
-  count       <- 0
-  num_update  <- 0
+  count <- 0
+  num_update <- 0
 
   adagrad <- new.env()
   adagrad$lr <- lr
   adagrad$count <- 0
   adagrad$num_update <- 0
 
-  create.state <- function(index, weight) {
-      return (mx.nd.zeros(dim(weight), ctx(weight))) #history
+  create_exec <- function(index, weight_dim, ctx) {
+
+    weight <- mx.symbol.Variable("weight")
+    grad <- mx.symbol.Variable("grad")
+    history <- mx.symbol.Variable("history")
+
+    grad <- grad * rescale.grad
+    if (!is.null(clip_gradient)) {
+      if (clip_gradient >= 0) {
+        grad <- mx.symbol.clip(data = grad, a.min = -clip_gradient, a.max = 
clip_gradient)
+      }
+    }
+
+    history <- history + (grad * grad)
+    weight <- weight - lr * (grad / mx.symbol.sqrt(history + epsilon) + wd * 
weight)
+
+    w <- mx.symbol.identity(weight, name = "w")
+    h <- mx.symbol.identity(history, name = "h")
+    sym <- mx.symbol.Group(c(w, h))
+
+    exec <- mx.simple.bind(symbol = sym, weight = weight_dim, ctx = ctx, 
grad.req = "null")
+    return(exec)
   }
 
-  update <- function(index, weight, grad, state) {
-    if (!is.null(lr_scheduler)){
+  update <- function(index, exec_w, weight, grad) {
+    if (!is.null(lr_scheduler)) {
       lr_scheduler(adagrad) ## changing lr
       lr <- adagrad$lr
       ## update count
       indexKey <- paste0('ik', index)
-      if (!exists(envir = adagrad, x = indexKey, inherits = FALSE)){
+      if (!exists(envir = adagrad, x = indexKey, inherits = FALSE)) {
         adagrad[[indexKey]] <- 0
       } else {
         indexValue <- adagrad[[indexKey]]
@@ -294,25 +354,18 @@ mx.opt.adagrad <- function(learning.rate=0.05,
       }
     }
 
-    grad <- grad * rescale.grad
-    if (!is.null(clip_gradient)){
-      if(clip_gradient >= 0){
-        grad <- mx.nd.clip(grad, -clip_gradient, clip_gradient)
-      } else {
-        stop("Error: clip_gradient should be positive number.")
-      }
-    }
+    mx.exec.update.arg.arrays(exec_w, arg.arrays = list(weight = weight,grad = 
grad), match.name = T)
+    mx.exec.forward(exec_w, is.train = F)
 
-    history <- state
-    history <- history + (grad * grad)
-    weight <- weight - lr * (grad / mx.nd.sqrt(history + epsilon) + wd * 
weight)
-    state <- history
+    # update state
+    mx.exec.update.arg.arrays(exec_w, arg.arrays = list(history = 
exec_w$ref.outputs$h_output), match.name = T)
 
-    return(list(weight=weight, state=state))
+    return(exec_w$ref.outputs$w_output)
   }
-  return(list(create.state=create.state, update=update))
+  return(list(create_exec = create_exec, update = update))
 }
 
+
 #' Create an AdaDelta optimizer with respective parameters.
 #'
 #' AdaDelta optimizer as described in Zeiler, M. D. (2012).
@@ -325,50 +378,64 @@ mx.opt.adagrad <- function(learning.rate=0.05,
 #'      The constant as described in the thesis.
 #' @param wd float, default=0.0
 #'      L2 regularization coefficient add to all the weights.
-#' @param rescale.grad float, default=1.0
+#' @param rescale.grad float, default=1
 #'      rescaling factor of gradient.
-#' @param clip_gradient float, optional
+#' @param clip_gradient float, default=-1 (no clipping if < 0)
 #'      clip gradient in range [-clip_gradient, clip_gradient].
 #'
-mx.opt.adadelta <- function(rho=0.90,
-                            epsilon=1e-5,
-                            wd=0,
-                            rescale.grad=1,
-                            clip_gradient = NULL) {
+mx.opt.adadelta <- function(rho = 0.90,
+                            epsilon = 1e-5,
+                            wd = 0,
+                            rescale.grad = 1,
+                            clip_gradient = -1) {
   adadelta <- new.env()
 
-  create.state <- function(index, weight) {
-    return (list(acc.g=mx.nd.zeros(dim(weight), ctx(weight)),       # 
accumulated g
-                 acc.delta=mx.nd.zeros(dim(weight), ctx(weight))))  # 
accumulated delta
-  }
+  create_exec <- function(index, weight_dim, ctx) {
+    weight <- mx.symbol.Variable("weight")
+    grad <- mx.symbol.Variable("grad")
+    acc.g <- mx.symbol.Variable("acc.g")
+    acc.delta <- mx.symbol.Variable("acc.delta")
 
-  update <- function(index, weight, grad, state) {
-    # preprocess grad
     grad <- grad * rescale.grad
-    if (!is.null(clip_gradient)){
-      if(clip_gradient >= 0){
-        grad <- mx.nd.clip(grad, -clip_gradient, clip_gradient)
-      } else {
-        stop("Error: clip_gradient should be positive number.")
+    if (!is.null(clip_gradient)) {
+      if (clip_gradient >= 0) {
+        grad <- mx.symbol.clip(data = grad, a.min = -clip_gradient, a.max = 
clip_gradient)
       }
     }
 
-    # accumulated g and delta initlization
-    acc.g <- state$acc.g
-    acc.delta <- state$acc.delta
-
-    # update g, delta
+    # update state (acc.g, acc.delta)
     acc.g <- rho * acc.g + (1 - rho) * (grad * grad)
-    current.delta <- mx.nd.sqrt(acc.delta + epsilon) / mx.nd.sqrt(acc.g + 
epsilon) * grad
+    current.delta <- mx.symbol.sqrt(acc.delta + epsilon) / 
mx.symbol.sqrt(acc.g + epsilon) * grad
     acc.delta <- rho * acc.delta + (1 - rho) * (current.delta * current.delta)
     weight <- weight - current.delta - wd * weight
-    state <- list(acc.g=acc.g, acc.delta=acc.delta)
 
-    return(list(weight=weight, state=state))
+    w <- mx.symbol.identity(weight, name = "w")
+    g <- mx.symbol.identity(acc.g, name = "g")
+    delta <- mx.symbol.identity(acc.delta, name = "delta")
+    sym <- mx.symbol.Group(c(w, g, delta))
+
+    exec <- mx.simple.bind(symbol = sym, weight = weight_dim, ctx = ctx, 
grad.req = "null")
+    return(exec)
   }
-  return(list(create.state=create.state, update=update))
+
+  update <- function(index, exec_w, weight, grad) {
+
+    mx.exec.update.arg.arrays(exec_w, arg.arrays = list(weight = weight,grad = 
grad), match.name = T)
+    mx.exec.forward(exec_w, is.train = F)
+
+    # update state
+    mx.exec.update.arg.arrays(exec_w,
+                              arg.arrays = list(
+                                acc.g = exec_w$ref.outputs$g_output,
+                                acc.delta = exec_w$ref.outputs$delta_output),
+                              match.name = T)
+
+    return(exec_w$ref.outputs$w_output)
+  }
+  return(list(create_exec = create_exec, update = update))
 }
 
+
 #' Create an optimizer by name and parameters
 #'
 #' @param name The name of the optimizer
@@ -392,31 +459,28 @@ mx.opt.create <- function(name, ...) {
 #' @param weights The weights to be optimized
 #'
 #' @export
-mx.opt.get.updater <- function(optimizer, weights) {
-  # This is the list to keep track of internal states of optimzer
-  state.list <- lapply(seq_along(weights), function(i) {
-    if (is.null(weights[[i]])) return(NULL)
-    optimizer$create.state(i, weights[[i]])
+mx.opt.get.updater <- function(optimizer, weights, ctx) {
+
+  exec_list <- lapply(seq_along(weights), function(i) {
+    if (is.null(weights[[i]])) {
+      return(NULL)
+    } else {
+      optimizer$create_exec(index = i, weight_dim = dim(weights[[i]]), ctx = 
ctx)
+    }
   })
+
   update <- optimizer$update
 
   update.closure <- function(weight, grad) {
-    ulist <- lapply(seq_along(weight), function(i) {
+
+    weight_list <- lapply(seq_along(weight), function(i) {
       if (!is.null(grad[[i]])) {
-        update(i, weight[[i]], grad[[i]], state.list[[i]])
+        return(update(i, exec_list[[i]], weight[[i]], grad[[i]]))
       } else {
         return(NULL)
       }
     })
-    # update state list, use mutate assignment
-    state.list <<- lapply(ulist, function(x) {
-      x$state
-    })
-    # return updated weight list
-    weight.list <- lapply(ulist, function(x) {
-      x$weight
-    })
-    return(weight.list)
+    return(weight_list)
   }
   return(update.closure)
 }
diff --git a/R-package/tests/testthat/test_optimizer.R 
b/R-package/tests/testthat/test_optimizer.R
new file mode 100644
index 0000000..c6dacaa
--- /dev/null
+++ b/R-package/tests/testthat/test_optimizer.R
@@ -0,0 +1,204 @@
+context("optimizer")
+
+test_that("sgd", {
+
+  data = mx.symbol.Variable('data')
+  label = mx.symbol.Variable('label')
+  fc_weight = mx.symbol.Variable('fc_weight')
+  fc = mx.symbol.FullyConnected(data = data, weight = fc_weight, no.bias = T, 
name = 'fc1', num_hidden = 1)
+  loss = mx.symbol.LinearRegressionOutput(data = fc, label = label, name = 
'loss')
+
+  x <- mx.nd.array(array(1:6, dim=2:3))
+  y <- mx.nd.array(c(5, 11, 16))
+  w1 <- mx.nd.array(array(c(1.1, 1.8), dim = c(2,1)))
+
+  exec <- mxnet:::mx.symbol.bind(symbol = loss,
+                                 ctx = mx.cpu(),
+                                 arg.arrays = list(data = x,
+                                                   fc1_weight = w1,
+                                                   label = y),
+                                 aux.arrays = NULL,
+                                 grad.reqs = c("null", "write", "null"))
+
+  optimizer <- mx.opt.create("sgd",
+                             learning.rate = 1,
+                             momentum = 0,
+                             wd = 0,
+                             rescale.grad = 1,
+                             clip_gradient = -1)
+
+  updaters <- mx.opt.get.updater(optimizer, exec$ref.arg.arrays, ctx = 
mx.cpu())
+
+  mx.exec.forward(exec, is.train = T)
+  mx.exec.backward(exec)
+
+  arg.blocks <- updaters(exec$ref.arg.arrays, exec$ref.grad.arrays)
+  mx.exec.update.arg.arrays(exec, arg.blocks, skip.null = TRUE)
+
+  expect_equal(as.array(arg.blocks[[2]]), array(c(1.4, 2.6), dim = c(2,1)), 
tolerance = 1e-1)
+
+})
+
+
+test_that("rmsprop", {
+
+  data = mx.symbol.Variable('data')
+  label = mx.symbol.Variable('label')
+  fc_weight = mx.symbol.Variable('fc_weight')
+  fc = mx.symbol.FullyConnected(data = data, weight = fc_weight, no.bias = T, 
name = 'fc1', num_hidden = 1)
+  loss = mx.symbol.LinearRegressionOutput(data = fc, label = label, name = 
'loss')
+
+  x <- mx.nd.array(array(1:6, dim=2:3))
+  y <- mx.nd.array(c(5, 11, 16))
+  w1 <- mx.nd.array(array(c(1.1, 1.8), dim = c(2,1)))
+
+  exec <- mxnet:::mx.symbol.bind(symbol = loss,
+                                 ctx = mx.cpu(),
+                                 arg.arrays = list(data = x,
+                                                   fc1_weight = w1,
+                                                   label = y),
+                                 aux.arrays = NULL,
+                                 grad.reqs = c("null", "write", "null"))
+
+  optimizer <- mx.opt.create("rmsprop", learning.rate = 1,
+                             centered = TRUE,
+                             gamma1 = 0.95,
+                             gamma2 = 0.9,
+                             epsilon = 1e-4,
+                             wd = 0,
+                             rescale.grad = 1,
+                             clip_gradient = -1)
+
+  updaters <- mx.opt.get.updater(optimizer, exec$ref.arg.arrays, ctx = 
mx.cpu())
+
+  mx.exec.forward(exec, is.train = T)
+  mx.exec.backward(exec)
+
+  arg.blocks <- updaters(exec$ref.arg.arrays, exec$ref.grad.arrays)
+  mx.exec.update.arg.arrays(exec, arg.blocks, skip.null = TRUE)
+
+  expect_equal(as.array(arg.blocks[[2]]), array(c(5.64, 6.38), dim = c(2,1)), 
tolerance = 1e-1)
+
+})
+
+
+test_that("adam", {
+
+  data = mx.symbol.Variable('data')
+  label = mx.symbol.Variable('label')
+  fc_weight = mx.symbol.Variable('fc_weight')
+  fc = mx.symbol.FullyConnected(data = data, weight = fc_weight, no.bias = T, 
name = 'fc1', num_hidden = 1)
+  loss = mx.symbol.LinearRegressionOutput(data = fc, label = label, name = 
'loss')
+
+  x <- mx.nd.array(array(1:6, dim=2:3))
+  y <- mx.nd.array(c(5, 11, 16))
+  w1 <- mx.nd.array(array(c(1.1, 1.8), dim = c(2,1)))
+
+  exec <- mxnet:::mx.symbol.bind(symbol = loss,
+                                 ctx = mx.cpu(),
+                                 arg.arrays = list(data = x,
+                                                   fc1_weight = w1,
+                                                   label = y),
+                                 aux.arrays = NULL,
+                                 grad.reqs = c("null", "write", "null"))
+
+  optimizer <- mx.opt.create("adam",
+                             learning.rate = 1,
+                             beta1 = 0.9,
+                             beta2 = 0.999,
+                             epsilon = 1e-8,
+                             wd = 0,
+                             rescale.grad = 1,
+                             clip_gradient = -1)
+
+  updaters <- mx.opt.get.updater(optimizer, exec$ref.arg.arrays, ctx = 
mx.cpu())
+
+  mx.exec.forward(exec, is.train = T)
+  mx.exec.backward(exec)
+
+  arg.blocks <- updaters(exec$ref.arg.arrays, exec$ref.grad.arrays)
+  mx.exec.update.arg.arrays(exec, arg.blocks, skip.null = TRUE)
+
+  expect_equal(as.array(arg.blocks[[2]]), array(c(4.26, 4.96), dim = c(2,1)), 
tolerance = 1e-1)
+
+})
+
+
+test_that("adagrad", {
+
+  data = mx.symbol.Variable('data')
+  label = mx.symbol.Variable('label')
+  fc_weight = mx.symbol.Variable('fc_weight')
+  fc = mx.symbol.FullyConnected(data = data, weight = fc_weight, no.bias = T, 
name = 'fc1', num_hidden = 1)
+  loss = mx.symbol.LinearRegressionOutput(data = fc, label = label, name = 
'loss')
+
+  x <- mx.nd.array(array(1:6, dim=2:3))
+  y <- mx.nd.array(c(5, 11, 16))
+  w1 <- mx.nd.array(array(c(1.1, 1.8), dim = c(2,1)))
+
+  exec <- mxnet:::mx.symbol.bind(symbol = loss,
+                                 ctx = mx.cpu(),
+                                 arg.arrays = list(data = x,
+                                                   fc1_weight = w1,
+                                                   label = y),
+                                 aux.arrays = NULL,
+                                 grad.reqs = c("null", "write", "null"))
+
+  optimizer <- mx.opt.create("adagrad",
+                             learning.rate = 1,
+                             epsilon = 1e-8,
+                             wd = 0,
+                             rescale.grad = 1,
+                             clip_gradient = -1)
+
+  updaters <- mx.opt.get.updater(optimizer, exec$ref.arg.arrays, ctx = 
mx.cpu())
+
+  mx.exec.forward(exec, is.train = T)
+  mx.exec.backward(exec)
+
+  arg.blocks <- updaters(exec$ref.arg.arrays, exec$ref.grad.arrays)
+  mx.exec.update.arg.arrays(exec, arg.blocks, skip.null = TRUE)
+
+  expect_equal(as.array(arg.blocks[[2]]), array(c(2.1, 2.8), dim = c(2,1)), 
tolerance = 1e-1)
+
+})
+
+
+test_that("adadelta", {
+
+  data = mx.symbol.Variable('data')
+  label = mx.symbol.Variable('label')
+  fc_weight = mx.symbol.Variable('fc_weight')
+  fc = mx.symbol.FullyConnected(data = data, weight = fc_weight, no.bias = T, 
name = 'fc1', num_hidden = 1)
+  loss = mx.symbol.LinearRegressionOutput(data = fc, label = label, name = 
'loss')
+
+  x <- mx.nd.array(array(1:6, dim=2:3))
+  y <- mx.nd.array(c(5, 11, 16))
+  w1 <- mx.nd.array(array(c(1.1, 1.8), dim = c(2,1)))
+
+  exec <- mxnet:::mx.symbol.bind(symbol = loss,
+                                 ctx = mx.cpu(),
+                                 arg.arrays = list(data = x,
+                                                   fc1_weight = w1,
+                                                   label = y),
+                                 aux.arrays = NULL,
+                                 grad.reqs = c("null", "write", "null"))
+
+  optimizer <- mx.opt.create("adadelta",
+                             rho = 0.90,
+                             epsilon = 1e-5,
+                             wd = 0,
+                             rescale.grad = 1,
+                             clip_gradient = -1)
+
+  updaters <- mx.opt.get.updater(optimizer, exec$ref.arg.arrays, ctx = 
mx.cpu())
+
+  mx.exec.forward(exec, is.train = T)
+  mx.exec.backward(exec)
+
+  arg.blocks <- updaters(exec$ref.arg.arrays, exec$ref.grad.arrays)
+  mx.exec.update.arg.arrays(exec, arg.blocks, skip.null = TRUE)
+
+  expect_equal(as.array(arg.blocks[[2]]), array(c(1.11, 1.81), dim = c(2,1)), 
tolerance = 1e-1)
+
+})

[incubator-mxnet] branch master updated: [MXNET-563] Refactor R optimizers to fix memory leak (#11374)

Reply via email to