This is an automated email from the ASF dual-hosted git repository.
the pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/incubator-mxnet.git
The following commit(s) were added to refs/heads/master by this push:
new be47870 [MXNET-563] Refactor R optimizers to fix memory leak (#11374)
be47870 is described below
commit be478700e01944ecfee0c30a7cc5dc07d1b2789a
Author: jeremiedb <[email protected]>
AuthorDate: Wed Jul 25 17:10:16 2018 -0400
[MXNET-563] Refactor R optimizers to fix memory leak (#11374)
* refactor R optimizers to fix memory leak
* add Adadelta and Adagrad
* fix comments
* fix comments
* fix comments
* add tests
* fix whitespaces
* fix whitespaces
* fix typo
* fix typo
* add doc on clipping
---
R-package/R/model.R | 2 +-
R-package/R/model.rnn.R | 170 ++++++-----
R-package/R/optimizer.R | 450 +++++++++++++++++-------------
R-package/tests/testthat/test_optimizer.R | 204 ++++++++++++++
4 files changed, 545 insertions(+), 281 deletions(-)
diff --git a/R-package/R/model.R b/R-package/R/model.R
index b461f79..a2c4419 100644
--- a/R-package/R/model.R
+++ b/R-package/R/model.R
@@ -147,7 +147,7 @@ mx.model.train <- function(symbol, ctx, input.shape,
output.shape,
kvstore$set.optimizer(optimizer)
} else {
updaters <- lapply(seq_len(ndevice), function(i) {
- mx.opt.get.updater(optimizer, train.execs[[i]]$ref.arg.arrays)
+ mx.opt.get.updater(optimizer, train.execs[[i]]$ref.arg.arrays, ctx =
ctx[[i]])
})
}
if (!is.null(kvstore)) {
diff --git a/R-package/R/model.rnn.R b/R-package/R/model.rnn.R
index f328d1b..580c82a 100644
--- a/R-package/R/model.rnn.R
+++ b/R-package/R/model.rnn.R
@@ -1,51 +1,50 @@
# Internal function to do multiple device training on RNN
-mx.model.train.buckets <- function(symbol, ctx, train.data, eval.data,
- dlist, arg.params, aux.params,
- grad.req, arg.update.idx,
+mx.model.train.buckets <- function(symbol, ctx, train.data, eval.data,
+ dlist, arg.params, aux.params,
+ grad.req, arg.update.idx,
begin.round, end.round, optimizer, metric,
metric_cpu,
- epoch.end.callback, batch.end.callback,
kvstore, verbose,
- gc_freq) {
-
+ epoch.end.callback, batch.end.callback,
kvstore, verbose) {
+
ndevice <- length(ctx)
- if (verbose)
+ if (verbose)
message("Start training with ", ndevice, " devices")
-
+
input.names <- names(dlist)
arg.params.names <- names(arg.params)
-
+
if (is.list(symbol)) sym_ini <- symbol[[names(train.data$bucketID)]] else
sym_ini <- symbol
-
+
slices <- lapply(seq_len(ndevice), function(i) {
sapply(names(dlist), function(n) mx.nd.split(data=dlist[[n]], num_outputs
= ndevice, axis = 0, squeeze_axis = FALSE))
})
-
+
train.execs <- lapply(seq_len(ndevice), function(i) {
s <- slices[[i]]
- mx.symbol.bind(symbol = sym_ini, arg.arrays = c(s,
arg.params)[arg.update.idx],
+ mx.symbol.bind(symbol = sym_ini, arg.arrays = c(s,
arg.params)[arg.update.idx],
aux.arrays = aux.params, ctx = ctx[[i]], grad.req =
grad.req)
})
-
+
# KVStore related stuffs
params.index <- as.integer(
mx.util.filter.null(
lapply(seq_along(train.execs[[1]]$ref.grad.arrays), function(k) {
if (!is.null(train.execs[[1]]$ref.grad.arrays[[k]])) k else NULL}
)))
-
+
update.on.kvstore <- FALSE
if (!is.null(kvstore) && kvstore$update.on.kvstore) {
update.on.kvstore <- TRUE
kvstore$set.optimizer(optimizer)
} else {
updaters <- lapply(seq_len(ndevice), function(i) {
- mx.opt.get.updater(optimizer, train.execs[[i]]$ref.arg.arrays)
+ mx.opt.get.updater(optimizer, train.execs[[i]]$ref.arg.arrays, ctx =
ctx[[i]])
})
}
-
+
if (!is.null(kvstore)) {
kvstore$init(params.index, train.execs[[1]]$ref.arg.arrays[params.index])
}
-
+
# train over specified number of epochs
for (iteration in begin.round:end.round) {
nbatch <- 0
@@ -55,20 +54,20 @@ mx.model.train.buckets <- function(symbol, ctx, train.data,
eval.data,
}
train.data$reset()
while (train.data$iter.next()) {
-
+
# Get iterator data
dlist <- train.data$value()[input.names]
-
+
# Slice inputs for multi-devices
slices <- lapply(seq_len(ndevice), function(i) {
sapply(names(dlist), function(n) mx.nd.split(data=dlist[[n]],
num_outputs = ndevice, axis = 0, squeeze_axis = F))
})
-
+
# Assign input to each executor - bug on inference if using BatchNorm
if (is.list(symbol)) {
train.execs <- lapply(seq_len(ndevice), function(i) {
s <- slices[[i]]
- mx.symbol.bind(symbol = symbol[[names(train.data$bucketID)]],
+ mx.symbol.bind(symbol = symbol[[names(train.data$bucketID)]],
arg.arrays = c(s,
train.execs[[i]]$arg.arrays[arg.params.names])[arg.update.idx],
aux.arrays = train.execs[[i]]$aux.arrays, ctx =
ctx[[i]], grad.req = grad.req)
})
@@ -78,12 +77,12 @@ mx.model.train.buckets <- function(symbol, ctx, train.data,
eval.data,
mx.exec.update.arg.arrays(train.execs[[i]], s, match.name=TRUE)
}
}
-
+
# forward pass
for (texec in train.execs) {
mx.exec.forward(texec, is.train = TRUE)
}
-
+
# copy of preds and labels for metric
if (!is.null(metric)) {
preds <- lapply(train.execs, function(texec) {texec$ref.outputs[[1]]})
@@ -93,12 +92,12 @@ mx.model.train.buckets <- function(symbol, ctx, train.data,
eval.data,
labels <- lapply(seq_along(train.execs), function(i)
{mx.nd.copyto(labels[[i]], mx.cpu())})
}
}
-
+
# backward pass
for (texec in train.execs) {
mx.exec.backward(texec)
}
-
+
if (!is.null(kvstore)) {
# push the gradient
kvstore$push(params.index, lapply(train.execs, function(texec) {
@@ -124,7 +123,7 @@ mx.model.train.buckets <- function(symbol, ctx, train.data,
eval.data,
mx.exec.update.arg.arrays(train.execs[[i]], arg.blocks[[i]],
skip.null = TRUE)
}
}
-
+
# Update the evaluation metrics
if (!is.null(metric)) {
for (i in seq_len(ndevice)) {
@@ -133,43 +132,40 @@ mx.model.train.buckets <- function(symbol, ctx,
train.data, eval.data,
state = train.metric)
}
}
-
+
nbatch <- nbatch + 1
- if (!is.null(gc_freq)) {
- if (nbatch %% gc_freq == 0) gc()
- }
-
+
if (!is.null(batch.end.callback)) {
batch.end.callback(iteration, nbatch, environment())
}
}
-
+
if (!is.null(metric)) {
result <- metric$get(train.metric)
- if (verbose)
+ if (verbose)
message("[", iteration, "] Train-", result$name, "=", result$value)
}
-
+
if (!is.null(eval.data)) {
if (!is.null(metric)) {
eval.metric <- metric$init()
}
eval.data$reset()
while (eval.data$iter.next()) {
-
+
# Get iterator data
dlist <- eval.data$value()[input.names]
-
+
# Slice input to multiple devices
slices <- lapply(seq_len(ndevice), function(i) {
sapply(names(dlist), function(n) mx.nd.split(data=dlist[[n]],
num_outputs = ndevice, axis = 0, squeeze_axis = FALSE))
})
-
+
# Assign input to each executor - bug on inference if using BatchNorm
if (is.list(symbol)) {
train.execs <- lapply(seq_len(ndevice), function(i) {
s <- slices[[i]]
- mx.symbol.bind(symbol = symbol[[names(eval.data$bucketID)]],
+ mx.symbol.bind(symbol = symbol[[names(eval.data$bucketID)]],
arg.arrays = c(s,
train.execs[[i]]$arg.arrays[arg.params.names])[arg.update.idx],
aux.arrays = train.execs[[i]]$aux.arrays, ctx =
ctx[[i]], grad.req = grad.req)
})
@@ -179,12 +175,12 @@ mx.model.train.buckets <- function(symbol, ctx,
train.data, eval.data,
mx.exec.update.arg.arrays(train.execs[[i]], s, match.name=TRUE)
}
}
-
+
# forward pass
for (texec in train.execs) {
mx.exec.forward(texec, is.train = FALSE)
}
-
+
# copy of preds and labels for metric and update metric
if (!is.null(metric)) {
preds <- lapply(train.execs, function(texec)
{texec$ref.outputs[[1]]})
@@ -194,17 +190,17 @@ mx.model.train.buckets <- function(symbol, ctx,
train.data, eval.data,
labels <- lapply(seq_along(train.execs), function(i)
{mx.nd.copyto(labels[[i]], mx.cpu())})
}
for (i in seq_len(ndevice)) {
- eval.metric <- metric$update(label = labels[[i]],
- pred = preds[[i]],
+ eval.metric <- metric$update(label = labels[[i]],
+ pred = preds[[i]],
state = eval.metric)
}
}
}
-
+
if (!is.null(metric)) {
result <- metric$get(eval.metric)
if (verbose) {
- message("[", iteration, "] Validation-", result$name, "=",
+ message("[", iteration, "] Validation-", result$name, "=",
result$value)
}
}
@@ -213,12 +209,12 @@ mx.model.train.buckets <- function(symbol, ctx,
train.data, eval.data,
}
# get the model out
model <- mx.model.extract.model(sym_ini, train.execs)
-
+
epoch_continue <- TRUE
if (!is.null(epoch.end.callback)) {
epoch_continue <- epoch.end.callback(iteration, 0, environment(),
verbose = verbose)
}
-
+
if (!epoch_continue) {
break
}
@@ -227,7 +223,7 @@ mx.model.train.buckets <- function(symbol, ctx, train.data,
eval.data,
}
-#
+#
#' Train RNN with bucket support
#'
#' @param symbol Symbol or list of Symbols representing the model
@@ -245,33 +241,33 @@ mx.model.train.buckets <- function(symbol, ctx,
train.data, eval.data,
#' @param verbose
#'
#' @export
-mx.model.buckets <- function(symbol, train.data, eval.data = NULL, metric =
NULL,
- arg.params = NULL, aux.params = NULL,
fixed.params = NULL,
- num.round = 1, begin.round = 1,
- initializer = mx.init.uniform(0.01), optimizer =
"sgd", ctx = NULL,
- batch.end.callback = NULL, epoch.end.callback =
NULL,
- kvstore = "local", verbose = TRUE, metric_cpu =
TRUE, gc_freq = NULL) {
-
+mx.model.buckets <- function(symbol, train.data, eval.data = NULL, metric =
NULL,
+ arg.params = NULL, aux.params = NULL,
fixed.params = NULL,
+ num.round = 1, begin.round = 1,
+ initializer = mx.init.uniform(0.01), optimizer =
"sgd", ctx = NULL,
+ batch.end.callback = NULL, epoch.end.callback =
NULL,
+ kvstore = "local", verbose = TRUE, metric_cpu =
TRUE) {
+
if (!train.data$iter.next()) {
train.data$reset()
- if (!train.data$iter.next())
+ if (!train.data$iter.next())
stop("Empty train.data")
}
-
+
if (!is.null(eval.data)) {
if (!eval.data$iter.next()) {
eval.data$reset()
- if (!eval.data$iter.next())
+ if (!eval.data$iter.next())
stop("Empty eval.data")
}
}
-
- if (is.null(ctx))
+
+ if (is.null(ctx))
ctx <- mx.ctx.default()
if (is.mx.context(ctx)) {
ctx <- list(ctx)
}
- if (!is.list(ctx))
+ if (!is.list(ctx))
stop("ctx must be mx.context or list of mx.context")
if (is.character(optimizer)) {
if (is.numeric(input.shape)) {
@@ -283,75 +279,75 @@ mx.model.buckets <- function(symbol, train.data,
eval.data = NULL, metric = NULL
}
optimizer <- mx.opt.create(optimizer, rescale.grad = (1/batchsize), ...)
}
-
+
sym_ini <- if (is.list(symbol)) symbol[[names(train.data$bucketID)]] else
symbol
-
+
arguments <- sym_ini$arguments
input.names <- intersect(names(train.data$value()), arguments)
-
+
input.shape <- sapply(input.names, function(n) {
dim(train.data$value()[[n]])
}, simplify = FALSE)
-
+
shapes <- sym_ini$infer.shape(input.shape)
-
+
# assign arg.params and aux.params arguments to arg.params.input and
aux.params.input
arg.params.input <- arg.params
aux.params.input <- aux.params
-
+
# initialize all arguments with zeros
arg.params <- lapply(shapes$arg.shapes, function(shape) {
mx.nd.zeros(shape = shape, ctx = mx.cpu())
})
-
+
# initialize input parameters
dlist <- arg.params[input.names]
-
+
# initialize parameters - only argument ending with _weight and _bias are
initialized
arg.params.ini <- mx.init.create(initializer = initializer, shape.array =
shapes$arg.shapes, ctx = mx.cpu(), skip.unknown = TRUE)
-
+
# assign initilized parameters to arg.params
arg.params[names(arg.params.ini)] <- arg.params.ini
-
+
# assign input params to arg.params
arg.params[names(arg.params.input)] <- arg.params.input
-
+
# remove input params from arg.params
arg.params[input.names] <- NULL
-
+
# Grad request
grad.req <- rep("null", length(arguments))
grad.req.write <- arguments %in% setdiff(names(arg.params.ini), fixed.params)
grad.req[grad.req.write] <- "write"
-
+
# Arg array order
update_names <- c(input.names, names(arg.params))
arg.update.idx <- match(arguments, update_names)
-
+
# aux parameters setup
aux.params <- lapply(shapes$aux.shapes, function(shape) {
mx.nd.zeros(shape = shape, ctx = mx.cpu())
})
-
+
aux.params.ini <- mx.init.create(initializer, shapes$aux.shapes, ctx =
mx.cpu(), skip.unknown = FALSE)
if (length(aux.params) > 0) {
aux.params[names(aux.params.ini)] <- aux.params.ini
} else aux.params <- NULL
-
+
aux.params[names(aux.params.input)] <- aux.params.input
-
+
# kvstore initialization
- kvstore <- mx.model.create.kvstore(kvstore, params$arg.params, length(ctx),
+ kvstore <- mx.model.create.kvstore(kvstore, params$arg.params, length(ctx),
verbose = verbose)
-
+
### Execute training
- model <- mx.model.train.buckets(symbol = symbol, ctx = ctx, train.data =
train.data, eval.data = eval.data,
- dlist = dlist, arg.params = arg.params,
aux.params = aux.params,
- grad.req = grad.req, arg.update.idx =
arg.update.idx,
- optimizer = optimizer, metric = metric,
- begin.round = begin.round, end.round =
num.round,
- batch.end.callback = batch.end.callback,
epoch.end.callback = epoch.end.callback,
- kvstore = kvstore, verbose = verbose,
metric_cpu = metric_cpu, gc_freq = gc_freq)
-
+ model <- mx.model.train.buckets(symbol = symbol, ctx = ctx, train.data =
train.data, eval.data = eval.data,
+ dlist = dlist, arg.params = arg.params,
aux.params = aux.params,
+ grad.req = grad.req, arg.update.idx =
arg.update.idx,
+ optimizer = optimizer, metric = metric,
+ begin.round = begin.round, end.round =
num.round,
+ batch.end.callback = batch.end.callback,
epoch.end.callback = epoch.end.callback,
+ kvstore = kvstore, verbose = verbose,
metric_cpu = metric_cpu)
+
return(model)
}
diff --git a/R-package/R/optimizer.R b/R-package/R/optimizer.R
index 3c503c2..7283f67 100644
--- a/R-package/R/optimizer.R
+++ b/R-package/R/optimizer.R
@@ -1,31 +1,69 @@
#' Create an SGD optimizer with respective parameters.
#' Perform SGD with momentum update
#'
-mx.opt.sgd <- function(learning.rate,
- momentum=0,
- wd=0,
- rescale.grad=1,
- clip_gradient = NULL,
+#' @param learning.rate float, default=0.01
+#' The initial learning rate.
+#' @param momentum float, default=0
+#' The momentumvalue
+#' @param wd float, default=0.0
+#' L2 regularization coefficient add to all the weights.
+#' @param rescale.grad float, default=1.0
+#' rescaling factor of gradient.
+#' @param clip_gradient float, optional, default=-1 (no clipping if < 0)
+#' clip gradient in range [-clip_gradient, clip_gradient].
+#' @param lr_scheduler function, optional
+#' The learning rate scheduler.
+mx.opt.sgd <- function(learning.rate = 0.01,
+ momentum = 0,
+ wd = 0,
+ rescale.grad = 1,
+ clip_gradient = -1,
lr_scheduler = NULL) {
- # use lr as short for learing rate.
+
lr <- learning.rate
- count <- 0
- num_update <- 0
+ count <- 0
+ num_update <- 0
sgd <- new.env()
sgd$lr <- lr
sgd$count <- 0
sgd$num_update <- 0
- create.state <- function(index, weight) {
+ create_exec <- function(index, weight_dim, ctx) {
+
if (momentum == 0) {
- return(NULL)
+
+ weight <- mx.symbol.Variable("weight")
+ grad <- mx.symbol.Variable("grad")
+
+ sym <- mx.symbol.sgd_update(weight,
+ grad,
+ lr = lr,
+ wd = wd,
+ rescale_grad = rescale.grad,
+ clip_gradient = clip_gradient,
+ name = "w")
} else {
- ret <- (mx.nd.zeros(dim(weight), ctx(weight)))
- return(ret)
+
+ weight <- mx.symbol.Variable("weight")
+ grad <- mx.symbol.Variable("grad")
+ mom <- mx.symbol.Variable("mom")
+
+ sym <- mx.symbol.sgd_mom_update(weight,
+ grad,
+ mom,
+ lr = lr,
+ wd = wd,
+ momentum= momentum,
+ rescale_grad = rescale.grad,
+ clip_gradient = clip_gradient,
+ name = "w")
}
+ exec <- mx.simple.bind(symbol = sym, weight = weight_dim, ctx = ctx,
grad.req = "null")
+ return(exec)
}
- update <- function(index, weight, grad, state) {
+
+ update <- function(index, exec_w, weight, grad) {
if (!is.null(lr_scheduler)){
lr_scheduler(sgd) ## changing lr
@@ -40,77 +78,104 @@ mx.opt.sgd <- function(learning.rate,
sgd$num_update <- max(sgd$num_update, sgd[[indexKey]])
}
}
- grad <- grad * rescale.grad
- if (!is.null(clip_gradient)){
- if(clip_gradient >= 0){
- grad <- mx.nd.clip(grad, -clip_gradient, clip_gradient)
- } else {
- stop("Error: clip_gradient should be positive number.")
- }
- }
- if (is.null(state)) {
- weight <- weight - lr * (grad + wd * weight)
- } else {
- mom <- state
- mom <- mom * momentum
- mom <- mom - lr * (grad + wd * weight)
- weight <- weight + mom
- state <- mom
- }
- return(list(weight=weight, state=state))
+
+ mx.exec.update.arg.arrays(exec_w, arg.arrays = list(weight = weight,grad =
grad), match.name = T)
+ mx.exec.forward(exec_w, is.train = F)
+ return(exec_w$ref.outputs$w_output)
}
- return(list(create.state=create.state, update=update))
+ return(list(create_exec = create_exec, update = update))
}
#' Create an RMSProp optimizer with respective parameters.
#' Reference: Tieleman T, Hinton G. Lecture 6.5-rmsprop: Divide the gradient
by a running average of its recent magnitude[J]. COURSERA: Neural Networks for
Machine Learning, 2012, 4(2).
#' The code follows: http://arxiv.org/pdf/1308.0850v5.pdf Eq(38) - Eq(45) by
Alex Graves, 2013.
-#'
+#'
#' @param learning.rate float, default=0.002
-#' Step size.
+#' The initial learning rate.
#' @param gamma1 float, default=0.95
#' decay factor of moving average for gradient, gradient^2.
-#' @param gamm2 float, default=0.9
+#' @param gamma2 float, default=0.9
#' "momentum" factor.
+#' @param epsilon float, default=1e-4
#' @param wd float, default=0.0
#' L2 regularization coefficient add to all the weights.
#' @param rescale.grad float, default=1.0
#' rescaling factor of gradient.
-#' @param clip_gradient float, optional
+#' @param clip_gradient float, optional, default=-1 (no clipping if < 0)
#' clip gradient in range [-clip_gradient, clip_gradient].
#' @param lr_scheduler function, optional
#' The learning rate scheduler.
#'
-mx.opt.rmsprop <- function(learning.rate=0.002,
- gamma1=0.95,
- gamma2=0.9,
- wd=0,
- rescale.grad=1,
- clip_gradient = NULL,
+mx.opt.rmsprop <- function(learning.rate = 0.002,
+ centered = TRUE,
+ gamma1 = 0.95,
+ gamma2 = 0.9,
+ epsilon = 1e-4,
+ wd = 0,
+ rescale.grad = 1,
+ clip_gradient = -1,
lr_scheduler = NULL) {
- # use lr as short for learing rate.
+
lr <- learning.rate
- count <- 0
- num_update <- 0
+ count <- 0
+ num_update <- 0
rmsprop <- new.env()
rmsprop$lr <- lr
rmsprop$count <- 0
rmsprop$num_update <- 0
- create.state <- function(index, weight) {
- return (list(n=mx.nd.zeros(dim(weight), ctx(weight)),
- g=mx.nd.zeros(dim(weight), ctx(weight)),
- delta=mx.nd.zeros(dim(weight), ctx(weight))))
+ create_exec <- function(index, weight_dim, ctx) {
+
+ if (centered) {
+
+ weight <- mx.symbol.Variable("weight")
+ grad <- mx.symbol.Variable("grad")
+ n <- mx.symbol.Variable("n")
+ g <- mx.symbol.Variable("g")
+ delta <- mx.symbol.Variable("delta")
+
+ sym <- mx.symbol.rmspropalex_update(weight,
+ grad,
+ n,
+ g,
+ delta,
+ lr = lr,
+ gamma1 = gamma1,
+ gamma2 = gamma2,
+ epsilon = epsilon,
+ wd = wd,
+ rescale_grad = rescale.grad,
+ clip_gradient = clip_gradient,
+ name = "w")
+ } else {
+ weight <- mx.symbol.Variable("weight")
+ grad <- mx.symbol.Variable("grad")
+ n <- mx.symbol.Variable("n")
+
+ sym <- mx.symbol.rmsprop_update(weight,
+ grad,
+ n,
+ lr = lr,
+ gamma1 = gamma1,
+ epsilon = epsilon,
+ wd = wd,
+ rescale_grad = rescale.grad,
+ clip_gradient = clip_gradient,
+ name = "w")
+ }
+
+ exec <- mx.simple.bind(symbol = sym, weight = weight_dim, ctx = ctx,
grad.req = "null")
+ return(exec)
}
- update <- function(index, weight, grad, state) {
+ update <- function(index, exec_w, weight, grad) {
if (!is.null(lr_scheduler)){
lr_scheduler(rmsprop) ## changing lr
lr <- rmsprop$lr
## update count
indexKey <- paste0('ik', index)
- if (!exists(envir = rmsprop, x = indexKey, inherits = FALSE)){
+ if (!exists(envir = rmsprop, x = indexKey, inherits = FALSE)) {
rmsprop[[indexKey]] <- 0
} else {
indexValue <- rmsprop[[indexKey]]
@@ -118,27 +183,12 @@ mx.opt.rmsprop <- function(learning.rate=0.002,
rmsprop$num_update <- max(rmsprop$num_update, rmsprop[[indexKey]])
}
}
- grad <- grad * rescale.grad
- if (!is.null(clip_gradient)){
- if(clip_gradient >= 0){
- grad <- mx.nd.clip(grad, -clip_gradient, clip_gradient)
- } else {
- stop("Error: clip_gradient should be positive number.")
- }
- }
- n <- state$n
- g <- state$g
- delta <- state$delta
- n <- gamma1 * n + (1 - gamma1) * (grad * grad)
- g <- gamma1 * g + (1 - gamma1) * grad
- delta <- gamma2 * delta - lr * (grad / mx.nd.sqrt(n - g*g + 1e-4) + wd *
weight)
- weight <- weight + delta
- state <- list(n=n, g=g, delta=delta)
-
- return(list(weight=weight, state=state))
+ mx.exec.update.arg.arrays(exec_w, arg.arrays = list(weight = weight,grad =
grad), match.name = T)
+ mx.exec.forward(exec_w, is.train = F)
+ return(exec_w$ref.outputs$w_output)
}
- return(list(create.state=create.state, update=update))
+ return(list(create_exec = create_exec, update = update))
}
#' Create an Adam optimizer with respective parameters.
@@ -148,8 +198,8 @@ mx.opt.rmsprop <- function(learning.rate=0.002,
#' Adam: A Method for Stochastic Optimization,
#' http://arxiv.org/abs/1412.6980
#'
-#' @param learning.rate float, default=0.001
-#' Step size.
+#' @param learning.rate float, default=1e-3
+#' The initial learning rate.
#' @param beta1 float, default=0.9
#' Exponential decay rate for the first moment estimates.
#' @param beta2 float, default=0.999
@@ -159,41 +209,60 @@ mx.opt.rmsprop <- function(learning.rate=0.002,
#' L2 regularization coefficient add to all the weights.
#' @param rescale.grad float, default=1.0
#' rescaling factor of gradient.
-#' @param clip_gradient float, optional
+#' @param clip_gradient float, optional, default=-1 (no clipping if < 0)
#' clip gradient in range [-clip_gradient, clip_gradient].
#' @param lr_scheduler function, optional
#' The learning rate scheduler.
#'
-mx.opt.adam <- function(learning.rate=0.001,
- beta1=0.9,
- beta2=0.999,
- epsilon=1e-8,
- wd=0,
- rescale.grad=1,
- clip_gradient = NULL,
+mx.opt.adam <- function(learning.rate = 1e-3,
+ beta1 = 0.9,
+ beta2 = 0.999,
+ epsilon = 1e-8,
+ wd = 0,
+ rescale.grad = 1,
+ clip_gradient = -1,
lr_scheduler = NULL) {
- # use lr as short for learing rate.
+
lr <- learning.rate
- count <- 0
- num_update <- 0
+ count <- 0
+ num_update <- 0
adam <- new.env()
adam$lr <- lr
adam$count <- 0
adam$num_update <- 0
- create.state <- function(index, weight) {
- return (list(mean=mx.nd.zeros(dim(weight), ctx(weight)),
- variance=mx.nd.zeros(dim(weight), ctx(weight))))
+ create_exec <- function(index, weight_dim, ctx) {
+
+ weight <- mx.symbol.Variable("weight")
+ grad <- mx.symbol.Variable("grad")
+ mean <- mx.symbol.Variable("mean")
+ var <- mx.symbol.Variable("var")
+
+ sym <- mx.symbol.adam_update(weight,
+ grad,
+ mean,
+ var,
+ lr = lr,
+ beta1 = beta1,
+ beta2 = beta2,
+ epsilon = epsilon,
+ wd = wd,
+ rescale_grad = rescale.grad,
+ clip_gradient = clip_gradient,
+ name = "w")
+
+ exec <- mx.simple.bind(symbol = sym, weight = weight_dim, ctx = ctx,
grad.req = "null")
+ return(exec)
}
- update <- function(index, weight, grad, state) {
+ update <- function(index, exec_w, weight, grad) {
if (!is.null(lr_scheduler)){
lr_scheduler(adam) ## changing lr
lr <- adam$lr
## update count
indexKey <- paste0('ik', index)
- if (!exists(envir = adam, x = indexKey, inherits = FALSE)){
+ if (!exists(envir = adam, x = indexKey, inherits = FALSE)) {
adam[[indexKey]] <- 0
} else {
indexValue <- adam[[indexKey]]
@@ -202,44 +271,15 @@ mx.opt.adam <- function(learning.rate=0.001,
}
}
- # increment time
- time.key <- paste0('t', index)
- if (!exists(envir = adam, x = time.key, inherits = FALSE)){
- adam[[time.key]] <- 0
- }
- t <- adam[[time.key]]
- t <- t + 1
- adam[[time.key]] <- t
-
- mean <- state$mean
- variance <- state$variance
-
- grad <- grad * rescale.grad
- if (!is.null(clip_gradient)){
- if(clip_gradient >= 0){
- grad <- mx.nd.clip(grad, -clip_gradient, clip_gradient)
- } else {
- stop("Error: clip_gradient should be positive number.")
- }
- }
-
- mean <- beta1 * mean + (1 - beta1) * grad
- variance <- beta2 * variance + (1 - beta2) * (grad * grad)
-
- coef1 <- 1 - beta1^t
- coef2 <- 1 - beta2^t
- lr <- lr * sqrt(coef2)/coef1
-
- weight <- weight - lr * mean / (mx.nd.sqrt(variance) + epsilon)
- weight <- weight - lr * wd * weight
-
- state <- list(mean=mean, variance=variance)
-
- return(list(weight=weight, state=state))
+ mx.exec.update.arg.arrays(exec_w, arg.arrays = list(weight = weight,grad =
grad), match.name = T)
+ mx.exec.forward(exec_w, is.train = F)
+ return(exec_w$ref.outputs$w_output)
}
- return(list(create.state=create.state, update=update))
+ return(list(create_exec = create_exec, update = update))
}
+
+
#' Create an AdaGrad optimizer with respective parameters.
#' AdaGrad optimizer of Duchi et al., 2011,
#'
@@ -254,38 +294,58 @@ mx.opt.adam <- function(learning.rate=0.001,
#' L2 regularization coefficient add to all the weights.
#' @param rescale.grad float, default=1.0
#' rescaling factor of gradient.
-#' @param clip_gradient float, optional
+#' @param clip_gradient float, default=-1.0 (no clipping if < 0)
#' clip gradient in range [-clip_gradient, clip_gradient].
#' @param lr_scheduler function, optional
#' The learning rate scheduler.
#'
-mx.opt.adagrad <- function(learning.rate=0.05,
- epsilon=1e-8,
- wd=0,
- rescale.grad=1,
- clip_gradient = NULL,
+mx.opt.adagrad <- function(learning.rate = 0.05,
+ epsilon = 1e-8,
+ wd = 0,
+ rescale.grad = 1,
+ clip_gradient = -1,
lr_scheduler = NULL) {
# use lr as short for learing rate.
lr <- learning.rate
- count <- 0
- num_update <- 0
+ count <- 0
+ num_update <- 0
adagrad <- new.env()
adagrad$lr <- lr
adagrad$count <- 0
adagrad$num_update <- 0
- create.state <- function(index, weight) {
- return (mx.nd.zeros(dim(weight), ctx(weight))) #history
+ create_exec <- function(index, weight_dim, ctx) {
+
+ weight <- mx.symbol.Variable("weight")
+ grad <- mx.symbol.Variable("grad")
+ history <- mx.symbol.Variable("history")
+
+ grad <- grad * rescale.grad
+ if (!is.null(clip_gradient)) {
+ if (clip_gradient >= 0) {
+ grad <- mx.symbol.clip(data = grad, a.min = -clip_gradient, a.max =
clip_gradient)
+ }
+ }
+
+ history <- history + (grad * grad)
+ weight <- weight - lr * (grad / mx.symbol.sqrt(history + epsilon) + wd *
weight)
+
+ w <- mx.symbol.identity(weight, name = "w")
+ h <- mx.symbol.identity(history, name = "h")
+ sym <- mx.symbol.Group(c(w, h))
+
+ exec <- mx.simple.bind(symbol = sym, weight = weight_dim, ctx = ctx,
grad.req = "null")
+ return(exec)
}
- update <- function(index, weight, grad, state) {
- if (!is.null(lr_scheduler)){
+ update <- function(index, exec_w, weight, grad) {
+ if (!is.null(lr_scheduler)) {
lr_scheduler(adagrad) ## changing lr
lr <- adagrad$lr
## update count
indexKey <- paste0('ik', index)
- if (!exists(envir = adagrad, x = indexKey, inherits = FALSE)){
+ if (!exists(envir = adagrad, x = indexKey, inherits = FALSE)) {
adagrad[[indexKey]] <- 0
} else {
indexValue <- adagrad[[indexKey]]
@@ -294,25 +354,18 @@ mx.opt.adagrad <- function(learning.rate=0.05,
}
}
- grad <- grad * rescale.grad
- if (!is.null(clip_gradient)){
- if(clip_gradient >= 0){
- grad <- mx.nd.clip(grad, -clip_gradient, clip_gradient)
- } else {
- stop("Error: clip_gradient should be positive number.")
- }
- }
+ mx.exec.update.arg.arrays(exec_w, arg.arrays = list(weight = weight,grad =
grad), match.name = T)
+ mx.exec.forward(exec_w, is.train = F)
- history <- state
- history <- history + (grad * grad)
- weight <- weight - lr * (grad / mx.nd.sqrt(history + epsilon) + wd *
weight)
- state <- history
+ # update state
+ mx.exec.update.arg.arrays(exec_w, arg.arrays = list(history =
exec_w$ref.outputs$h_output), match.name = T)
- return(list(weight=weight, state=state))
+ return(exec_w$ref.outputs$w_output)
}
- return(list(create.state=create.state, update=update))
+ return(list(create_exec = create_exec, update = update))
}
+
#' Create an AdaDelta optimizer with respective parameters.
#'
#' AdaDelta optimizer as described in Zeiler, M. D. (2012).
@@ -325,50 +378,64 @@ mx.opt.adagrad <- function(learning.rate=0.05,
#' The constant as described in the thesis.
#' @param wd float, default=0.0
#' L2 regularization coefficient add to all the weights.
-#' @param rescale.grad float, default=1.0
+#' @param rescale.grad float, default=1
#' rescaling factor of gradient.
-#' @param clip_gradient float, optional
+#' @param clip_gradient float, default=-1 (no clipping if < 0)
#' clip gradient in range [-clip_gradient, clip_gradient].
#'
-mx.opt.adadelta <- function(rho=0.90,
- epsilon=1e-5,
- wd=0,
- rescale.grad=1,
- clip_gradient = NULL) {
+mx.opt.adadelta <- function(rho = 0.90,
+ epsilon = 1e-5,
+ wd = 0,
+ rescale.grad = 1,
+ clip_gradient = -1) {
adadelta <- new.env()
- create.state <- function(index, weight) {
- return (list(acc.g=mx.nd.zeros(dim(weight), ctx(weight)), #
accumulated g
- acc.delta=mx.nd.zeros(dim(weight), ctx(weight)))) #
accumulated delta
- }
+ create_exec <- function(index, weight_dim, ctx) {
+ weight <- mx.symbol.Variable("weight")
+ grad <- mx.symbol.Variable("grad")
+ acc.g <- mx.symbol.Variable("acc.g")
+ acc.delta <- mx.symbol.Variable("acc.delta")
- update <- function(index, weight, grad, state) {
- # preprocess grad
grad <- grad * rescale.grad
- if (!is.null(clip_gradient)){
- if(clip_gradient >= 0){
- grad <- mx.nd.clip(grad, -clip_gradient, clip_gradient)
- } else {
- stop("Error: clip_gradient should be positive number.")
+ if (!is.null(clip_gradient)) {
+ if (clip_gradient >= 0) {
+ grad <- mx.symbol.clip(data = grad, a.min = -clip_gradient, a.max =
clip_gradient)
}
}
- # accumulated g and delta initlization
- acc.g <- state$acc.g
- acc.delta <- state$acc.delta
-
- # update g, delta
+ # update state (acc.g, acc.delta)
acc.g <- rho * acc.g + (1 - rho) * (grad * grad)
- current.delta <- mx.nd.sqrt(acc.delta + epsilon) / mx.nd.sqrt(acc.g +
epsilon) * grad
+ current.delta <- mx.symbol.sqrt(acc.delta + epsilon) /
mx.symbol.sqrt(acc.g + epsilon) * grad
acc.delta <- rho * acc.delta + (1 - rho) * (current.delta * current.delta)
weight <- weight - current.delta - wd * weight
- state <- list(acc.g=acc.g, acc.delta=acc.delta)
- return(list(weight=weight, state=state))
+ w <- mx.symbol.identity(weight, name = "w")
+ g <- mx.symbol.identity(acc.g, name = "g")
+ delta <- mx.symbol.identity(acc.delta, name = "delta")
+ sym <- mx.symbol.Group(c(w, g, delta))
+
+ exec <- mx.simple.bind(symbol = sym, weight = weight_dim, ctx = ctx,
grad.req = "null")
+ return(exec)
}
- return(list(create.state=create.state, update=update))
+
+ update <- function(index, exec_w, weight, grad) {
+
+ mx.exec.update.arg.arrays(exec_w, arg.arrays = list(weight = weight,grad =
grad), match.name = T)
+ mx.exec.forward(exec_w, is.train = F)
+
+ # update state
+ mx.exec.update.arg.arrays(exec_w,
+ arg.arrays = list(
+ acc.g = exec_w$ref.outputs$g_output,
+ acc.delta = exec_w$ref.outputs$delta_output),
+ match.name = T)
+
+ return(exec_w$ref.outputs$w_output)
+ }
+ return(list(create_exec = create_exec, update = update))
}
+
#' Create an optimizer by name and parameters
#'
#' @param name The name of the optimizer
@@ -392,31 +459,28 @@ mx.opt.create <- function(name, ...) {
#' @param weights The weights to be optimized
#'
#' @export
-mx.opt.get.updater <- function(optimizer, weights) {
- # This is the list to keep track of internal states of optimzer
- state.list <- lapply(seq_along(weights), function(i) {
- if (is.null(weights[[i]])) return(NULL)
- optimizer$create.state(i, weights[[i]])
+mx.opt.get.updater <- function(optimizer, weights, ctx) {
+
+ exec_list <- lapply(seq_along(weights), function(i) {
+ if (is.null(weights[[i]])) {
+ return(NULL)
+ } else {
+ optimizer$create_exec(index = i, weight_dim = dim(weights[[i]]), ctx =
ctx)
+ }
})
+
update <- optimizer$update
update.closure <- function(weight, grad) {
- ulist <- lapply(seq_along(weight), function(i) {
+
+ weight_list <- lapply(seq_along(weight), function(i) {
if (!is.null(grad[[i]])) {
- update(i, weight[[i]], grad[[i]], state.list[[i]])
+ return(update(i, exec_list[[i]], weight[[i]], grad[[i]]))
} else {
return(NULL)
}
})
- # update state list, use mutate assignment
- state.list <<- lapply(ulist, function(x) {
- x$state
- })
- # return updated weight list
- weight.list <- lapply(ulist, function(x) {
- x$weight
- })
- return(weight.list)
+ return(weight_list)
}
return(update.closure)
}
diff --git a/R-package/tests/testthat/test_optimizer.R
b/R-package/tests/testthat/test_optimizer.R
new file mode 100644
index 0000000..c6dacaa
--- /dev/null
+++ b/R-package/tests/testthat/test_optimizer.R
@@ -0,0 +1,204 @@
+context("optimizer")
+
+test_that("sgd", {
+
+ data = mx.symbol.Variable('data')
+ label = mx.symbol.Variable('label')
+ fc_weight = mx.symbol.Variable('fc_weight')
+ fc = mx.symbol.FullyConnected(data = data, weight = fc_weight, no.bias = T,
name = 'fc1', num_hidden = 1)
+ loss = mx.symbol.LinearRegressionOutput(data = fc, label = label, name =
'loss')
+
+ x <- mx.nd.array(array(1:6, dim=2:3))
+ y <- mx.nd.array(c(5, 11, 16))
+ w1 <- mx.nd.array(array(c(1.1, 1.8), dim = c(2,1)))
+
+ exec <- mxnet:::mx.symbol.bind(symbol = loss,
+ ctx = mx.cpu(),
+ arg.arrays = list(data = x,
+ fc1_weight = w1,
+ label = y),
+ aux.arrays = NULL,
+ grad.reqs = c("null", "write", "null"))
+
+ optimizer <- mx.opt.create("sgd",
+ learning.rate = 1,
+ momentum = 0,
+ wd = 0,
+ rescale.grad = 1,
+ clip_gradient = -1)
+
+ updaters <- mx.opt.get.updater(optimizer, exec$ref.arg.arrays, ctx =
mx.cpu())
+
+ mx.exec.forward(exec, is.train = T)
+ mx.exec.backward(exec)
+
+ arg.blocks <- updaters(exec$ref.arg.arrays, exec$ref.grad.arrays)
+ mx.exec.update.arg.arrays(exec, arg.blocks, skip.null = TRUE)
+
+ expect_equal(as.array(arg.blocks[[2]]), array(c(1.4, 2.6), dim = c(2,1)),
tolerance = 1e-1)
+
+})
+
+
+test_that("rmsprop", {
+
+ data = mx.symbol.Variable('data')
+ label = mx.symbol.Variable('label')
+ fc_weight = mx.symbol.Variable('fc_weight')
+ fc = mx.symbol.FullyConnected(data = data, weight = fc_weight, no.bias = T,
name = 'fc1', num_hidden = 1)
+ loss = mx.symbol.LinearRegressionOutput(data = fc, label = label, name =
'loss')
+
+ x <- mx.nd.array(array(1:6, dim=2:3))
+ y <- mx.nd.array(c(5, 11, 16))
+ w1 <- mx.nd.array(array(c(1.1, 1.8), dim = c(2,1)))
+
+ exec <- mxnet:::mx.symbol.bind(symbol = loss,
+ ctx = mx.cpu(),
+ arg.arrays = list(data = x,
+ fc1_weight = w1,
+ label = y),
+ aux.arrays = NULL,
+ grad.reqs = c("null", "write", "null"))
+
+ optimizer <- mx.opt.create("rmsprop", learning.rate = 1,
+ centered = TRUE,
+ gamma1 = 0.95,
+ gamma2 = 0.9,
+ epsilon = 1e-4,
+ wd = 0,
+ rescale.grad = 1,
+ clip_gradient = -1)
+
+ updaters <- mx.opt.get.updater(optimizer, exec$ref.arg.arrays, ctx =
mx.cpu())
+
+ mx.exec.forward(exec, is.train = T)
+ mx.exec.backward(exec)
+
+ arg.blocks <- updaters(exec$ref.arg.arrays, exec$ref.grad.arrays)
+ mx.exec.update.arg.arrays(exec, arg.blocks, skip.null = TRUE)
+
+ expect_equal(as.array(arg.blocks[[2]]), array(c(5.64, 6.38), dim = c(2,1)),
tolerance = 1e-1)
+
+})
+
+
+test_that("adam", {
+
+ data = mx.symbol.Variable('data')
+ label = mx.symbol.Variable('label')
+ fc_weight = mx.symbol.Variable('fc_weight')
+ fc = mx.symbol.FullyConnected(data = data, weight = fc_weight, no.bias = T,
name = 'fc1', num_hidden = 1)
+ loss = mx.symbol.LinearRegressionOutput(data = fc, label = label, name =
'loss')
+
+ x <- mx.nd.array(array(1:6, dim=2:3))
+ y <- mx.nd.array(c(5, 11, 16))
+ w1 <- mx.nd.array(array(c(1.1, 1.8), dim = c(2,1)))
+
+ exec <- mxnet:::mx.symbol.bind(symbol = loss,
+ ctx = mx.cpu(),
+ arg.arrays = list(data = x,
+ fc1_weight = w1,
+ label = y),
+ aux.arrays = NULL,
+ grad.reqs = c("null", "write", "null"))
+
+ optimizer <- mx.opt.create("adam",
+ learning.rate = 1,
+ beta1 = 0.9,
+ beta2 = 0.999,
+ epsilon = 1e-8,
+ wd = 0,
+ rescale.grad = 1,
+ clip_gradient = -1)
+
+ updaters <- mx.opt.get.updater(optimizer, exec$ref.arg.arrays, ctx =
mx.cpu())
+
+ mx.exec.forward(exec, is.train = T)
+ mx.exec.backward(exec)
+
+ arg.blocks <- updaters(exec$ref.arg.arrays, exec$ref.grad.arrays)
+ mx.exec.update.arg.arrays(exec, arg.blocks, skip.null = TRUE)
+
+ expect_equal(as.array(arg.blocks[[2]]), array(c(4.26, 4.96), dim = c(2,1)),
tolerance = 1e-1)
+
+})
+
+
+test_that("adagrad", {
+
+ data = mx.symbol.Variable('data')
+ label = mx.symbol.Variable('label')
+ fc_weight = mx.symbol.Variable('fc_weight')
+ fc = mx.symbol.FullyConnected(data = data, weight = fc_weight, no.bias = T,
name = 'fc1', num_hidden = 1)
+ loss = mx.symbol.LinearRegressionOutput(data = fc, label = label, name =
'loss')
+
+ x <- mx.nd.array(array(1:6, dim=2:3))
+ y <- mx.nd.array(c(5, 11, 16))
+ w1 <- mx.nd.array(array(c(1.1, 1.8), dim = c(2,1)))
+
+ exec <- mxnet:::mx.symbol.bind(symbol = loss,
+ ctx = mx.cpu(),
+ arg.arrays = list(data = x,
+ fc1_weight = w1,
+ label = y),
+ aux.arrays = NULL,
+ grad.reqs = c("null", "write", "null"))
+
+ optimizer <- mx.opt.create("adagrad",
+ learning.rate = 1,
+ epsilon = 1e-8,
+ wd = 0,
+ rescale.grad = 1,
+ clip_gradient = -1)
+
+ updaters <- mx.opt.get.updater(optimizer, exec$ref.arg.arrays, ctx =
mx.cpu())
+
+ mx.exec.forward(exec, is.train = T)
+ mx.exec.backward(exec)
+
+ arg.blocks <- updaters(exec$ref.arg.arrays, exec$ref.grad.arrays)
+ mx.exec.update.arg.arrays(exec, arg.blocks, skip.null = TRUE)
+
+ expect_equal(as.array(arg.blocks[[2]]), array(c(2.1, 2.8), dim = c(2,1)),
tolerance = 1e-1)
+
+})
+
+
+test_that("adadelta", {
+
+ data = mx.symbol.Variable('data')
+ label = mx.symbol.Variable('label')
+ fc_weight = mx.symbol.Variable('fc_weight')
+ fc = mx.symbol.FullyConnected(data = data, weight = fc_weight, no.bias = T,
name = 'fc1', num_hidden = 1)
+ loss = mx.symbol.LinearRegressionOutput(data = fc, label = label, name =
'loss')
+
+ x <- mx.nd.array(array(1:6, dim=2:3))
+ y <- mx.nd.array(c(5, 11, 16))
+ w1 <- mx.nd.array(array(c(1.1, 1.8), dim = c(2,1)))
+
+ exec <- mxnet:::mx.symbol.bind(symbol = loss,
+ ctx = mx.cpu(),
+ arg.arrays = list(data = x,
+ fc1_weight = w1,
+ label = y),
+ aux.arrays = NULL,
+ grad.reqs = c("null", "write", "null"))
+
+ optimizer <- mx.opt.create("adadelta",
+ rho = 0.90,
+ epsilon = 1e-5,
+ wd = 0,
+ rescale.grad = 1,
+ clip_gradient = -1)
+
+ updaters <- mx.opt.get.updater(optimizer, exec$ref.arg.arrays, ctx =
mx.cpu())
+
+ mx.exec.forward(exec, is.train = T)
+ mx.exec.backward(exec)
+
+ arg.blocks <- updaters(exec$ref.arg.arrays, exec$ref.grad.arrays)
+ mx.exec.update.arg.arrays(exec, arg.blocks, skip.null = TRUE)
+
+ expect_equal(as.array(arg.blocks[[2]]), array(c(1.11, 1.81), dim = c(2,1)),
tolerance = 1e-1)
+
+})