This is an automated email from the ASF dual-hosted git repository.
jxie pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/incubator-mxnet.git
The following commit(s) were added to refs/heads/master by this push:
new e6a1139 add backward(is_train=False) and always mode for dropout
(#7303)
e6a1139 is described below
commit e6a1139812db33bd11fcc8491915f6a51d42418b
Author: Eric Junyuan Xie <[email protected]>
AuthorDate: Thu Aug 3 10:36:11 2017 -0700
add backward(is_train=False) and always mode for dropout (#7303)
* add backward(is_train=False) and always mode for dropout
* fix
* fix
* fix slicing
* fix mkl dropout
---
include/mxnet/c_api.h | 36 ++++++++++++-
include/mxnet/executor.h | 2 +-
python/mxnet/autograd.py | 70 +++++++++++++++++++------
python/mxnet/base.py | 4 +-
python/mxnet/contrib/autograd.py | 2 +
python/mxnet/executor.py | 29 ++++------
python/mxnet/gluon/model_zoo/vision/__init__.py | 5 +-
python/mxnet/image/detection.py | 5 +-
python/mxnet/ndarray.py | 17 ++++--
src/c_api/c_api_executor.cc | 9 +++-
src/c_api/c_api_ndarray.cc | 20 +++++--
src/executor/graph_executor.cc | 4 +-
src/executor/graph_executor.h | 2 +-
src/ndarray/autograd.cc | 10 ++--
src/ndarray/autograd.h | 14 ++++-
src/ndarray/ndarray.cc | 1 +
src/operator/dropout-inl.h | 21 ++++++--
src/operator/dropout.cc | 3 +-
tests/python/unittest/test_autograd.py | 16 ++++++
tests/python/unittest/test_operator.py | 36 +++++++++++++
20 files changed, 243 insertions(+), 63 deletions(-)
diff --git a/include/mxnet/c_api.h b/include/mxnet/c_api.h
index 47447fb..d9a5315 100644
--- a/include/mxnet/c_api.h
+++ b/include/mxnet/c_api.h
@@ -553,6 +553,13 @@ MXNET_DLL int MXImperativeInvoke(AtomicSymbolCreator
creator,
const char **param_vals);
/*!
* \brief set whether to record operator for autograd
+ * \param is_recording 1 when recording, 0 when not recording.
+ * \param prev returns the previous status before this set.
+ * \return 0 when success, -1 when failure happens
+ */
+MXNET_DLL int MXAutogradSetIsRecording(int is_recording, int* prev);
+/*!
+ * \brief set whether to record operator for autograd
* \param is_train 1 when training, 0 when testing
* \param prev returns the previous status before this set.
* \return 0 when success, -1 when failure happens
@@ -589,6 +596,20 @@ MXNET_DLL int MXAutogradBackward(mx_uint num_output,
NDArrayHandle* ograd_handles,
int retain_graph);
/*!
+* \brief compute the gradient of outputs w.r.t variabels
+* \param num_output number of output NDArray
+* \param output_handles output NDArrays
+* \param ograd_handles head gradient for NDArrays
+* \param retain_graph whether to keep the graph after backward
+* \param is_train whether to do backward for training or inference
+* \return 0 when success, -1 when failure happens
+*/
+MXNET_DLL int MXAutogradBackwardEx(mx_uint num_output,
+ NDArrayHandle* output_handles,
+ NDArrayHandle* ograd_handles,
+ int retain_graph,
+ int is_train);
+/*!
* \brief create cached operator
*/
MXNET_DLL int MXCreateCachedOp(SymbolHandle handle,
@@ -1028,7 +1049,20 @@ MXNET_DLL int MXExecutorForward(ExecutorHandle handle,
int is_train);
MXNET_DLL int MXExecutorBackward(ExecutorHandle handle,
mx_uint len,
NDArrayHandle *head_grads);
-
+/*!
+ * \brief Excecutor run backward
+ *
+ * \param handle execute handle
+ * \param len lenth
+ * \param head_grads NDArray handle for heads' gradient
+ * \param is_train int value to indicate whether the backward pass is for
evaluation
+ *
+ * \return 0 when success, -1 when failure happens
+ */
+MXNET_DLL int MXExecutorBackwardEx(ExecutorHandle handle,
+ mx_uint len,
+ NDArrayHandle *head_grads,
+ int is_train);
/*!
* \brief Get executor's head NDArray
*
diff --git a/include/mxnet/executor.h b/include/mxnet/executor.h
index 40bd60f..9308587 100644
--- a/include/mxnet/executor.h
+++ b/include/mxnet/executor.h
@@ -58,7 +58,7 @@ class Executor {
*
* \param head_grads the gradient of head nodes to be backproped.
*/
- virtual void Backward(const std::vector<NDArray> &head_grads) = 0;
+ virtual void Backward(const std::vector<NDArray> &head_grads, bool is_train
= true) = 0;
/*!
* \brief print the execution plan info to output stream.
* \param os the output stream we like to print to.
diff --git a/python/mxnet/autograd.py b/python/mxnet/autograd.py
index b97d350..2f33052 100644
--- a/python/mxnet/autograd.py
+++ b/python/mxnet/autograd.py
@@ -12,9 +12,7 @@ from .symbol import _GRAD_REQ_MAP
def set_recording(is_recording):
"""Set status to recording/not recording. When recording, graph will be
constructed
- for gradient computation. Operators will also run with ctx.is_train=True.
For example,
- Dropout will drop inputs randomly when is_train=True while simply passing
through
- if is_train=False.
+ for gradient computation.
Parameters
----------
@@ -25,46 +23,77 @@ def set_recording(is_recording):
previous state before this set.
"""
prev = ctypes.c_int()
- check_call(_LIB.MXAutogradSetIsTraining(
+ check_call(_LIB.MXAutogradSetIsRecording(
ctypes.c_int(is_recording), ctypes.byref(prev)))
return bool(prev.value)
+def set_training(is_train):
+ """Set status to training/not training. This affects ctx.is_train in
operator
+ running context. For example, Dropout will drop inputs randomly when
+ is_train=True while simply passing through if is_train=False.
+
+ Parameters
+ ----------
+ is_train: bool
+
+ Returns
+ -------
+ previous state before this set.
+ """
+ prev = ctypes.c_int()
+ check_call(_LIB.MXAutogradSetIsTraining(
+ ctypes.c_int(is_train), ctypes.byref(prev)))
+ return bool(prev.value)
+
-class TrainingStateScope(object):
+class RecordingStateScope(object):
"""Scope for managing training state.
Example::
- with TrainingStateScope(True):
+ with RecordingStateScope(True, True):
y = model(x)
backward([y])
"""
- def __init__(self, enter_state):
+ def __init__(self, enter_state, is_train):
self._enter_state = enter_state
+ self._enter_is_train = is_train
self._prev = None
+ self._prev_is_train = None
def __enter__(self):
self._prev = set_recording(self._enter_state)
+ self._prev_is_train = set_training(self._enter_is_train)
def __exit__(self, ptype, value, trace):
if self._prev != self._enter_state:
set_recording(self._prev)
+ if self._prev_is_train != self._enter_is_train:
+ set_training(self._prev_is_train)
-def record():
+def record(is_train=True):
"""Returns a training scope context to be used in 'with' statement
and captures training code.
+ .. note:: When forwarding with is_train=False, the corresponding backward
+ should also use is_train=False, otherwise gradient is undefined.
+
Example::
with autograd.record():
y = model(x)
backward([y])
metric.update(...)
optim.step(...)
+
+ Parameters
+ ----------
+ is_train: bool, default True
+ Whether to do forward for training or inference.
"""
- return TrainingStateScope(True)
+ return RecordingStateScope(True, is_train)
-def pause():
+def pause(is_train=False):
"""Returns a testing scope context to be used in 'with' statement
and captures testing code.
@@ -74,8 +103,13 @@ def pause():
backward([y])
with autograd.pause():
# testing, IO, gradient updates...
+
+ Parameters
+ ----------
+ is_train: bool, default False
+ Whether to do forward for training or inference.
"""
- return TrainingStateScope(False)
+ return RecordingStateScope(False, is_train)
def mark_variables(variables, gradients, grad_reqs='write'):
@@ -109,7 +143,7 @@ def mark_variables(variables, gradients, grad_reqs='write'):
c_array(NDArrayHandle, gradient_handles)))
-def backward(heads, head_grads=None, retain_graph=False):
+def backward(heads, head_grads=None, retain_graph=False, is_train=True):
"""Compute the gradients of heads w.r.t previously marked variables.
Parameters
@@ -118,6 +152,8 @@ def backward(heads, head_grads=None, retain_graph=False):
Output NDArray(s)
head_grads: NDArray or list of NDArray or None
Gradients with respect to heads.
+ is_train: bool, optional
+ Whether to do backward for training or inference.
"""
if isinstance(heads, NDArray):
assert head_grads is None or isinstance(head_grads, NDArray)
@@ -129,11 +165,12 @@ def backward(heads, head_grads=None, retain_graph=False):
output_handles.append(arr.handle)
if head_grads is None:
- check_call(_LIB.MXAutogradBackward(
+ check_call(_LIB.MXAutogradBackwardEx(
len(output_handles),
c_array(NDArrayHandle, output_handles),
ctypes.c_void_p(0),
- ctypes.c_int(retain_graph)))
+ ctypes.c_int(retain_graph),
+ ctypes.c_int(is_train)))
return
ograd_handles = []
@@ -145,8 +182,9 @@ def backward(heads, head_grads=None, retain_graph=False):
assert len(ograd_handles) == len(output_handles), \
"heads and head_grads must have the same length"
- check_call(_LIB.MXAutogradBackward(
+ check_call(_LIB.MXAutogradBackwardEx(
len(output_handles),
c_array(NDArrayHandle, output_handles),
c_array(NDArrayHandle, ograd_handles),
- ctypes.c_int(retain_graph)))
+ ctypes.c_int(retain_graph),
+ ctypes.c_int(is_train)))
diff --git a/python/mxnet/base.py b/python/mxnet/base.py
index 6d53752..ddaeb6e 100644
--- a/python/mxnet/base.py
+++ b/python/mxnet/base.py
@@ -18,14 +18,14 @@ __all__ = ['MXNetError']
#----------------------------
if sys.version_info[0] == 3:
string_types = str,
- numeric_types = (float, int, np.float32, np.int32)
+ numeric_types = (float, int, np.generic)
integer_types = int
# this function is needed for python3
# to convert ctypes.char_p .value back to python str
py_str = lambda x: x.decode('utf-8')
else:
string_types = basestring,
- numeric_types = (float, int, long, np.float32, np.int32)
+ numeric_types = (float, int, long, np.generic)
integer_types = (int, long)
py_str = lambda x: x
diff --git a/python/mxnet/contrib/autograd.py b/python/mxnet/contrib/autograd.py
index e56361e..9074e45 100644
--- a/python/mxnet/contrib/autograd.py
+++ b/python/mxnet/contrib/autograd.py
@@ -28,6 +28,8 @@ def set_is_training(is_train):
prev = ctypes.c_int()
check_call(_LIB.MXAutogradSetIsTraining(
ctypes.c_int(is_train), ctypes.byref(prev)))
+ check_call(_LIB.MXAutogradSetIsRecording(
+ ctypes.c_int(is_train), ctypes.byref(prev)))
return bool(prev.value)
diff --git a/python/mxnet/executor.py b/python/mxnet/executor.py
index 6b9aab2..d2b108c 100644
--- a/python/mxnet/executor.py
+++ b/python/mxnet/executor.py
@@ -5,7 +5,6 @@ from __future__ import absolute_import
import ctypes
import copy
-import warnings
import numpy as np
from .base import _LIB
from .base import mx_uint, NDArrayHandle, ExecutorHandle
@@ -61,7 +60,6 @@ class Executor(object):
self._aux_dict = None
self._output_dict = None
self._monitor_callback = None
- self._output_dirty = False
self._ctx = copy.deepcopy(ctx)
self._grad_req = copy.deepcopy(grad_req)
self._group2ctx = copy.deepcopy(group2ctx)
@@ -99,8 +97,7 @@ class Executor(object):
----------
is_train: bool, optional
Whether this forward is for evaluation purpose. If True,
- a backward call is expected to follow. Otherwise following
- backward is invalid.
+ a backward call is expected to follow.
**kwargs
Additional specification of input arguments.
@@ -132,15 +129,9 @@ class Executor(object):
self.handle,
ctypes.c_int(int(is_train))))
- if self._output_dirty:
- warnings.warn(
- "Calling forward the second time after forward(is_train=True) "
- "without calling backward first. Is this intended?",
stacklevel=2)
- self._output_dirty = is_train
-
return self.outputs
- def backward(self, out_grads=None):
+ def backward(self, out_grads=None, is_train=True):
"""Do backward pass to get the gradient of arguments.
Parameters
@@ -149,6 +140,11 @@ class Executor(object):
Gradient on the outputs to be propagated back.
This parameter is only needed when bind is called
on outputs that are not a loss function.
+ is_train : bool, default True
+ Whether this backward is for training or inference. Note that in
rare
+ cases you want to call backward with is_train=False to get gradient
+ during inference.
+
Examples
--------
@@ -211,16 +207,11 @@ class Executor(object):
if not isinstance(obj, NDArray):
raise TypeError("inputs must be NDArray")
ndarray = c_array(NDArrayHandle, [item.handle for item in out_grads])
- check_call(_LIB.MXExecutorBackward(
+ check_call(_LIB.MXExecutorBackwardEx(
self.handle,
mx_uint(len(out_grads)),
- ndarray))
-
- if not self._output_dirty:
- warnings.warn(
- "Calling backward without calling forward(is_train=True) "
- "first. Behavior is undefined.", stacklevel=2)
- self._output_dirty = False
+ ndarray,
+ ctypes.c_int(is_train)))
def set_monitor_callback(self, callback):
"""Install callback for monitor.
diff --git a/python/mxnet/gluon/model_zoo/vision/__init__.py
b/python/mxnet/gluon/model_zoo/vision/__init__.py
index e0498dc..56e46f9 100644
--- a/python/mxnet/gluon/model_zoo/vision/__init__.py
+++ b/python/mxnet/gluon/model_zoo/vision/__init__.py
@@ -102,5 +102,8 @@ def get_model(name, **kwargs):
'inceptionv3': inception_v3,
}
name = name.lower()
- assert name in models, 'Model %s is not supported'%name
+ if name not in models:
+ raise ValueError(
+ 'Model %s is not supported. Available options are\n\t%s'%(
+ name, '\n\t'.join(sorted(models.keys()))))
return models[name](**kwargs)
diff --git a/python/mxnet/image/detection.py b/python/mxnet/image/detection.py
index d5e5c1e..0a16ac3 100644
--- a/python/mxnet/image/detection.py
+++ b/python/mxnet/image/detection.py
@@ -756,8 +756,9 @@ class ImageDetIter(ImageIter):
assert i < batch_size, 'Batch size must be multiples of
augmenter output length'
batch_data[i][:] = self.postprocess_data(datum)
num_object = label.shape[0]
- batch_label[i][0:num_object][:] = nd.array(label)
- batch_label[i][num_object:][:] = -1
+ batch_label[i][0:num_object] = nd.array(label)
+ if num_object < batch_label[i].shape[0]:
+ batch_label[i][num_object:] = -1
i += 1
except StopIteration:
if not i:
diff --git a/python/mxnet/ndarray.py b/python/mxnet/ndarray.py
index fdecebb..b2178a9 100644
--- a/python/mxnet/ndarray.py
+++ b/python/mxnet/ndarray.py
@@ -124,6 +124,7 @@ fixed-size items.
"""
__slots__ = []
+ # make numpy functions return NDArray instead of numpy object array
__array_priority__ = 1000.0
# pylint: disable= no-member, undefined-variable
def __repr__(self):
@@ -1058,22 +1059,30 @@ fixed-size items.
check_call(_LIB.MXNDArrayDetach(self.handle, ctypes.byref(hdl)))
return NDArray(hdl)
- def backward(self, out_grad=None, retain_graph=False):
+ def backward(self, out_grad=None, retain_graph=False, is_train=True):
"""Compute the gradients of this NDArray w.r.t variables.
Parameters
----------
- out_grad: list of NDArray or None
+ out_grad : NDArray, optional
+ Gradient with respect to head.
+ retain_graph : bool, optional
+ Whether to retain the computaion graph for another backward
+ pass on the same graph. By default the computaion history
+ is cleared.
+ is_train : bool, optional
+ Whether to compute gradient for training or inference.
"""
if out_grad is None:
ograd_handles = [NDArrayHandle(0)]
else:
ograd_handles = [out_grad.handle]
- check_call(_LIB.MXAutogradBackward(
+ check_call(_LIB.MXAutogradBackwardEx(
1, c_array(NDArrayHandle, [self.handle]),
c_array(NDArrayHandle, ograd_handles),
- ctypes.c_int(retain_graph)))
+ ctypes.c_int(retain_graph),
+ ctypes.c_int(is_train)))
def onehot_encode(indices, out):
diff --git a/src/c_api/c_api_executor.cc b/src/c_api/c_api_executor.cc
index ca49402..3ba3154 100644
--- a/src/c_api/c_api_executor.cc
+++ b/src/c_api/c_api_executor.cc
@@ -35,6 +35,13 @@ int MXExecutorForward(ExecutorHandle handle, int is_train) {
int MXExecutorBackward(ExecutorHandle handle,
mx_uint len,
NDArrayHandle *head_grads) {
+ return MXExecutorBackwardEx(handle, len, head_grads, true);
+}
+
+int MXExecutorBackwardEx(ExecutorHandle handle,
+ mx_uint len,
+ NDArrayHandle *head_grads,
+ int is_train) {
API_BEGIN();
Executor *exec = static_cast<Executor*>(handle);
std::vector<NDArray> ndarrays;
@@ -42,7 +49,7 @@ int MXExecutorBackward(ExecutorHandle handle,
for (mx_uint i = 0; i < len; ++i) {
ndarrays.push_back(*args_ptr[i]);
}
- exec->Backward(ndarrays);
+ exec->Backward(ndarrays, is_train);
API_END();
}
diff --git a/src/c_api/c_api_ndarray.cc b/src/c_api/c_api_ndarray.cc
index 818f263..f401394 100644
--- a/src/c_api/c_api_ndarray.cc
+++ b/src/c_api/c_api_ndarray.cc
@@ -378,7 +378,7 @@ void ImperativeInvokeImpl(const Context& default_ctx,
}
if (fn) {
- if (AutogradRuntime::Get()->IsTraining()) {
+ if (AutogradRuntime::Get()->IsRecording()) {
AutogradRuntime::Get()->RecordImperativeFCompute(op,
attrs, &ndinputs, &ndoutputs);
}
@@ -387,7 +387,7 @@ void ImperativeInvokeImpl(const Context& default_ctx,
} else if (createop.count(op)) {
auto state =
createop[op](attrs, ctx, ret->arg_shapes, ret->arg_types);
- if (AutogradRuntime::Get()->IsTraining()) {
+ if (AutogradRuntime::Get()->IsRecording()) {
AutogradRuntime::Get()->RecordImperativeOperator(state, op,
attrs, &ndinputs, &ndoutputs);
}
@@ -528,6 +528,12 @@ int MXAutogradSetIsTraining(int is_training, int* prev) {
API_END();
}
+int MXAutogradSetIsRecording(int is_recording, int* prev) {
+ API_BEGIN();
+ *prev =
AutogradRuntime::Get()->SetIsRecording(static_cast<bool>(is_recording));
+ API_END();
+}
+
int MXAutogradMarkVariables(mx_uint num_var,
NDArrayHandle *var_handles,
mx_uint *reqs_array,
@@ -556,6 +562,14 @@ int MXAutogradBackward(mx_uint num_output,
NDArrayHandle *output_handles,
NDArrayHandle *ograd_handles,
int retain_graph) {
+ return MXAutogradBackwardEx(num_output, output_handles, ograd_handles,
retain_graph, true);
+}
+
+int MXAutogradBackwardEx(mx_uint num_output,
+ NDArrayHandle *output_handles,
+ NDArrayHandle *ograd_handles,
+ int retain_graph,
+ int is_train) {
API_BEGIN();
MXAPIThreadLocalEntry *ret = MXAPIThreadLocalStore::Get();
@@ -574,6 +588,6 @@ int MXAutogradBackward(mx_uint num_output,
}
}
- AutogradRuntime::Get()->ComputeGradient(outputs, ograds, retain_graph);
+ AutogradRuntime::Get()->ComputeGradient(outputs, ograds, retain_graph,
is_train);
API_END();
}
diff --git a/src/executor/graph_executor.cc b/src/executor/graph_executor.cc
index af5ec7f..a17f44a 100644
--- a/src/executor/graph_executor.cc
+++ b/src/executor/graph_executor.cc
@@ -42,7 +42,7 @@ void GraphExecutor::PartialForward(bool is_train, int step,
int *step_left) {
*step_left = static_cast<int>(num_forward_nodes_ - sstep - 1);
}
-void GraphExecutor::Backward(const std::vector<NDArray>& head_grads) {
+void GraphExecutor::Backward(const std::vector<NDArray>& head_grads, bool
is_train) {
const auto& idx = graph_.indexed_graph();
if (num_forward_inputs_ != idx.input_nodes().size()) {
for (size_t i = 0; i < head_grad_array_.size(); ++i) {
@@ -57,7 +57,7 @@ void GraphExecutor::Backward(const std::vector<NDArray>&
head_grads) {
}
}
}
- RunOps(true, num_forward_nodes_, idx.num_nodes());
+ RunOps(is_train, num_forward_nodes_, idx.num_nodes());
}
void GraphExecutor::Print(std::ostream &os) const { // NOLINT(*)
diff --git a/src/executor/graph_executor.h b/src/executor/graph_executor.h
index 5b6fa39..0efb8ae 100644
--- a/src/executor/graph_executor.h
+++ b/src/executor/graph_executor.h
@@ -44,7 +44,7 @@ class GraphExecutor : public Executor {
virtual ~GraphExecutor();
void Forward(bool is_train) override;
void PartialForward(bool is_train, int step, int *step_left) override;
- void Backward(const std::vector<NDArray> &head_grads) override;
+ void Backward(const std::vector<NDArray> &head_grads, bool is_train = true)
override;
const std::vector<NDArray>& outputs() const override;
const std::unordered_map<std::string, NDArray>& in_arg_map() const override;
const std::unordered_map<std::string, NDArray>& arg_grad_map() const
override;
diff --git a/src/ndarray/autograd.cc b/src/ndarray/autograd.cc
index f990ee2..efb6bc9 100644
--- a/src/ndarray/autograd.cc
+++ b/src/ndarray/autograd.cc
@@ -23,9 +23,11 @@ using nnvm::NodeEntryMap;
using exec::GraphExecutor;
#if DMLC_CXX11_THREAD_LOCAL
-thread_local bool AutogradRuntime::is_train_;
+thread_local bool AutogradRuntime::is_train_ = false;
+thread_local bool AutogradRuntime::is_recording_ = false;
#else
-MX_THREAD_LOCAL bool AutogradRuntime::is_train_;
+MX_THREAD_LOCAL bool AutogradRuntime::is_train_ = false;
+MX_THREAD_LOCAL bool AutogradRuntime::is_recording_ = false;
#endif
template<typename FVisit>
@@ -149,7 +151,7 @@ AGNodePtr AutogradRuntime::RecordOp(const nnvm::Op* op,
void AutogradRuntime::ComputeGradient(const std::vector<NDArray>& outputs,
const std::vector<NDArray>& ograds,
- bool retain_graph) {
+ bool retain_graph, bool is_train) {
static auto& fmutate_inputs =
nnvm::Op::GetAttr<nnvm::FMutateInputs>("FMutateInputs");
std::vector<AGNodeEntry> heads;
Symbol sym;
@@ -233,7 +235,7 @@ void AutogradRuntime::ComputeGradient(const
std::vector<NDArray>& outputs,
}
}
- exec->Backward(head_grads);
+ exec->Backward(head_grads, is_train);
delete exec;
}
diff --git a/src/ndarray/autograd.h b/src/ndarray/autograd.h
index baf843d..4748640 100644
--- a/src/ndarray/autograd.h
+++ b/src/ndarray/autograd.h
@@ -63,6 +63,16 @@ class AutogradRuntime {
bool IsTraining() const {
return is_train_;
}
+ /*! \brief turn on or turn off operator recording for autograd. */
+ bool SetIsRecording(bool is_recording) {
+ bool old = is_recording_;
+ is_recording_ = is_recording;
+ return old;
+ }
+ /*! \brief whether operator recording is on. */
+ bool IsRecording() const {
+ return is_recording_;
+ }
/*! \brief mark variables for computing gradients. */
void MarkVariables(const std::vector<NDArray*>& variables,
const std::vector<mx_uint>& grad_reqs,
@@ -81,7 +91,7 @@ class AutogradRuntime {
/*! \brief compute the gradient of outputs w.r.t variables. */
void ComputeGradient(const std::vector<NDArray>& outputs,
const std::vector<NDArray>& ograds,
- bool retain_graph);
+ bool retain_graph, bool is_train);
/*! \return AutogradRuntime singleton */
static AutogradRuntime* Get();
/*! \brief Get shared pointer reference to AutogradRuntime singleton.
@@ -109,8 +119,10 @@ class AutogradRuntime {
/*! \brief indicate whether is training. */
#if DMLC_CXX11_THREAD_LOCAL
static thread_local bool is_train_;
+ static thread_local bool is_recording_;
#else
static MX_THREAD_LOCAL bool is_train_;
+ static MX_THREAD_LOCAL bool is_recording_;
#endif
/*! \brief node count used for naming */
std::atomic<uint64_t> node_count_{0};
diff --git a/src/ndarray/ndarray.cc b/src/ndarray/ndarray.cc
index f2e90dd..48499fa 100644
--- a/src/ndarray/ndarray.cc
+++ b/src/ndarray/ndarray.cc
@@ -69,6 +69,7 @@ NDArray NDArray::Slice(index_t begin, index_t end) const {
using namespace autograd;
NDArray ret = *this;
CHECK(!is_none()) << "NDArray is not initialized";
+ CHECK_LT(begin, end) << "Invalid slicing range [" << begin << ", " << end <<
")";
CHECK_GE(shape_[0], end) << "Slice end index out of range";
size_t length = shape_.ProdShape(1, shape_.ndim());
MSHADOW_TYPE_SWITCH(ret.dtype(), DType, {
diff --git a/src/operator/dropout-inl.h b/src/operator/dropout-inl.h
index e77d613..57d7814 100644
--- a/src/operator/dropout-inl.h
+++ b/src/operator/dropout-inl.h
@@ -29,6 +29,7 @@ namespace dropout {
enum DropoutOpInputs {kData};
enum DropoutOpOutputs {kOut, kMask};
enum DropoutOpForwardResource {kRandom};
+enum DropoutOpMode {kTraining, kAlways};
} // namespace dropout
namespace mxnet {
@@ -58,10 +59,16 @@ static void bernoulli_generate(int n, double p, int* r) {
struct DropoutParam : public dmlc::Parameter<DropoutParam> {
float p;
+ int mode;
DMLC_DECLARE_PARAMETER(DropoutParam) {
DMLC_DECLARE_FIELD(p).set_default(0.5)
.set_range(0, 1)
.describe("Fraction of the input that gets dropped out during training
time.");
+ DMLC_DECLARE_FIELD(mode)
+ .add_enum("training", dropout::kTraining)
+ .add_enum("always", dropout::kAlways)
+ .set_default(dropout::kTraining)
+ .describe("Whether to only turn on dropout during training or to also turn
on for inference.");
}
}; // struct DropoutParam
@@ -70,6 +77,7 @@ class DropoutOp : public Operator {
public:
explicit DropoutOp(DropoutParam param) {
this->pkeep_ = 1.0f - param.p;
+ this->mode_ = param.mode;
}
virtual void Forward(const OpContext &ctx,
@@ -86,7 +94,7 @@ class DropoutOp : public Operator {
Stream<xpu> *s = ctx.get_stream<xpu>();
Tensor<xpu, 2, DType> data = in_data[dropout::kData].FlatTo2D<xpu,
DType>(s);
Tensor<xpu, 2, DType> out = out_data[dropout::kOut].FlatTo2D<xpu,
DType>(s);
- if (ctx.is_train) {
+ if (ctx.is_train || mode_ == dropout::kAlways) {
Tensor<xpu, 2, DType> mask = out_data[dropout::kMask].FlatTo2D<xpu,
DType>(s);
#if !defined(__CUDACC__) && defined(USE_MKL) && defined(_OPENMP)
DType* outptr = out.dptr_;
@@ -96,7 +104,7 @@ class DropoutOp : public Operator {
bernoulli_generate(count, this->pkeep_, maskptr);
#pragma omp parallel for
for (int i = 0; i < count; ++i) {
- outptr[i] = dataptr[i] * maskptr[i];
+ outptr[i] = dataptr[i] * maskptr[i] * (1.0f / pkeep_);
}
#else
Random<xpu> *prnd = ctx.requested[dropout::kRandom].get_random<xpu,
real_t>(s);
@@ -124,6 +132,7 @@ class DropoutOp : public Operator {
Tensor<xpu, 2, DType> grad = out_grad[dropout::kOut].FlatTo2D<xpu,
DType>(s);
Tensor<xpu, 2, DType> mask = out_data[dropout::kMask].FlatTo2D<xpu,
DType>(s);
Tensor<xpu, 2, DType> gdata = in_grad[dropout::kData].FlatTo2D<xpu,
DType>(s);
+ if (ctx.is_train || mode_ == dropout::kAlways) {
#if !defined(__CUDACC__) && defined(USE_MKL) && defined(_OPENMP)
DType* ingradptr = gdata.dptr_;
DType* outgradptr = grad.dptr_;
@@ -131,17 +140,21 @@ class DropoutOp : public Operator {
int count = mask.shape_[0]*mask.shape_[1];
- #pragma omp parallel for
+ #pragma omp parallel for
for (int i = 0; i < count; ++i) {
- ingradptr[i] = outgradptr[i] * maskptr[i];
+ ingradptr[i] = outgradptr[i] * maskptr[i] * (1.0f / pkeep_);
}
#else // USE_MKL && _OPENMP
Assign(gdata, req[dropout::kData], grad * mask);
#endif // USE_MKL && _OPENMP
+ } else {
+ Assign(gdata, req[dropout::kData], F<mshadow_op::identity>(grad));
+ }
}
private:
real_t pkeep_;
+ int mode_;
}; // class DropoutOp
diff --git a/src/operator/dropout.cc b/src/operator/dropout.cc
index 74a50ba..e206214 100644
--- a/src/operator/dropout.cc
+++ b/src/operator/dropout.cc
@@ -33,7 +33,8 @@ MXNET_REGISTER_OP_PROPERTY(Dropout, DropoutProp)
The whole array is rescaled by :math:`1/(1-p)` to keep the expected
sum of the input unchanged.
-- During testing, this operator does not change the input.
+- During testing, this operator does not change the input if mode is
'training'.
+ If mode is 'always', the same computaion as during training will be applied.
Example::
diff --git a/tests/python/unittest/test_autograd.py
b/tests/python/unittest/test_autograd.py
index 8dea04d..172075d 100644
--- a/tests/python/unittest/test_autograd.py
+++ b/tests/python/unittest/test_autograd.py
@@ -248,6 +248,22 @@ def test_attach_grad():
assert (x.grad.asnumpy() == 2).all()
+def test_is_train():
+ x = mx.nd.ones((10, 10))
+ x.attach_grad()
+ with record(True):
+ y = mx.nd.Dropout(x, p=0.5)
+ assert y.asnumpy().max() == 2 and y.asnumpy().min() == 0
+ y.backward()
+ assert (x.grad.asnumpy() == y.asnumpy()).all()
+
+ with record(False):
+ y = mx.nd.Dropout(x, p=0.5)
+ assert (y.asnumpy() == x.asnumpy()).all()
+ y.backward(is_train=False)
+ assert (x.grad.asnumpy() == x.asnumpy()).all()
+
+
if __name__ == "__main__":
import nose
nose.runmodule()
diff --git a/tests/python/unittest/test_operator.py
b/tests/python/unittest/test_operator.py
index 2f7c3b9..51a77e0 100644
--- a/tests/python/unittest/test_operator.py
+++ b/tests/python/unittest/test_operator.py
@@ -3658,6 +3658,42 @@ def test_stack():
check_numeric_gradient(out, inputs)
+def test_dropout():
+ # test dropout
+ x = mx.sym.var('data')
+ y = mx.sym.Dropout(x, p=0.5)
+ exe = y.simple_bind(ctx=default_context(), data=(10, 10))
+
+ exe.arg_arrays[0][:] = 1
+ exe.forward(is_train=True)
+ assert exe.outputs[0].asnumpy().max() == 2
+ assert exe.outputs[0].asnumpy().min() == 0
+ exe.backward([mx.nd.ones((10, 10))])
+ assert (exe.grad_arrays[0].asnumpy() == exe.outputs[0].asnumpy()).all()
+
+ exe.forward(is_train=False)
+ assert (exe.outputs[0].asnumpy() == exe.arg_arrays[0].asnumpy()).all()
+ exe.backward([mx.nd.ones((10, 10))], is_train=False)
+ assert (exe.grad_arrays[0].asnumpy() == exe.arg_arrays[0].asnumpy()).all()
+
+ # test permanent dropout
+ x = mx.sym.var('data')
+ y = mx.sym.Dropout(x, p=0.5, mode='always')
+ exe = y.simple_bind(ctx=default_context(), data=(10, 10))
+
+ exe.arg_arrays[0][:] = 1
+ exe.forward(is_train=True)
+ assert exe.outputs[0].asnumpy().max() == 2
+ assert exe.outputs[0].asnumpy().min() == 0
+ exe.backward([mx.nd.ones((10, 10))])
+ assert (exe.grad_arrays[0].asnumpy() == exe.outputs[0].asnumpy()).all()
+
+ exe.forward(is_train=False)
+ assert exe.outputs[0].asnumpy().max() == 2
+ assert exe.outputs[0].asnumpy().min() == 0
+ exe.backward([mx.nd.ones((10, 10))], is_train=False)
+ assert (exe.grad_arrays[0].asnumpy() == exe.outputs[0].asnumpy()).all()
+
if __name__ == '__main__':
import nose
--
To stop receiving notification emails like this one, please contact
['"[email protected]" <[email protected]>'].