[incubator-mxnet] branch master updated: add backward(is_train=False) and always mode for dropout (#7303)

jxie Thu, 03 Aug 2017 10:36:35 -0700

This is an automated email from the ASF dual-hosted git repository.

jxie pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/incubator-mxnet.git



The following commit(s) were added to refs/heads/master by this push:
     new e6a1139  add backward(is_train=False) and always mode for dropout 
(#7303)
e6a1139 is described below

commit e6a1139812db33bd11fcc8491915f6a51d42418b
Author: Eric Junyuan Xie <[email protected]>
AuthorDate: Thu Aug 3 10:36:11 2017 -0700

    add backward(is_train=False) and always mode for dropout (#7303)
    
    * add backward(is_train=False) and always mode for dropout
    
    * fix
    
    * fix
    
    * fix slicing
    
    * fix mkl dropout
---
 include/mxnet/c_api.h                           | 36 ++++++++++++-
 include/mxnet/executor.h                        |  2 +-
 python/mxnet/autograd.py                        | 70 +++++++++++++++++++------
 python/mxnet/base.py                            |  4 +-
 python/mxnet/contrib/autograd.py                |  2 +
 python/mxnet/executor.py                        | 29 ++++------
 python/mxnet/gluon/model_zoo/vision/__init__.py |  5 +-
 python/mxnet/image/detection.py                 |  5 +-
 python/mxnet/ndarray.py                         | 17 ++++--
 src/c_api/c_api_executor.cc                     |  9 +++-
 src/c_api/c_api_ndarray.cc                      | 20 +++++--
 src/executor/graph_executor.cc                  |  4 +-
 src/executor/graph_executor.h                   |  2 +-
 src/ndarray/autograd.cc                         | 10 ++--
 src/ndarray/autograd.h                          | 14 ++++-
 src/ndarray/ndarray.cc                          |  1 +
 src/operator/dropout-inl.h                      | 21 ++++++--
 src/operator/dropout.cc                         |  3 +-
 tests/python/unittest/test_autograd.py          | 16 ++++++
 tests/python/unittest/test_operator.py          | 36 +++++++++++++
 20 files changed, 243 insertions(+), 63 deletions(-)

diff --git a/include/mxnet/c_api.h b/include/mxnet/c_api.h
index 47447fb..d9a5315 100644
--- a/include/mxnet/c_api.h
+++ b/include/mxnet/c_api.h
@@ -553,6 +553,13 @@ MXNET_DLL int MXImperativeInvoke(AtomicSymbolCreator 
creator,
                                  const char **param_vals);
 /*!
  * \brief set whether to record operator for autograd
+ * \param is_recording 1 when recording, 0 when not recording.
+ * \param prev returns the previous status before this set.
+ * \return 0 when success, -1 when failure happens
+ */
+MXNET_DLL int MXAutogradSetIsRecording(int is_recording, int* prev);
+/*!
+ * \brief set whether to record operator for autograd
  * \param is_train 1 when training, 0 when testing
  * \param prev returns the previous status before this set.
  * \return 0 when success, -1 when failure happens
@@ -589,6 +596,20 @@ MXNET_DLL int MXAutogradBackward(mx_uint num_output,
                                  NDArrayHandle* ograd_handles,
                                  int retain_graph);
 /*!
+* \brief compute the gradient of outputs w.r.t variabels
+* \param num_output number of output NDArray
+* \param output_handles output NDArrays
+* \param ograd_handles head gradient for NDArrays
+* \param retain_graph whether to keep the graph after backward
+* \param is_train whether to do backward for training or inference
+* \return 0 when success, -1 when failure happens
+*/
+MXNET_DLL int MXAutogradBackwardEx(mx_uint num_output,
+                                   NDArrayHandle* output_handles,
+                                   NDArrayHandle* ograd_handles,
+                                   int retain_graph,
+                                   int is_train);
+/*!
  * \brief create cached operator
  */
 MXNET_DLL int MXCreateCachedOp(SymbolHandle handle,
@@ -1028,7 +1049,20 @@ MXNET_DLL int MXExecutorForward(ExecutorHandle handle, 
int is_train);
 MXNET_DLL int MXExecutorBackward(ExecutorHandle handle,
                                  mx_uint len,
                                  NDArrayHandle *head_grads);
-
+/*!
+ * \brief Excecutor run backward
+ *
+ * \param handle execute handle
+ * \param len lenth
+ * \param head_grads NDArray handle for heads' gradient
+ * \param is_train int value to indicate whether the backward pass is for 
evaluation
+ *
+ * \return 0 when success, -1 when failure happens
+ */
+MXNET_DLL int MXExecutorBackwardEx(ExecutorHandle handle,
+                                   mx_uint len,
+                                   NDArrayHandle *head_grads,
+                                   int is_train);
 /*!
  * \brief Get executor's head NDArray
  *
diff --git a/include/mxnet/executor.h b/include/mxnet/executor.h
index 40bd60f..9308587 100644
--- a/include/mxnet/executor.h
+++ b/include/mxnet/executor.h
@@ -58,7 +58,7 @@ class Executor {
    *
    * \param head_grads the gradient of head nodes to be backproped.
    */
-  virtual void Backward(const std::vector<NDArray> &head_grads) = 0;
+  virtual void Backward(const std::vector<NDArray> &head_grads, bool is_train 
= true) = 0;
   /*!
    * \brief print the execution plan info to output stream.
    * \param os the output stream we like to print to.
diff --git a/python/mxnet/autograd.py b/python/mxnet/autograd.py
index b97d350..2f33052 100644
--- a/python/mxnet/autograd.py
+++ b/python/mxnet/autograd.py
@@ -12,9 +12,7 @@ from .symbol import _GRAD_REQ_MAP
 
 def set_recording(is_recording):
     """Set status to recording/not recording. When recording, graph will be 
constructed
-    for gradient computation. Operators will also run with ctx.is_train=True. 
For example,
-    Dropout will drop inputs randomly when is_train=True while simply passing 
through
-    if is_train=False.
+    for gradient computation.
 
     Parameters
     ----------
@@ -25,46 +23,77 @@ def set_recording(is_recording):
     previous state before this set.
     """
     prev = ctypes.c_int()
-    check_call(_LIB.MXAutogradSetIsTraining(
+    check_call(_LIB.MXAutogradSetIsRecording(
         ctypes.c_int(is_recording), ctypes.byref(prev)))
     return bool(prev.value)
 
+def set_training(is_train):
+    """Set status to training/not training. This affects ctx.is_train in 
operator
+    running context. For example, Dropout will drop inputs randomly when
+    is_train=True while simply passing through if is_train=False.
+
+    Parameters
+    ----------
+    is_train: bool
+
+    Returns
+    -------
+    previous state before this set.
+    """
+    prev = ctypes.c_int()
+    check_call(_LIB.MXAutogradSetIsTraining(
+        ctypes.c_int(is_train), ctypes.byref(prev)))
+    return bool(prev.value)
+
 
-class TrainingStateScope(object):
+class RecordingStateScope(object):
     """Scope for managing training state.
 
     Example::
-        with TrainingStateScope(True):
+        with RecordingStateScope(True, True):
             y = model(x)
             backward([y])
     """
-    def __init__(self, enter_state):
+    def __init__(self, enter_state, is_train):
         self._enter_state = enter_state
+        self._enter_is_train = is_train
         self._prev = None
+        self._prev_is_train = None
 
     def __enter__(self):
         self._prev = set_recording(self._enter_state)
+        self._prev_is_train = set_training(self._enter_is_train)
 
     def __exit__(self, ptype, value, trace):
         if self._prev != self._enter_state:
             set_recording(self._prev)
+        if self._prev_is_train != self._enter_is_train:
+            set_training(self._prev_is_train)
 
 
-def record():
+def record(is_train=True):
     """Returns a training scope context to be used in 'with' statement
     and captures training code.
 
+    .. note:: When forwarding with is_train=False, the corresponding backward
+              should also use is_train=False, otherwise gradient is undefined.
+
     Example::
         with autograd.record():
             y = model(x)
             backward([y])
         metric.update(...)
         optim.step(...)
+
+    Parameters
+    ----------
+    is_train: bool, default True
+        Whether to do forward for training or inference.
     """
-    return TrainingStateScope(True)
+    return RecordingStateScope(True, is_train)
 
 
-def pause():
+def pause(is_train=False):
     """Returns a testing scope context to be used in 'with' statement
     and captures testing code.
 
@@ -74,8 +103,13 @@ def pause():
             backward([y])
             with autograd.pause():
                 # testing, IO, gradient updates...
+
+    Parameters
+    ----------
+    is_train: bool, default False
+        Whether to do forward for training or inference.
     """
-    return TrainingStateScope(False)
+    return RecordingStateScope(False, is_train)
 
 
 def mark_variables(variables, gradients, grad_reqs='write'):
@@ -109,7 +143,7 @@ def mark_variables(variables, gradients, grad_reqs='write'):
         c_array(NDArrayHandle, gradient_handles)))
 
 
-def backward(heads, head_grads=None, retain_graph=False):
+def backward(heads, head_grads=None, retain_graph=False, is_train=True):
     """Compute the gradients of heads w.r.t previously marked variables.
 
     Parameters
@@ -118,6 +152,8 @@ def backward(heads, head_grads=None, retain_graph=False):
         Output NDArray(s)
     head_grads: NDArray or list of NDArray or None
         Gradients with respect to heads.
+    is_train: bool, optional
+        Whether to do backward for training or inference.
     """
     if isinstance(heads, NDArray):
         assert head_grads is None or isinstance(head_grads, NDArray)
@@ -129,11 +165,12 @@ def backward(heads, head_grads=None, retain_graph=False):
         output_handles.append(arr.handle)
 
     if head_grads is None:
-        check_call(_LIB.MXAutogradBackward(
+        check_call(_LIB.MXAutogradBackwardEx(
             len(output_handles),
             c_array(NDArrayHandle, output_handles),
             ctypes.c_void_p(0),
-            ctypes.c_int(retain_graph)))
+            ctypes.c_int(retain_graph),
+            ctypes.c_int(is_train)))
         return
 
     ograd_handles = []
@@ -145,8 +182,9 @@ def backward(heads, head_grads=None, retain_graph=False):
     assert len(ograd_handles) == len(output_handles), \
         "heads and head_grads must have the same length"
 
-    check_call(_LIB.MXAutogradBackward(
+    check_call(_LIB.MXAutogradBackwardEx(
         len(output_handles),
         c_array(NDArrayHandle, output_handles),
         c_array(NDArrayHandle, ograd_handles),
-        ctypes.c_int(retain_graph)))
+        ctypes.c_int(retain_graph),
+        ctypes.c_int(is_train)))
diff --git a/python/mxnet/base.py b/python/mxnet/base.py
index 6d53752..ddaeb6e 100644
--- a/python/mxnet/base.py
+++ b/python/mxnet/base.py
@@ -18,14 +18,14 @@ __all__ = ['MXNetError']
 #----------------------------
 if sys.version_info[0] == 3:
     string_types = str,
-    numeric_types = (float, int, np.float32, np.int32)
+    numeric_types = (float, int, np.generic)
     integer_types = int
     # this function is needed for python3
     # to convert ctypes.char_p .value back to python str
     py_str = lambda x: x.decode('utf-8')
 else:
     string_types = basestring,
-    numeric_types = (float, int, long, np.float32, np.int32)
+    numeric_types = (float, int, long, np.generic)
     integer_types = (int, long)
     py_str = lambda x: x
 
diff --git a/python/mxnet/contrib/autograd.py b/python/mxnet/contrib/autograd.py
index e56361e..9074e45 100644
--- a/python/mxnet/contrib/autograd.py
+++ b/python/mxnet/contrib/autograd.py
@@ -28,6 +28,8 @@ def set_is_training(is_train):
     prev = ctypes.c_int()
     check_call(_LIB.MXAutogradSetIsTraining(
         ctypes.c_int(is_train), ctypes.byref(prev)))
+    check_call(_LIB.MXAutogradSetIsRecording(
+        ctypes.c_int(is_train), ctypes.byref(prev)))
     return bool(prev.value)
 
 
diff --git a/python/mxnet/executor.py b/python/mxnet/executor.py
index 6b9aab2..d2b108c 100644
--- a/python/mxnet/executor.py
+++ b/python/mxnet/executor.py
@@ -5,7 +5,6 @@ from __future__ import absolute_import
 
 import ctypes
 import copy
-import warnings
 import numpy as np
 from .base import _LIB
 from .base import mx_uint, NDArrayHandle, ExecutorHandle
@@ -61,7 +60,6 @@ class Executor(object):
         self._aux_dict = None
         self._output_dict = None
         self._monitor_callback = None
-        self._output_dirty = False
         self._ctx = copy.deepcopy(ctx)
         self._grad_req = copy.deepcopy(grad_req)
         self._group2ctx = copy.deepcopy(group2ctx)
@@ -99,8 +97,7 @@ class Executor(object):
         ----------
         is_train: bool, optional
             Whether this forward is for evaluation purpose. If True,
-            a backward call is expected to follow. Otherwise following
-            backward is invalid.
+            a backward call is expected to follow.
 
         **kwargs
             Additional specification of input arguments.
@@ -132,15 +129,9 @@ class Executor(object):
             self.handle,
             ctypes.c_int(int(is_train))))
 
-        if self._output_dirty:
-            warnings.warn(
-                "Calling forward the second time after forward(is_train=True) "
-                "without calling backward first. Is this intended?", 
stacklevel=2)
-        self._output_dirty = is_train
-
         return self.outputs
 
-    def backward(self, out_grads=None):
+    def backward(self, out_grads=None, is_train=True):
         """Do backward pass to get the gradient of arguments.
 
         Parameters
@@ -149,6 +140,11 @@ class Executor(object):
             Gradient on the outputs to be propagated back.
             This parameter is only needed when bind is called
             on outputs that are not a loss function.
+        is_train : bool, default True
+            Whether this backward is for training or inference. Note that in 
rare
+            cases you want to call backward with is_train=False to get gradient
+            during inference.
+
 
         Examples
         --------
@@ -211,16 +207,11 @@ class Executor(object):
             if not isinstance(obj, NDArray):
                 raise TypeError("inputs must be NDArray")
         ndarray = c_array(NDArrayHandle, [item.handle for item in out_grads])
-        check_call(_LIB.MXExecutorBackward(
+        check_call(_LIB.MXExecutorBackwardEx(
             self.handle,
             mx_uint(len(out_grads)),
-            ndarray))
-
-        if not self._output_dirty:
-            warnings.warn(
-                "Calling backward without calling forward(is_train=True) "
-                "first. Behavior is undefined.", stacklevel=2)
-        self._output_dirty = False
+            ndarray,
+            ctypes.c_int(is_train)))
 
     def set_monitor_callback(self, callback):
         """Install callback for monitor.
diff --git a/python/mxnet/gluon/model_zoo/vision/__init__.py 
b/python/mxnet/gluon/model_zoo/vision/__init__.py
index e0498dc..56e46f9 100644
--- a/python/mxnet/gluon/model_zoo/vision/__init__.py
+++ b/python/mxnet/gluon/model_zoo/vision/__init__.py
@@ -102,5 +102,8 @@ def get_model(name, **kwargs):
               'inceptionv3': inception_v3,
              }
     name = name.lower()
-    assert name in models, 'Model %s is not supported'%name
+    if name not in models:
+        raise ValueError(
+            'Model %s is not supported. Available options are\n\t%s'%(
+                name, '\n\t'.join(sorted(models.keys()))))
     return models[name](**kwargs)
diff --git a/python/mxnet/image/detection.py b/python/mxnet/image/detection.py
index d5e5c1e..0a16ac3 100644
--- a/python/mxnet/image/detection.py
+++ b/python/mxnet/image/detection.py
@@ -756,8 +756,9 @@ class ImageDetIter(ImageIter):
                     assert i < batch_size, 'Batch size must be multiples of 
augmenter output length'
                     batch_data[i][:] = self.postprocess_data(datum)
                     num_object = label.shape[0]
-                    batch_label[i][0:num_object][:] = nd.array(label)
-                    batch_label[i][num_object:][:] = -1
+                    batch_label[i][0:num_object] = nd.array(label)
+                    if num_object < batch_label[i].shape[0]:
+                        batch_label[i][num_object:] = -1
                     i += 1
         except StopIteration:
             if not i:
diff --git a/python/mxnet/ndarray.py b/python/mxnet/ndarray.py
index fdecebb..b2178a9 100644
--- a/python/mxnet/ndarray.py
+++ b/python/mxnet/ndarray.py
@@ -124,6 +124,7 @@ fixed-size items.
 
     """
     __slots__ = []
+    # make numpy functions return NDArray instead of numpy object array
     __array_priority__ = 1000.0
     # pylint: disable= no-member, undefined-variable
     def __repr__(self):
@@ -1058,22 +1059,30 @@ fixed-size items.
         check_call(_LIB.MXNDArrayDetach(self.handle, ctypes.byref(hdl)))
         return NDArray(hdl)
 
-    def backward(self, out_grad=None, retain_graph=False):
+    def backward(self, out_grad=None, retain_graph=False, is_train=True):
         """Compute the gradients of this NDArray w.r.t variables.
 
         Parameters
         ----------
-        out_grad: list of NDArray or None
+        out_grad : NDArray, optional
+            Gradient with respect to head.
+        retain_graph : bool, optional
+            Whether to retain the computaion graph for another backward
+            pass on the same graph. By default the computaion history
+            is cleared.
+        is_train : bool, optional
+            Whether to compute gradient for training or inference.
         """
         if out_grad is None:
             ograd_handles = [NDArrayHandle(0)]
         else:
             ograd_handles = [out_grad.handle]
 
-        check_call(_LIB.MXAutogradBackward(
+        check_call(_LIB.MXAutogradBackwardEx(
             1, c_array(NDArrayHandle, [self.handle]),
             c_array(NDArrayHandle, ograd_handles),
-            ctypes.c_int(retain_graph)))
+            ctypes.c_int(retain_graph),
+            ctypes.c_int(is_train)))
 
 
 def onehot_encode(indices, out):
diff --git a/src/c_api/c_api_executor.cc b/src/c_api/c_api_executor.cc
index ca49402..3ba3154 100644
--- a/src/c_api/c_api_executor.cc
+++ b/src/c_api/c_api_executor.cc
@@ -35,6 +35,13 @@ int MXExecutorForward(ExecutorHandle handle, int is_train) {
 int MXExecutorBackward(ExecutorHandle handle,
                        mx_uint len,
                        NDArrayHandle *head_grads) {
+  return MXExecutorBackwardEx(handle, len, head_grads, true);
+}
+
+int MXExecutorBackwardEx(ExecutorHandle handle,
+                         mx_uint len,
+                         NDArrayHandle *head_grads,
+                         int is_train) {
   API_BEGIN();
   Executor *exec = static_cast<Executor*>(handle);
   std::vector<NDArray> ndarrays;
@@ -42,7 +49,7 @@ int MXExecutorBackward(ExecutorHandle handle,
   for (mx_uint i = 0; i < len; ++i) {
     ndarrays.push_back(*args_ptr[i]);
   }
-  exec->Backward(ndarrays);
+  exec->Backward(ndarrays, is_train);
   API_END();
 }
 
diff --git a/src/c_api/c_api_ndarray.cc b/src/c_api/c_api_ndarray.cc
index 818f263..f401394 100644
--- a/src/c_api/c_api_ndarray.cc
+++ b/src/c_api/c_api_ndarray.cc
@@ -378,7 +378,7 @@ void ImperativeInvokeImpl(const Context& default_ctx,
     }
 
     if (fn) {
-      if (AutogradRuntime::Get()->IsTraining()) {
+      if (AutogradRuntime::Get()->IsRecording()) {
         AutogradRuntime::Get()->RecordImperativeFCompute(op,
             attrs, &ndinputs, &ndoutputs);
       }
@@ -387,7 +387,7 @@ void ImperativeInvokeImpl(const Context& default_ctx,
     } else if (createop.count(op)) {
       auto state =
           createop[op](attrs, ctx, ret->arg_shapes, ret->arg_types);
-      if (AutogradRuntime::Get()->IsTraining()) {
+      if (AutogradRuntime::Get()->IsRecording()) {
         AutogradRuntime::Get()->RecordImperativeOperator(state, op,
             attrs, &ndinputs, &ndoutputs);
       }
@@ -528,6 +528,12 @@ int MXAutogradSetIsTraining(int is_training, int* prev) {
   API_END();
 }
 
+int MXAutogradSetIsRecording(int is_recording, int* prev) {
+  API_BEGIN();
+  *prev = 
AutogradRuntime::Get()->SetIsRecording(static_cast<bool>(is_recording));
+  API_END();
+}
+
 int MXAutogradMarkVariables(mx_uint num_var,
                             NDArrayHandle *var_handles,
                             mx_uint *reqs_array,
@@ -556,6 +562,14 @@ int MXAutogradBackward(mx_uint num_output,
                        NDArrayHandle *output_handles,
                        NDArrayHandle *ograd_handles,
                        int retain_graph) {
+  return MXAutogradBackwardEx(num_output, output_handles, ograd_handles, 
retain_graph, true);
+}
+
+int MXAutogradBackwardEx(mx_uint num_output,
+                         NDArrayHandle *output_handles,
+                         NDArrayHandle *ograd_handles,
+                         int retain_graph,
+                         int is_train) {
   API_BEGIN();
   MXAPIThreadLocalEntry *ret = MXAPIThreadLocalStore::Get();
 
@@ -574,6 +588,6 @@ int MXAutogradBackward(mx_uint num_output,
     }
   }
 
-  AutogradRuntime::Get()->ComputeGradient(outputs, ograds, retain_graph);
+  AutogradRuntime::Get()->ComputeGradient(outputs, ograds, retain_graph, 
is_train);
   API_END();
 }
diff --git a/src/executor/graph_executor.cc b/src/executor/graph_executor.cc
index af5ec7f..a17f44a 100644
--- a/src/executor/graph_executor.cc
+++ b/src/executor/graph_executor.cc
@@ -42,7 +42,7 @@ void GraphExecutor::PartialForward(bool is_train, int step, 
int *step_left) {
   *step_left = static_cast<int>(num_forward_nodes_ - sstep - 1);
 }
 
-void GraphExecutor::Backward(const std::vector<NDArray>& head_grads) {
+void GraphExecutor::Backward(const std::vector<NDArray>& head_grads, bool 
is_train) {
   const auto& idx = graph_.indexed_graph();
   if (num_forward_inputs_ != idx.input_nodes().size()) {
     for (size_t i = 0; i < head_grad_array_.size(); ++i) {
@@ -57,7 +57,7 @@ void GraphExecutor::Backward(const std::vector<NDArray>& 
head_grads) {
       }
     }
   }
-  RunOps(true, num_forward_nodes_, idx.num_nodes());
+  RunOps(is_train, num_forward_nodes_, idx.num_nodes());
 }
 
 void GraphExecutor::Print(std::ostream &os) const {  // NOLINT(*)
diff --git a/src/executor/graph_executor.h b/src/executor/graph_executor.h
index 5b6fa39..0efb8ae 100644
--- a/src/executor/graph_executor.h
+++ b/src/executor/graph_executor.h
@@ -44,7 +44,7 @@ class GraphExecutor : public Executor {
   virtual ~GraphExecutor();
   void Forward(bool is_train) override;
   void PartialForward(bool is_train, int step, int *step_left) override;
-  void Backward(const std::vector<NDArray> &head_grads) override;
+  void Backward(const std::vector<NDArray> &head_grads, bool is_train = true) 
override;
   const std::vector<NDArray>& outputs() const override;
   const std::unordered_map<std::string, NDArray>& in_arg_map() const override;
   const std::unordered_map<std::string, NDArray>& arg_grad_map() const 
override;
diff --git a/src/ndarray/autograd.cc b/src/ndarray/autograd.cc
index f990ee2..efb6bc9 100644
--- a/src/ndarray/autograd.cc
+++ b/src/ndarray/autograd.cc
@@ -23,9 +23,11 @@ using nnvm::NodeEntryMap;
 using exec::GraphExecutor;
 
 #if DMLC_CXX11_THREAD_LOCAL
-thread_local bool AutogradRuntime::is_train_;
+thread_local bool AutogradRuntime::is_train_ = false;
+thread_local bool AutogradRuntime::is_recording_ = false;
 #else
-MX_THREAD_LOCAL bool AutogradRuntime::is_train_;
+MX_THREAD_LOCAL bool AutogradRuntime::is_train_ = false;
+MX_THREAD_LOCAL bool AutogradRuntime::is_recording_ = false;
 #endif
 
 template<typename FVisit>
@@ -149,7 +151,7 @@ AGNodePtr AutogradRuntime::RecordOp(const nnvm::Op* op,
 
 void AutogradRuntime::ComputeGradient(const std::vector<NDArray>& outputs,
                                       const std::vector<NDArray>& ograds,
-                                      bool retain_graph) {
+                                      bool retain_graph, bool is_train) {
   static auto& fmutate_inputs = 
nnvm::Op::GetAttr<nnvm::FMutateInputs>("FMutateInputs");
   std::vector<AGNodeEntry> heads;
   Symbol sym;
@@ -233,7 +235,7 @@ void AutogradRuntime::ComputeGradient(const 
std::vector<NDArray>& outputs,
       }
     }
 
-    exec->Backward(head_grads);
+    exec->Backward(head_grads, is_train);
     delete exec;
   }
 
diff --git a/src/ndarray/autograd.h b/src/ndarray/autograd.h
index baf843d..4748640 100644
--- a/src/ndarray/autograd.h
+++ b/src/ndarray/autograd.h
@@ -63,6 +63,16 @@ class AutogradRuntime {
   bool IsTraining() const {
     return is_train_;
   }
+  /*! \brief turn on or turn off operator recording for autograd. */
+  bool SetIsRecording(bool is_recording) {
+      bool old = is_recording_;
+      is_recording_ = is_recording;
+      return old;
+  }
+  /*! \brief whether operator recording is on. */
+  bool IsRecording() const {
+    return is_recording_;
+  }
   /*! \brief mark variables for computing gradients. */
   void MarkVariables(const std::vector<NDArray*>& variables,
                      const std::vector<mx_uint>& grad_reqs,
@@ -81,7 +91,7 @@ class AutogradRuntime {
   /*! \brief compute the gradient of outputs w.r.t variables. */
   void ComputeGradient(const std::vector<NDArray>& outputs,
                        const std::vector<NDArray>& ograds,
-                       bool retain_graph);
+                       bool retain_graph, bool is_train);
   /*! \return AutogradRuntime singleton */
   static AutogradRuntime* Get();
   /*! \brief Get shared pointer reference to AutogradRuntime singleton.
@@ -109,8 +119,10 @@ class AutogradRuntime {
   /*! \brief indicate whether is training. */
 #if DMLC_CXX11_THREAD_LOCAL
   static thread_local bool is_train_;
+  static thread_local bool is_recording_;
 #else
   static MX_THREAD_LOCAL bool is_train_;
+  static MX_THREAD_LOCAL bool is_recording_;
 #endif
   /*! \brief node count used for naming */
   std::atomic<uint64_t> node_count_{0};
diff --git a/src/ndarray/ndarray.cc b/src/ndarray/ndarray.cc
index f2e90dd..48499fa 100644
--- a/src/ndarray/ndarray.cc
+++ b/src/ndarray/ndarray.cc
@@ -69,6 +69,7 @@ NDArray NDArray::Slice(index_t begin, index_t end) const {
   using namespace autograd;
   NDArray ret = *this;
   CHECK(!is_none()) << "NDArray is not initialized";
+  CHECK_LT(begin, end) << "Invalid slicing range [" << begin << ", " << end << 
")";
   CHECK_GE(shape_[0], end) << "Slice end index out of range";
   size_t length = shape_.ProdShape(1, shape_.ndim());
   MSHADOW_TYPE_SWITCH(ret.dtype(), DType, {
diff --git a/src/operator/dropout-inl.h b/src/operator/dropout-inl.h
index e77d613..57d7814 100644
--- a/src/operator/dropout-inl.h
+++ b/src/operator/dropout-inl.h
@@ -29,6 +29,7 @@ namespace dropout {
 enum DropoutOpInputs {kData};
 enum DropoutOpOutputs {kOut, kMask};
 enum DropoutOpForwardResource {kRandom};
+enum DropoutOpMode {kTraining, kAlways};
 }  // namespace dropout
 
 namespace mxnet {
@@ -58,10 +59,16 @@ static void bernoulli_generate(int n, double p, int* r) {
 
 struct DropoutParam : public dmlc::Parameter<DropoutParam> {
   float p;
+  int mode;
   DMLC_DECLARE_PARAMETER(DropoutParam) {
     DMLC_DECLARE_FIELD(p).set_default(0.5)
     .set_range(0, 1)
     .describe("Fraction of the input that gets dropped out during training 
time.");
+    DMLC_DECLARE_FIELD(mode)
+    .add_enum("training", dropout::kTraining)
+    .add_enum("always", dropout::kAlways)
+    .set_default(dropout::kTraining)
+    .describe("Whether to only turn on dropout during training or to also turn 
on for inference.");
   }
 };  // struct DropoutParam
 
@@ -70,6 +77,7 @@ class DropoutOp : public Operator {
  public:
   explicit DropoutOp(DropoutParam param) {
     this->pkeep_ = 1.0f - param.p;
+    this->mode_ = param.mode;
   }
 
   virtual void Forward(const OpContext &ctx,
@@ -86,7 +94,7 @@ class DropoutOp : public Operator {
     Stream<xpu> *s = ctx.get_stream<xpu>();
     Tensor<xpu, 2, DType> data = in_data[dropout::kData].FlatTo2D<xpu, 
DType>(s);
     Tensor<xpu, 2, DType> out = out_data[dropout::kOut].FlatTo2D<xpu, 
DType>(s);
-    if (ctx.is_train) {
+    if (ctx.is_train || mode_ == dropout::kAlways) {
       Tensor<xpu, 2, DType> mask = out_data[dropout::kMask].FlatTo2D<xpu, 
DType>(s);
 #if !defined(__CUDACC__) && defined(USE_MKL) && defined(_OPENMP)
       DType* outptr = out.dptr_;
@@ -96,7 +104,7 @@ class DropoutOp : public Operator {
       bernoulli_generate(count, this->pkeep_, maskptr);
   #pragma omp parallel for
       for (int i = 0; i < count; ++i) {
-        outptr[i] = dataptr[i] * maskptr[i];
+        outptr[i] = dataptr[i] * maskptr[i] * (1.0f / pkeep_);
       }
 #else
       Random<xpu> *prnd = ctx.requested[dropout::kRandom].get_random<xpu, 
real_t>(s);
@@ -124,6 +132,7 @@ class DropoutOp : public Operator {
     Tensor<xpu, 2, DType> grad = out_grad[dropout::kOut].FlatTo2D<xpu, 
DType>(s);
     Tensor<xpu, 2, DType> mask = out_data[dropout::kMask].FlatTo2D<xpu, 
DType>(s);
     Tensor<xpu, 2, DType> gdata = in_grad[dropout::kData].FlatTo2D<xpu, 
DType>(s);
+    if (ctx.is_train || mode_ == dropout::kAlways) {
 #if !defined(__CUDACC__) && defined(USE_MKL) && defined(_OPENMP)
       DType* ingradptr = gdata.dptr_;
       DType* outgradptr = grad.dptr_;
@@ -131,17 +140,21 @@ class DropoutOp : public Operator {
 
       int count = mask.shape_[0]*mask.shape_[1];
 
-  #pragma omp parallel for
+      #pragma omp parallel for
       for (int i = 0; i < count; ++i) {
-        ingradptr[i] = outgradptr[i] * maskptr[i];
+        ingradptr[i] = outgradptr[i] * maskptr[i] * (1.0f / pkeep_);
       }
 #else  // USE_MKL && _OPENMP
       Assign(gdata, req[dropout::kData], grad * mask);
 #endif  // USE_MKL && _OPENMP
+    } else {
+      Assign(gdata, req[dropout::kData], F<mshadow_op::identity>(grad));
+    }
   }
 
  private:
   real_t pkeep_;
+  int mode_;
 };  // class DropoutOp
 
 
diff --git a/src/operator/dropout.cc b/src/operator/dropout.cc
index 74a50ba..e206214 100644
--- a/src/operator/dropout.cc
+++ b/src/operator/dropout.cc
@@ -33,7 +33,8 @@ MXNET_REGISTER_OP_PROPERTY(Dropout, DropoutProp)
   The whole array is rescaled by :math:`1/(1-p)` to keep the expected
   sum of the input unchanged.
 
-- During testing, this operator does not change the input.
+- During testing, this operator does not change the input if mode is 
'training'.
+  If mode is 'always', the same computaion as during training will be applied.
 
 Example::
 
diff --git a/tests/python/unittest/test_autograd.py 
b/tests/python/unittest/test_autograd.py
index 8dea04d..172075d 100644
--- a/tests/python/unittest/test_autograd.py
+++ b/tests/python/unittest/test_autograd.py
@@ -248,6 +248,22 @@ def test_attach_grad():
     assert (x.grad.asnumpy() == 2).all()
 
 
+def test_is_train():
+    x = mx.nd.ones((10, 10))
+    x.attach_grad()
+    with record(True):
+        y = mx.nd.Dropout(x, p=0.5)
+        assert y.asnumpy().max() == 2 and y.asnumpy().min() == 0
+        y.backward()
+        assert (x.grad.asnumpy() == y.asnumpy()).all()
+
+    with record(False):
+        y = mx.nd.Dropout(x, p=0.5)
+        assert (y.asnumpy() == x.asnumpy()).all()
+        y.backward(is_train=False)
+        assert (x.grad.asnumpy() == x.asnumpy()).all()
+
+
 if __name__ == "__main__":
     import nose
     nose.runmodule()
diff --git a/tests/python/unittest/test_operator.py 
b/tests/python/unittest/test_operator.py
index 2f7c3b9..51a77e0 100644
--- a/tests/python/unittest/test_operator.py
+++ b/tests/python/unittest/test_operator.py
@@ -3658,6 +3658,42 @@ def test_stack():
         check_numeric_gradient(out, inputs)
 
 
+def test_dropout():
+    # test dropout
+    x = mx.sym.var('data')
+    y = mx.sym.Dropout(x, p=0.5)
+    exe = y.simple_bind(ctx=default_context(), data=(10, 10))
+
+    exe.arg_arrays[0][:] = 1
+    exe.forward(is_train=True)
+    assert exe.outputs[0].asnumpy().max() == 2
+    assert exe.outputs[0].asnumpy().min() == 0
+    exe.backward([mx.nd.ones((10, 10))])
+    assert (exe.grad_arrays[0].asnumpy() == exe.outputs[0].asnumpy()).all()
+
+    exe.forward(is_train=False)
+    assert (exe.outputs[0].asnumpy() == exe.arg_arrays[0].asnumpy()).all()
+    exe.backward([mx.nd.ones((10, 10))], is_train=False)
+    assert (exe.grad_arrays[0].asnumpy() == exe.arg_arrays[0].asnumpy()).all()
+
+    # test permanent dropout
+    x = mx.sym.var('data')
+    y = mx.sym.Dropout(x, p=0.5, mode='always')
+    exe = y.simple_bind(ctx=default_context(), data=(10, 10))
+
+    exe.arg_arrays[0][:] = 1
+    exe.forward(is_train=True)
+    assert exe.outputs[0].asnumpy().max() == 2
+    assert exe.outputs[0].asnumpy().min() == 0
+    exe.backward([mx.nd.ones((10, 10))])
+    assert (exe.grad_arrays[0].asnumpy() == exe.outputs[0].asnumpy()).all()
+
+    exe.forward(is_train=False)
+    assert exe.outputs[0].asnumpy().max() == 2
+    assert exe.outputs[0].asnumpy().min() == 0
+    exe.backward([mx.nd.ones((10, 10))], is_train=False)
+    assert (exe.grad_arrays[0].asnumpy() == exe.outputs[0].asnumpy()).all()
+
 
 if __name__ == '__main__':
     import nose

-- 
To stop receiving notification emails like this one, please contact
['"[email protected]" <[email protected]>'].

[incubator-mxnet] branch master updated: add backward(is_train=False) and always mode for dropout (#7303)

Reply via email to