SINGA-348 Support autograd MLP Example

rename some variables; and add more checks for Dummy operations.

move mlp.py into example/autograd; I can run mlp.py without any bugs.


Project: http://git-wip-us.apache.org/repos/asf/incubator-singa/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-singa/commit/060e7dfe
Tree: http://git-wip-us.apache.org/repos/asf/incubator-singa/tree/060e7dfe
Diff: http://git-wip-us.apache.org/repos/asf/incubator-singa/diff/060e7dfe

Branch: refs/heads/master
Commit: 060e7dfe1cc847500e4beb115f3b24e923288b3e
Parents: f42d4d0
Author: Wang Wei <dcs...@nus.edu.sg>
Authored: Wed Apr 11 21:09:45 2018 +0800
Committer: Wang Wei <dcs...@nus.edu.sg>
Committed: Thu Apr 12 16:59:48 2018 +0800

----------------------------------------------------------------------
 examples/MLP.py          | 83 -------------------------------------------
 examples/autograd/mlp.py | 74 ++++++++++++++++++++++++++++++++++++++
 python/singa/autograd.py | 33 ++++++++++-------
 python/singa/tensor.py   | 58 +++++++++++++++++++-----------
 4 files changed, 131 insertions(+), 117 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/060e7dfe/examples/MLP.py
----------------------------------------------------------------------
diff --git a/examples/MLP.py b/examples/MLP.py
deleted file mode 100644
index b773efb..0000000
--- a/examples/MLP.py
+++ /dev/null
@@ -1,83 +0,0 @@
-from singa import tensor
-from singa import autograd
-from singa import optimizer
-import numpy as np
-
-
-if __name__ == '__main__':
-
-    # prepare training data in numpy array
-
-    # generate the boundary
-    f = lambda x: (5 * x + 1)
-    bd_x = np.linspace(-1., 1, 200)
-    bd_y = f(bd_x)
-    # generate the training data
-    x = np.random.uniform(-1, 1, 400)
-    y = f(x) + 2 * np.random.randn(len(x))
-    # convert training data to 2d space
-    label = np.asarray([5 * a + 1 > b for (a, b) in zip(x, y)])
-    data = np.array([[a, b] for (a, b) in zip(x, y)], dtype=np.float32)
-
-    def to_categorical(y, num_classes=None):
-        '''
-        Converts a class vector (integers) to binary class matrix.
-
-        Args
-            y: class vector to be converted into a matrix
-                (integers from 0 to num_classes).
-            num_classes: total number of classes.
-
-        Return
-            A binary matrix representation of the input.
-        '''
-        y = np.array(y, dtype='int')
-        input_shape = y.shape
-        if input_shape and input_shape[-1] == 1 and len(input_shape) > 1:
-            input_shape = tuple(input_shape[:-1])
-        y = y.ravel()
-        if not num_classes:
-            num_classes = np.max(y) + 1
-        n = y.shape[0]
-        categorical = np.zeros((n, num_classes))
-        categorical[np.arange(n), y] = 1
-        output_shape = input_shape + (num_classes,)
-        categorical = np.reshape(categorical, output_shape)
-        return categorical
-
-    label = to_categorical(label, 2).astype(np.float32)
-    print('train_data_shape:', data.shape)
-    print('train_label_shape:', label.shape)
-
-    inputs = tensor.Tensor(data=data, requires_grad=False)
-    target = tensor.Tensor(data=label, requires_grad=False)
-
-    w0 = tensor.Tensor(shape=(2, 3), requires_grad=True, stores_grad=True)
-    w0.gaussian(0.0, 0.1)
-    b0 = tensor.Tensor(shape=(1, 3), requires_grad=True, stores_grad=True)
-    b0.set_value(0.0)
-
-    w1 = tensor.Tensor(shape=(3, 2), requires_grad=True, stores_grad=True)
-    w1.gaussian(0.0, 0.1)
-    b1 = tensor.Tensor(shape=(1, 2), requires_grad=True, stores_grad=True)
-    b1.set_value(0.0)
-
-    sgd = optimizer.SGD(0.05)
-    # training process
-    for i in range(1001):
-        x = tensor.matmul(inputs, w0)
-        x = tensor.add_bias(x, b0)
-        x = tensor.relu(x)
-        x = tensor.matmul(x, w1)
-        x = tensor.add_bias(x, b1)
-        x = tensor.softmax(x)
-        loss = tensor.cross_entropy(x, target)
-        in_grads = autograd.backward(loss)
-
-        for param in in_grads:
-            sgd.apply(0, in_grads[param], param, '')
-
-        if (i % 100 == 0):
-            print('training loss = ', tensor.to_numpy(loss)[0])
-
-

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/060e7dfe/examples/autograd/mlp.py
----------------------------------------------------------------------
diff --git a/examples/autograd/mlp.py b/examples/autograd/mlp.py
new file mode 100644
index 0000000..7352c21
--- /dev/null
+++ b/examples/autograd/mlp.py
@@ -0,0 +1,74 @@
+from singa import tensor
+from singa.tensor import Tensor
+from singa import autograd
+from singa import optimizer
+import numpy as np
+
+
+if __name__ == '__main__':
+
+    # prepare training data in numpy array
+
+    # generate the boundary
+    f = lambda x: (5 * x + 1)
+    bd_x = np.linspace(-1., 1, 200)
+    bd_y = f(bd_x)
+    # generate the training data
+    x = np.random.uniform(-1, 1, 400)
+    y = f(x) + 2 * np.random.randn(len(x))
+    # convert training data to 2d space
+    label = np.asarray([5 * a + 1 > b for (a, b) in zip(x, y)])
+    data = np.array([[a, b] for (a, b) in zip(x, y)], dtype=np.float32)
+
+    def to_categorical(y, num_classes):
+        '''
+        Converts a class vector (integers) to binary class matrix.
+
+        Args
+            y: class vector to be converted into a matrix
+                (integers from 0 to num_classes).
+            num_classes: total number of classes.
+
+        Return
+            A binary matrix representation of the input.
+        '''
+        y = np.array(y, dtype='int')
+        n = y.shape[0]
+        categorical = np.zeros((n, num_classes))
+        categorical[np.arange(n), y] = 1
+        return categorical
+
+    label = to_categorical(label, 2).astype(np.float32)
+    print('train_data_shape:', data.shape)
+    print('train_label_shape:', label.shape)
+
+    inputs = Tensor(data=data)
+    target = Tensor(data=label)
+
+    w0 = Tensor(shape=(2, 3), requires_grad=True, stores_grad=True)
+    w0.gaussian(0.0, 0.1)
+    b0 = Tensor(shape=(1, 3), requires_grad=True, stores_grad=True)
+    b0.set_value(0.0)
+
+    w1 = Tensor(shape=(3, 2), requires_grad=True, stores_grad=True)
+    w1.gaussian(0.0, 0.1)
+    b1 = Tensor(shape=(1, 2), requires_grad=True, stores_grad=True)
+    b1.set_value(0.0)
+
+    sgd = optimizer.SGD(0.05)
+    # training process
+    for i in range(1001):
+        x = tensor.matmul(inputs, w0)
+        x = tensor.add_bias(x, b0)
+        x = tensor.relu(x)
+        x = tensor.matmul(x, w1)
+        x = tensor.add_bias(x, b1)
+        x = tensor.soft_max(x)
+        loss = tensor.cross_entropy(x, target)
+        in_grads = autograd.backward(loss)
+
+        for param in in_grads:
+            sgd.apply(0, in_grads[param], param, '')
+
+        if (i % 100 == 0):
+            print('training loss = ', tensor.to_numpy(loss)[0])

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/060e7dfe/python/singa/autograd.py
----------------------------------------------------------------------
diff --git a/python/singa/autograd.py b/python/singa/autograd.py
index 399fa19..175f8b2 100644
--- a/python/singa/autograd.py
+++ b/python/singa/autograd.py
@@ -1,6 +1,5 @@
 from collections import Counter, deque
-from singa import tensor
-
+from .tensor import Tensor, Dummy
 
 
 def infer_dependency(op):
@@ -23,7 +22,8 @@ def infer_dependency(op):
     while len(queue) > 0:
         cur_op = queue.pop()
         for src_op, _, _, _ in cur_op.src:
-            if src_op not in dependency_count:
+            if src_op not in dependency_count and \
+                    (not isinstance(src_op, Dummy)):
                 # dependency[src_op] = [Counter() for _ in src_op.y_id2idx]
                 dependency_count[src_op] = 0
                 queue.append(src_op)
@@ -53,7 +53,7 @@ def backward(y, dy=None):
     # by default the dy is a tensor with 1.0 for each sample;
     if dy is None:
         dy = float(1.0)
-    elif isinstance(dy, tensor.Tensor):
+    elif isinstance(dy, Tensor):
         dy = dy.data
     else:
         dy = float(dy)
@@ -62,17 +62,22 @@ def backward(y, dy=None):
     ready = deque([(y.creator, (dy,))])
     not_ready = {}  # mapping: op->[dy]
     gradients = {}  # mapping: x->dx if x.stores_grad
+    if y.stores_grad:
+        gradients[y] = dy
 
     while len(ready) > 0:
         op, dys = ready.pop()
-        #if not isinstance(op, tensor.Dummy):
+        if not op.requires_grad or isinstance(op, Dummy):
+            continue
+        # if not isinstance(op, tensor.Dummy):
         dxs = op._do_backward(*dys)
         # TODO src and dx must match
         assert len(op.src) == len(dxs), \
             'the number of src ops (=%d) and dx (=%d) not match' \
             % (len(op.src), len(dxs))
-        for (src_op, x_id, param, x_stores_grad), dx in zip(op.src, dxs):
-            # x_id is the python id of one input arg of op, denoted as x.
+        for (src_op, x_id, y, y_stores_grad), dx in zip(op.src, dxs):
+            # prefix x is w.r.t op; prefix y is w.r.t src_op.
+            # x_id is the python id of one input arg of src_op, denoted as x.
             # y_idx (below) is the index of x among the outputs of src_op.
             # not_ready[src_op][y_idx] records the intermediate gradient
             # of the y_idx'th output of src_op. 'intermediate gradient'
@@ -81,10 +86,11 @@ def backward(y, dy=None):
             # children operations. When src_op is ready, it means that
             # the gradient of all its outputs are available, i.e. all children
             # operations have been backwarded.
-            y_idx = src_op.y_ids[x_id]
+            # y is None if y.stores_grad is false; otherwise it is a Tensor
+            y_idx = src_op.y_id2idx[x_id]
             if src_op not in not_ready:
                 # src_op may have mulitple outputs
-                not_ready[src_op] = [None for _ in src_op.y_ids]
+                not_ready[src_op] = [None for _ in src_op.y_id2idx]
                 not_ready[src_op][y_idx] = dx
             else:
                 dxs = not_ready[src_op]
@@ -94,14 +100,15 @@ def backward(y, dy=None):
                     # add the gradient from another children operation that
                     # uses y_idx'th output of src_op as input arg
                     dxs[y_idx] += dx
-            if x_stores_grad:
+            if y_stores_grad:
                 # store the gradient for final return, e.g. if x is parameter
-                gradient = not_ready[src_op][y_idx]
-                gradients[param] = tensor.Tensor(device=gradient.device, 
data=gradient, requires_grad=False)
+                g = not_ready[src_op][y_idx]
+                gradients[y] = Tensor(device=g.device, data=g)
             dependency[src_op] -= 1
             if src_op.requires_grad is True:
                 if dependency[src_op] == 0:
-                    ready.append((src_op, not_ready[src_op]))
+                    if not isinstance(src_op, Dummy):
+                        ready.append((src_op, not_ready[src_op]))
                     del not_ready[src_op]
 
     return gradients

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/060e7dfe/python/singa/tensor.py
----------------------------------------------------------------------
diff --git a/python/singa/tensor.py b/python/singa/tensor.py
index f4801a4..70a9302 100644
--- a/python/singa/tensor.py
+++ b/python/singa/tensor.py
@@ -1130,32 +1130,42 @@ class Operation(object):
     a Xxxx instance and calls __call__ to do forward. The autograd engine
     is able to do backward propagation by calling the backward() of Xxxx
     automatically. Notice that the tensors are CTensor. NOT Python Tensor.
-    The arguments of forward includes both CTensor instances and other
-    types of data; The backward function ONLY supports CTensor args.
+    The arguments of forward() and backward() should only include CTensor 
args; 
     '''
 
     def __call__(self, *xs):
         return self._do_forward(*xs)
 
     def _do_forward(self, *xs):
+        '''
+        Do not call this function from user code. It is called by __call__().
+
+        Args:
+            xs, Tensor instance(s)
+
+        Returns:
+            Tensor instance(s)
+        '''
         # TODO add the pre hook
-        # filter out args that are not Tensor instances
-        tensor_xs = [x for x in xs if isinstance(x, Tensor)]
+        assert all([isinstance(x, Tensor) for x in xs]), \
+            'xs should include only Tensor instances'
 
         # need to do backward if any of its input arg needs gradient
-        self.requires_grad = any([x.requires_grad for x in tensor_xs])
-        # src records info of every input arg that needs gradient
-        # the backward() function computes grad only for those arg
+        self.requires_grad = any([x.requires_grad for x in xs])
 
         self.src = []
-        for x in tensor_xs:
+        for x in xs:
             if x.stores_grad:
+                # store the tensor whose gradient needs be returned in
+                # backward(), e.g. if x is parameter
                 self.src.append((x.creator, id(x), x, x.stores_grad))
             else:
+                # for intermediate tensors, they will be released soon;
+                # no need to store them --> use None
                 self.src.append((x.creator, id(x), None, x.stores_grad))
 
-        # use the CTensor (data) if the input arg is Tensor
-        xs = tuple(x.data if isinstance(x, Tensor) else x for x in xs)
+        # get the CTensor (data) if the input arg is Tensor
+        xs = tuple(x.data for x in xs)
         ys = self.forward(*xs)
         if not isinstance(ys, tuple):
             ys = (ys,)
@@ -1166,7 +1176,7 @@ class Operation(object):
                           requires_grad=self.requires_grad,
                           creator=self) for y in ys)
         # map from python id to output index
-        self.y_ids = {id(y): i for i, y in enumerate(ys)}
+        self.y_id2idx = {id(y): i for i, y in enumerate(ys)}
         # TODO add the post hook
         return ys
 
@@ -1180,7 +1190,7 @@ class Operation(object):
         '''Forward propagation.
 
         Args:
-            xs: input args consisting of CTensors and others.
+            xs: input args consisting of only CTensors.
 
         Returns:
             CTensor instance(s)
@@ -1191,7 +1201,7 @@ class Operation(object):
         ''' Backward propagation.
 
         Args:
-            dys: input args consisting of CTensors.
+            dys: input args consisting of only CTensors.
 
         Returns:
             CTensor instance(s)
@@ -1209,7 +1219,7 @@ class Dummy(Operation):
     def __init__(self, tensor, name=None):
         self.name = name
         self.src = []
-        self.y_ids = {id(tensor): 0}
+        self.y_id2idx = {id(tensor): 0}
         self.requires_grad = False
 
 
@@ -1268,7 +1278,8 @@ class Matmul(Operation):
         Returns:
             a tuple for (dx, dw)
         '''
-        return singa.Mult(dy, self.input[1].T()), 
singa.Mult(self.input[0].T(), dy)
+        return singa.Mult(dy, self.input[1].T()), \
+            singa.Mult(self.input[0].T(), dy)
 
 
 def matmul(x, w):
@@ -1279,10 +1290,11 @@ class AddBias(Operation):
     '''
     Add Bias to each row / column of the Tensor, depending on the parameter 
axis.
     '''
+
     def __init__(self, axis=0):
         '''
         To indicate the calculation axis, 0 for row, 1 for column.
-        
+
         Args:
             axis: 0 or 1, default is 0.
         '''
@@ -1327,6 +1339,7 @@ class SoftMax(Operation):
     Apply SoftMax for each row of the Tensor or each column of the Tensor
     according to the parameter axis.
     '''
+
     def __init__(self, axis=0):
         self.axis = axis
 
@@ -1372,7 +1385,7 @@ class SoftMax(Operation):
             return dx.T()
 
 
-def softmax(x, axis=0):
+def soft_max(x, axis=0):
     return SoftMax(axis)(x)[0]
 
 
@@ -1381,6 +1394,7 @@ class CrossEntropy(Operation):
     Calculte CrossEntropy loss for a batch of training data.
 
     '''
+
     def forward(self, x, t):
         '''
         Args:
@@ -1393,10 +1407,12 @@ class CrossEntropy(Operation):
         loss = CTensor((1,))
         loss_data = -singa.SumAsFloat(singa.__mul__(t, singa.Log(x)))
         loss.SetFloatValue(loss_data / x.shape()[0])
+        self.x = x
+        self.t = t
         self.input = (x, t)
         return loss
 
-    def backward(self, dy):
+    def backward(self, dy=1.0):
         '''
         Args:
             dy (float or CTensor): scalar, accumulate gradient from outside of 
current network, usually
@@ -1406,8 +1422,8 @@ class CrossEntropy(Operation):
             dx (CTensor): data for the dL /dx, L is the loss, x is the output 
of current network.
             note that this is true for dy = 1.0
         '''
-        dx = singa.__div__(self.input[1], self.input[0])
-        dx *= float(-1 / self.input[0].shape()[0])
+        dx = singa.__div__(self.t, self.x)
+        dx *= float(-1 / self.x.shape()[0])
         if isinstance(dy, float):
             # dtype of dy: float
             dx *= dy
@@ -1426,4 +1442,4 @@ def ctensor2numpy(x):
     Convert a singa_tensor to numpy_tensor.
     '''
     np_array = x.GetFloatValue(int(x.Size()))
-    return np_array.reshape(x.shape())
\ No newline at end of file
+    return np_array.reshape(x.shape())

Reply via email to