SINGA-348 Support autograd MLP Example rename some variables; and add more checks for Dummy operations.
move mlp.py into example/autograd; I can run mlp.py without any bugs. Project: http://git-wip-us.apache.org/repos/asf/incubator-singa/repo Commit: http://git-wip-us.apache.org/repos/asf/incubator-singa/commit/060e7dfe Tree: http://git-wip-us.apache.org/repos/asf/incubator-singa/tree/060e7dfe Diff: http://git-wip-us.apache.org/repos/asf/incubator-singa/diff/060e7dfe Branch: refs/heads/master Commit: 060e7dfe1cc847500e4beb115f3b24e923288b3e Parents: f42d4d0 Author: Wang Wei <dcs...@nus.edu.sg> Authored: Wed Apr 11 21:09:45 2018 +0800 Committer: Wang Wei <dcs...@nus.edu.sg> Committed: Thu Apr 12 16:59:48 2018 +0800 ---------------------------------------------------------------------- examples/MLP.py | 83 ------------------------------------------- examples/autograd/mlp.py | 74 ++++++++++++++++++++++++++++++++++++++ python/singa/autograd.py | 33 ++++++++++------- python/singa/tensor.py | 58 +++++++++++++++++++----------- 4 files changed, 131 insertions(+), 117 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/060e7dfe/examples/MLP.py ---------------------------------------------------------------------- diff --git a/examples/MLP.py b/examples/MLP.py deleted file mode 100644 index b773efb..0000000 --- a/examples/MLP.py +++ /dev/null @@ -1,83 +0,0 @@ -from singa import tensor -from singa import autograd -from singa import optimizer -import numpy as np - - -if __name__ == '__main__': - - # prepare training data in numpy array - - # generate the boundary - f = lambda x: (5 * x + 1) - bd_x = np.linspace(-1., 1, 200) - bd_y = f(bd_x) - # generate the training data - x = np.random.uniform(-1, 1, 400) - y = f(x) + 2 * np.random.randn(len(x)) - # convert training data to 2d space - label = np.asarray([5 * a + 1 > b for (a, b) in zip(x, y)]) - data = np.array([[a, b] for (a, b) in zip(x, y)], dtype=np.float32) - - def to_categorical(y, num_classes=None): - ''' - Converts a class vector (integers) to binary class matrix. - - Args - y: class vector to be converted into a matrix - (integers from 0 to num_classes). - num_classes: total number of classes. - - Return - A binary matrix representation of the input. - ''' - y = np.array(y, dtype='int') - input_shape = y.shape - if input_shape and input_shape[-1] == 1 and len(input_shape) > 1: - input_shape = tuple(input_shape[:-1]) - y = y.ravel() - if not num_classes: - num_classes = np.max(y) + 1 - n = y.shape[0] - categorical = np.zeros((n, num_classes)) - categorical[np.arange(n), y] = 1 - output_shape = input_shape + (num_classes,) - categorical = np.reshape(categorical, output_shape) - return categorical - - label = to_categorical(label, 2).astype(np.float32) - print('train_data_shape:', data.shape) - print('train_label_shape:', label.shape) - - inputs = tensor.Tensor(data=data, requires_grad=False) - target = tensor.Tensor(data=label, requires_grad=False) - - w0 = tensor.Tensor(shape=(2, 3), requires_grad=True, stores_grad=True) - w0.gaussian(0.0, 0.1) - b0 = tensor.Tensor(shape=(1, 3), requires_grad=True, stores_grad=True) - b0.set_value(0.0) - - w1 = tensor.Tensor(shape=(3, 2), requires_grad=True, stores_grad=True) - w1.gaussian(0.0, 0.1) - b1 = tensor.Tensor(shape=(1, 2), requires_grad=True, stores_grad=True) - b1.set_value(0.0) - - sgd = optimizer.SGD(0.05) - # training process - for i in range(1001): - x = tensor.matmul(inputs, w0) - x = tensor.add_bias(x, b0) - x = tensor.relu(x) - x = tensor.matmul(x, w1) - x = tensor.add_bias(x, b1) - x = tensor.softmax(x) - loss = tensor.cross_entropy(x, target) - in_grads = autograd.backward(loss) - - for param in in_grads: - sgd.apply(0, in_grads[param], param, '') - - if (i % 100 == 0): - print('training loss = ', tensor.to_numpy(loss)[0]) - - http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/060e7dfe/examples/autograd/mlp.py ---------------------------------------------------------------------- diff --git a/examples/autograd/mlp.py b/examples/autograd/mlp.py new file mode 100644 index 0000000..7352c21 --- /dev/null +++ b/examples/autograd/mlp.py @@ -0,0 +1,74 @@ +from singa import tensor +from singa.tensor import Tensor +from singa import autograd +from singa import optimizer +import numpy as np + + +if __name__ == '__main__': + + # prepare training data in numpy array + + # generate the boundary + f = lambda x: (5 * x + 1) + bd_x = np.linspace(-1., 1, 200) + bd_y = f(bd_x) + # generate the training data + x = np.random.uniform(-1, 1, 400) + y = f(x) + 2 * np.random.randn(len(x)) + # convert training data to 2d space + label = np.asarray([5 * a + 1 > b for (a, b) in zip(x, y)]) + data = np.array([[a, b] for (a, b) in zip(x, y)], dtype=np.float32) + + def to_categorical(y, num_classes): + ''' + Converts a class vector (integers) to binary class matrix. + + Args + y: class vector to be converted into a matrix + (integers from 0 to num_classes). + num_classes: total number of classes. + + Return + A binary matrix representation of the input. + ''' + y = np.array(y, dtype='int') + n = y.shape[0] + categorical = np.zeros((n, num_classes)) + categorical[np.arange(n), y] = 1 + return categorical + + label = to_categorical(label, 2).astype(np.float32) + print('train_data_shape:', data.shape) + print('train_label_shape:', label.shape) + + inputs = Tensor(data=data) + target = Tensor(data=label) + + w0 = Tensor(shape=(2, 3), requires_grad=True, stores_grad=True) + w0.gaussian(0.0, 0.1) + b0 = Tensor(shape=(1, 3), requires_grad=True, stores_grad=True) + b0.set_value(0.0) + + w1 = Tensor(shape=(3, 2), requires_grad=True, stores_grad=True) + w1.gaussian(0.0, 0.1) + b1 = Tensor(shape=(1, 2), requires_grad=True, stores_grad=True) + b1.set_value(0.0) + + sgd = optimizer.SGD(0.05) + # training process + for i in range(1001): + x = tensor.matmul(inputs, w0) + x = tensor.add_bias(x, b0) + x = tensor.relu(x) + x = tensor.matmul(x, w1) + x = tensor.add_bias(x, b1) + x = tensor.soft_max(x) + loss = tensor.cross_entropy(x, target) + in_grads = autograd.backward(loss) + + for param in in_grads: + sgd.apply(0, in_grads[param], param, '') + + if (i % 100 == 0): + print('training loss = ', tensor.to_numpy(loss)[0]) http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/060e7dfe/python/singa/autograd.py ---------------------------------------------------------------------- diff --git a/python/singa/autograd.py b/python/singa/autograd.py index 399fa19..175f8b2 100644 --- a/python/singa/autograd.py +++ b/python/singa/autograd.py @@ -1,6 +1,5 @@ from collections import Counter, deque -from singa import tensor - +from .tensor import Tensor, Dummy def infer_dependency(op): @@ -23,7 +22,8 @@ def infer_dependency(op): while len(queue) > 0: cur_op = queue.pop() for src_op, _, _, _ in cur_op.src: - if src_op not in dependency_count: + if src_op not in dependency_count and \ + (not isinstance(src_op, Dummy)): # dependency[src_op] = [Counter() for _ in src_op.y_id2idx] dependency_count[src_op] = 0 queue.append(src_op) @@ -53,7 +53,7 @@ def backward(y, dy=None): # by default the dy is a tensor with 1.0 for each sample; if dy is None: dy = float(1.0) - elif isinstance(dy, tensor.Tensor): + elif isinstance(dy, Tensor): dy = dy.data else: dy = float(dy) @@ -62,17 +62,22 @@ def backward(y, dy=None): ready = deque([(y.creator, (dy,))]) not_ready = {} # mapping: op->[dy] gradients = {} # mapping: x->dx if x.stores_grad + if y.stores_grad: + gradients[y] = dy while len(ready) > 0: op, dys = ready.pop() - #if not isinstance(op, tensor.Dummy): + if not op.requires_grad or isinstance(op, Dummy): + continue + # if not isinstance(op, tensor.Dummy): dxs = op._do_backward(*dys) # TODO src and dx must match assert len(op.src) == len(dxs), \ 'the number of src ops (=%d) and dx (=%d) not match' \ % (len(op.src), len(dxs)) - for (src_op, x_id, param, x_stores_grad), dx in zip(op.src, dxs): - # x_id is the python id of one input arg of op, denoted as x. + for (src_op, x_id, y, y_stores_grad), dx in zip(op.src, dxs): + # prefix x is w.r.t op; prefix y is w.r.t src_op. + # x_id is the python id of one input arg of src_op, denoted as x. # y_idx (below) is the index of x among the outputs of src_op. # not_ready[src_op][y_idx] records the intermediate gradient # of the y_idx'th output of src_op. 'intermediate gradient' @@ -81,10 +86,11 @@ def backward(y, dy=None): # children operations. When src_op is ready, it means that # the gradient of all its outputs are available, i.e. all children # operations have been backwarded. - y_idx = src_op.y_ids[x_id] + # y is None if y.stores_grad is false; otherwise it is a Tensor + y_idx = src_op.y_id2idx[x_id] if src_op not in not_ready: # src_op may have mulitple outputs - not_ready[src_op] = [None for _ in src_op.y_ids] + not_ready[src_op] = [None for _ in src_op.y_id2idx] not_ready[src_op][y_idx] = dx else: dxs = not_ready[src_op] @@ -94,14 +100,15 @@ def backward(y, dy=None): # add the gradient from another children operation that # uses y_idx'th output of src_op as input arg dxs[y_idx] += dx - if x_stores_grad: + if y_stores_grad: # store the gradient for final return, e.g. if x is parameter - gradient = not_ready[src_op][y_idx] - gradients[param] = tensor.Tensor(device=gradient.device, data=gradient, requires_grad=False) + g = not_ready[src_op][y_idx] + gradients[y] = Tensor(device=g.device, data=g) dependency[src_op] -= 1 if src_op.requires_grad is True: if dependency[src_op] == 0: - ready.append((src_op, not_ready[src_op])) + if not isinstance(src_op, Dummy): + ready.append((src_op, not_ready[src_op])) del not_ready[src_op] return gradients http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/060e7dfe/python/singa/tensor.py ---------------------------------------------------------------------- diff --git a/python/singa/tensor.py b/python/singa/tensor.py index f4801a4..70a9302 100644 --- a/python/singa/tensor.py +++ b/python/singa/tensor.py @@ -1130,32 +1130,42 @@ class Operation(object): a Xxxx instance and calls __call__ to do forward. The autograd engine is able to do backward propagation by calling the backward() of Xxxx automatically. Notice that the tensors are CTensor. NOT Python Tensor. - The arguments of forward includes both CTensor instances and other - types of data; The backward function ONLY supports CTensor args. + The arguments of forward() and backward() should only include CTensor args; ''' def __call__(self, *xs): return self._do_forward(*xs) def _do_forward(self, *xs): + ''' + Do not call this function from user code. It is called by __call__(). + + Args: + xs, Tensor instance(s) + + Returns: + Tensor instance(s) + ''' # TODO add the pre hook - # filter out args that are not Tensor instances - tensor_xs = [x for x in xs if isinstance(x, Tensor)] + assert all([isinstance(x, Tensor) for x in xs]), \ + 'xs should include only Tensor instances' # need to do backward if any of its input arg needs gradient - self.requires_grad = any([x.requires_grad for x in tensor_xs]) - # src records info of every input arg that needs gradient - # the backward() function computes grad only for those arg + self.requires_grad = any([x.requires_grad for x in xs]) self.src = [] - for x in tensor_xs: + for x in xs: if x.stores_grad: + # store the tensor whose gradient needs be returned in + # backward(), e.g. if x is parameter self.src.append((x.creator, id(x), x, x.stores_grad)) else: + # for intermediate tensors, they will be released soon; + # no need to store them --> use None self.src.append((x.creator, id(x), None, x.stores_grad)) - # use the CTensor (data) if the input arg is Tensor - xs = tuple(x.data if isinstance(x, Tensor) else x for x in xs) + # get the CTensor (data) if the input arg is Tensor + xs = tuple(x.data for x in xs) ys = self.forward(*xs) if not isinstance(ys, tuple): ys = (ys,) @@ -1166,7 +1176,7 @@ class Operation(object): requires_grad=self.requires_grad, creator=self) for y in ys) # map from python id to output index - self.y_ids = {id(y): i for i, y in enumerate(ys)} + self.y_id2idx = {id(y): i for i, y in enumerate(ys)} # TODO add the post hook return ys @@ -1180,7 +1190,7 @@ class Operation(object): '''Forward propagation. Args: - xs: input args consisting of CTensors and others. + xs: input args consisting of only CTensors. Returns: CTensor instance(s) @@ -1191,7 +1201,7 @@ class Operation(object): ''' Backward propagation. Args: - dys: input args consisting of CTensors. + dys: input args consisting of only CTensors. Returns: CTensor instance(s) @@ -1209,7 +1219,7 @@ class Dummy(Operation): def __init__(self, tensor, name=None): self.name = name self.src = [] - self.y_ids = {id(tensor): 0} + self.y_id2idx = {id(tensor): 0} self.requires_grad = False @@ -1268,7 +1278,8 @@ class Matmul(Operation): Returns: a tuple for (dx, dw) ''' - return singa.Mult(dy, self.input[1].T()), singa.Mult(self.input[0].T(), dy) + return singa.Mult(dy, self.input[1].T()), \ + singa.Mult(self.input[0].T(), dy) def matmul(x, w): @@ -1279,10 +1290,11 @@ class AddBias(Operation): ''' Add Bias to each row / column of the Tensor, depending on the parameter axis. ''' + def __init__(self, axis=0): ''' To indicate the calculation axis, 0 for row, 1 for column. - + Args: axis: 0 or 1, default is 0. ''' @@ -1327,6 +1339,7 @@ class SoftMax(Operation): Apply SoftMax for each row of the Tensor or each column of the Tensor according to the parameter axis. ''' + def __init__(self, axis=0): self.axis = axis @@ -1372,7 +1385,7 @@ class SoftMax(Operation): return dx.T() -def softmax(x, axis=0): +def soft_max(x, axis=0): return SoftMax(axis)(x)[0] @@ -1381,6 +1394,7 @@ class CrossEntropy(Operation): Calculte CrossEntropy loss for a batch of training data. ''' + def forward(self, x, t): ''' Args: @@ -1393,10 +1407,12 @@ class CrossEntropy(Operation): loss = CTensor((1,)) loss_data = -singa.SumAsFloat(singa.__mul__(t, singa.Log(x))) loss.SetFloatValue(loss_data / x.shape()[0]) + self.x = x + self.t = t self.input = (x, t) return loss - def backward(self, dy): + def backward(self, dy=1.0): ''' Args: dy (float or CTensor): scalar, accumulate gradient from outside of current network, usually @@ -1406,8 +1422,8 @@ class CrossEntropy(Operation): dx (CTensor): data for the dL /dx, L is the loss, x is the output of current network. note that this is true for dy = 1.0 ''' - dx = singa.__div__(self.input[1], self.input[0]) - dx *= float(-1 / self.input[0].shape()[0]) + dx = singa.__div__(self.t, self.x) + dx *= float(-1 / self.x.shape()[0]) if isinstance(dy, float): # dtype of dy: float dx *= dy @@ -1426,4 +1442,4 @@ def ctensor2numpy(x): Convert a singa_tensor to numpy_tensor. ''' np_array = x.GetFloatValue(int(x.Size())) - return np_array.reshape(x.shape()) \ No newline at end of file + return np_array.reshape(x.shape())