SINGA-349 Create layer operations for autograd clean the code and add comments
Project: http://git-wip-us.apache.org/repos/asf/incubator-singa/repo Commit: http://git-wip-us.apache.org/repos/asf/incubator-singa/commit/6d7d629b Tree: http://git-wip-us.apache.org/repos/asf/incubator-singa/tree/6d7d629b Diff: http://git-wip-us.apache.org/repos/asf/incubator-singa/diff/6d7d629b Branch: refs/heads/master Commit: 6d7d629bf00437c23a67f1de3b1ed085764e9492 Parents: 403843d Author: Wang Wei <dcs...@nus.edu.sg> Authored: Thu May 17 21:18:31 2018 +0800 Committer: Wang Wei <dcs...@nus.edu.sg> Committed: Thu May 17 21:19:07 2018 +0800 ---------------------------------------------------------------------- examples/autograd/mnist_cnn.py | 41 ++++---- python/singa/autograd.py | 200 +++++++++++++++++++----------------- 2 files changed, 126 insertions(+), 115 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/6d7d629b/examples/autograd/mnist_cnn.py ---------------------------------------------------------------------- diff --git a/examples/autograd/mnist_cnn.py b/examples/autograd/mnist_cnn.py index 7afbb9e..3ab8b62 100644 --- a/examples/autograd/mnist_cnn.py +++ b/examples/autograd/mnist_cnn.py @@ -31,18 +31,18 @@ def to_categorical(y, num_classes): n = y.shape[0] categorical = np.zeros((n, num_classes)) categorical[np.arange(n), y] = 1 - categorical=categorical.astype(np.float32) + categorical = categorical.astype(np.float32) return categorical def preprocess(data): - data=data.astype(np.float32) + data = data.astype(np.float32) data /= 255 - data=np.expand_dims(data, axis=1) + data = np.expand_dims(data, axis=1) return data -def accuracy(pred,target): +def accuracy(pred, target): y = np.argmax(pred, axis=1) t = np.argmax(target, axis=1) a = y == t @@ -55,8 +55,9 @@ if __name__ == '__main__': parser.add_argument('file_path', type=str, help='the dataset path') args = parser.parse_args() - assert os.path.exists(args.file_path), 'Pls download the MNIST dataset from' \ - 'https://github.com/mnielsen/neural-networks-and-deep-learning/raw/master/data/mnist.pkl.gz' + assert os.path.exists(args.file_path), \ + 'Pls download the MNIST dataset from ' \ + 'https://github.com/mnielsen/neural-networks-and-deep-learning/raw/master/data/mnist.pkl.gz' train, test = load_data(args.file_path) @@ -69,18 +70,17 @@ if __name__ == '__main__': x_train = preprocess(train[0]) y_train = to_categorical(train[1], num_classes) - x_test=preprocess(test[0]) - y_test=to_categorical(test[1],num_classes) - print ('the shape of training data is', x_train.shape) - print ('the shape of training label is', y_train.shape) - print ('the shape of testing data is', x_test.shape) - print ('the shape of testing label is', y_test.shape) + x_test = preprocess(test[0]) + y_test = to_categorical(test[1], num_classes) + print('the shape of training data is', x_train.shape) + print('the shape of training label is', y_train.shape) + print('the shape of testing data is', x_test.shape) + print('the shape of testing label is', y_test.shape) # operations initialization conv1 = autograd.Conv2d(3, 32) conv2 = autograd.Conv2d(32, 32) - linear = autograd.Linear(32*28*28, 10) - + linear = autograd.Linear(32 * 28 * 28, 10) def forward(x, t): y = conv1(x) @@ -94,18 +94,21 @@ if __name__ == '__main__': loss = autograd.cross_entropy(y, t) return loss, y + autograd.training = True for epoch in range(epochs): for i in range(batch_number): - inputs = tensor.Tensor(data=x_train[i * 100:(1 + i) * 100, :], requires_grad=False, stores_grad=False) - targets = tensor.Tensor(data=y_train[i * 100:(1 + i) * 100, :], requires_grad=False, stores_grad=False) + inputs = tensor.Tensor(data=x_train[i * 100:(1 + i) * 100, :]) + targets = tensor.Tensor(data=y_train[i * 100:(1 + i) * 100, :]) loss, y = forward(inputs, targets) - accuracy_rate = accuracy(autograd.ctensor2numpy(y.data),autograd.ctensor2numpy(targets.data)) + accuracy_rate = accuracy(autograd.ctensor2numpy( + y.data), autograd.ctensor2numpy(targets.data)) if (i % 5 == 0): - print('accuracy is:', accuracy_rate,'loss is:', autograd.ctensor2numpy(loss.data)[0]) + print('accuracy is:', accuracy_rate, 'loss is:', + autograd.ctensor2numpy(loss.data)[0]) in_grads = autograd.backward(loss) for param in in_grads: - sgd.apply(0, in_grads[param], param, '') \ No newline at end of file + sgd.apply(0, in_grads[param], param, '') http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/6d7d629b/python/singa/autograd.py ---------------------------------------------------------------------- diff --git a/python/singa/autograd.py b/python/singa/autograd.py index de3dc92..fc438ab 100644 --- a/python/singa/autograd.py +++ b/python/singa/autograd.py @@ -1,18 +1,18 @@ from __future__ import division -from functools import reduce from collections import Counter, deque -from .tensor import Tensor +import numpy as np +import math -from singa import layer +from .tensor import Tensor +from . import layer from singa.proto import model_pb2 from . import singa_wrap as singa -import numpy as np -import math CTensor = singa.Tensor +training = False class Operation(object): @@ -20,12 +20,16 @@ class Operation(object): An operation includes the forward and backward function of tensor calculation. - To add a specific operation Xxxx, subclass Operation and implement - forward() and backward(). Then implement a function xxxx which creates - a Xxxx instance and calls __call__ to do forward. The autograd engine - is able to do backward propagation by calling the backward() of Xxxx - automatically. Notice that the tensors are CTensor. NOT Python Tensor. - The arguments of forward() and backward() should only include CTensor args; + Steps to add a specific operation Xxxx: + 1. create a subclass of Operation, name it as Xxxx + 2. if Xxxx is implemented using other Operations, then override + _do_forward() function; + if Xxxx is implemented using CTensor operations, + then override the forward() and backward(); The arguments of forward() + and backward() should only include CTensor; + if Xxxx is implemented by calling functions in layer.py, then override + __call__(), forward() and backward(). TODO(wangwei) avoid this complex + case. ''' def __call__(self, *xs): @@ -103,6 +107,9 @@ class Operation(object): ''' raise NotImplementedError + def get_params(self): + return [] + class Dummy(Operation): '''Dummy operation whice serves as a placehoder for autograd @@ -119,10 +126,6 @@ class Dummy(Operation): class ReLU(Operation): - def __call__(self, x, flag=True): - assert type(flag) is bool, 'flag can only be bool.' - self.flag=flag - return self._do_forward(x) def forward(self, x): ''' @@ -132,7 +135,7 @@ class ReLU(Operation): Returns: a new CTensor whose element y = x if x >= 0; otherwise 0; ''' - if self.flag: + if training: self.input = x return singa.ReLU(x) @@ -154,10 +157,6 @@ def relu(x): class Matmul(Operation): '''For matrix multiplication''' - def __call__(self, x, w, flag=True): - assert type(flag) is bool, 'flag can only be bool.' - self.flag=flag - return self._do_forward(x, w) def forward(self, x, w): '''Do forward propgation. @@ -171,7 +170,7 @@ class Matmul(Operation): Returns: a CTensor for the result ''' - if self.flag: + if training: self.input = (x, w) return singa.Mult(x, w) @@ -187,13 +186,13 @@ class Matmul(Operation): singa.Mult(self.input[0].T(), dy) -def matmul(x, w, flag=True): - return Matmul()(x, w, flag)[0] +def matmul(x, w): + return Matmul()(x, w)[0] class AddBias(Operation): ''' - Add Bias to each row / column of the Tensor, depending on the parameter axis. + Add Bias to each row / column of the Tensor, depending on the axis arg. ''' def __init__(self, axis=0): @@ -303,7 +302,8 @@ class CrossEntropy(Operation): def forward(self, x, t): ''' Args: - x (CTensor): 1d or 2d tensor, the prediction data(output) of current network. + x (CTensor): 1d or 2d tensor, the prediction data(output) + of current network. t (CTensor): 1d or 2d tensor, the target data for training. Returns: @@ -320,12 +320,13 @@ class CrossEntropy(Operation): def backward(self, dy=1.0): ''' Args: - dy (float or CTensor): scalar, accumulate gradient from outside of current network, usually - equal to 1.0 + dy (float or CTensor): scalar, accumulate gradient from outside + of current network, usually equal to 1.0 Returns: - dx (CTensor): data for the dL /dx, L is the loss, x is the output of current network. - note that this is true for dy = 1.0 + dx (CTensor): data for the dL /dx, L is the loss, x is the output + of current network. note that this is true for + dy = 1.0 ''' dx = singa.__div__(self.t, self.x) dx *= float(-1 / self.x.shape()[0]) @@ -351,17 +352,18 @@ def ctensor2numpy(x): class Conv2d(Operation): - def __init__(self, in_channels, out_channels, kernel_size=3, stride=1, padding=0, dilation=1, groups=1, bias=True, - **kwargs): + + def __init__(self, in_channels, out_channels, kernel_size=3, stride=1, + padding=0, dilation=1, groups=1, bias=True, **kwargs): inner_params = {'name': 'Conv2d', - 'border_mode': 'same', - 'cudnn_prefer': 'fastest', - 'workspace_byte_limit': 1024, - 'data_format': 'NCHW', - 'W_specs': {'init': 'xavier'}, - 'b_specs': {'init': 'constant'}, - 'input_sample_shape': None} + 'border_mode': 'same', + 'cudnn_prefer': 'fastest', + 'workspace_byte_limit': 1024, + 'data_format': 'NCHW', + 'W_specs': {'init': 'xavier'}, + 'b_specs': {'init': 'constant'}, + 'input_sample_shape': None} # TODO valid value of inner_params check for kwarg in kwargs: @@ -369,7 +371,7 @@ class Conv2d(Operation): raise TypeError('Keyword argument not understood:', kwarg) else: inner_params[kwarg] = kwargs[kwarg] - + self.in_channels = in_channels self.out_channels = out_channels self.W_specs = inner_params['W_specs'] @@ -388,21 +390,30 @@ class Conv2d(Operation): if dilation != 1 or groups != 1: raise ValueError('Not implemented yet') - self.PyLayer = layer.Conv2D(inner_params['name'], nb_kernels=out_channels, kernel=kernel_size, stride=stride, + self.PyLayer = layer.Conv2D(inner_params['name'], + nb_kernels=out_channels, + kernel=kernel_size, + stride=stride, border_mode=inner_params['border_mode'], - cudnn_prefer=inner_params['cudnn_prefer'], workspace_byte_limit=inner_params['workspace_byte_limit'], - data_format=inner_params['data_format'], use_bias=bias, W_specs=self.W_specs, b_specs=self.b_specs, - pad=pad, input_sample_shape=inner_params['input_sample_shape']) + cudnn_prefer=inner_params['cudnn_prefer'], + workspace_byte_limit=inner_params[ + 'workspace_byte_limit'], + data_format=inner_params['data_format'], + use_bias=bias, + W_specs=self.W_specs, + b_specs=self.b_specs, + pad=pad, + input_sample_shape=inner_params['input_sample_shape']) def get_params(self): - assert self.has_setup, \ - 'Must call setup() before get_params()' - params = self.PyLayer.layer.param_values() - return params - - def __call__(self, x, flag=True): - assert type(flag) is bool, 'flag can only be bool.' - if flag: + assert self.init_value is True, 'must initialize before get_params()' + if self.bias: + return (self.w, self.b) + else: + return self.w + + def __call__(self, x): + if training: self.flag = model_pb2.kTrain else: self.flag = model_pb2.kEval @@ -413,15 +424,18 @@ class Conv2d(Operation): param_data = self.PyLayer.layer.param_values() if not hasattr(self, 'w'): - self.w = Tensor(device=param_data[0].device, data=param_data[0], requires_grad=True, stores_grad=True) - std = math.sqrt(2.0/(self.in_channels*self.kernel_size[0]*self.kernel_size[1]+self.out_channels)) + self.w = Tensor(device=param_data[0].device, data=param_data[ + 0], requires_grad=True, stores_grad=True) + std = math.sqrt( + 2.0 / (self.in_channels * self.kernel_size[0] * self.kernel_size[1] + self.out_channels)) self.w.gaussian(0.0, std) xs = [x, self.w] if len(param_data) == 2: if not hasattr(self, 'b'): - self.b = Tensor(device=param_data[1].device, data=param_data[1], requires_grad=True, stores_grad=True) + self.b = Tensor(device=param_data[1].device, data=param_data[ + 1], requires_grad=True, stores_grad=True) self.b.set_value(0.0) xs.append(self.b) @@ -434,10 +448,11 @@ class Conv2d(Operation): def backward(self, dy): ret = self.PyLayer.layer.Backward(self.flag, dy) - return (ret[0],)+ret[1] + return (ret[0],) + ret[1] class Linear(Operation): + def __init__(self, in_features, out_features, bias=True): self.in_features = in_features self.out_features = out_features @@ -453,47 +468,36 @@ class Linear(Operation): else: return self.w - def init_params(self, w, b=None): - if self.bias: - assert b is not None, 'must initialize bias.' - assert w.shape == self.w_shape, 'shape of parameters must match.' - assert b.shape == self.b_shape, 'shape of parameters must match.' - self.w = w - self.b = b - else: - assert b is None, 'cannot initialize bias.' - assert w.shape == self.w_shape, 'shape of parameters must match.' - self.w = w - self.init_value = True - return - - def __call__(self, x, flag=True): - assert type(flag) is bool, 'flag can only be bool.' + def __call__(self, x): if self.init_value is False: - self.w = Tensor(shape=self.w_shape, requires_grad=True, stores_grad=True) + self.w = Tensor(shape=self.w_shape, + requires_grad=True, stores_grad=True) std = math.sqrt(2.0 / (self.in_features + self.out_features)) self.w.gaussian(0.0, std) if self.bias: - self.b = Tensor(shape=self.b_shape, requires_grad=True, stores_grad=True) + self.b = Tensor(shape=self.b_shape, + requires_grad=True, stores_grad=True) self.b.set_value(0.0) self.init_value = True - y = matmul(x, self.w, flag) + y = matmul(x, self.w) if self.bias: y = add_bias(y, self.b, axis=0) return y class MaxPool2d(Operation): - def __init__(self, kernel_size=3, stride=1, padding=0, dilation=1, return_indices=False, ceil_mode=False, **kwargs): + + def __init__(self, kernel_size=3, stride=1, padding=0, dilation=1, + return_indices=False, ceil_mode=False, **kwargs): inner_params = {'name': 'MaxPool2d', - 'border_mode': 'same', - 'data_format': 'NCHW', - 'input_sample_shape': None - } + 'border_mode': 'same', + 'data_format': 'NCHW', + 'input_sample_shape': None + } for kwarg in kwargs: - if kwarg not in allowed_kwargs: + if kwarg not in inner_params: raise TypeError('Keyword argument not understood:', kwarg) else: inner_params[kwarg] = kwargs[kwarg] @@ -503,16 +507,18 @@ class MaxPool2d(Operation): else: pad = padding - if dilation != 1 or return_indices is not False or ceil_mode is not False: + if dilation != 1 or return_indices or ceil_mode: raise ValueError('Not implemented yet') - self.PyLayer = layer.Pooling2D(inner_params['name'], model_pb2.PoolingConf.MAX, - kernel_size, stride, inner_params['border_mode'], - pad, inner_params['data_format'], inner_params['input_sample_shape']) + self.PyLayer = layer.Pooling2D(inner_params['name'], + model_pb2.PoolingConf.MAX, + kernel_size, stride, inner_params[ + 'border_mode'], + pad, inner_params['data_format'], + inner_params['input_sample_shape']) - def __call__(self, x, flag=True): - assert type(flag) is bool, 'flag can only be bool.' - if flag: + def __call__(self, x): + if training: self.flag = model_pb2.kTrain else: self.flag = model_pb2.kEval @@ -529,17 +535,19 @@ class MaxPool2d(Operation): return self.PyLayer.layer.Backward(0, dy)[0] -def max_pool_2d(x, kernel_size=3, stride=1, padding=0, dilation=1, return_indices=False, ceil_mode=False, **kwargs): - return MaxPool2d(kernel_size, stride, padding, dilation, return_indices, ceil_mode, **kwargs)(x)[0] +def max_pool_2d(x, kernel_size=3, stride=1, padding=0, dilation=1, + return_indices=False, ceil_mode=False, **kwargs): + return MaxPool2d(kernel_size, stride, padding, dilation, return_indices, + ceil_mode, **kwargs)(x)[0] class Flatten(Operation): - def __init__(self, name='Flatten', axis=1, input_sample_shape=None): - self.PyLayer = layer.Flatten(name, axis, input_sample_shape) - def __call__(self, x, flag=True): - assert type(flag) is bool, 'flag can only be bool.' - if flag: + def __init__(self): + self.PyLayer = layer.Flatten('flatten', 1) + + def __call__(self, x): + if training: self.flag = model_pb2.kTrain else: self.flag = model_pb2.kEval @@ -554,8 +562,8 @@ class Flatten(Operation): return self.PyLayer.layer.Backward(0, dy)[0] -def flatten(x, name='Flatten', axis=1, input_sample_shape=None): - return Flatten(name, axis, input_sample_shape)(x)[0] +def flatten(x): + return Flatten()(x)[0] def infer_dependency(op):