SINGA-371 Implement functional operations in c++ for autograd - tidy some files and fixed some bugs.
- add few shape checks and functions in new developed layer. - rename some files, classes, variables Project: http://git-wip-us.apache.org/repos/asf/incubator-singa/repo Commit: http://git-wip-us.apache.org/repos/asf/incubator-singa/commit/15c0230c Tree: http://git-wip-us.apache.org/repos/asf/incubator-singa/tree/15c0230c Diff: http://git-wip-us.apache.org/repos/asf/incubator-singa/diff/15c0230c Branch: refs/heads/master Commit: 15c0230cbc98c3662f5e2519bed4da4b26741a4f Parents: 82ef417 Author: xuewanqi <[email protected]> Authored: Mon Jul 2 05:53:13 2018 +0000 Committer: xuewanqi <[email protected]> Committed: Tue Jul 3 03:37:48 2018 +0000 ---------------------------------------------------------------------- examples/autograd/mlp.py | 2 +- examples/autograd/mnist_cnn.py | 2 +- python/singa/autograd.py | 313 +++++------------- src/api/core_device.i | 3 - src/api/model_operation.i | 28 +- src/model/operation/convolution.cc | 371 ++++++++++++++++++++++ src/model/operation/convolution.h | 78 +++++ src/model/operation/convolution_operation.cc | 366 --------------------- src/model/operation/convolution_operation.h | 78 ----- test/python/test_operation.py | 27 +- 10 files changed, 564 insertions(+), 704 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/15c0230c/examples/autograd/mlp.py ---------------------------------------------------------------------- diff --git a/examples/autograd/mlp.py b/examples/autograd/mlp.py old mode 100644 new mode 100755 index f7c4353..0447927 --- a/examples/autograd/mlp.py +++ b/examples/autograd/mlp.py @@ -62,7 +62,7 @@ if __name__ == '__main__': label = to_categorical(label, 2).astype(np.float32) print('train_data_shape:', data.shape) print('train_label_shape:', label.shape) - # 1 + inputs = Tensor(data=data) target = Tensor(data=label) http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/15c0230c/examples/autograd/mnist_cnn.py ---------------------------------------------------------------------- diff --git a/examples/autograd/mnist_cnn.py b/examples/autograd/mnist_cnn.py old mode 100644 new mode 100755 index cbb5650..a82f64c --- a/examples/autograd/mnist_cnn.py +++ b/examples/autograd/mnist_cnn.py @@ -100,7 +100,7 @@ if __name__ == '__main__': print('the shape of testing label is', y_test.shape) # operations initialization - conv1 = autograd.Conv2D(1, 32, 3, padding=1) + conv1 = autograd.Conv2D(1, 32, 3, padding=1, bias=False) conv2 = autograd.Conv2D(32, 32, 3, padding=1) linear = autograd.Linear(32 * 28 * 28, 10) http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/15c0230c/python/singa/autograd.py ---------------------------------------------------------------------- diff --git a/python/singa/autograd.py b/python/singa/autograd.py old mode 100644 new mode 100755 index 474fff4..2a10608 --- a/python/singa/autograd.py +++ b/python/singa/autograd.py @@ -369,105 +369,6 @@ def ctensor2numpy(x): return np_array.reshape(x.shape()) -class Conv2d(Operation): - - def __init__(self, in_channels, out_channels, kernel_size=3, stride=1, - padding=0, dilation=1, groups=1, bias=True, **kwargs): - - inner_params = {'name': 'Conv2d', - 'border_mode': 'same', - 'cudnn_prefer': 'fastest', - 'workspace_byte_limit': 1024, - 'data_format': 'NCHW', - 'W_specs': {'init': 'xavier'}, - 'b_specs': {'init': 'constant'}, - 'input_sample_shape': None} - # TODO valid value of inner_params check - - for kwarg in kwargs: - if kwarg not in inner_params: - raise TypeError('Keyword argument not understood:', kwarg) - else: - inner_params[kwarg] = kwargs[kwarg] - - self.in_channels = in_channels - self.out_channels = out_channels - self.W_specs = inner_params['W_specs'] - self.b_specs = inner_params['b_specs'] - - if isinstance(kernel_size, int): - self.kernel_size = (kernel_size, kernel_size) - else: - self.kernel_size = kernel_size - - if padding == 0: - pad = None - else: - pad = padding - - if dilation != 1 or groups != 1: - raise ValueError('Not implemented yet') - - self.PyLayer = layer.Conv2D(inner_params['name'], - nb_kernels=out_channels, - kernel=kernel_size, - stride=stride, - border_mode=inner_params['border_mode'], - cudnn_prefer=inner_params['cudnn_prefer'], - workspace_byte_limit=inner_params[ - 'workspace_byte_limit'], - data_format=inner_params['data_format'], - use_bias=bias, - W_specs=self.W_specs, - b_specs=self.b_specs, - pad=pad, - input_sample_shape=inner_params['input_sample_shape']) - - def get_params(self): - assert self.init_value is True, 'must initialize before get_params()' - if self.bias: - return (self.w, self.b) - else: - return self.w - - def __call__(self, x): - if training: - self.flag = model_pb2.kTrain - else: - self.flag = model_pb2.kEval - - if not self.PyLayer.has_setup: - self.PyLayer.setup(x.shape[1:]) - - param_data = self.PyLayer.layer.param_values() - - if not hasattr(self, 'w'): - self.w = Tensor(device=param_data[0].device(), data=param_data[ - 0], requires_grad=True, stores_grad=True) - std = math.sqrt( - 2.0 / (self.in_channels * self.kernel_size[0] * self.kernel_size[1] + self.out_channels)) - self.w.gaussian(0.0, std) - - xs = [x, self.w] - - if len(param_data) == 2: - if not hasattr(self, 'b'): - self.b = Tensor(device=param_data[1].device(), data=param_data[ - 1], requires_grad=True, stores_grad=True) - self.b.set_value(0.0) - - xs.append(self.b) - - xs = tuple(xs) - return self._do_forward(*xs)[0] - - def forward(self, *xs): - return self.PyLayer.layer.Forward(self.flag, xs[0]) - - def backward(self, dy): - ret = self.PyLayer.layer.Backward(self.flag, dy) - return (ret[0],) + ret[1] - class MaxPool2d(Operation): def __init__(self, kernel_size=3, stride=1, padding=0, dilation=1, @@ -548,80 +449,11 @@ class Flatten(Operation): def flatten(x): return Flatten()(x)[0] -class CONV2D(Operation): - '''def __init__(self, in_channels, out_channels, kernel_size, stride=1, - padding=0, dilation=1, groups=1, bias=True, **kwargs): - self.in_channels = in_channels - self.out_channels = out_channels +class _Conv2D(Operation): - if isinstance(kernel_size, int): - self.kernel_size = (kernel_size, kernel_size) - elif isinstance(kernel_size, tuple): - self.kernel_size = kernel_size - else: - raise TypeError('Wrong kernel_size type.') - - if isinstance(stride, int): - self.stride = (stride,stride) - elif isinstance(stride, tuple): - self.stride = stride - else: - raise TypeError('Wrong stride type.') - - if isinstance(padding, int): - self.padding = (padding,padding) - elif isinstance(padding, tuple): - self.padding = padding - else: - raise TypeError('Wrong padding type.') - - if dilation != 1 or groups != 1: - raise ValueError('Not implemented yet') - - self.bias = bias - - self.inner_params = {'cudnn_prefer': 'fastest', 'workspace_MB_limit': 1024} - # TODO valid value of inner_params check - - for kwarg in kwargs: - if kwarg not in self.inner_params: - raise TypeError('Keyword argument not understood:', kwarg) - else: - self.inner_params[kwarg] = kwargs[kwarg] - - w_shape = (self.out_channels, self.in_channels, self.kernel_size[0], self.kernel_size[1]) - self.W = Tensor(shape=w_shape, requires_grad=True, stores_grad=True) - std = math.sqrt( - 2.0 / (self.in_channels * self.kernel_size[0] * self.kernel_size[1] + self.out_channels)) - self.W.gaussian(0.0, std) - - if self.bias: - b_shape = (self.out_channels,) - self.b = Tensor(shape=b_shape, requires_grad=True, stores_grad=True) - self.b.set_value(0.0) - else: - #to keep consistency when to do forward. - self.b = Tensor(data=CTensor([]), requires_grad=False, stores_grad=False) - - def __call__(self, x): - if not hasattr(self, 'device_id'): - self.device_id = x.device.id() - else: - assert self.device_id == x.device.id(),'Not the same device.' - - if self.W.device.id() != self.device_id: - self.W.to_device(x.device) - - if self.bias: - if self.b.device.id() != self.device_id: - self.b.to_device(x.device) - - xs = [x, self.W, self.b] - - return self._do_forward(*xs)[0]''' - def __init__(self, handles): - self.handles = handles + def __init__(self, handle): + self.handle = handle def forward(self, x, W, b): #assert x.nDim() == 4, 'The dimensions of input should be 4D.' @@ -631,39 +463,46 @@ class CONV2D(Operation): #assert 0 == 0, 'invalid padding' if training: - self.inputs = (x,W,b) + self.inputs = (x, W, b) - if self.handles.device_id == -1: - return singa.CpuConvForward(x, W, b, self.handles) + if self.handle.device_id == -1: + return singa.CpuConvForward(x, W, b, self.handle) else: - return singa.GpuConvForward(x, W, b, self.handles) + return singa.GpuConvForward(x, W, b, self.handle) def backward(self, dy): - assert training is True and hasattr(self, 'inputs'), 'Please set training as True before do BP. ' - - if dy.device().id() != self.handles.device_id: - dy.ToDevice(self.x.device()) - - if self.handles.device_id == -1: - dx = singa.CpuConvBackwardx(dy, self.inputs[1], self.inputs[0], self.handles) - dW = singa.CpuConvBackwardW(dy, self.inputs[0], self.inputs[1], self.handles) - if self.handles.bias: - db = singa.CpuConvBackwardb(dy, self.inputs[2], self.handles) + assert training is True and hasattr( + self, 'inputs'), 'Please set training as True before do BP. ' + + if dy.device().id() != self.handle.device_id: + dy.ToDevice(self.inputs[0].device()) + + if self.handle.device_id == -1: + dx = singa.CpuConvBackwardx( + dy, self.inputs[1], self.inputs[0], self.handle) + dW = singa.CpuConvBackwardW( + dy, self.inputs[0], self.inputs[1], self.handle) + if self.handle.bias_term_: + db = singa.CpuConvBackwardb(dy, self.inputs[2], self.handle) return dx, dW, db else: - return dx, dW + return dx, dW, None else: - dx = singa.GpuConvBackwardx(dy, self.inputs[1], self.inputs[0], self.handles) - dW = singa.GpuConvBackwardW(dy, self.inputs[0], self.inputs[1], self.handles) - if self.handles.bias: - db = singa.GpuConvBackwardb(dy, self.inputs[2], self.handles) + dx = singa.GpuConvBackwardx( + dy, self.inputs[1], self.inputs[0], self.handle) + dW = singa.GpuConvBackwardW( + dy, self.inputs[0], self.inputs[1], self.handle) + if self.handle.bias_term_: + db = singa.GpuConvBackwardb(dy, self.inputs[2], self.handle) return dx, dW, db else: - return dx, dW + return dx, dW, None + + +def conv2d(x, W, b, handle): + return _Conv2D(handle)(x, W, b)[0] -def conv2d(x,W,b,handles): - return CONV2D(handles)(x,W,b)[0] def infer_dependency(op): ''' @@ -776,27 +615,33 @@ def backward(y, dy=None): return gradients -class newlayer(object): + +class NewLayer(object): + def __init__(self): pass - def device_check(*inputs): - pass + def device_check(self, *inputs): + x_device = inputs[0].device + for var in inputs: + if var.device.id() != x_device: + var.to_device(x_device) + +class Linear(NewLayer): -class Linear(newlayer): def __init__(self, in_features, out_features, bias=True): #self.in_features = in_features #self.out_features = out_features w_shape = (in_features, out_features) b_shape = (1, out_features) self.bias = bias - + self.W = Tensor(shape=w_shape, requires_grad=True, stores_grad=True) std = math.sqrt(2.0 / (in_features + out_features)) self.W.gaussian(0.0, std) - + if self.bias: self.b = Tensor(shape=b_shape, requires_grad=True, stores_grad=True) @@ -812,7 +657,9 @@ class Linear(newlayer): y = add_bias(y, self.b, axis=0) return y -class Conv2D(newlayer): + +class Conv2D(NewLayer): + def __init__(self, in_channels, out_channels, kernel_size, stride=1, padding=0, dilation=1, groups=1, bias=True, **kwargs): @@ -825,16 +672,16 @@ class Conv2D(newlayer): self.kernel_size = kernel_size else: raise TypeError('Wrong kernel_size type.') - + if isinstance(stride, int): - self.stride = (stride,stride) + self.stride = (stride, stride) elif isinstance(stride, tuple): self.stride = stride else: raise TypeError('Wrong stride type.') if isinstance(padding, int): - self.padding = (padding,padding) + self.padding = (padding, padding) elif isinstance(padding, tuple): self.padding = padding else: @@ -845,7 +692,8 @@ class Conv2D(newlayer): self.bias = bias - self.inner_params = {'cudnn_prefer': 'fastest', 'workspace_MB_limit': 1024} + self.inner_params = {'cudnn_prefer': 'fastest', + 'workspace_MB_limit': 1024} # TODO valid value of inner_params check for kwarg in kwargs: @@ -853,46 +701,49 @@ class Conv2D(newlayer): raise TypeError('Keyword argument not understood:', kwarg) else: self.inner_params[kwarg] = kwargs[kwarg] - - w_shape = (self.out_channels, self.in_channels, self.kernel_size[0], self.kernel_size[1]) + + w_shape = (self.out_channels, self.in_channels, + self.kernel_size[0], self.kernel_size[1]) self.W = Tensor(shape=w_shape, requires_grad=True, stores_grad=True) std = math.sqrt( - 2.0 / (self.in_channels * self.kernel_size[0] * self.kernel_size[1] + self.out_channels)) + 2.0 / (self.in_channels * self.kernel_size[0] * self.kernel_size[1] + self.out_channels)) self.W.gaussian(0.0, std) if self.bias: b_shape = (self.out_channels,) - self.b = Tensor(shape=b_shape, requires_grad=True, stores_grad=True) + self.b = Tensor(shape=b_shape, requires_grad=True, + stores_grad=True) self.b.set_value(0.0) else: - #to keep consistency when to do forward. - self.b = Tensor(data=CTensor([1]), requires_grad=False, stores_grad=False) + # to keep consistency when to do forward. + self.b = Tensor(data=CTensor( + [1]), requires_grad=False, stores_grad=False) self.b.set_value(0.0) def __call__(self, x): + assert x.shape[1] == self.in_channels,'in_channels dismatched' + assert (x.shape[2]+2*self.padding[0]-self.kernel_size[0])%self.stride[0] == 0, 'invalid padding or strides.' + assert (x.shape[3]+2*self.padding[1]-self.kernel_size[1])%self.stride[1] == 0, 'invalid padding or stride.' + self.device_check(x, self.W, self.b) if x.device.id() == -1: - if not hasattr (self, 'handles'): - self.handles = singa.ConvHandles(x.data, self.kernel_size, self.stride, - self.padding, self.in_channels, self.out_channels, self.bias) - elif x.shape[0] != self.handles.batchsize: - self.handles = singa.ConvHandles(x.data, self.kernel_size, self.stride, - self.padding, self.in_channels, self.out_channels, self.bias) + if not hasattr(self, 'handle'): + self.handle = singa.ConvHandle(x.data, self.kernel_size, self.stride, + self.padding, self.in_channels, self.out_channels, self.bias) + elif x.shape[0] != self.handle.batchsize: + self.handle = singa.ConvHandle(x.data, self.kernel_size, self.stride, + self.padding, self.in_channels, self.out_channels, self.bias) else: - if not hasattr(self, 'handles'): - self.handles = singa.CudnnConvHandles(x.data, self.kernel_size, self.stride, - self.padding, self.in_channels, self.out_channels, self.bias, - self.inner_params['workspace_MB_limit']*1024*1024, self.inner_params['cudnn_prefer']) - elif x.shape[0] != self.handles.batchsize: - self.handles = singa.CudnnConvHandles(x.data, self.kernel_size, self.stride, - self.padding, self.in_channels, self.out_channels, self.bias, - self.inner_params['workspace_MB_limit']*1024*1024, self.inner_params['cudnn_prefer']) - self.handles.device_id= x.device.id() - self.handles.bias=self.bias # can simplified - y = conv2d(x, self.W, self.b, self.handles) + if not hasattr(self, 'handle'): + self.handle = singa.CudnnConvHandle(x.data, self.kernel_size, self.stride, + self.padding, self.in_channels, self.out_channels, self.bias, + self.inner_params['workspace_MB_limit'] * 1024 * 1024, self.inner_params['cudnn_prefer']) + elif x.shape[0] != self.handle.batchsize: + self.handle = singa.CudnnConvHandle(x.data, self.kernel_size, self.stride, + self.padding, self.in_channels, self.out_channels, self.bias, + self.inner_params['workspace_MB_limit'] * 1024 * 1024, self.inner_params['cudnn_prefer']) + self.handle.device_id = x.device.id() + + y = conv2d(x, self.W, self.b, self.handle) return y - - - - http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/15c0230c/src/api/core_device.i ---------------------------------------------------------------------- diff --git a/src/api/core_device.i b/src/api/core_device.i index 381f7c6..a5b7de6 100644 --- a/src/api/core_device.i +++ b/src/api/core_device.i @@ -43,14 +43,11 @@ namespace std{ namespace singa{ -enum LangType {kCpp, kCuda, kOpencl,kNumDeviceType}; - class Device { public: virtual void SetRandSeed(unsigned seed) = 0; std::shared_ptr<Device> host(); int id() const; - LangType lang() const; }; class Platform { http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/15c0230c/src/api/model_operation.i ---------------------------------------------------------------------- diff --git a/src/api/model_operation.i b/src/api/model_operation.i old mode 100644 new mode 100755 index 29f8f58..58e5270 --- a/src/api/model_operation.i +++ b/src/api/model_operation.i @@ -1,46 +1,48 @@ %module model_operation %{ -#include "../src/model/operation/convolution_operation.h" +#include "../src/model/operation/convolution.h" %} namespace singa{ -struct ConvHandles{ +struct ConvHandle{ size_t batchsize; + const bool bias_term_; - ConvHandles(const Tensor &input, const std::vector<size_t> kernel_size, + ConvHandle(const Tensor &input, const std::vector<size_t> kernel_size, const std::vector<size_t> stride, const std::vector<size_t> padding, const size_t in_channels, const size_t out_channels, const bool bias_term_); }; -struct CudnnConvHandles{ +struct CudnnConvHandle{ size_t batchsize; + const bool bias_term_; - CudnnConvHandles(const Tensor &input, const std::vector<size_t> kernel_size, + CudnnConvHandle(const Tensor &input, const std::vector<size_t> kernel_size, const std::vector<size_t> stride, const std::vector<size_t> padding, const size_t in_channels, const size_t out_channels, const bool bias_term_, const size_t workspace_byte_limit_=1024*1024*1024, const std::string prefer_="fastest"); }; -Tensor GpuConvForward(const Tensor &x, const Tensor &W, const Tensor &b, const CudnnConvHandles cch); +Tensor GpuConvForward(const Tensor &x, const Tensor &W, const Tensor &b, const CudnnConvHandle &cch); -Tensor GpuConvBackwardx(const Tensor &dy, const Tensor &W, const Tensor &x, const CudnnConvHandles cch); +Tensor GpuConvBackwardx(const Tensor &dy, const Tensor &W, const Tensor &x, const CudnnConvHandle &cch); -Tensor GpuConvBackwardW(const Tensor &dy, const Tensor &x, const Tensor &W, const CudnnConvHandles cch); +Tensor GpuConvBackwardW(const Tensor &dy, const Tensor &x, const Tensor &W, const CudnnConvHandle &cch); -Tensor GpuConvBackwardb(const Tensor &dy, const Tensor &b, const CudnnConvHandles cch); +Tensor GpuConvBackwardb(const Tensor &dy, const Tensor &b, const CudnnConvHandle &cch); -Tensor CpuConvForward(const Tensor &x, Tensor &W, Tensor &b, const ConvHandles ch); +Tensor CpuConvForward(const Tensor &x, Tensor &W, Tensor &b, const ConvHandle &ch); -Tensor CpuConvBackwardx(const Tensor &dy, Tensor &W, const Tensor &x, const ConvHandles ch); +Tensor CpuConvBackwardx(const Tensor &dy, Tensor &W, const Tensor &x, const ConvHandle &ch); -Tensor CpuConvBackwardW(const Tensor &dy, const Tensor &x, const Tensor &W, const ConvHandles ch); +Tensor CpuConvBackwardW(const Tensor &dy, const Tensor &x, const Tensor &W, const ConvHandle &ch); -Tensor CpuConvBackwardb(const Tensor &dy, const Tensor &b, const ConvHandles ch); +Tensor CpuConvBackwardb(const Tensor &dy, const Tensor &b, const ConvHandle &ch); } http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/15c0230c/src/model/operation/convolution.cc ---------------------------------------------------------------------- diff --git a/src/model/operation/convolution.cc b/src/model/operation/convolution.cc new file mode 100755 index 0000000..8d60df4 --- /dev/null +++ b/src/model/operation/convolution.cc @@ -0,0 +1,371 @@ +#include "./convolution.h" +#include "../layer/convolution.h" +#include<iostream> + +namespace singa{ + +ConvHandle::ConvHandle(const Tensor &input, const std::vector<size_t> kernel_size, + const std::vector<size_t> stride, const std::vector<size_t> padding, + const size_t in_channels, const size_t out_channels, + const bool bias){ + kernel_h_=kernel_size[0]; + kernel_w_=kernel_size[1]; + + pad_h_=padding[0]; + pad_w_=padding[1]; + + stride_h_=stride[0]; + stride_w_=stride[1]; + + channels_=in_channels; + num_filters_=out_channels; + + bias_term_ = bias; + + batchsize = input.shape(0); + CHECK(input.shape(1) == in_channels)<<"the number of input channels mismatched."; + height_ = input.shape(2); + width_ = input.shape(3); + + conv_height_ = 1; + if (stride_h_ > 0) + conv_height_ = (height_ + 2 * pad_h_ - kernel_h_) / stride_h_ + 1; + conv_width_ = (width_ + 2 * pad_w_ - kernel_w_) / stride_w_ + 1; + + col_height_ = in_channels * kernel_w_ * kernel_h_; + col_width_ = conv_height_ * conv_width_; + imagesize = input.Size() / batchsize; +}; + +CudnnConvHandle::CudnnConvHandle(const Tensor &input, const std::vector<size_t> kernel_size, + const std::vector<size_t> stride, const std::vector<size_t> padding, + const size_t in_channels, const size_t out_channels,const bool bias_term_, + const size_t workspace_byte_limit_,const std::string prefer_) + :ConvHandle(input, kernel_size, stride, padding, in_channels, out_channels, bias_term_){ + + DataType dtype = input.data_type(); + auto dev = input.device(); + Context *ctx = dev->context(0); + + CUDNN_CHECK(cudnnCreateTensorDescriptor(&x_desc_)); + CUDNN_CHECK(cudnnCreateTensorDescriptor(&y_desc_)); + if (bias_term_) + CUDNN_CHECK(cudnnCreateTensorDescriptor(&bias_desc_)); + CUDNN_CHECK(cudnnCreateFilterDescriptor(&filter_desc_)); + CUDNN_CHECK(cudnnCreateConvolutionDescriptor(&conv_desc_)); + + + CUDNN_CHECK(cudnnSetTensor4dDescriptor(x_desc_, CUDNN_TENSOR_NCHW, + GetCudnnDataType(dtype), batchsize, + channels_, height_, width_)); + CUDNN_CHECK(cudnnSetTensor4dDescriptor( + y_desc_, CUDNN_TENSOR_NCHW, GetCudnnDataType(dtype), batchsize, + num_filters_, conv_height_, conv_width_)); + if (bias_term_) + CUDNN_CHECK(cudnnSetTensor4dDescriptor(bias_desc_, CUDNN_TENSOR_NCHW, + GetCudnnDataType(dtype), 1, + num_filters_, 1, 1)); + CUDNN_CHECK(cudnnSetConvolution2dDescriptor(conv_desc_, pad_h_, pad_w_, + stride_h_, stride_w_, 1, 1, + CUDNN_CROSS_CORRELATION, + GetCudnnDataType(dtype))); + CUDNN_CHECK(cudnnSetFilter4dDescriptor(filter_desc_, GetCudnnDataType(dtype), + CUDNN_TENSOR_NCHW, num_filters_, + channels_, kernel_h_, kernel_w_)); + if (prefer_ == "fastest" || prefer_ == "limited_workspace" || + prefer_ == "no_workspace") { + cudnnConvolutionFwdPreference_t fwd_pref; + cudnnConvolutionBwdFilterPreference_t bwd_filt_pref; + cudnnConvolutionBwdDataPreference_t bwd_data_pref; + if (prefer_ == "fastest") { + fwd_pref = CUDNN_CONVOLUTION_FWD_PREFER_FASTEST; + bwd_filt_pref = CUDNN_CONVOLUTION_BWD_FILTER_PREFER_FASTEST; + bwd_data_pref = CUDNN_CONVOLUTION_BWD_DATA_PREFER_FASTEST; + } else if (prefer_ == "limited_workspace") { + fwd_pref = CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT; + bwd_filt_pref = CUDNN_CONVOLUTION_BWD_FILTER_SPECIFY_WORKSPACE_LIMIT; + bwd_data_pref = CUDNN_CONVOLUTION_BWD_DATA_SPECIFY_WORKSPACE_LIMIT; + } else { + fwd_pref = CUDNN_CONVOLUTION_FWD_NO_WORKSPACE; + bwd_filt_pref = CUDNN_CONVOLUTION_BWD_FILTER_NO_WORKSPACE; + bwd_data_pref = CUDNN_CONVOLUTION_BWD_DATA_SPECIFY_WORKSPACE_LIMIT; + } + CUDNN_CHECK(cudnnGetConvolutionForwardAlgorithm( + ctx->cudnn_handle, x_desc_, filter_desc_, conv_desc_, y_desc_, fwd_pref, + workspace_byte_limit_, &fp_alg_)); + CUDNN_CHECK(cudnnGetConvolutionBackwardFilterAlgorithm( + ctx->cudnn_handle, x_desc_, y_desc_, conv_desc_, filter_desc_, + bwd_filt_pref, workspace_byte_limit_, &bp_filter_alg_)); + // deprecated in cudnn v7 + CUDNN_CHECK(cudnnGetConvolutionBackwardDataAlgorithm( + ctx->cudnn_handle, filter_desc_, y_desc_, conv_desc_, x_desc_, + bwd_data_pref, workspace_byte_limit_, &bp_data_alg_)); + } else if (prefer_ == "autotune") { + const int topk = 1; + int num_fp_alg, num_bp_filt_alg, num_bp_data_alg; + cudnnConvolutionFwdAlgoPerf_t fp_alg_perf[topk]; + cudnnConvolutionBwdFilterAlgoPerf_t bp_filt_perf[topk]; + cudnnConvolutionBwdDataAlgoPerf_t bp_data_perf[topk]; + CUDNN_CHECK(cudnnFindConvolutionForwardAlgorithm( + ctx->cudnn_handle, x_desc_, filter_desc_, conv_desc_, y_desc_, topk, + &num_fp_alg, fp_alg_perf)); + fp_alg_ = fp_alg_perf[0].algo; + CUDNN_CHECK(cudnnFindConvolutionBackwardFilterAlgorithm( + ctx->cudnn_handle, x_desc_, y_desc_, conv_desc_, filter_desc_, topk, + &num_bp_filt_alg, bp_filt_perf)); + bp_filter_alg_ = bp_filt_perf[0].algo; + CUDNN_CHECK(cudnnFindConvolutionBackwardDataAlgorithm( + ctx->cudnn_handle, filter_desc_, y_desc_, conv_desc_, x_desc_, topk, + &num_bp_data_alg, bp_data_perf)); + bp_data_alg_ = bp_data_perf[0].algo; + } else { + LOG(FATAL) << "Preferred algorithm is not available!"; + } + + size_t fp_byte, bp_data_byte, bp_filter_byte; + CUDNN_CHECK(cudnnGetConvolutionForwardWorkspaceSize( + ctx->cudnn_handle, x_desc_, filter_desc_, conv_desc_, y_desc_, fp_alg_, + &fp_byte)); + CUDNN_CHECK(cudnnGetConvolutionBackwardDataWorkspaceSize( + ctx->cudnn_handle, filter_desc_, y_desc_, conv_desc_, x_desc_, + bp_data_alg_, &bp_data_byte)); + CUDNN_CHECK(cudnnGetConvolutionBackwardFilterWorkspaceSize( + ctx->cudnn_handle, x_desc_, y_desc_, conv_desc_, filter_desc_, + bp_filter_alg_, &bp_filter_byte)); + workspace_count_ = std::max(std::max(fp_byte, bp_data_byte), bp_filter_byte) / + sizeof(float) + + 1; + if (workspace_count_ * sizeof(float) > workspace_byte_limit_) + LOG(WARNING) << "The required memory for workspace (" + << workspace_count_ * sizeof(float) + << ") is larger than the expected Bytes (" + << workspace_byte_limit_ << ")"; + workspace_ = Tensor(Shape{workspace_count_}, dev, dtype); +}; + +Convolution C; + +Tensor CpuConvForward(const Tensor &x, Tensor &W, Tensor &b, const ConvHandle &ch){ + CHECK_EQ(x.device()->lang(), kCpp); + + CHECK(x.shape(1) == ch.channels_ && x.shape(2) == ch.height_ && + x.shape(3) == ch.width_) << "input sample shape should not change"; + + CHECK(W.shape(0) == ch.num_filters_ && W.shape(1) == ch.channels_ && + W.shape(2) == ch.kernel_h_ && W.shape(3) == ch.kernel_w_) << "weights shape should not change"; + + Shape w_shape= W.shape(); + Shape b_shape; + if (ch.bias_term_) + b_shape= b.shape(); + + W.Reshape(Shape{ch.num_filters_, ch.col_height_}); + if (ch.bias_term_) + b.Reshape(Shape{ch.num_filters_}); + + DataType dtype = x.data_type(); + auto dev = x.device(); + Shape shape{ch.batchsize, ch.num_filters_, ch.conv_height_, ch.conv_width_}; + Tensor output(shape, dev, dtype); + + Tensor col_data(Shape{ch.col_height_, ch.col_width_});//broadcasted image + + float *data_col = new float[ch.col_height_ * ch.col_width_]; + auto in_data = x.data<float>(); + for (size_t num = 0; num < ch.batchsize; num++) { + C.Im2col(in_data + num * ch.imagesize, ch.channels_, ch.height_, ch.width_, ch.kernel_h_, + ch.kernel_w_, ch.pad_h_, ch.pad_w_, ch.stride_h_, ch.stride_w_, data_col); + + col_data.CopyDataFromHostPtr(data_col, ch.col_height_ * ch.col_width_); + Tensor each = Mult(W, col_data); + if (ch.bias_term_) { + AddColumn(b, &each); + } + CopyDataToFrom(&output, each, each.Size(), num * each.Size()); + }; + W.Reshape(w_shape); + if (ch.bias_term_) + b.Reshape(b_shape); + return output; +}; + +Tensor CpuConvBackwardx(const Tensor &dy, Tensor &W, const Tensor &x, const ConvHandle &ch){ + CHECK_EQ(dy.device()->lang(), kCpp); + + CHECK(dy.shape(1) == ch.num_filters_ && dy.shape(2) == ch.conv_height_ && + dy.shape(3) == ch.conv_width_) << "input gradients shape should not change"; + + CHECK(W.shape(0) == ch.num_filters_ && W.shape(1) == ch.channels_ && + W.shape(2) == ch.kernel_h_ && W.shape(3) == ch.kernel_w_) << "weights shape should not change"; + + Shape w_shape= W.shape(); + W.Reshape(Shape{ch.num_filters_, ch.col_height_}); + + Tensor dx; + dx.ResetLike(x); + + float *dx_b = new float[ch.imagesize]; + + for (size_t num = 0; num < ch.batchsize; num++) { + Tensor grad_b(Shape{ch.num_filters_, ch.conv_height_ * ch.conv_width_}); + CopyDataToFrom(&grad_b, dy, grad_b.Size(), 0, num * grad_b.Size()); + Tensor dcol_b = Mult(W.T(), grad_b); + auto dcol_data = dcol_b.data<float>(); + C.Col2im(dcol_data, ch.channels_, ch.height_, ch.width_, ch.kernel_h_, ch.kernel_w_, ch.pad_h_, + ch.pad_w_, ch.stride_h_, ch.stride_w_, dx_b); + dx.CopyDataFromHostPtr(dx_b, ch.imagesize, num * ch.imagesize); + } + W.Reshape(w_shape); + return dx; +}; + +Tensor CpuConvBackwardW(const Tensor &dy, const Tensor &x, const Tensor &W, const ConvHandle &ch){ + CHECK_EQ(dy.device()->lang(), kCpp); + + CHECK(dy.shape(1) == ch.num_filters_ && dy.shape(2) == ch.conv_height_ && + dy.shape(3) == ch.conv_width_) << "input gradients shape should not change"; + + CHECK(x.shape(1) == ch.channels_ && x.shape(2) == ch.height_ && + x.shape(3) == ch.width_) << "input sample shape should not change"; + + Tensor dW; + dW.ResetLike(W); + dW.SetValue(0.0f); + + Shape w_shape= W.shape(); + dW.Reshape(Shape{ch.num_filters_, ch.col_height_}); + + Tensor col_data(Shape{ch.col_height_, ch.col_width_});//broadcasted image + + float *data_col = new float[ch.col_height_ * ch.col_width_]; + auto in_data = dy.data<float>(); + for (size_t num = 0; num < ch.batchsize; num++) { + C.Im2col(in_data + num * ch.imagesize, ch.channels_, ch.height_, ch.width_, ch.kernel_h_, + ch.kernel_w_, ch.pad_h_, ch.pad_w_, ch.stride_h_, ch.stride_w_, data_col); + col_data.CopyDataFromHostPtr(data_col, ch.col_height_ * ch.col_width_); + Tensor grad_b(Shape{ch.num_filters_, ch.conv_height_ * ch.conv_width_}); + CopyDataToFrom(&grad_b, dy, grad_b.Size(), 0, num * grad_b.Size()); + dW += Mult(grad_b, col_data.T()); + } + dW.Reshape(w_shape); + return dW; +}; + +Tensor CpuConvBackwardb(const Tensor &dy, const Tensor &b, const ConvHandle &ch){ + CHECK_EQ(dy.device()->lang(), kCpp); + + CHECK(dy.shape(1) == ch.num_filters_ && dy.shape(2) == ch.conv_height_ && + dy.shape(3) == ch.conv_width_) << "input gradients shape should not change"; + + CHECK(b.shape(0) == ch.num_filters_)<< "bias shape should not change"; + + Tensor db; + db.ResetLike(b); + + auto tmpshp = Shape{ch.batchsize * ch.num_filters_, dy.Size() / (ch.batchsize * ch.num_filters_)}; + Tensor tmp1 = Reshape(dy, tmpshp); + + Tensor tmp2(Shape{ch.batchsize * ch.num_filters_}); + SumColumns(tmp1, &tmp2); + Tensor tmp3 = Reshape(tmp2, Shape{ch.batchsize, ch.num_filters_}); + + SumRows(tmp3, &db); + + return db; +}; + +Tensor GpuConvForward(const Tensor &x, const Tensor &W, const Tensor &b, const CudnnConvHandle &cch){ + CHECK_EQ(x.device()->lang(), kCuda); + + DataType dtype = x.data_type(); + auto dev = x.device(); + + Shape shape{cch.batchsize, cch.num_filters_, cch.conv_height_, cch.conv_width_}; + Tensor output(shape, dev, dtype); + + output.device()->Exec([output, x, W, cch](Context *ctx) { + Block *inblock = x.block(), *outblock = output.block(), + *wblock = W.block(); + float alpha = 1.f, beta = 0.f; + cudnnConvolutionForward(ctx->cudnn_handle, &alpha, cch.x_desc_, + inblock->data(), cch.filter_desc_, wblock->data(), + cch.conv_desc_, cch.fp_alg_, + cch.workspace_.block()->mutable_data(), + cch.workspace_count_ * sizeof(float), &beta, + cch.y_desc_, outblock->mutable_data()); + }, {x.block(), W.block()}, {output.block()}, cch.workspace_.block()); + + if (cch.bias_term_) { + output.device()->Exec([output, b, cch](Context *ctx) { + float beta = 1.f, alpha = 1.0f; + Block *outblock = output.block(), *bblock = b.block(); + cudnnAddTensor(ctx->cudnn_handle, &alpha, cch.bias_desc_, + bblock->data(), &beta, cch.y_desc_, + outblock->mutable_data()); + }, {output.block(), b.block()}, {output.block()}); + } + + return output; +}; + +Tensor GpuConvBackwardx(const Tensor &dy, const Tensor &W, const Tensor &x, const CudnnConvHandle &cch){ + CHECK_EQ(dy.device()->lang(), kCuda); + + Tensor dx; + dx.ResetLike(x); + + dy.device()->Exec([dx, dy, W, cch](Context *ctx) { + Block *wblock = W.block(), *dyblock = dy.block(), + *dxblock = dx.block(); + float alpha = 1.f, beta = 0.f; + cudnnConvolutionBackwardData(ctx->cudnn_handle, &alpha, cch.filter_desc_, + wblock->data(), cch.y_desc_, dyblock->data(), + cch.conv_desc_, cch.bp_data_alg_, + cch.workspace_.block()->mutable_data(), + cch.workspace_count_ * sizeof(float), &beta, + cch.x_desc_, dxblock->mutable_data()); + }, {dy.block(), W.block()}, {dx.block(), cch.workspace_.block()}); + + return dx; +}; + +Tensor GpuConvBackwardW(const Tensor &dy, const Tensor &x, const Tensor &W, const CudnnConvHandle &cch){ + CHECK_EQ(dy.device()->lang(), kCuda); + + Tensor dW; + dW.ResetLike(W); + + dy.device()->Exec([dW, dy, x, W, cch](Context *ctx) { + Block *inblock = x.block(), *dyblock = dy.block(), + *dwblock = dW.block(); + float alpha = 1.f, beta = 0.f; + cudnnConvolutionBackwardFilter( + ctx->cudnn_handle, &alpha, cch.x_desc_, inblock->data(), + cch.y_desc_, dyblock->data(), cch.conv_desc_, cch.bp_filter_alg_, + cch.workspace_.block()->mutable_data(), + cch.workspace_count_ * sizeof(float), &beta, cch.filter_desc_, + dwblock->mutable_data()); + }, {dy.block(), x.block()}, {dW.block(), cch.workspace_.block()}); + + return dW; +}; + +// input Tensor b for Reset db purpose, can avoid this later. +Tensor GpuConvBackwardb(const Tensor &dy, const Tensor &b, const CudnnConvHandle &cch){ + CHECK_EQ(dy.device()->lang(), kCuda); + + Tensor db; + db.ResetLike(b); + + dy.device()->Exec([db, dy, b, cch](Context *ctx) { + Block *dyblock = dy.block(), *dbblock = db.block(); + float alpha = 1.f, beta = 0.f; + cudnnConvolutionBackwardBias(ctx->cudnn_handle, &alpha, cch.y_desc_, + dyblock->data(), &beta, cch.bias_desc_, + dbblock->mutable_data()); + }, {dy.block()}, {db.block()}); + + return db; +}; + +} \ No newline at end of file http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/15c0230c/src/model/operation/convolution.h ---------------------------------------------------------------------- diff --git a/src/model/operation/convolution.h b/src/model/operation/convolution.h new file mode 100755 index 0000000..96a6d60 --- /dev/null +++ b/src/model/operation/convolution.h @@ -0,0 +1,78 @@ +#include <string> +#include <vector> +#include <cudnn.h> +#include "../layer/cudnn_convolution.h" +#include "../layer/cudnn_utils.h" +#include "singa/utils/logging.h" + +namespace singa{ + +struct ConvHandle{ + size_t kernel_w_; + size_t pad_w_; + size_t stride_w_; + size_t kernel_h_; + size_t pad_h_; + size_t stride_h_; + + size_t channels_; + size_t num_filters_; + + bool bias_term_; + + size_t height_; + size_t width_; + size_t conv_height_; + size_t conv_width_; + size_t batchsize; + + size_t col_height_; + size_t col_width_; + size_t imagesize; + + ConvHandle(const Tensor &input, const std::vector<size_t> kernel_size, + const std::vector<size_t> stride, const std::vector<size_t> padding, + const size_t in_channels, const size_t out_channels, + const bool bias); + +}; + +struct CudnnConvHandle:ConvHandle{ + cudnnTensorDescriptor_t x_desc_ ; + cudnnTensorDescriptor_t y_desc_ ; + cudnnTensorDescriptor_t bias_desc_ ; + cudnnFilterDescriptor_t filter_desc_ ; + cudnnConvolutionDescriptor_t conv_desc_ ; + cudnnConvolutionFwdAlgo_t fp_alg_; + cudnnConvolutionBwdFilterAlgo_t bp_filter_alg_; + cudnnConvolutionBwdDataAlgo_t bp_data_alg_; + + size_t workspace_count_; + Tensor workspace_; + + CudnnConvHandle(const Tensor &input, const std::vector<size_t> kernel_size, + const std::vector<size_t> stride, const std::vector<size_t> padding, + const size_t in_channels, const size_t out_channels, + const bool bias, const size_t workspace_byte_limit_=1024*1024*1024, + const std::string prefer_="fastest"); +}; + +Tensor CpuConvForward(const Tensor &x, Tensor &W, Tensor &b, const ConvHandle &ch); + +Tensor CpuConvBackwardx(const Tensor &dy, Tensor &W, const Tensor &x, const ConvHandle &ch); + +Tensor CpuConvBackwardW(const Tensor &dy, const Tensor &x, const Tensor &W, const ConvHandle &ch); + +Tensor CpuConvBackwardb(const Tensor &dy, const Tensor &b, const ConvHandle &ch); + + +Tensor GpuConvForward(const Tensor &x, const Tensor &W, const Tensor &b, const CudnnConvHandle &cch); + +Tensor GpuConvBackwardx(const Tensor &dy, const Tensor &W, const Tensor &x, const CudnnConvHandle &cch); + +Tensor GpuConvBackwardW(const Tensor &dy, const Tensor &x, const Tensor &W, const CudnnConvHandle &cch); + +Tensor GpuConvBackwardb(const Tensor &dy, const Tensor &b, const CudnnConvHandle &cch); + + +} \ No newline at end of file http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/15c0230c/src/model/operation/convolution_operation.cc ---------------------------------------------------------------------- diff --git a/src/model/operation/convolution_operation.cc b/src/model/operation/convolution_operation.cc deleted file mode 100644 index 90b1b4a..0000000 --- a/src/model/operation/convolution_operation.cc +++ /dev/null @@ -1,366 +0,0 @@ -#include "./convolution_operation.h" -#include "../layer/convolution.h" -#include<iostream> - -namespace singa{ - -ConvHandles::ConvHandles(const Tensor &input, const std::vector<size_t> kernel_size, - const std::vector<size_t> stride, const std::vector<size_t> padding, - const size_t in_channels, const size_t out_channels, - const bool bias_term_){ - kernel_h_=kernel_size[0]; - kernel_w_=kernel_size[1]; - - pad_h_=padding[0]; - pad_w_=padding[1]; - - stride_h_=stride[0]; - stride_w_=stride[1]; - - channels_=in_channels; - num_filters_=out_channels; - - batchsize = input.shape(0); - CHECK(input.shape(1) == in_channels)<<"the number of input channels mismatched."; - height_ = input.shape(2); - width_ = input.shape(3); - - conv_height_ = 1; - if (stride_h_ > 0) - conv_height_ = (height_ + 2 * pad_h_ - kernel_h_) / stride_h_ + 1; - conv_width_ = (width_ + 2 * pad_w_ - kernel_w_) / stride_w_ + 1; - - col_height_ = in_channels * kernel_w_ * kernel_h_; - col_width_ = conv_height_ * conv_width_; - imagesize = input.Size() / batchsize; -}; - -CudnnConvHandles::CudnnConvHandles(const Tensor &input, const std::vector<size_t> kernel_size, - const std::vector<size_t> stride, const std::vector<size_t> padding, - const size_t in_channels, const size_t out_channels,const bool bias_term_, - const size_t workspace_byte_limit_,const std::string prefer_) - :ConvHandles(input, kernel_size, stride, padding, in_channels, out_channels, bias_term_){ - - DataType dtype = input.data_type(); - auto dev = input.device(); - Context *ctx = dev->context(0); - - CUDNN_CHECK(cudnnCreateTensorDescriptor(&x_desc_)); - CUDNN_CHECK(cudnnCreateTensorDescriptor(&y_desc_)); - if (bias_term_) - CUDNN_CHECK(cudnnCreateTensorDescriptor(&bias_desc_)); - CUDNN_CHECK(cudnnCreateFilterDescriptor(&filter_desc_)); - CUDNN_CHECK(cudnnCreateConvolutionDescriptor(&conv_desc_)); - - - CUDNN_CHECK(cudnnSetTensor4dDescriptor(x_desc_, CUDNN_TENSOR_NCHW, - GetCudnnDataType(dtype), batchsize, - channels_, height_, width_)); - CUDNN_CHECK(cudnnSetTensor4dDescriptor( - y_desc_, CUDNN_TENSOR_NCHW, GetCudnnDataType(dtype), batchsize, - num_filters_, conv_height_, conv_width_)); - if (bias_term_) - CUDNN_CHECK(cudnnSetTensor4dDescriptor(bias_desc_, CUDNN_TENSOR_NCHW, - GetCudnnDataType(dtype), 1, - num_filters_, 1, 1)); - CUDNN_CHECK(cudnnSetConvolution2dDescriptor(conv_desc_, pad_h_, pad_w_, - stride_h_, stride_w_, 1, 1, - CUDNN_CROSS_CORRELATION, - GetCudnnDataType(dtype))); - CUDNN_CHECK(cudnnSetFilter4dDescriptor(filter_desc_, GetCudnnDataType(dtype), - CUDNN_TENSOR_NCHW, num_filters_, - channels_, kernel_h_, kernel_w_)); - if (prefer_ == "fastest" || prefer_ == "limited_workspace" || - prefer_ == "no_workspace") { - cudnnConvolutionFwdPreference_t fwd_pref; - cudnnConvolutionBwdFilterPreference_t bwd_filt_pref; - cudnnConvolutionBwdDataPreference_t bwd_data_pref; - if (prefer_ == "fastest") { - fwd_pref = CUDNN_CONVOLUTION_FWD_PREFER_FASTEST; - bwd_filt_pref = CUDNN_CONVOLUTION_BWD_FILTER_PREFER_FASTEST; - bwd_data_pref = CUDNN_CONVOLUTION_BWD_DATA_PREFER_FASTEST; - } else if (prefer_ == "limited_workspace") { - fwd_pref = CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT; - bwd_filt_pref = CUDNN_CONVOLUTION_BWD_FILTER_SPECIFY_WORKSPACE_LIMIT; - bwd_data_pref = CUDNN_CONVOLUTION_BWD_DATA_SPECIFY_WORKSPACE_LIMIT; - } else { - fwd_pref = CUDNN_CONVOLUTION_FWD_NO_WORKSPACE; - bwd_filt_pref = CUDNN_CONVOLUTION_BWD_FILTER_NO_WORKSPACE; - bwd_data_pref = CUDNN_CONVOLUTION_BWD_DATA_SPECIFY_WORKSPACE_LIMIT; - } - CUDNN_CHECK(cudnnGetConvolutionForwardAlgorithm( - ctx->cudnn_handle, x_desc_, filter_desc_, conv_desc_, y_desc_, fwd_pref, - workspace_byte_limit_, &fp_alg_)); - CUDNN_CHECK(cudnnGetConvolutionBackwardFilterAlgorithm( - ctx->cudnn_handle, x_desc_, y_desc_, conv_desc_, filter_desc_, - bwd_filt_pref, workspace_byte_limit_, &bp_filter_alg_)); - // deprecated in cudnn v7 - CUDNN_CHECK(cudnnGetConvolutionBackwardDataAlgorithm( - ctx->cudnn_handle, filter_desc_, y_desc_, conv_desc_, x_desc_, - bwd_data_pref, workspace_byte_limit_, &bp_data_alg_)); - } else if (prefer_ == "autotune") { - const int topk = 1; - int num_fp_alg, num_bp_filt_alg, num_bp_data_alg; - cudnnConvolutionFwdAlgoPerf_t fp_alg_perf[topk]; - cudnnConvolutionBwdFilterAlgoPerf_t bp_filt_perf[topk]; - cudnnConvolutionBwdDataAlgoPerf_t bp_data_perf[topk]; - CUDNN_CHECK(cudnnFindConvolutionForwardAlgorithm( - ctx->cudnn_handle, x_desc_, filter_desc_, conv_desc_, y_desc_, topk, - &num_fp_alg, fp_alg_perf)); - fp_alg_ = fp_alg_perf[0].algo; - CUDNN_CHECK(cudnnFindConvolutionBackwardFilterAlgorithm( - ctx->cudnn_handle, x_desc_, y_desc_, conv_desc_, filter_desc_, topk, - &num_bp_filt_alg, bp_filt_perf)); - bp_filter_alg_ = bp_filt_perf[0].algo; - CUDNN_CHECK(cudnnFindConvolutionBackwardDataAlgorithm( - ctx->cudnn_handle, filter_desc_, y_desc_, conv_desc_, x_desc_, topk, - &num_bp_data_alg, bp_data_perf)); - bp_data_alg_ = bp_data_perf[0].algo; - } else { - LOG(FATAL) << "Preferred algorithm is not available!"; - } - - size_t fp_byte, bp_data_byte, bp_filter_byte; - CUDNN_CHECK(cudnnGetConvolutionForwardWorkspaceSize( - ctx->cudnn_handle, x_desc_, filter_desc_, conv_desc_, y_desc_, fp_alg_, - &fp_byte)); - CUDNN_CHECK(cudnnGetConvolutionBackwardDataWorkspaceSize( - ctx->cudnn_handle, filter_desc_, y_desc_, conv_desc_, x_desc_, - bp_data_alg_, &bp_data_byte)); - CUDNN_CHECK(cudnnGetConvolutionBackwardFilterWorkspaceSize( - ctx->cudnn_handle, x_desc_, y_desc_, conv_desc_, filter_desc_, - bp_filter_alg_, &bp_filter_byte)); - workspace_count_ = std::max(std::max(fp_byte, bp_data_byte), bp_filter_byte) / - sizeof(float) + - 1; - if (workspace_count_ * sizeof(float) > workspace_byte_limit_) - LOG(WARNING) << "The required memory for workspace (" - << workspace_count_ * sizeof(float) - << ") is larger than the expected Bytes (" - << workspace_byte_limit_ << ")"; - workspace_ = Tensor(Shape{workspace_count_}, dev, dtype); -}; - -Convolution C; - -Tensor CpuConvForward(const Tensor &x, Tensor &W, Tensor &b, const ConvHandles ch){ - CHECK_EQ(x.device()->lang(), kCpp); - - CHECK(x.shape(1) == ch.channels_ && x.shape(2) == ch.height_ && - x.shape(3) == ch.width_) << "input sample shape should not change"; - - CHECK(W.shape(0) == ch.num_filters_ && W.shape(1) == ch.channels_ && - W.shape(2) == ch.kernel_h_ && W.shape(3) == ch.kernel_w_) << "weights shape should not change"; - - Shape w_shape= W.shape(); - Shape b_shape= b.shape(); - - W.Reshape(Shape{ch.num_filters_, ch.col_height_}); - if (ch.bias_term_) - b.Reshape(Shape{ch.num_filters_}); - - DataType dtype = x.data_type(); - auto dev = x.device(); - Shape shape{ch.batchsize, ch.num_filters_, ch.conv_height_, ch.conv_width_}; - Tensor output(shape, dev, dtype); - - Tensor col_data(Shape{ch.col_height_, ch.col_width_});//broadcasted image - - float *data_col = new float[ch.col_height_ * ch.col_width_]; - auto in_data = x.data<float>(); - for (size_t num = 0; num < ch.batchsize; num++) { - C.Im2col(in_data + num * ch.imagesize, ch.channels_, ch.height_, ch.width_, ch.kernel_h_, - ch.kernel_w_, ch.pad_h_, ch.pad_w_, ch.stride_h_, ch.stride_w_, data_col); - - col_data.CopyDataFromHostPtr(data_col, ch.col_height_ * ch.col_width_); - Tensor each = Mult(W, col_data); - if (ch.bias_term_) { - AddColumn(b, &each); - } - CopyDataToFrom(&output, each, each.Size(), num * each.Size()); - }; - W.Reshape(w_shape); - b.Reshape(b_shape); - return output; -}; - -Tensor CpuConvBackwardx(const Tensor &dy, Tensor &W, const Tensor &x, const ConvHandles ch){ - CHECK_EQ(dy.device()->lang(), kCpp); - - CHECK(dy.shape(1) == ch.num_filters_ && dy.shape(2) == ch.conv_height_ && - dy.shape(3) == ch.conv_width_) << "input gradients shape should not change"; - - CHECK(W.shape(0) == ch.num_filters_ && W.shape(1) == ch.channels_ && - W.shape(2) == ch.kernel_h_ && W.shape(3) == ch.kernel_w_) << "weights shape should not change"; - - Shape w_shape= W.shape(); - W.Reshape(Shape{ch.num_filters_, ch.col_height_}); - - Tensor dx; - dx.ResetLike(x); - - float *dx_b = new float[ch.imagesize]; - - for (size_t num = 0; num < ch.batchsize; num++) { - Tensor grad_b(Shape{ch.num_filters_, ch.conv_height_ * ch.conv_width_}); - CopyDataToFrom(&grad_b, dy, grad_b.Size(), 0, num * grad_b.Size()); - Tensor dcol_b = Mult(W.T(), grad_b); - auto dcol_data = dcol_b.data<float>(); - C.Col2im(dcol_data, ch.channels_, ch.height_, ch.width_, ch.kernel_h_, ch.kernel_w_, ch.pad_h_, - ch.pad_w_, ch.stride_h_, ch.stride_w_, dx_b); - dx.CopyDataFromHostPtr(dx_b, ch.imagesize, num * ch.imagesize); - } - W.Reshape(w_shape); - return dx; -}; - -Tensor CpuConvBackwardW(const Tensor &dy, const Tensor &x, const Tensor &W, const ConvHandles ch){ - CHECK_EQ(dy.device()->lang(), kCpp); - - CHECK(dy.shape(1) == ch.num_filters_ && dy.shape(2) == ch.conv_height_ && - dy.shape(3) == ch.conv_width_) << "input gradients shape should not change"; - - CHECK(x.shape(1) == ch.channels_ && x.shape(2) == ch.height_ && - x.shape(3) == ch.width_) << "input sample shape should not change"; - - Tensor dW; - dW.ResetLike(W); - dW.SetValue(0.0f); - - Shape w_shape= W.shape(); - dW.Reshape(Shape{ch.num_filters_, ch.col_height_}); - - Tensor col_data(Shape{ch.col_height_, ch.col_width_});//broadcasted image - - float *data_col = new float[ch.col_height_ * ch.col_width_]; - auto in_data = dy.data<float>(); - for (size_t num = 0; num < ch.batchsize; num++) { - C.Im2col(in_data + num * ch.imagesize, ch.channels_, ch.height_, ch.width_, ch.kernel_h_, - ch.kernel_w_, ch.pad_h_, ch.pad_w_, ch.stride_h_, ch.stride_w_, data_col); - col_data.CopyDataFromHostPtr(data_col, ch.col_height_ * ch.col_width_); - Tensor grad_b(Shape{ch.num_filters_, ch.conv_height_ * ch.conv_width_}); - CopyDataToFrom(&grad_b, dy, grad_b.Size(), 0, num * grad_b.Size()); - dW += Mult(grad_b, col_data.T()); - } - dW.Reshape(w_shape); - return dW; -}; - -Tensor CpuConvBackwardb(const Tensor &dy, const Tensor &b, const ConvHandles ch){ - CHECK_EQ(dy.device()->lang(), kCpp); - - CHECK(dy.shape(1) == ch.num_filters_ && dy.shape(2) == ch.conv_height_ && - dy.shape(3) == ch.conv_width_) << "input gradients shape should not change"; - - CHECK(b.shape(0) == ch.num_filters_)<< "bias shape should not change"; - - Tensor db; - db.ResetLike(b); - - auto tmpshp = Shape{ch.batchsize * ch.num_filters_, dy.Size() / (ch.batchsize * ch.num_filters_)}; - Tensor tmp1 = Reshape(dy, tmpshp); - - Tensor tmp2(Shape{ch.batchsize * ch.num_filters_}); - SumColumns(tmp1, &tmp2); - Tensor tmp3 = Reshape(tmp2, Shape{ch.batchsize, ch.num_filters_}); - - SumRows(tmp3, &db); - - return db; -}; - -Tensor GpuConvForward(const Tensor &x, const Tensor &W, const Tensor &b, const CudnnConvHandles cch){ - CHECK_EQ(x.device()->lang(), kCuda); - - DataType dtype = x.data_type(); - auto dev = x.device(); - - Shape shape{cch.batchsize, cch.num_filters_, cch.conv_height_, cch.conv_width_}; - Tensor output(shape, dev, dtype); - - output.device()->Exec([output, x, W, cch](Context *ctx) { - Block *inblock = x.block(), *outblock = output.block(), - *wblock = W.block(); - float alpha = 1.f, beta = 0.f; - cudnnConvolutionForward(ctx->cudnn_handle, &alpha, cch.x_desc_, - inblock->data(), cch.filter_desc_, wblock->data(), - cch.conv_desc_, cch.fp_alg_, - cch.workspace_.block()->mutable_data(), - cch.workspace_count_ * sizeof(float), &beta, - cch.y_desc_, outblock->mutable_data()); - }, {x.block(), W.block()}, {output.block()}, cch.workspace_.block()); - - if (cch.bias_term_) { - output.device()->Exec([output, b, cch](Context *ctx) { - float beta = 1.f, alpha = 1.0f; - Block *outblock = output.block(), *bblock = b.block(); - cudnnAddTensor(ctx->cudnn_handle, &alpha, cch.bias_desc_, - bblock->data(), &beta, cch.y_desc_, - outblock->mutable_data()); - }, {output.block(), b.block()}, {output.block()}); - } - - return output; -}; - -Tensor GpuConvBackwardx(const Tensor &dy, const Tensor &W, const Tensor &x, const CudnnConvHandles cch){ - CHECK_EQ(dy.device()->lang(), kCuda); - - Tensor dx; - dx.ResetLike(x); - - dy.device()->Exec([dx, dy, W, cch](Context *ctx) { - Block *wblock = W.block(), *dyblock = dy.block(), - *dxblock = dx.block(); - float alpha = 1.f, beta = 0.f; - cudnnConvolutionBackwardData(ctx->cudnn_handle, &alpha, cch.filter_desc_, - wblock->data(), cch.y_desc_, dyblock->data(), - cch.conv_desc_, cch.bp_data_alg_, - cch.workspace_.block()->mutable_data(), - cch.workspace_count_ * sizeof(float), &beta, - cch.x_desc_, dxblock->mutable_data()); - }, {dy.block(), W.block()}, {dx.block(), cch.workspace_.block()}); - - return dx; -}; - -Tensor GpuConvBackwardW(const Tensor &dy, const Tensor &x, const Tensor &W, const CudnnConvHandles cch){ - CHECK_EQ(dy.device()->lang(), kCuda); - - Tensor dW; - dW.ResetLike(W); - - dy.device()->Exec([dW, dy, x, W, cch](Context *ctx) { - Block *inblock = x.block(), *dyblock = dy.block(), - *dwblock = dW.block(); - float alpha = 1.f, beta = 0.f; - cudnnConvolutionBackwardFilter( - ctx->cudnn_handle, &alpha, cch.x_desc_, inblock->data(), - cch.y_desc_, dyblock->data(), cch.conv_desc_, cch.bp_filter_alg_, - cch.workspace_.block()->mutable_data(), - cch.workspace_count_ * sizeof(float), &beta, cch.filter_desc_, - dwblock->mutable_data()); - }, {dy.block(), x.block()}, {dW.block(), cch.workspace_.block()}); - - return dW; -}; - -// input Tensor b for Reset db purpose, can avoid this later. -Tensor GpuConvBackwardb(const Tensor &dy, const Tensor &b, const CudnnConvHandles cch){ - CHECK_EQ(dy.device()->lang(), kCuda); - - Tensor db; - db.ResetLike(b); - - dy.device()->Exec([db, dy, b, cch](Context *ctx) { - Block *dyblock = dy.block(), *dbblock = db.block(); - float alpha = 1.f, beta = 0.f; - cudnnConvolutionBackwardBias(ctx->cudnn_handle, &alpha, cch.y_desc_, - dyblock->data(), &beta, cch.bias_desc_, - dbblock->mutable_data()); - }, {dy.block()}, {db.block()}); - - return db; -}; - -} \ No newline at end of file http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/15c0230c/src/model/operation/convolution_operation.h ---------------------------------------------------------------------- diff --git a/src/model/operation/convolution_operation.h b/src/model/operation/convolution_operation.h deleted file mode 100644 index 835581e..0000000 --- a/src/model/operation/convolution_operation.h +++ /dev/null @@ -1,78 +0,0 @@ -#include <string> -#include <vector> -#include <cudnn.h> -#include "../layer/cudnn_convolution.h" -#include "../layer/cudnn_utils.h" -#include "singa/utils/logging.h" - -namespace singa{ - -struct ConvHandles{ - size_t kernel_w_; - size_t pad_w_; - size_t stride_w_; - size_t kernel_h_; - size_t pad_h_; - size_t stride_h_; - - size_t channels_; - size_t num_filters_; - - bool bias_term_; - - size_t height_; - size_t width_; - size_t conv_height_; - size_t conv_width_; - size_t batchsize; - - size_t col_height_; - size_t col_width_; - size_t imagesize; - - ConvHandles(const Tensor &input, const std::vector<size_t> kernel_size, - const std::vector<size_t> stride, const std::vector<size_t> padding, - const size_t in_channels, const size_t out_channels, - const bool bias_term_); - -}; - -struct CudnnConvHandles:ConvHandles{ - cudnnTensorDescriptor_t x_desc_ ; - cudnnTensorDescriptor_t y_desc_ ; - cudnnTensorDescriptor_t bias_desc_ ; - cudnnFilterDescriptor_t filter_desc_ ; - cudnnConvolutionDescriptor_t conv_desc_ ; - cudnnConvolutionFwdAlgo_t fp_alg_; - cudnnConvolutionBwdFilterAlgo_t bp_filter_alg_; - cudnnConvolutionBwdDataAlgo_t bp_data_alg_; - - size_t workspace_count_; - Tensor workspace_; - - CudnnConvHandles(const Tensor &input, const std::vector<size_t> kernel_size, - const std::vector<size_t> stride, const std::vector<size_t> padding, - const size_t in_channels, const size_t out_channels, - const bool bias_term_, const size_t workspace_byte_limit_=1024*1024*1024, - const std::string prefer_="fastest"); -}; - -Tensor CpuConvForward(const Tensor &x, Tensor &W, Tensor &b, const ConvHandles ch); - -Tensor CpuConvBackwardx(const Tensor &dy, Tensor &W, const Tensor &x, const ConvHandles ch); - -Tensor CpuConvBackwardW(const Tensor &dy, const Tensor &x, const Tensor &W, const ConvHandles ch); - -Tensor CpuConvBackwardb(const Tensor &dy, const Tensor &b, const ConvHandles ch); - - -Tensor GpuConvForward(const Tensor &x, const Tensor &W, const Tensor &b, const CudnnConvHandles cch); - -Tensor GpuConvBackwardx(const Tensor &dy, const Tensor &W, const Tensor &x, const CudnnConvHandles cch); - -Tensor GpuConvBackwardW(const Tensor &dy, const Tensor &x, const Tensor &W, const CudnnConvHandles cch); - -Tensor GpuConvBackwardb(const Tensor &dy, const Tensor &b, const CudnnConvHandles cch); - - -} \ No newline at end of file http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/15c0230c/test/python/test_operation.py ---------------------------------------------------------------------- diff --git a/test/python/test_operation.py b/test/python/test_operation.py index ece537d..1bbc70c 100644 --- a/test/python/test_operation.py +++ b/test/python/test_operation.py @@ -16,9 +16,6 @@ cpu_dev = device.get_default_device() dy = CTensor([2, 1, 2, 2]) singa.Gaussian(0.0, 1.0, dy) -conv = autograd.Conv2D(3, 1, 2) # (in_channels, out_channels, kernel_size) -conv_without_bias = autograd.Conv2D(3,1,2,bias=False) - def _tuple_to_string(t): lt = [str(x) for x in t] @@ -34,35 +31,43 @@ class TestPythonOperation(unittest.TestCase): ) def test_conv2d_gpu(self): + # (in_channels, out_channels, kernel_size) + conv_0 = autograd.Conv2D(3, 1, 2) + conv_without_bias_0 = autograd.Conv2D(3, 1, 2, bias=False) + gpu_input_tensor = tensor.Tensor(shape=(2, 3, 3, 3), device=gpu_dev) gpu_input_tensor.gaussian(0.0, 1.0) - y = conv(gpu_input_tensor) # PyTensor - dx, dW, db = conv.backward(dy) # CTensor + y = conv_0(gpu_input_tensor) # PyTensor + dx, dW, db = y.creator.backward(dy) # CTensor self.check_shape(y.shape, (2, 1, 2, 2)) self.check_shape(dx.shape(), (2, 3, 3, 3)) self.check_shape(dW.shape(), (1, 3, 2, 2)) self.check_shape(db.shape(), (1,)) - #forward without bias - y_without_bias=conv_without_bias(gpu_input_tensor) + # forward without bias + y_without_bias = conv_without_bias_0(gpu_input_tensor) self.check_shape(y.shape, (2, 1, 2, 2)) def test_conv2d_cpu(self): + # (in_channels, out_channels, kernel_size) + conv_1 = autograd.Conv2D(3, 1, 2) + conv_without_bias_1 = autograd.Conv2D(3, 1, 2, bias=False) + cpu_input_tensor = tensor.Tensor(shape=(2, 3, 3, 3), device=cpu_dev) cpu_input_tensor.gaussian(0.0, 1.0) - y = conv(cpu_input_tensor) # PyTensor - dx, dW, db = conv.backward(dy) # CTensor + y = conv_1(cpu_input_tensor) # PyTensor + dx, dW, db = y.creator.backward(dy) # CTensor self.check_shape(y.shape, (2, 1, 2, 2)) self.check_shape(dx.shape(), (2, 3, 3, 3)) self.check_shape(dW.shape(), (1, 3, 2, 2)) self.check_shape(db.shape(), (1,)) - #forward without bias - y_without_bias=conv_without_bias(cpu_input_tensor) + # forward without bias + y_without_bias = conv_without_bias_1(cpu_input_tensor) self.check_shape(y.shape, (2, 1, 2, 2)) if __name__ == '__main__':
