SINGA-371 Implement functional operations in c++ for autograd fix some bugs and update the example for autograd
Project: http://git-wip-us.apache.org/repos/asf/incubator-singa/repo Commit: http://git-wip-us.apache.org/repos/asf/incubator-singa/commit/e16cea12 Tree: http://git-wip-us.apache.org/repos/asf/incubator-singa/tree/e16cea12 Diff: http://git-wip-us.apache.org/repos/asf/incubator-singa/diff/e16cea12 Branch: refs/heads/master Commit: e16cea129b688c804afe87b3bd1b6a82e5f5ca5f Parents: e209203 Author: wang wei <[email protected]> Authored: Sat Jul 7 22:00:07 2018 +0800 Committer: wang wei <[email protected]> Committed: Sun Jul 8 16:01:45 2018 +0800 ---------------------------------------------------------------------- examples/autograd/mnist_cnn.py | 25 ++++--- python/singa/autograd.py | 6 +- src/api/model_operation.i | 29 ++++---- src/core/tensor/tensor.cc | 4 +- src/core/tensor/tensor_math_cuda.h | 114 +++++++++++++++++------------- src/model/layer/cudnn_convolution.cc | 2 +- src/model/operation/convolution.cc | 9 ++- src/model/operation/convolution.h | 1 + tool/conda/singa/build.sh | 3 +- tool/conda/singa/meta.yaml | 2 +- 10 files changed, 112 insertions(+), 83 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/e16cea12/examples/autograd/mnist_cnn.py ---------------------------------------------------------------------- diff --git a/examples/autograd/mnist_cnn.py b/examples/autograd/mnist_cnn.py index a82f64c..5b4e608 100755 --- a/examples/autograd/mnist_cnn.py +++ b/examples/autograd/mnist_cnn.py @@ -21,13 +21,11 @@ import numpy as np import argparse import os -import singa +from singa import device from singa import tensor from singa import autograd from singa import optimizer -singa.layer.engine = 'singacpp' - def load_data(path): f = np.load(path) @@ -75,11 +73,18 @@ if __name__ == '__main__': parser = argparse.ArgumentParser(description='Train CNN over MNIST') parser.add_argument('file_path', type=str, help='the dataset path') + parser.add_argument('--use_cpu', action='store_true') args = parser.parse_args() assert os.path.exists(args.file_path), \ - 'Pls download the MNIST dataset from ' \ - 'https://github.com/mnielsen/neural-networks-and-deep-learning/raw/master/data/mnist.pkl.gz' + 'Pls download the MNIST dataset from ' + + if args.use_cpu: + print('Using CPU') + dev = device.get_default_device() + else: + print('Using GPU') + dev = device.create_cuda_gpu() train, test = load_data(args.file_path) @@ -119,16 +124,16 @@ if __name__ == '__main__': autograd.training = True for epoch in range(epochs): for i in range(batch_number): - inputs = tensor.Tensor(data=x_train[i * 100:(1 + i) * 100, :]) - targets = tensor.Tensor(data=y_train[i * 100:(1 + i) * 100, :]) + inputs = tensor.Tensor(device=dev, data=x_train[i * 100:(1 + i) * 100]) + targets = tensor.Tensor(device=dev, data=y_train[i * 100:(1 + i) * 100]) loss, y = forward(inputs, targets) - accuracy_rate = accuracy(autograd.ctensor2numpy(y.data), - autograd.ctensor2numpy(targets.data)) + accuracy_rate = accuracy(tensor.to_numpy(y), + tensor.to_numpy(targets)) if (i % 5 == 0): print('accuracy is:', accuracy_rate, 'loss is:', - autograd.ctensor2numpy(loss.data)[0]) + tensor.to_numpy(loss)[0]) in_grads = autograd.backward(loss) http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/e16cea12/python/singa/autograd.py ---------------------------------------------------------------------- diff --git a/python/singa/autograd.py b/python/singa/autograd.py index 80209ff..9fd8b4d 100755 --- a/python/singa/autograd.py +++ b/python/singa/autograd.py @@ -741,12 +741,10 @@ class Conv2D(NewLayer): else: if not hasattr(self, 'handle'): self.handle = singa.CudnnConvHandle(x.data, self.kernel_size, self.stride, - self.padding, self.in_channels, self.out_channels, self.bias, - self.inner_params['workspace_MB_limit'] * 1024 * 1024, self.inner_params['cudnn_prefer']) + self.padding, self.in_channels, self.out_channels, self.bias) elif x.shape[0] != self.handle.batchsize: self.handle = singa.CudnnConvHandle(x.data, self.kernel_size, self.stride, - self.padding, self.in_channels, self.out_channels, self.bias, - self.inner_params['workspace_MB_limit'] * 1024 * 1024, self.inner_params['cudnn_prefer']) + self.padding, self.in_channels, self.out_channels, self.bias) self.handle.device_id = x.device.id() y = conv2d(x, self.W, self.b, self.handle) http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/e16cea12/src/api/model_operation.i ---------------------------------------------------------------------- diff --git a/src/api/model_operation.i b/src/api/model_operation.i index 2c13a3b..3858a2b 100755 --- a/src/api/model_operation.i +++ b/src/api/model_operation.i @@ -1,9 +1,12 @@ %module model_operation +%include "config.i" +%include "std_vector.i" +%include "std_string.i" %{ #include "../src/model/operation/convolution.h" %} -namespace singa{ +namespace singa { class ConvHandle { public: @@ -15,15 +18,24 @@ class ConvHandle { size_t batchsize; }; -struct CudnnConvHandle{ +Tensor CpuConvForward(const Tensor &x, Tensor &W, Tensor &b, const ConvHandle &ch); + +Tensor CpuConvBackwardx(const Tensor &dy, Tensor &W, const Tensor &x, const ConvHandle &ch); + +Tensor CpuConvBackwardW(const Tensor &dy, const Tensor &x, const Tensor &W, const ConvHandle &ch); + +Tensor CpuConvBackwardb(const Tensor &dy, const Tensor &b, const ConvHandle &ch); + +#if USE_CUDNN +class CudnnConvHandle: public ConvHandle { public: - CudnnConvHandle(const Tensor &input, const std::vector<size_t>& kernel_size, + CudnnConvHandle(const Tensor &input, const std::vector<size_t>& kernel_size, const std::vector<size_t>& stride, const std::vector<size_t>& padding, const size_t in_channels, const size_t out_channels, const bool bias, const size_t workspace_byte_limit = 1024 * 1024 * 1024, const std::string& prefer = "fastest"); bool bias_term; - size_t batchsize; + size_t batchsize; }; Tensor GpuConvForward(const Tensor &x, const Tensor &W, const Tensor &b, const CudnnConvHandle &cch); @@ -34,13 +46,6 @@ Tensor GpuConvBackwardW(const Tensor &dy, const Tensor &x, const Tensor &W, cons Tensor GpuConvBackwardb(const Tensor &dy, const Tensor &b, const CudnnConvHandle &cch); - -Tensor CpuConvForward(const Tensor &x, Tensor &W, Tensor &b, const ConvHandle &ch); - -Tensor CpuConvBackwardx(const Tensor &dy, Tensor &W, const Tensor &x, const ConvHandle &ch); - -Tensor CpuConvBackwardW(const Tensor &dy, const Tensor &x, const Tensor &W, const ConvHandle &ch); - -Tensor CpuConvBackwardb(const Tensor &dy, const Tensor &b, const ConvHandle &ch); +#endif // USE_CUDNN } http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/e16cea12/src/core/tensor/tensor.cc ---------------------------------------------------------------------- diff --git a/src/core/tensor/tensor.cc b/src/core/tensor/tensor.cc index e0a9ecb..05db7cf 100644 --- a/src/core/tensor/tensor.cc +++ b/src/core/tensor/tensor.cc @@ -346,7 +346,7 @@ Tensor Tensor::Repeat(vector<size_t> repeats, int axis, std::shared_ptr<Device> } else { if (repeats.size() == 1){ total_repeats = repeats[0]; - for (int i = 0; i < shape_.size(); i++) { + for (size_t i = 0; i < shape_.size(); i++) { if (i == axis) { tshape.push_back(shape_[i] * total_repeats); } else { @@ -363,7 +363,7 @@ Tensor Tensor::Repeat(vector<size_t> repeats, int axis, std::shared_ptr<Device> } total_repeats += repeats[i]; } - for (int i = 0; i < shape_.size(); i++){ + for (size_t i = 0; i < shape_.size(); i++){ if (i == axis) { tshape.push_back(total_repeats); } else{ http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/e16cea12/src/core/tensor/tensor_math_cuda.h ---------------------------------------------------------------------- diff --git a/src/core/tensor/tensor_math_cuda.h b/src/core/tensor/tensor_math_cuda.h index a1b9381..2a43468 100644 --- a/src/core/tensor/tensor_math_cuda.h +++ b/src/core/tensor/tensor_math_cuda.h @@ -791,6 +791,12 @@ void Sqrt<float, lang::Cuda>(const Tensor& in, Tensor* out, const float* inPtr = static_cast<const float*>(in.block()->data()); float* outPtr = static_cast<float*>(out->block()->mutable_data()); +#if CUDNN_MAJOR < 7 + size_t num = in.Size(); + cuda::sqrt(num, inPtr, outPtr, ctx->stream); + +#else + float alpha1 = 1.0; float alpha2 = 0.0; float beta = 0.0; @@ -800,6 +806,7 @@ void Sqrt<float, lang::Cuda>(const Tensor& in, Tensor* out, (void*)(&alpha2), in_desc, inPtr, (void*)(&beta), generate_tensor_nd_desc(*out), outPtr )); +#endif // CUDNN_MAJOR < 7 } /// Element-wise operation, out[i]=in[i]^2 @@ -833,54 +840,6 @@ void Square<float, lang::Cuda>(const Tensor& in, Tensor* out, // // cuda::sum(num, inPtr, out, ctx->stream); // } -template <> -void Sum<float, lang::Cuda>(const Tensor& in, float* out, - Context* ctx) { - const float* inPtr = static_cast<const float*>(in.block()->data()); - - //reduce all axes to 1 for cudnnReduce, e.g. Tensor A with shape (2,4) will be reduced to (1) - Shape reduced_shape = {1}; - Tensor t(reduced_shape, in.device(), in.data_type()); - float* tPtr = static_cast<float*>(t.block()->mutable_data()); - vector<int> reduce_all_axes = generate_shape_cuda(in); - for (size_t n = 0; n < reduce_all_axes.size(); ++n) { - reduce_all_axes[n] = 1; - } - - //reduce_desc - cudnnReduceTensorDescriptor_t reduce_desc; - cudnnReduceTensorOp_t reduce_op = CUDNN_REDUCE_TENSOR_ADD; - cudnnDataType_t cudnn_dtype = CUDNN_DATA_FLOAT; - cudnnNanPropagation_t cudnn_propagation = CUDNN_PROPAGATE_NAN; - cudnnReduceTensorIndices_t cudnn_indices = CUDNN_REDUCE_TENSOR_NO_INDICES; - cudnnIndicesType_t cudnn_indices_type = CUDNN_32BIT_INDICES; - check_cudnn(cudnnCreateReduceTensorDescriptor(&reduce_desc)); - check_cudnn(cudnnSetReduceTensorDescriptor(reduce_desc, reduce_op, cudnn_dtype, - cudnn_propagation, cudnn_indices, cudnn_indices_type)); - - //instantiate 2 new tensors to use new blocks as memory instead of cudaMalloc - size_t reduction_size_int = Product(in.shape()); - Shape reduction_size = {reduction_size_int * 100}; - Tensor indices(reduction_size, in.device(), in.data_type()); - Tensor workspace(reduction_size, in.device(), in.data_type()); - size_t indices_bytes = indices.block()->size() * 100; - size_t workspace_bytes = workspace.block()->size() * 100; - size_t* indicesPtr = static_cast<size_t*>(indices.block()->mutable_data()); - float* workspacePtr = static_cast<float*>(workspace.block()->mutable_data()); - //void* indicesPtr{nullptr}; void* workspacePtr{nullptr}; - //cudaMalloc(&indicesPtr, indices_bytes); cudaMalloc(&workspacePtr, workspace_bytes); - - float alpha = 1.0; - float beta = 0.0; - check_cudnn(cudnnReduceTensor(ctx->cudnn_handle, reduce_desc, - indicesPtr, indices_bytes, workspacePtr, workspace_bytes, - (void*)(&alpha), generate_tensor_nd_desc(in), inPtr, - (void*)(&beta), generate_tensor_nd_desc(t), tPtr - )); - - *out = tPtr[0]; -} - /// Element-wise operation, out[i]=tanh([in[i]) // template <> @@ -949,7 +908,7 @@ void Transform<float, lang::Cuda>(const Tensor& in, Tensor* out, (void*)(&alpha), generate_tensor_nd_desc(in), inPtr, (void*)(&beta), generate_tensor_nd_desc(*out), outPtr )); - + } // ================Random functions=========================================== @@ -1233,6 +1192,63 @@ void RowMax<float, lang::Cuda>(const Tensor& in, Tensor* out, } } + +// must put this function after Set and Dot functions due to the error from +// instantiation before specialization +template <> +void Sum<float, lang::Cuda>(const Tensor& in, float* out, + Context* ctx) { +#if CUDNN_MAJOR < 7 + Tensor one(in.shape(), in.device(), in.data_type()); + Set<float, lang::Cuda>(float(1), &one, ctx); + Dot<float, lang::Cuda>(in, one, out, ctx); +#else + const float* inPtr = static_cast<const float*>(in.block()->data()); + //reduce all axes to 1 for cudnnReduce, e.g. Tensor A with shape (2,4) will be reduced to (1) + Shape reduced_shape = {1}; + Tensor t(reduced_shape, in.device(), in.data_type()); + float* tPtr = static_cast<float*>(t.block()->mutable_data()); + vector<int> reduce_all_axes = generate_shape_cuda(in); + for (size_t n = 0; n < reduce_all_axes.size(); ++n) { + reduce_all_axes[n] = 1; + } + + //reduce_desc + cudnnReduceTensorDescriptor_t reduce_desc; + cudnnReduceTensorOp_t reduce_op = CUDNN_REDUCE_TENSOR_ADD; + cudnnDataType_t cudnn_dtype = CUDNN_DATA_FLOAT; + cudnnNanPropagation_t cudnn_propagation = CUDNN_PROPAGATE_NAN; + cudnnReduceTensorIndices_t cudnn_indices = CUDNN_REDUCE_TENSOR_NO_INDICES; + cudnnIndicesType_t cudnn_indices_type = CUDNN_32BIT_INDICES; + check_cudnn(cudnnCreateReduceTensorDescriptor(&reduce_desc)); + check_cudnn(cudnnSetReduceTensorDescriptor(reduce_desc, reduce_op, cudnn_dtype, + cudnn_propagation, cudnn_indices, cudnn_indices_type)); + + //instantiate 2 new tensors to use new blocks as memory instead of cudaMalloc + size_t reduction_size_int = Product(in.shape()); + Shape reduction_size = {reduction_size_int * 100}; + Tensor indices(reduction_size, in.device(), in.data_type()); + Tensor workspace(reduction_size, in.device(), in.data_type()); + size_t indices_bytes = indices.block()->size() * 100; + size_t workspace_bytes = workspace.block()->size() * 100; + size_t* indicesPtr = static_cast<size_t*>(indices.block()->mutable_data()); + float* workspacePtr = static_cast<float*>(workspace.block()->mutable_data()); + //void* indicesPtr{nullptr}; void* workspacePtr{nullptr}; + //cudaMalloc(&indicesPtr, indices_bytes); cudaMalloc(&workspacePtr, workspace_bytes); + + float alpha = 1.0; + float beta = 0.0; + check_cudnn(cudnnReduceTensor(ctx->cudnn_handle, reduce_desc, + indicesPtr, indices_bytes, workspacePtr, workspace_bytes, + (void*)(&alpha), generate_tensor_nd_desc(in), inPtr, + (void*)(&beta), generate_tensor_nd_desc(t), tPtr + )); + + *out = tPtr[0]; +#endif // CUDNN_MAJOR < 7 +} + + } // namespace singa #endif // USE_CUDA http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/e16cea12/src/model/layer/cudnn_convolution.cc ---------------------------------------------------------------------- diff --git a/src/model/layer/cudnn_convolution.cc b/src/model/layer/cudnn_convolution.cc index 1b12f93..0aed832 100644 --- a/src/model/layer/cudnn_convolution.cc +++ b/src/model/layer/cudnn_convolution.cc @@ -79,7 +79,7 @@ void CudnnConvolution::InitCudnn(const Tensor &input) { CUDNN_CHECK(cudnnSetConvolution2dDescriptor(conv_desc_, pad_h_, pad_w_, stride_h_, stride_w_, 1, 1, // dilation x and y CUDNN_CROSS_CORRELATION -#if CUDNN_MAJOR == 5 +#if CUDNN_MAJOR >= 7 , GetCudnnDataType(dtype) #endif // CUDNN_MAJOR )); http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/e16cea12/src/model/operation/convolution.cc ---------------------------------------------------------------------- diff --git a/src/model/operation/convolution.cc b/src/model/operation/convolution.cc index e36df43..f700203 100755 --- a/src/model/operation/convolution.cc +++ b/src/model/operation/convolution.cc @@ -199,8 +199,11 @@ CudnnConvHandle::CudnnConvHandle(const Tensor &input, const std::vector<size_t>& num_filters, 1, 1)); CUDNN_CHECK(cudnnSetConvolution2dDescriptor(conv_desc, pad_h, pad_w, stride_h, stride_w, 1, 1, - CUDNN_CROSS_CORRELATION, - GetCudnnDataType(dtype))); + CUDNN_CROSS_CORRELATION +#if CUDNN_MAJOR >= 7 + , GetCudnnDataType(dtype) +#endif + )); CUDNN_CHECK(cudnnSetFilter4dDescriptor(filter_desc, GetCudnnDataType(dtype), CUDNN_TENSOR_NCHW, num_filters, channels, kernel_h, kernel_w)); @@ -381,4 +384,4 @@ Tensor GpuConvBackwardb(const Tensor &dy, const Tensor &b, const CudnnConvHandle } #endif // USE_CUDNN -} // namespace singa \ No newline at end of file +} // namespace singa http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/e16cea12/src/model/operation/convolution.h ---------------------------------------------------------------------- diff --git a/src/model/operation/convolution.h b/src/model/operation/convolution.h index 62ff254..9da881f 100755 --- a/src/model/operation/convolution.h +++ b/src/model/operation/convolution.h @@ -5,6 +5,7 @@ #include <vector> #include "singa/core/tensor.h" #include "singa/utils/logging.h" +#include "singa/singa_config.h" #ifdef USE_CUDNN #include <cudnn.h> http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/e16cea12/tool/conda/singa/build.sh ---------------------------------------------------------------------- diff --git a/tool/conda/singa/build.sh b/tool/conda/singa/build.sh index 91a2f3b..b54e451 100644 --- a/tool/conda/singa/build.sh +++ b/tool/conda/singa/build.sh @@ -23,12 +23,13 @@ export CMAKE_PREFIX_PATH=$PREFIX:$CMAKE_PREFIX_PATH export CMAKE_INCLUDE_PATH=$PREFIX/include:$CMAKE_INCLUDE_PATH export CMAKE_LIBRARY_PATH=$PREFIX/lib:$CMAKE_LIBRARY_PATH +echo "----------------------$CUDNN_PATH---------------" if [ -z ${CUDNN_PATH+x} ]; then USE_CUDA=OFF else USE_CUDA=ON - cp -r $CUDNN_PATH/include $PREFIX/include + cp $CUDNN_PATH/include/* $PREFIX/include/ cp -P $CUDNN_PATH/lib64/libcudnn.so* $PREFIX/lib/ fi http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/e16cea12/tool/conda/singa/meta.yaml ---------------------------------------------------------------------- diff --git a/tool/conda/singa/meta.yaml b/tool/conda/singa/meta.yaml index 997341c..ee76636 100644 --- a/tool/conda/singa/meta.yaml +++ b/tool/conda/singa/meta.yaml @@ -22,7 +22,7 @@ package: version: "{{ GIT_DESCRIBE_TAG }}" source: - git_url: https://github.com/apache/incubator-singa.git + path: /home/wangwei/incubator-singa/ build: number: 0
