SINGA-100 Implement layers using CUDNN for GPU training Pass cublas handle from math_blob to math_addr. Test configure-make for cpu code. Compile success for Makefile.gpu. Todo set up Context when creating worker threads.
Project: http://git-wip-us.apache.org/repos/asf/incubator-singa/repo Commit: http://git-wip-us.apache.org/repos/asf/incubator-singa/commit/6e563344 Tree: http://git-wip-us.apache.org/repos/asf/incubator-singa/tree/6e563344 Diff: http://git-wip-us.apache.org/repos/asf/incubator-singa/diff/6e563344 Branch: refs/heads/master Commit: 6e5633441da13625b28971a42afda416ee05e1c6 Parents: 2ed18a5 Author: seaokcs <[email protected]> Authored: Fri Nov 27 21:25:00 2015 +0800 Committer: Wei Wang <[email protected]> Committed: Fri Dec 11 11:48:23 2015 +0800 ---------------------------------------------------------------------- include/singa/utils/context.h | 2 + include/singa/utils/math_addr.h | 45 ++++++------------ include/singa/utils/math_blob.h | 64 +++++++++++++------------- src/neuralnet/neuron_layer/convolution.cc | 8 +--- src/neuralnet/neuron_layer/pooling.cc | 10 ++-- src/proto/job.proto | 20 ++++---- 6 files changed, 61 insertions(+), 88 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/6e563344/include/singa/utils/context.h ---------------------------------------------------------------------- diff --git a/include/singa/utils/context.h b/include/singa/utils/context.h index 905b810..1d1802c 100644 --- a/include/singa/utils/context.h +++ b/include/singa/utils/context.h @@ -77,8 +77,10 @@ class Context { */ Context() { for (int i = 0; i < kMaxNumGPU; i++) { +#ifdef USE_GPU cublas_handle_.push_back(nullptr); curand_generator_.push_back(nullptr); +#endif } } http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/6e563344/include/singa/utils/math_addr.h ---------------------------------------------------------------------- diff --git a/include/singa/utils/math_addr.h b/include/singa/utils/math_addr.h index f548606..3b0eefd 100644 --- a/include/singa/utils/math_addr.h +++ b/include/singa/utils/math_addr.h @@ -171,65 +171,51 @@ void cpu_sample_gaussian(URNG& g, int n, Dtype mean, Dtype std, Dtype* A) { #ifdef USE_GPU template<typename Dtype> -Dtype gpu_asum(int n, const Dtype* A, int inc) { +Dtype gpu_asum(cublasHandle_t handle, int n, const Dtype* A, int inc) { Dtype result = 0.0; - cublasHandle_t handle; - cublasCreate(&handle); cublasSasum(handle, n, A, inc, &result); - cublasDestroy(handle); return result; } template<typename Dtype> -void gpu_gemm(const Dtype * A, const Dtype * B, const int m, const int n, - const int k, const Dtype alpha, const Dtype beta, const bool TranA, - const bool TranB, Dtype * C) { +void gpu_gemm(cublasHandle_t handle, const Dtype * A, const Dtype * B, + const int m, const int n, const int k, const Dtype alpha, const Dtype beta, + const bool TranA, const bool TranB, Dtype * C) { int lda = TranA ? m : k; int ldb = TranB ? k : n; int ldc = n; cublasOperation_t tA = (TranA == false) ? CUBLAS_OP_N : CUBLAS_OP_T; cublasOperation_t tB = (TranB == false) ? CUBLAS_OP_N : CUBLAS_OP_T; - cublasHandle_t handle; - cublasCreate(&handle); cublasSgemm(handle, tB, tA, n, m, k, &alpha, B, ldb, A, lda, &beta, C, ldc); - cublasDestroy(handle); } template<typename Dtype> -void gpu_gemv(const Dtype * A, const Dtype * B, const int m, const int n, - const Dtype alpha, const Dtype beta, const bool TranA, Dtype * C) { +void gpu_gemv(cublasHandle_t handle, const Dtype * A, const Dtype * B, + const int m, const int n, const Dtype alpha, const Dtype beta, + const bool TranA, Dtype * C) { int lda = n; cublasOperation_t tA = (TranA == true) ? CUBLAS_OP_N : CUBLAS_OP_T; - cublasHandle_t handle; - cublasCreate(&handle); cublasSgemv(handle, tA, n, m, &alpha , A, lda, B, 1, &beta, C, 1); - cublasDestroy(handle); } template<typename Dtype> -void gpu_axpy(const Dtype * A, const int n, const Dtype alpha, Dtype * B) { - cublasHandle_t handle; - cublasCreate(&handle); +void gpu_axpy(cublasHandle_t handle, const Dtype * A, const int n, + const Dtype alpha, Dtype * B) { cublasSaxpy(handle, n, &alpha, A, 1, B, 1); - cublasDestroy(handle); } template<typename Dtype> -void gpu_scale(const int n, const Dtype alpha, Dtype * A) { - cublasHandle_t handle; - cublasCreate(&handle); +void gpu_scale(cublasHandle_t handle, const int n, const Dtype alpha, + Dtype * A) { cublasSscal(handle, n, &alpha, A, 1); - cublasDestroy(handle); } template<typename Dtype> -Dtype gpu_dot(const Dtype * A, const Dtype * B, const int n) { - cublasHandle_t handle; - cublasCreate(&handle); +Dtype gpu_dot(cublasHandle_t handle, const Dtype * A, const Dtype * B, + const int n) { Dtype result = 0.0; cublasSdot(handle, n, A, 1, B, 1, &result); - cublasDestroy(handle); return result; } @@ -280,16 +266,11 @@ void gpu_expand_f(const Dtype * A, const int m, const int n, Dtype * B) { template<typename Dtype, typename URNG> void gpu_sample_uniform(URNG g, int n, Dtype low, Dtype high, Dtype* A) { - //curandGenerator_t gen; - //curandCreateGenerator(&gen, CURAND_RNG_PSEUDO_DEFAULT); - curandSetPseudoRandomGeneratorSeed(g, time(NULL)); curandGenerateUniform(g, A, n); - //curandDestroyGenerator(gen); } template<typename Dtype, typename URNG> void gpu_sample_gaussian(URNG g, int n, Dtype mean, Dtype std, Dtype* A) { - curandSetPseudoRandomGeneratorSeed(g, time(NULL)); curandGenerateNormal(g, A, n, mean, std); } http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/6e563344/include/singa/utils/math_blob.h ---------------------------------------------------------------------- diff --git a/include/singa/utils/math_blob.h b/include/singa/utils/math_blob.h index bbf7cc0..97d5cf7 100644 --- a/include/singa/utils/math_blob.h +++ b/include/singa/utils/math_blob.h @@ -49,8 +49,8 @@ void Scale(Dtype alpha, Blob<Dtype> * B) { cpu_scale(B->count(), alpha, B->mutable_cpu_data()); else { #ifdef USE_GPU - // TODO(haibo) check it. - gpu_scale(B->count(), alpha, B->mutable_gpu_data()); + gpu_scale(context->cublas_handle(device), B->count(), alpha, + B->mutable_gpu_data()); #endif } } @@ -67,7 +67,8 @@ void AXPY(Dtype alpha, const Blob<Dtype> & A, Blob<Dtype> * B) { cpu_axpy(A.count(), alpha, A.cpu_data(), B->mutable_cpu_data()); } else { #ifdef USE_GPU - gpu_axpy(A.count(), alpha, A.gpu_data(), B->mutable_gpu_data()); + gpu_axpy(context->cublas_handle(device), A.count(), alpha, A.gpu_data(), + B->mutable_gpu_data()); #endif // USE_GPU } } @@ -106,9 +107,8 @@ void GEMV(Dtype alpha, Dtype beta, const Blob<Dtype>& A, C->mutable_cpu_data()); } else { #ifdef USE_GPU - // gpu part - gpu_gemv(A.gpu_data(), B.gpu_data(), m, n, alpha, beta, TranA, - C->mutable_gpu_data()); + gpu_gemv(context->cublas_handle(device), A.gpu_data(), B.gpu_data(), m, n, + alpha, beta, TranA, C->mutable_gpu_data()); #endif // USE_GPU } } @@ -172,9 +172,8 @@ void GEMM( Dtype alpha, Dtype beta, const Blob<Dtype>& A, C->mutable_cpu_data()); } else { #ifdef USE_GPU - // gpu part - gpu_gemm(A.gpu_data(), B.gpu_data(), m, n, k, alpha, beta, - TranA, TranB, C->mutable_gpu_data()); + gpu_gemm(context->cublas_handle(device), A.gpu_data(), B.gpu_data(), m, n, k, + alpha, beta, TranA, TranB, C->mutable_gpu_data()); #endif // USE_GPU } } @@ -216,7 +215,7 @@ Dtype VVDot(const Blob<Dtype> & A, const Blob<Dtype> & B) { } else { #ifdef USE_GPU // gpu part - res = gpu_dot(A.gpu_data(), B.gpu_data(), n); + res = gpu_dot(context->cublas_handle(device), A.gpu_data(), B.gpu_data(), n); #endif // USE_GPU } return res; @@ -244,8 +243,7 @@ void OuterProduct(const Blob<Dtype>& A, const Blob<Dtype>& B, Blob<Dtype> * C) { C->mutable_cpu_data()); } else { #ifdef USE_GPU - // gpu part - gpu_gemm(A.gpu_data(), B.gpu_data(), m, n, 1, 1, 0, + gpu_gemm(context->cublas_handle(device), A.gpu_data(), B.gpu_data(), m, n, 1, 1, 0, false, false, C->mutable_gpu_data()); #endif // USE_GPU } @@ -264,10 +262,9 @@ void Map(const Blob<Dtype> & A, Blob<Dtype> * B) { if (device == -1) { cpu_e_f<Op>(A.count(), A.cpu_data(), B->mutable_cpu_data()); } else { -#ifdef SINGA_GPU - // gpu part +#ifdef USE_GPU gpu_e_f<Op>(A.count(), A.gpu_data(), B->mutable_gpu_data()); -#endif // SINGA_GPU +#endif // USE_GPU } } @@ -286,10 +283,10 @@ void Map(const Blob<Dtype> & A, const Blob<Dtype> & B, Blob<Dtype> * C) { if (device == -1) { cpu_e_f<Op>(A.count(), A.cpu_data(), B.cpu_data(), C->mutable_cpu_data()); } else { -#ifdef SINGA_GPU +#ifdef USE_GPU // gpu part gpu_e_f<Op>(A.count(), A.gpu_data(), B.gpu_data(), C->mutable_gpu_data()); -#endif // SINGA_GPU +#endif // USE_GPU } } @@ -305,8 +302,8 @@ void Map(Dtype alpha, const Blob<Dtype>& A, Blob<Dtype>* B) { if (device == -1) { cpu_e_f<Op>(A.count(), alpha, A.cpu_data(), B->mutable_cpu_data()); } else { -#ifdef SINGA_GPU -#endif // SINGA_GPU +#ifdef USE_GPU +#endif // USE_GPU } } /** @@ -323,8 +320,8 @@ void Map(Dtype alpha, const Blob<Dtype>& A, const Blob<Dtype>& B, cpu_e_f<Op>(A.count(), alpha, A.cpu_data(), B->cpu_data(), C->mutable_cpu_data()); } else { -#ifdef SINGA_GPU -#endif // SINGA_GPU +#ifdef USE_GPU +#endif // USE_GPU } } @@ -563,8 +560,8 @@ void MVSumCol(Dtype alpha, Dtype beta, const Blob<Dtype> & A, Blob<Dtype> * B) { #ifdef USE_GPU singa_gpu_sum_col(A.gpu_data(), B->gpu_data(), m, n, n); // gpu part (TODO check transpose case) - } #endif // USE_GPU + } } /** @@ -588,8 +585,8 @@ void MVSumRow(Dtype alpha, Dtype beta, const Blob<Dtype> & A, Blob<Dtype> * B) { #ifdef USE_GPU singa_gpu_sum_row(A.gpu_data(), B->gpu_data(), m, n, n); // gpu part (TODO check transpose case) - } #endif // USE_GPU + } } /** @@ -606,10 +603,10 @@ void Reduce2D(const Blob<Dtype> & A, Blob<Dtype> * B) { if (device == -1) { cpu_reduce_f<Op>(A.cpu_data(), m, n, B->mutable_cpu_data()); } else { -#ifdef SINGA_GPU +#ifdef USE_GPU // gpu part gpu_reduce_f<Op>(A.gpu_data(), m, n, B->mutable_gpu_data()); -#endif // SINGA_GPU +#endif // USE_GPU } } /** @@ -626,9 +623,9 @@ void Expand2D(const Blob<Dtype> & A, Blob<Dtype> * B) { if (device == -1) { cpu_expand_f<Op>(A.cpu_data(), m, n, B->mutable_cpu_data()); } else { -#ifdef SINGA_GPU +#ifdef USE_GPU gpu_expand_f<Op>(A.gpu_data(), m, n, B->mutable_gpu_data()); -#endif // SINGA_GPU +#endif // USE_GPU } } @@ -640,13 +637,16 @@ Dtype Asum(const Blob<Dtype>& A) { if (A.count() == 0) return Dtype(0); auto context = Singleton<Context>::Instance(); int device = context->device_id(std::this_thread::get_id()); + Dtype ret = Dtype(0); if (device == -1) { - return cpu_asum(A.count(), A.cpu_data(), 1) / A.count(); + ret = cpu_asum(A.count(), A.cpu_data(), 1) / A.count(); } else { #ifdef USE_GPU - return gpu_asum(A.count(), A.cpu_data(), 1) / A.count(); // TODO(haibo) + ret = gpu_asum(context->cublas_handle(device), A.count(), A.cpu_data(), 1) + / A.count(); #endif } + return ret; } @@ -661,7 +661,6 @@ void SampleUniform(Dtype low, Dtype high, Blob<Dtype>* A) { A->mutable_cpu_data()); } else { #ifdef USE_GPU - // TODO(haibo) check gpu_sample_uniform(context->curand_generator(thread), A->count(), low, high, A->mutable_gpu_data()); #endif @@ -678,9 +677,8 @@ void SampleGaussian(Dtype mean, Dtype std, Blob<Dtype>* A) { A->mutable_cpu_data()); } else { #ifdef USE_GPU - // TODO(haibo) check it. - gpu_sample_gaussian(context->curand_generator(thread), A->count(), mean, std, - A->mutable_gpu_data()); + gpu_sample_gaussian(context->curand_generator(thread), A->count(), + mean, std, A->mutable_gpu_data()); #endif } } http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/6e563344/src/neuralnet/neuron_layer/convolution.cc ---------------------------------------------------------------------- diff --git a/src/neuralnet/neuron_layer/convolution.cc b/src/neuralnet/neuron_layer/convolution.cc index 2b4c6d9..edfabb6 100644 --- a/src/neuralnet/neuron_layer/convolution.cc +++ b/src/neuralnet/neuron_layer/convolution.cc @@ -40,17 +40,15 @@ void ConvolutionLayer::Setup(const LayerProto& conf, if (conv_conf.has_kernel()) { kernel_x_ = kernel_y_ = conv_conf.kernel(); } else { - CHECK(conv_conf.has_kernel_x()); - CHECK(conv_conf.has_kernel_y()); kernel_x_ = conv_conf.kernel_x(); kernel_y_ = conv_conf.kernel_y(); } + CHECK_NE(kernel_x_, 0); + CHECK_NE(kernel_y_, 0); if (conv_conf.has_pad()) { pad_x_ = pad_y_ = conv_conf.pad(); } else { - CHECK(conv_conf.has_pad_x()); - CHECK(conv_conf.has_pad_y()); pad_x_ = conv_conf.pad_x(); pad_y_ = conv_conf.pad_y(); } @@ -58,8 +56,6 @@ void ConvolutionLayer::Setup(const LayerProto& conf, if (conv_conf.has_stride()) { stride_x_ = stride_y_ = conv_conf.stride(); } else { - CHECK(conv_conf.has_stride_x()); - CHECK(conv_conf.has_stride_y()); stride_x_ = conv_conf.stride_x(); stride_y_ = conv_conf.stride_y(); } http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/6e563344/src/neuralnet/neuron_layer/pooling.cc ---------------------------------------------------------------------- diff --git a/src/neuralnet/neuron_layer/pooling.cc b/src/neuralnet/neuron_layer/pooling.cc index 2e246fc..5b408ba 100644 --- a/src/neuralnet/neuron_layer/pooling.cc +++ b/src/neuralnet/neuron_layer/pooling.cc @@ -37,24 +37,22 @@ void PoolingLayer::Setup(const LayerProto& conf, if (pool_conf.has_kernel()) { kernel_x_ = kernel_y_ = pool_conf.kernel(); } else { - CHECK(pool_conf.has_kernel_x()); - CHECK(pool_conf.has_kernel_y()); kernel_x_ = pool_conf.kernel_x(); kernel_y_ = pool_conf.kernel_y(); } + CHECK_NE(kernel_x_, 0); + CHECK_NE(kernel_y_, 0); + if (pool_conf.has_pad()) { pad_x_ = pad_y_ = pool_conf.pad(); } else { - CHECK(pool_conf.has_pad_x()); - CHECK(pool_conf.has_pad_y()); pad_x_ = pool_conf.pad_x(); pad_y_ = pool_conf.pad_y(); } + if (pool_conf.has_stride()) { stride_x_ = stride_y_ = pool_conf.stride(); } else { - CHECK(pool_conf.has_stride_x()); - CHECK(pool_conf.has_stride_y()); stride_x_ = pool_conf.stride_x(); stride_y_ = pool_conf.stride_y(); } http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/6e563344/src/proto/job.proto ---------------------------------------------------------------------- diff --git a/src/proto/job.proto b/src/proto/job.proto index 9d64c4b..12f6929 100644 --- a/src/proto/job.proto +++ b/src/proto/job.proto @@ -362,15 +362,14 @@ message ConvolutionProto { // The number of outputs for the layer optional int32 num_filters = 1; // the kernel height/width - optional int32 kernel = 2; - + optional int32 kernel = 2 [default = 3]; // The padding height/width optional int32 pad = 30 [default = 0]; // the stride optional int32 stride = 31 [default = 1]; - optional int32 kernel_x = 41; - optional int32 kernel_y = 42; + optional int32 kernel_x = 41 [default = 3]; + optional int32 kernel_y = 42 [default = 3]; optional int32 pad_x = 44 [default = 0]; optional int32 pad_y = 45 [default = 0]; @@ -452,7 +451,7 @@ message LRNProto { message PoolingProto { // The kernel size (square) - optional int32 kernel= 1; + optional int32 kernel= 1 [default = 3]; enum PoolMethod { MAX = 0; AVG = 1; @@ -462,17 +461,16 @@ message PoolingProto { // The padding size optional uint32 pad = 31 [default = 0]; // The stride - optional uint32 stride = 32 [default = 1]; - + optional uint32 stride = 32 [default = 2]; - optional int32 kernel_x = 41; - optional int32 kernel_y = 42; + optional int32 kernel_x = 41 [default = 3]; + optional int32 kernel_y = 42 [default = 3]; optional int32 pad_x = 44 [default = 0]; optional int32 pad_y = 45 [default = 0]; - optional int32 stride_x = 47 [default = 1]; - optional int32 stride_y = 48 [default = 1]; + optional int32 stride_x = 47 [default = 2]; + optional int32 stride_y = 48 [default = 2]; }
