[08/19] incubator-singa git commit: SINGA-100 Implement layers using CUDNN for GPU training

wangsh Wed, 16 Dec 2015 04:12:13 -0800

SINGA-100 Implement layers using CUDNN for GPU training

Pass cublas handle from math_blob to math_addr.
Test configure-make for cpu code.
Compile success for Makefile.gpu.
Todo set up Context when creating worker threads.



Project: http://git-wip-us.apache.org/repos/asf/incubator-singa/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-singa/commit/6e563344
Tree: http://git-wip-us.apache.org/repos/asf/incubator-singa/tree/6e563344
Diff: http://git-wip-us.apache.org/repos/asf/incubator-singa/diff/6e563344

Branch: refs/heads/master
Commit: 6e5633441da13625b28971a42afda416ee05e1c6
Parents: 2ed18a5
Author: seaokcs <[email protected]>
Authored: Fri Nov 27 21:25:00 2015 +0800
Committer: Wei Wang <[email protected]>
Committed: Fri Dec 11 11:48:23 2015 +0800

----------------------------------------------------------------------
 include/singa/utils/context.h             |  2 +
 include/singa/utils/math_addr.h           | 45 ++++++------------
 include/singa/utils/math_blob.h           | 64 +++++++++++++-------------
 src/neuralnet/neuron_layer/convolution.cc |  8 +---
 src/neuralnet/neuron_layer/pooling.cc     | 10 ++--
 src/proto/job.proto                       | 20 ++++----
 6 files changed, 61 insertions(+), 88 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/6e563344/include/singa/utils/context.h
----------------------------------------------------------------------
diff --git a/include/singa/utils/context.h b/include/singa/utils/context.h
index 905b810..1d1802c 100644
--- a/include/singa/utils/context.h
+++ b/include/singa/utils/context.h
@@ -77,8 +77,10 @@ class Context {
    */
   Context() {
     for (int i = 0; i < kMaxNumGPU; i++) {
+#ifdef USE_GPU
       cublas_handle_.push_back(nullptr);
       curand_generator_.push_back(nullptr);
+#endif
     }
   }
 

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/6e563344/include/singa/utils/math_addr.h
----------------------------------------------------------------------
diff --git a/include/singa/utils/math_addr.h b/include/singa/utils/math_addr.h
index f548606..3b0eefd 100644
--- a/include/singa/utils/math_addr.h
+++ b/include/singa/utils/math_addr.h
@@ -171,65 +171,51 @@ void cpu_sample_gaussian(URNG& g, int n, Dtype mean, 
Dtype std, Dtype* A) {
 
 #ifdef USE_GPU
 template<typename Dtype>
-Dtype gpu_asum(int n, const Dtype* A, int inc) {
+Dtype gpu_asum(cublasHandle_t handle, int n, const Dtype* A, int inc) {
   Dtype result = 0.0;
-  cublasHandle_t handle;
-  cublasCreate(&handle);
   cublasSasum(handle, n, A, inc, &result);
-  cublasDestroy(handle);
   return result;
 }
 
 template<typename Dtype>
-void gpu_gemm(const Dtype * A, const Dtype * B, const int m, const int n,
-    const int k, const Dtype alpha, const Dtype beta, const bool TranA,
-    const bool TranB, Dtype * C) {
+void gpu_gemm(cublasHandle_t handle, const Dtype * A, const Dtype * B,
+    const int m, const int n, const int k, const Dtype alpha, const Dtype beta,
+    const bool TranA, const bool TranB, Dtype * C) {
   int lda = TranA ? m : k;
   int ldb = TranB ? k : n;
   int ldc = n;
   cublasOperation_t tA = (TranA == false) ? CUBLAS_OP_N : CUBLAS_OP_T;
   cublasOperation_t tB = (TranB == false) ? CUBLAS_OP_N : CUBLAS_OP_T;
-  cublasHandle_t handle;
-  cublasCreate(&handle);
   cublasSgemm(handle, tB, tA, n, m, k, &alpha, B, ldb,
       A, lda, &beta, C, ldc);
-  cublasDestroy(handle);
 }
 
 template<typename Dtype>
-void gpu_gemv(const Dtype * A, const Dtype * B, const int m, const int n,
-    const Dtype alpha, const Dtype beta, const bool TranA, Dtype * C) {
+void gpu_gemv(cublasHandle_t handle, const Dtype * A, const Dtype * B,
+    const int m, const int n, const Dtype alpha, const Dtype beta,
+    const bool TranA, Dtype * C) {
   int lda = n;
   cublasOperation_t tA = (TranA == true) ? CUBLAS_OP_N : CUBLAS_OP_T;
-  cublasHandle_t handle;
-  cublasCreate(&handle);
   cublasSgemv(handle, tA, n, m, &alpha , A, lda, B, 1, &beta, C, 1);
-  cublasDestroy(handle);
 }
 
 template<typename Dtype>
-void gpu_axpy(const Dtype * A, const int n, const Dtype alpha, Dtype * B) {
-  cublasHandle_t handle;
-  cublasCreate(&handle);
+void gpu_axpy(cublasHandle_t handle, const Dtype * A, const int n,
+    const Dtype alpha, Dtype * B) {
   cublasSaxpy(handle, n, &alpha, A, 1, B, 1);
-  cublasDestroy(handle);
 }
 
 template<typename Dtype>
-void gpu_scale(const int n, const Dtype alpha, Dtype * A) {
-  cublasHandle_t handle;
-  cublasCreate(&handle);
+void gpu_scale(cublasHandle_t handle, const int n, const Dtype alpha,
+    Dtype * A) {
   cublasSscal(handle, n, &alpha, A, 1);
-  cublasDestroy(handle);
 }
 
 template<typename Dtype>
-Dtype gpu_dot(const Dtype * A, const Dtype * B, const int n) {
-  cublasHandle_t handle;
-  cublasCreate(&handle);
+Dtype gpu_dot(cublasHandle_t handle, const Dtype * A, const Dtype * B,
+    const int n) {
   Dtype result = 0.0;
   cublasSdot(handle, n, A, 1, B, 1, &result);
-  cublasDestroy(handle);
   return result;
 }
 
@@ -280,16 +266,11 @@ void gpu_expand_f(const Dtype * A, const int m, const int 
n, Dtype * B) {
 
 template<typename Dtype, typename URNG>
 void gpu_sample_uniform(URNG g, int n, Dtype low, Dtype high, Dtype* A) {
-  //curandGenerator_t gen;
-  //curandCreateGenerator(&gen, CURAND_RNG_PSEUDO_DEFAULT);
-  curandSetPseudoRandomGeneratorSeed(g, time(NULL));
   curandGenerateUniform(g, A, n);
-  //curandDestroyGenerator(gen);
 }
 
 template<typename Dtype, typename URNG>
 void gpu_sample_gaussian(URNG g, int n, Dtype mean, Dtype std, Dtype* A) {
-  curandSetPseudoRandomGeneratorSeed(g, time(NULL));
   curandGenerateNormal(g, A, n, mean, std);
 }
 

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/6e563344/include/singa/utils/math_blob.h
----------------------------------------------------------------------
diff --git a/include/singa/utils/math_blob.h b/include/singa/utils/math_blob.h
index bbf7cc0..97d5cf7 100644
--- a/include/singa/utils/math_blob.h
+++ b/include/singa/utils/math_blob.h
@@ -49,8 +49,8 @@ void Scale(Dtype alpha, Blob<Dtype> * B) {
     cpu_scale(B->count(), alpha, B->mutable_cpu_data());
   else {
 #ifdef USE_GPU
-    // TODO(haibo) check it.
-    gpu_scale(B->count(), alpha, B->mutable_gpu_data());
+    gpu_scale(context->cublas_handle(device), B->count(), alpha,
+        B->mutable_gpu_data());
 #endif
   }
 }
@@ -67,7 +67,8 @@ void AXPY(Dtype alpha, const Blob<Dtype> & A, Blob<Dtype> * 
B) {
     cpu_axpy(A.count(), alpha, A.cpu_data(), B->mutable_cpu_data());
   } else {
 #ifdef USE_GPU
-    gpu_axpy(A.count(), alpha, A.gpu_data(), B->mutable_gpu_data());
+    gpu_axpy(context->cublas_handle(device), A.count(), alpha, A.gpu_data(),
+        B->mutable_gpu_data());
 #endif  // USE_GPU
   }
 }
@@ -106,9 +107,8 @@ void GEMV(Dtype alpha, Dtype beta, const Blob<Dtype>& A,
         C->mutable_cpu_data());
   } else {
 #ifdef USE_GPU
-    // gpu part
-    gpu_gemv(A.gpu_data(), B.gpu_data(), m, n, alpha, beta, TranA,
-        C->mutable_gpu_data());
+    gpu_gemv(context->cublas_handle(device), A.gpu_data(), B.gpu_data(), m, n,
+        alpha, beta, TranA, C->mutable_gpu_data());
 #endif  // USE_GPU
   }
 }
@@ -172,9 +172,8 @@ void GEMM( Dtype alpha, Dtype beta, const Blob<Dtype>& A,
         C->mutable_cpu_data());
   } else {
 #ifdef USE_GPU
-    // gpu part
-    gpu_gemm(A.gpu_data(), B.gpu_data(), m, n, k, alpha, beta,
-        TranA, TranB, C->mutable_gpu_data());
+    gpu_gemm(context->cublas_handle(device), A.gpu_data(), B.gpu_data(), m, n, 
k,
+        alpha, beta, TranA, TranB, C->mutable_gpu_data());
 #endif  // USE_GPU
   }
 }
@@ -216,7 +215,7 @@ Dtype VVDot(const Blob<Dtype> & A, const Blob<Dtype> & B) {
   } else {
 #ifdef USE_GPU
     // gpu part
-    res = gpu_dot(A.gpu_data(), B.gpu_data(), n);
+    res = gpu_dot(context->cublas_handle(device), A.gpu_data(), B.gpu_data(), 
n);
 #endif  // USE_GPU
   }
   return res;
@@ -244,8 +243,7 @@ void OuterProduct(const Blob<Dtype>& A, const Blob<Dtype>& 
B, Blob<Dtype> * C) {
         C->mutable_cpu_data());
   } else {
 #ifdef USE_GPU
-    // gpu part
-    gpu_gemm(A.gpu_data(), B.gpu_data(), m, n, 1, 1, 0,
+    gpu_gemm(context->cublas_handle(device), A.gpu_data(), B.gpu_data(), m, n, 
1, 1, 0,
         false, false, C->mutable_gpu_data());
 #endif  // USE_GPU
   }
@@ -264,10 +262,9 @@ void Map(const Blob<Dtype> & A, Blob<Dtype> * B) {
   if (device == -1) {
     cpu_e_f<Op>(A.count(), A.cpu_data(), B->mutable_cpu_data());
   } else {
-#ifdef SINGA_GPU
-    // gpu part
+#ifdef USE_GPU
     gpu_e_f<Op>(A.count(), A.gpu_data(), B->mutable_gpu_data());
-#endif  // SINGA_GPU
+#endif  // USE_GPU
   }
 }
 
@@ -286,10 +283,10 @@ void Map(const Blob<Dtype> & A, const Blob<Dtype> & B, 
Blob<Dtype> * C) {
   if (device == -1) {
     cpu_e_f<Op>(A.count(), A.cpu_data(), B.cpu_data(), C->mutable_cpu_data());
   } else {
-#ifdef SINGA_GPU
+#ifdef USE_GPU
     // gpu part
     gpu_e_f<Op>(A.count(), A.gpu_data(), B.gpu_data(), C->mutable_gpu_data());
-#endif  // SINGA_GPU
+#endif  // USE_GPU
   }
 }
 
@@ -305,8 +302,8 @@ void Map(Dtype alpha, const Blob<Dtype>& A, Blob<Dtype>* B) 
{
   if (device == -1) {
     cpu_e_f<Op>(A.count(), alpha, A.cpu_data(), B->mutable_cpu_data());
   } else {
-#ifdef SINGA_GPU
-#endif  // SINGA_GPU
+#ifdef USE_GPU
+#endif  // USE_GPU
   }
 }
 /**
@@ -323,8 +320,8 @@ void Map(Dtype alpha, const Blob<Dtype>& A, const 
Blob<Dtype>& B,
     cpu_e_f<Op>(A.count(), alpha, A.cpu_data(), B->cpu_data(),
         C->mutable_cpu_data());
   } else {
-#ifdef SINGA_GPU
-#endif  // SINGA_GPU
+#ifdef USE_GPU
+#endif  // USE_GPU
   }
 }
 
@@ -563,8 +560,8 @@ void MVSumCol(Dtype alpha, Dtype beta, const Blob<Dtype> & 
A, Blob<Dtype> * B) {
 #ifdef USE_GPU
     singa_gpu_sum_col(A.gpu_data(), B->gpu_data(), m, n, n);
     // gpu part (TODO check transpose case)
-  }
 #endif  // USE_GPU
+  }
 }
 
 /**
@@ -588,8 +585,8 @@ void MVSumRow(Dtype alpha, Dtype beta, const Blob<Dtype> & 
A, Blob<Dtype> * B) {
 #ifdef USE_GPU
     singa_gpu_sum_row(A.gpu_data(), B->gpu_data(), m, n, n);
     // gpu part (TODO check transpose case)
-  }
 #endif  // USE_GPU
+  }
 }
 
 /**
@@ -606,10 +603,10 @@ void Reduce2D(const Blob<Dtype> & A, Blob<Dtype> * B) {
   if (device == -1) {
     cpu_reduce_f<Op>(A.cpu_data(), m, n, B->mutable_cpu_data());
   } else {
-#ifdef SINGA_GPU
+#ifdef USE_GPU
     // gpu part
     gpu_reduce_f<Op>(A.gpu_data(), m, n, B->mutable_gpu_data());
-#endif  // SINGA_GPU
+#endif  // USE_GPU
   }
 }
 /**
@@ -626,9 +623,9 @@ void Expand2D(const Blob<Dtype> & A, Blob<Dtype> * B) {
   if (device == -1) {
     cpu_expand_f<Op>(A.cpu_data(), m, n, B->mutable_cpu_data());
   } else {
-#ifdef SINGA_GPU
+#ifdef USE_GPU
     gpu_expand_f<Op>(A.gpu_data(), m, n, B->mutable_gpu_data());
-#endif  // SINGA_GPU
+#endif  // USE_GPU
   }
 }
 
@@ -640,13 +637,16 @@ Dtype Asum(const Blob<Dtype>& A) {
   if (A.count() == 0) return Dtype(0);
   auto context = Singleton<Context>::Instance();
   int device = context->device_id(std::this_thread::get_id());
+  Dtype ret = Dtype(0);
   if (device == -1) {
-    return cpu_asum(A.count(), A.cpu_data(), 1) / A.count();
+    ret = cpu_asum(A.count(), A.cpu_data(), 1) / A.count();
   } else {
 #ifdef USE_GPU
-    return gpu_asum(A.count(), A.cpu_data(), 1) / A.count(); // TODO(haibo)
+    ret = gpu_asum(context->cublas_handle(device), A.count(), A.cpu_data(), 1)
+      / A.count();
 #endif
   }
+  return ret;
 }
 
 
@@ -661,7 +661,6 @@ void SampleUniform(Dtype low, Dtype high, Blob<Dtype>* A) {
         A->mutable_cpu_data());
   } else {
 #ifdef USE_GPU
-    // TODO(haibo) check
     gpu_sample_uniform(context->curand_generator(thread), A->count(), low, 
high,
                A->mutable_gpu_data());
 #endif
@@ -678,9 +677,8 @@ void SampleGaussian(Dtype mean, Dtype std, Blob<Dtype>* A) {
         A->mutable_cpu_data());
   } else {
 #ifdef USE_GPU
-    // TODO(haibo) check it.
-    gpu_sample_gaussian(context->curand_generator(thread), A->count(), mean, 
std,
-        A->mutable_gpu_data());
+    gpu_sample_gaussian(context->curand_generator(thread), A->count(),
+        mean, std, A->mutable_gpu_data());
 #endif
   }
 }

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/6e563344/src/neuralnet/neuron_layer/convolution.cc
----------------------------------------------------------------------
diff --git a/src/neuralnet/neuron_layer/convolution.cc 
b/src/neuralnet/neuron_layer/convolution.cc
index 2b4c6d9..edfabb6 100644
--- a/src/neuralnet/neuron_layer/convolution.cc
+++ b/src/neuralnet/neuron_layer/convolution.cc
@@ -40,17 +40,15 @@ void ConvolutionLayer::Setup(const LayerProto& conf,
   if (conv_conf.has_kernel()) {
     kernel_x_ = kernel_y_ = conv_conf.kernel();
   } else {
-    CHECK(conv_conf.has_kernel_x());
-    CHECK(conv_conf.has_kernel_y());
     kernel_x_ = conv_conf.kernel_x();
     kernel_y_ = conv_conf.kernel_y();
   }
+  CHECK_NE(kernel_x_, 0);
+  CHECK_NE(kernel_y_, 0);
 
   if (conv_conf.has_pad()) {
     pad_x_ = pad_y_ = conv_conf.pad();
   } else {
-    CHECK(conv_conf.has_pad_x());
-    CHECK(conv_conf.has_pad_y());
     pad_x_ = conv_conf.pad_x();
     pad_y_ = conv_conf.pad_y();
   }
@@ -58,8 +56,6 @@ void ConvolutionLayer::Setup(const LayerProto& conf,
   if (conv_conf.has_stride()) {
     stride_x_ = stride_y_ = conv_conf.stride();
   } else {
-    CHECK(conv_conf.has_stride_x());
-    CHECK(conv_conf.has_stride_y());
     stride_x_ = conv_conf.stride_x();
     stride_y_ = conv_conf.stride_y();
   }

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/6e563344/src/neuralnet/neuron_layer/pooling.cc
----------------------------------------------------------------------
diff --git a/src/neuralnet/neuron_layer/pooling.cc 
b/src/neuralnet/neuron_layer/pooling.cc
index 2e246fc..5b408ba 100644
--- a/src/neuralnet/neuron_layer/pooling.cc
+++ b/src/neuralnet/neuron_layer/pooling.cc
@@ -37,24 +37,22 @@ void PoolingLayer::Setup(const LayerProto& conf,
   if (pool_conf.has_kernel()) {
     kernel_x_ = kernel_y_ = pool_conf.kernel();
   } else {
-    CHECK(pool_conf.has_kernel_x());
-    CHECK(pool_conf.has_kernel_y());
     kernel_x_ = pool_conf.kernel_x();
     kernel_y_ = pool_conf.kernel_y();
   }
+  CHECK_NE(kernel_x_, 0);
+  CHECK_NE(kernel_y_, 0);
+
   if (pool_conf.has_pad()) {
     pad_x_ = pad_y_ = pool_conf.pad();
   } else {
-    CHECK(pool_conf.has_pad_x());
-    CHECK(pool_conf.has_pad_y());
     pad_x_ = pool_conf.pad_x();
     pad_y_ = pool_conf.pad_y();
   }
+
   if (pool_conf.has_stride()) {
     stride_x_ = stride_y_ = pool_conf.stride();
   } else {
-    CHECK(pool_conf.has_stride_x());
-    CHECK(pool_conf.has_stride_y());
     stride_x_ = pool_conf.stride_x();
     stride_y_ = pool_conf.stride_y();
   }

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/6e563344/src/proto/job.proto
----------------------------------------------------------------------
diff --git a/src/proto/job.proto b/src/proto/job.proto
index 9d64c4b..12f6929 100644
--- a/src/proto/job.proto
+++ b/src/proto/job.proto
@@ -362,15 +362,14 @@ message ConvolutionProto {
   // The number of outputs for the layer
   optional int32 num_filters = 1;
   // the kernel height/width
-  optional int32 kernel = 2;
-
+  optional int32 kernel = 2 [default = 3];
   // The padding height/width
   optional int32 pad = 30 [default = 0];
   // the stride
   optional int32 stride = 31 [default = 1];
 
-  optional int32 kernel_x = 41;
-  optional int32 kernel_y = 42;
+  optional int32 kernel_x = 41 [default = 3];
+  optional int32 kernel_y = 42 [default = 3];
 
   optional int32 pad_x = 44 [default = 0];
   optional int32 pad_y = 45 [default = 0];
@@ -452,7 +451,7 @@ message LRNProto {
 
 message PoolingProto {
   // The kernel size (square)
-  optional int32 kernel= 1;
+  optional int32 kernel= 1 [default = 3];
   enum PoolMethod {
     MAX = 0;
     AVG = 1;
@@ -462,17 +461,16 @@ message PoolingProto {
   // The padding size
   optional uint32 pad = 31 [default = 0];
   // The stride
-  optional uint32 stride = 32 [default = 1];
-
+  optional uint32 stride = 32 [default = 2];
 
-  optional int32 kernel_x = 41;
-  optional int32 kernel_y = 42;
+  optional int32 kernel_x = 41 [default = 3];
+  optional int32 kernel_y = 42 [default = 3];
 
   optional int32 pad_x = 44 [default = 0];
   optional int32 pad_y = 45 [default = 0];
 
-  optional int32 stride_x = 47 [default = 1];
-  optional int32 stride_y = 48 [default = 1];
+  optional int32 stride_x = 47 [default = 2];
+  optional int32 stride_y = 48 [default = 2];
 }

[08/19] incubator-singa git commit: SINGA-100 Implement layers using CUDNN for GPU training

Reply via email to