[01/19] incubator-singa git commit: SINGA-100 Implement layers using CUDNN for GPU training

wangsh Wed, 16 Dec 2015 04:12:15 -0800

Repository: incubator-singa
Updated Branches:
  refs/heads/master 1981874fc -> 8ac511c70



SINGA-100 Implement layers using CUDNN for GPU training

Update dropout layer to run for both cudnn and cpu training mode.
Update the cpu part of math_blob and math_addr to use Context class for
getting cpu random generators.
TODO update math_blob and math_addr for GPU code, e.g., sampling.


Project: http://git-wip-us.apache.org/repos/asf/incubator-singa/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-singa/commit/5d35ef26
Tree: http://git-wip-us.apache.org/repos/asf/incubator-singa/tree/5d35ef26
Diff: http://git-wip-us.apache.org/repos/asf/incubator-singa/diff/5d35ef26

Branch: refs/heads/master
Commit: 5d35ef267326be519bc0a7ea7d2847a6c5056181
Parents: 49293a6
Author: Wei Wang <[email protected]>
Authored: Fri Nov 27 15:27:31 2015 +0800
Committer: Wei Wang <[email protected]>
Committed: Fri Dec 11 11:01:29 2015 +0800

----------------------------------------------------------------------
 include/singa/utils/math_addr.h               |  50 ++-
 include/singa/utils/math_blob.h               | 408 ++++++++++++---------
 include/singa/utils/singa_op.h                |  28 ++
 src/neuralnet/layer.cc                        |   7 +-
 src/neuralnet/loss_layer/cudnn_softmaxloss.cu |  52 +++
 src/neuralnet/neuron_layer/activation.cc      |  83 +++++
 src/neuralnet/neuron_layer/dropout.cc         |   4 +-
 7 files changed, 445 insertions(+), 187 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/5d35ef26/include/singa/utils/math_addr.h
----------------------------------------------------------------------
diff --git a/include/singa/utils/math_addr.h b/include/singa/utils/math_addr.h
index b21ea45..f63ff78 100644
--- a/include/singa/utils/math_addr.h
+++ b/include/singa/utils/math_addr.h
@@ -64,11 +64,21 @@ void cpu_gemv(const Dtype * A, const Dtype * B, const int 
m, const int n,
 }
 
 template<typename Dtype>
-void cpu_axpy(const Dtype * A, const int n, const Dtype alpha, Dtype * B) {
+void cpu_axpy(const int n, const Dtype alpha, const Dtype * A, Dtype * B) {
   cblas_saxpy(n, alpha, A, 1, B, 1);
 }
 
 template<typename Dtype>
+void cpu_scale(const int n, const Dtype alpha, Dtype * A) {
+  cblas_sscal(n, alpha, A, 1);
+}
+
+template<typename Dtype>
+void cpu_copy(const int n, const Dtype* A, Dtype *B) {
+  cblas_scopy(n, A, 1, B, 1);
+}
+
+template<typename Dtype>
 Dtype cpu_dot(const Dtype * A, const Dtype * B, const int n) {
   Dtype sum = 0;
   for (int i = 0 ; i < n ; i++)
@@ -122,22 +132,42 @@ void cpu_expand_f(const Dtype * A, const int m, const int 
n, Dtype * B) {
     Op::Map(A[i], n, B+i*n);
   }
 }
-// expand each element in A into a row of B
+
 
 template<typename Dtype>
-void cpu_sample_uniform(int n, Dtype low, Dtype high, Dtype* A);
+void cpu_softmax(int nb_rows, int nb_cols, const Dtype* A, Dtype* B) {
+  for (int i = 0; i < nb_rows; i++) {
+    const Dtype* dptr = A + i * nb_cols;
+    Dtype mmax = dptr[0];
+    for (int x = 1; x < nb_cols; ++x )
+      if (mmax < dptr[x]) mmax = dptr[x];
+    Dtype sum = 0.0f;
+    for(int x = 0; x < nb_cols; ++x ) {
+      dptr[x] = std::exp(dptr[x] - mmax );
+      sum += dptr[x];
+    }
+    for(int x = 0; x < nb_cols; ++x ) {
+      dptr[x] /= sum;
+    }
+  }
+}
 
-template<>
-inline void cpu_sample_uniform<float>(int n, float low, float high, float* A) {
 
-}
-template<typename Dtype>
-void cpu_sample_gaussian(int n, Dtype mean, Dtype std, Dtype* A);
 
-template<>
-inline void cpu_sample_gaussian<float>(int n, float mean, float std, float* A) 
{
+template<typename Dtype, typename URNG>
+void cpu_sample_uniform(URNG& g, int n, Dtype low, Dtype high, Dtype* A) {
+  std::uniform_real_distribution<Dtype> distribution(low, high);
+  for (int i = 0; i < n; i++)
+    A[i] = distribution(g);
+}
 
+template<typename Dtype, typename URNG>
+void cpu_sample_gaussian(URNG& g, int n, Dtype mean, Dtype std, Dtype* A) {
+  std::normal_distribution<Dtype> distribution(mean, std);
+  for (int i = 0; i < n; i++)
+    A[i] = distribution(g);
 }
+
 #ifdef USE_GPU
 template<typename Dtype>
 void gpu_gemm(const Dtype * A, const Dtype * B, const int m, const int n,

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/5d35ef26/include/singa/utils/math_blob.h
----------------------------------------------------------------------
diff --git a/include/singa/utils/math_blob.h b/include/singa/utils/math_blob.h
index ca75205..ce40d4f 100644
--- a/include/singa/utils/math_blob.h
+++ b/include/singa/utils/math_blob.h
@@ -42,30 +42,34 @@ enum XPU {cpu, gpu, any};
  * Use blas scale internally.
  */
 template<typename Dtype>
-void Scale(XPU xpu, Dtype alpha, const Blob<Dtype> & A, Blob<Dtype> * B) {
-  CHECK_EQ(A.count(), B->count());
-  if (xpu == cpu)
-    cpu_scale(A.count(), alpha, A.cpu_data(), B->mutable_cpu_data());
+void Scale(Dtype alpha, Blob<Dtype> * B) {
+  auto context = Singleton<Context>::Instance();
+  int device = context->device_id(std::this_thread::get_id());
+  if (device == -1)
+    cpu_scale(B->count(), alpha, B->mutable_cpu_data());
+  else {
 #ifdef USE_GPU
+    // TODO(haibo) check it.
+//    gpu_scale(B->count(), alpha, B->mutable_gpu_data());
 #endif
+  }
 }
 
 /**
  * Element-wise operation: Bi = alpha*Ai+Bi. A and B should have the same size
  */
 template<typename Dtype>
-void AXPY(XPU xpu, Dtype alpha, const Blob<Dtype> & A, Blob<Dtype> * B) {
+void AXPY(Dtype alpha, const Blob<Dtype> & A, Blob<Dtype> * B) {
   CHECK_EQ(A.count(), B->count());
-  if (xpu == cpu) {
-    cpu_axpy(A.cpu_data(), A.count(),
-        alpha, B->mutable_cpu_data());
-  }
+  auto context = Singleton<Context>::Instance();
+  int device = context->device_id(std::this_thread::get_id());
+  if (device == -1) {
+    cpu_axpy(A.count(), alpha, A.cpu_data(), B->mutable_cpu_data());
+  } else {
 #ifdef USE_GPU
-  if (xpu == gpu) {
-    gpu_axpy(A.gpu_data(), A.count(),
-        alpha, B->mutable_gpu_data());
-  }
+    gpu_axpy(A.count(), alpha, A.gpu_data(), B->mutable_gpu_data());
 #endif  // USE_GPU
+  }
 }
 
 /************* BLAS level 2 *****************/
@@ -83,7 +87,7 @@ void AXPY(XPU xpu, Dtype alpha, const Blob<Dtype> & A, 
Blob<Dtype> * B) {
  * @param[in, out] C, vector
  */
 template<typename Dtype>
-void GEMV(XPU xpu, Dtype alpha, Dtype beta, const Blob<Dtype>& A,
+void GEMV(Dtype alpha, Dtype beta, const Blob<Dtype>& A,
     const Blob<Dtype>& B, Blob<Dtype>* C) {
   CHECK_EQ(A.shape().size(), 2) << "A must be a matrix";
   int a1, a2, m, n;
@@ -95,17 +99,18 @@ void GEMV(XPU xpu, Dtype alpha, Dtype beta, const 
Blob<Dtype>& A,
   CHECK_EQ(a1, n) << "# rows of A(.T) must = length of C";
 
   bool TranA = A.transpose();
-  if (xpu == cpu) {
+  auto context = Singleton<Context>::Instance();
+  int device = context->device_id(std::this_thread::get_id());
+  if (device == -1) {
     cpu_gemv(A.cpu_data(), B.cpu_data(), m, n, alpha, beta, TranA,
         C->mutable_cpu_data());
-  }
+  } else {
 #ifdef USE_GPU
-  if (xpu == gpu) {
     // gpu part
     gpu_gemv(A.gpu_data(), B.gpu_data(), m, n, alpha, beta, TranA,
         C->mutable_gpu_data());
-  }
 #endif  // USE_GPU
+  }
 }
 /**
  * Matrix vector multiplication, C = A(.T) * B, transpose is considered.
@@ -119,9 +124,9 @@ void GEMV(XPU xpu, Dtype alpha, Dtype beta, const 
Blob<Dtype>& A,
  * @param[out] C output vector
  */
 template <typename Dtype>
-void MVDot(XPU xpu, const Blob<Dtype>& A, const Blob<Dtype>& B,
+void MVDot(const Blob<Dtype>& A, const Blob<Dtype>& B,
     Blob<Dtype>* C) {
-  GEMV(xpu, Dtype(1), Dtype(0), A, B, C);
+  GEMV(Dtype(1), Dtype(0), A, B, C);
 }
 
 /************* BLAS level 3 *****************/
@@ -140,7 +145,7 @@ void MVDot(XPU xpu, const Blob<Dtype>& A, const 
Blob<Dtype>& B,
  * @param[in, out] C, matrix
  */
 template <typename Dtype>
-void GEMM(XPU xpu, Dtype alpha, Dtype beta, const Blob<Dtype>& A,
+void GEMM( Dtype alpha, Dtype beta, const Blob<Dtype>& A,
     const Blob<Dtype> & B, Blob<Dtype> * C) {
   CHECK_EQ(A.shape().size(), 2);
   CHECK_EQ(B.shape().size(), 2);
@@ -160,17 +165,18 @@ void GEMM(XPU xpu, Dtype alpha, Dtype beta, const 
Blob<Dtype>& A,
   int k = A.transpose() ? A.shape(0) : A.shape(1);
   bool TranA = A.transpose();
   bool TranB = B.transpose();
-  if (xpu == cpu) {
-    cpu_gemm(A.cpu_data(), B.cpu_data(), m, n, k, alpha, beta,
-        TranA, TranB, C->mutable_cpu_data());
-  }
+  auto context = Singleton<Context>::Instance();
+  int device = context->device_id(std::this_thread::get_id());
+  if (device == -1) {
+    cpu_gemm(A.cpu_data(), B.cpu_data(), m, n, k, alpha, beta, TranA, TranB,
+        C->mutable_cpu_data());
+  } else {
 #ifdef USE_GPU
-  if (xpu == gpu) {
     // gpu part
     gpu_gemm(A.gpu_data(), B.gpu_data(), m, n, k, alpha, beta,
         TranA, TranB, C->mutable_gpu_data());
-  }
 #endif  // USE_GPU
+  }
 }
 /**
  * Matrix multiplication, C = A(.T) * B(.T), transpose is considered.
@@ -183,9 +189,9 @@ void GEMM(XPU xpu, Dtype alpha, Dtype beta, const 
Blob<Dtype>& A,
  * @param[out] C output matrix
  */
 template <typename Dtype>
-void MMDot(XPU xpu, const Blob<Dtype>& A, const Blob<Dtype>& B,
+void MMDot(const Blob<Dtype>& A, const Blob<Dtype>& B,
     Blob<Dtype>* C) {
-  GEMM(xpu, Dtype(1), Dtype(0), A, B, C);
+  GEMM(Dtype(1), Dtype(0), A, B, C);
 }
 
 
@@ -199,19 +205,20 @@ void MMDot(XPU xpu, const Blob<Dtype>& A, const 
Blob<Dtype>& B,
  * @return inner product value.
  */
 template <typename Dtype>
-Dtype VVDot(XPU xpu, const Blob<Dtype> & A, const Blob<Dtype> & B) {
+Dtype VVDot(const Blob<Dtype> & A, const Blob<Dtype> & B) {
   Dtype res = 0;
   CHECK_EQ(A.count(), B.count());
   int n = A.count();
-  if (xpu == cpu) {
+  auto context = Singleton<Context>::Instance();
+  int device = context->device_id(std::this_thread::get_id());
+  if (device == -1) {
     res = cpu_dot(A.cpu_data(), B.cpu_data(), n);
-  }
+  } else {
 #ifdef USE_GPU
-  if (xpu == gpu) {
     // gpu part
     res = gpu_dot(A.gpu_data(), B.gpu_data(), n);
-  }
 #endif  // USE_GPU
+  }
   return res;
 }
 
@@ -224,25 +231,24 @@ Dtype VVDot(XPU xpu, const Blob<Dtype> & A, const 
Blob<Dtype> & B) {
  * @param[out] C, output matrix
  */
 template <typename Dtype>
-void OuterProduct(XPU xpu, const Blob<Dtype>& A, const Blob<Dtype>& B,
-    Blob<Dtype> * C) {
+void OuterProduct(const Blob<Dtype>& A, const Blob<Dtype>& B, Blob<Dtype> * C) 
{
   CHECK(!C->transpose());  // do not support C.T now.
 
   int m = A.count();
   int n = B.count();
   CHECK_EQ(C->count(), m * n);
-
-  if (xpu == cpu) {
-    cpu_gemm(A.cpu_data(), B.cpu_data(), m, n, 1, 1, 0,
-        false, false, C->mutable_cpu_data());
-  }
+  auto context = Singleton<Context>::Instance();
+  int device = context->device_id(std::this_thread::get_id());
+  if (device == -1) {
+    cpu_gemm(A.cpu_data(), B.cpu_data(), m, n, 1, 1, 0, false, false,
+        C->mutable_cpu_data());
+  } else {
 #ifdef USE_GPU
-  if (xpu == gpu) {
     // gpu part
     gpu_gemm(A.gpu_data(), B.gpu_data(), m, n, 1, 1, 0,
         false, false, C->mutable_gpu_data());
-  }
 #endif  // USE_GPU
+  }
 }
 /*********************** Element-wise functions ***********************/
 /**
@@ -251,17 +257,18 @@ void OuterProduct(XPU xpu, const Blob<Dtype>& A, const 
Blob<Dtype>& B,
  * Loose shape checking, A.count() == B.count().
  */
 template<typename Op, typename Dtype>
-void Map(XPU xpu, const Blob<Dtype> & A, Blob<Dtype> * B) {
+void Map(const Blob<Dtype> & A, Blob<Dtype> * B) {
   CHECK_EQ(A.count(), B->count()) << "Blobs must have the same size";
-  if (xpu == cpu) {
+  auto context = Singleton<Context>::Instance();
+  int device = context->device_id(std::this_thread::get_id());
+  if (device == -1) {
     cpu_e_f<Op>(A.count(), A.cpu_data(), B->mutable_cpu_data());
-  }
+  } else {
 #ifdef SINGA_GPU
-  if (xpu == gpu) {
     // gpu part
     gpu_e_f<Op>(A.count(), A.gpu_data(), B->mutable_gpu_data());
-  }
 #endif  // SINGA_GPU
+  }
 }
 
 /**
@@ -270,19 +277,20 @@ void Map(XPU xpu, const Blob<Dtype> & A, Blob<Dtype> * B) 
{
  * Loose shape checking, A, B and C are of the same size.
  */
 template<typename Op, typename Dtype>
-void Map(XPU xpu, const Blob<Dtype> & A, const Blob<Dtype> & B,
-    Blob<Dtype> * C) {
+void Map(const Blob<Dtype> & A, const Blob<Dtype> & B, Blob<Dtype> * C) {
   CHECK_EQ(A.count(), B.count()) << "Blobs must have the same size";
   CHECK_EQ(A.count(), C->count()) << "Blobs must have the same size";
-  if (xpu == cpu) {
+  cpu_e_f<Op>(A.count(), A.cpu_data(), B.cpu_data(), C->mutable_cpu_data());
+  auto context = Singleton<Context>::Instance();
+  int device = context->device_id(std::this_thread::get_id());
+  if (device == -1) {
     cpu_e_f<Op>(A.count(), A.cpu_data(), B.cpu_data(), C->mutable_cpu_data());
-  }
+  } else {
 #ifdef SINGA_GPU
-  if (xpu == gpu) {
     // gpu part
     gpu_e_f<Op>(A.count(), A.gpu_data(), B.gpu_data(), C->mutable_gpu_data());
-  }
 #endif  // SINGA_GPU
+  }
 }
 
 /**
@@ -290,28 +298,34 @@ void Map(XPU xpu, const Blob<Dtype> & A, const 
Blob<Dtype> & B,
  * Loose shape checking, A.count() == B.count().
  */
 template<typename Op, typename Dtype>
-void Map(XPU xpu, Dtype alpha, const Blob<Dtype>& A, Blob<Dtype>* B) {
+void Map(Dtype alpha, const Blob<Dtype>& A, Blob<Dtype>* B) {
   CHECK_EQ(A.count(), B->count()) << "Blobs must have the same size";
-  if (xpu == cpu) {
+  auto context = Singleton<Context>::Instance();
+  int device = context->device_id(std::this_thread::get_id());
+  if (device == -1) {
     cpu_e_f<Op>(A.count(), alpha, A.cpu_data(), B->mutable_cpu_data());
-  }
+  } else {
 #ifdef SINGA_GPU
 #endif  // SINGA_GPU
+  }
 }
 /**
  * Ci = Op(alpha, Ai, Bi)
  * Loose shape checking, A, B and C are of the same size.
  */
 template<typename Op, typename Dtype>
-void Map(XPU xpu, Dtype alpha, const Blob<Dtype>& A, const Blob<Dtype>& B,
+void Map(Dtype alpha, const Blob<Dtype>& A, const Blob<Dtype>& B,
     Blob<Dtype>* C) {
   CHECK_EQ(A.count(), B->count()) << "Blobs must have the same size";
-  if (xpu == cpu) {
+  auto context = Singleton<Context>::Instance();
+  int device = context->device_id(std::this_thread::get_id());
+  if (device == -1) {
     cpu_e_f<Op>(A.count(), alpha, A.cpu_data(), B->cpu_data(),
         C->mutable_cpu_data());
-  }
+  } else {
 #ifdef SINGA_GPU
 #endif  // SINGA_GPU
+  }
 }
 
 /**
@@ -322,24 +336,46 @@ void Map(XPU xpu, Dtype alpha, const Blob<Dtype>& A, 
const Blob<Dtype>& B,
  * Loose shape checking, A.count() == B.count().
  */
 template<typename Dtype>
-void Copy(XPU xpu, const Blob<Dtype>& A, Blob<Dtype>* B) {
+void Copy(const Blob<Dtype>& A, Blob<Dtype>* B) {
   CHECK_EQ(A.count(), B->count()) << "Blobs must have the same size";
-  if (xpu == cpu) {
+  auto context = Singleton<Context>::Instance();
+  int device = context->device_id(std::this_thread::get_id());
+  if (device == -1) {
     std::copy(A.cpu_data(), A.cpu_data() + A.count(), B->mutable_cpu_data());
   } else {
-    LOG(FATAL) << "Not implemented";
+#ifdef USE_GPU
+#endif
   }
 }
 
+
+/**
+ * B = alpha + A
+ * Implemented using Copy and AXPY.
+ */
+template<typename Dtype>
+void Add(Dtype alpha,  const Blob<Dtype> & A, Blob<Dtype> * B) {
+  Map<singa::op::Add<Dtype>>(alpha, A, B);
+}
+
 /**
  * C = A + B
  * Implemented using Copy and AXPY.
  */
 template<typename Dtype>
-void Add(XPU xpu, const Blob<Dtype> & A, const Blob<Dtype> & B,
+void Add(const Blob<Dtype> & A, const Blob<Dtype> & B,
     Blob<Dtype> * C) {
   Copy(A, C);
-  AXPY(B, C, 1);
+  AXPY(Dtype(1), B, C);
+}
+
+/**
+ * B = alpha - A
+ * Implemented using Copy and AXPY.
+ */
+template<typename Dtype>
+void Sub(Dtype alpha, const Blob<Dtype> & A, Blob<Dtype>* B) {
+  Map<singa::op::Sub<Dtype>>(alpha, A, B);
 }
 
 /**
@@ -347,10 +383,10 @@ void Add(XPU xpu, const Blob<Dtype> & A, const 
Blob<Dtype> & B,
  * Implemented using Copy and AXPY.
  */
 template<typename Dtype>
-void Sub(XPU xpu, const Blob<Dtype> & A, const Blob<Dtype> & B,
+void Sub(const Blob<Dtype> & A, const Blob<Dtype> & B,
     Blob<Dtype> * C) {
-  Copy(xpu, A, C);
-  AXPY(xpu, B, C, -1);
+  Copy(A, C);
+  AXPY(Dtype(-1), B, C);
 }
 
 /**
@@ -360,19 +396,8 @@ void Sub(XPU xpu, const Blob<Dtype> & A, const Blob<Dtype> 
& B,
 template<typename Dtype>
 void Mult(const Blob<Dtype> & A, const Blob<Dtype> & B,
     Blob<Dtype> * C) {
-  //Map<singa::op::Mult<Dtype>>(xpu, A, B, C);
+  Map<singa::op::Mult<Dtype>>(A, B, C);
   // TODO(wangwei) use MKL's vector func
-  auto context = Singleton<Context>::Instance();
-  int device = context->device_id(std::this_thread::get_id());
-  if (device == -1)
-    cpu_e_f<op::Mult<Dtype>>(C->count(), A.cpu_data(), B.cpu_data(),
-        C->mutable_cpu_data());
-  else {
-#ifdef USE_GPU
-  gpu_e_f<op::Mult<Dtype>>(C->count(), A.gpu_data(), B.gpu_data(),
-        C->mutable_gpu_data());
-#endif
-  }
 }
 
 /**
@@ -380,11 +405,46 @@ void Mult(const Blob<Dtype> & A, const Blob<Dtype> & B,
  * Map(XPU, const Blob<Dtype>&, const Blob<Dtype>&, Blob<Dtype>*).
  */
 template<typename Dtype>
-void Div(XPU xpu, const Blob<Dtype> & A, const Blob<Dtype> & B,
+void Div(const Blob<Dtype> & A, const Blob<Dtype> & B,
     Blob<Dtype> * C) {
-  Map<singa::op::Div<Dtype>>(xpu, A, B, C);
+  Map<singa::op::Div<Dtype>>(A, B, C);
   // TODO(wangwei) use MKL's vector func
 }
+/**
+ * B = sqrt(A)
+ */
+template<typename Dtype>
+void Sqrt(const Blob<Dtype> & A, Blob<Dtype>* B) {
+  Map<singa::op::Sqrt<Dtype>, Dtype>(A, B);
+}
+/**
+ * B = square(A)
+ */
+template<typename Dtype>
+void Square(const Blob<Dtype> & A, Blob<Dtype>* B) {
+  Map<singa::op::Square<Dtype>, Dtype>(A, B);
+}
+/**
+ * B = exp(A)
+ */
+template<typename Dtype>
+void Exp(const Blob<Dtype> & A, Blob<Dtype>* B) {
+  Map<singa::op::Exp<Dtype>, Dtype>(A, B);
+}
+/**
+ * B = log(A)
+ */
+template<typename Dtype>
+void Log(const Blob<Dtype>& A, Blob<Dtype>* B) {
+  Map<singa::op::Log<Dtype>, Dtype>(A, B);
+}
+/**
+ * B = tanh(A)
+ */
+template<typename Dtype>
+void Tanh(const Blob<Dtype>& A, Blob<Dtype>* B) {
+  Map<singa::op::Tanh<Dtype>, Dtype>(A, B);
+}
 /*************************1D<-->2D op/transform***************************/
 /**
  * Add A to each column of B, i.e., Bij = alpha*Ai + beta*Bij
@@ -392,28 +452,26 @@ void Div(XPU xpu, const Blob<Dtype> & A, const 
Blob<Dtype> & B,
  * # columns of B = B.count() / A.count().
  */
 template<typename Dtype>
-void MVAddCol(XPU xpu, Dtype alpha, Dtype beta, const Blob<Dtype> & A,
-    Blob<Dtype> * B) {
+void MVAddCol(Dtype alpha, Dtype beta, const Blob<Dtype> & A, Blob<Dtype> * B) 
{
   if (B->transpose()) {
-    Blob<Dtype>* tmp = Transpose(* B);
-    MVAddRow(xpu, alpha, beta, A, tmp);
-    delete tmp;
+    B->set_transpose(false);
+    MVAddRow(alpha, beta, A, B);
+    B->set_transpose(true);
   } else {
     CHECK_EQ(B->count() % A.count(), 0) << "#col of B not match length of A";
     int m = A.count(), n = B->count() / m;
-    if (xpu == cpu) {
-      Blob<Dtype> one(n);
-      one.SetValue(1);
-      cpu_gemm(A.cpu_data(), one.cpu_data(), m, n, 1, alpha, beta,
-          false, false, B->mutable_cpu_data());
-    }
+    Blob<Dtype> one(n);
+    one.SetValue(1);
+    auto context = Singleton<Context>::Instance();
+    int device = context->device_id(std::this_thread::get_id());
+    if (device == -1) {
+      cpu_gemm(A.cpu_data(), one.cpu_data(), m, n, 1, alpha, beta, false, 
false,
+          B->mutable_cpu_data());
+    } else {
 #ifdef USE_GPU
-    if (xpu == gpu) {
-      singa_gpu_add_vec_row(B->gpu_data(),
-          A.gpu_data(), A.gpu_data(), m, n, n);
-      // gpu part
-    }
+      singa_gpu_add_vec_row(B->gpu_data(), A.gpu_data(), A.gpu_data(), m, n, 
n);
 #endif  // USE_GPU
+    }
   }
 }
 /**
@@ -422,8 +480,8 @@ void MVAddCol(XPU xpu, Dtype alpha, Dtype beta, const 
Blob<Dtype> & A,
  * # columns of B = B.count() / A.count().
  */
 template<typename Dtype>
-void MVAddCol(XPU xpu, const Blob<Dtype> & A, Blob<Dtype>* B) {
-  MVAddCol(xpu, Dtype(1), Dtype(1), A, B);
+void MVAddCol(const Blob<Dtype> & A, Blob<Dtype>* B) {
+  MVAddCol(Dtype(1), Dtype(1), A, B);
 }
 
 /**
@@ -432,28 +490,26 @@ void MVAddCol(XPU xpu, const Blob<Dtype> & A, 
Blob<Dtype>* B) {
  * # rows of B = B.count() / A.count().
  */
 template<typename Dtype>
-void MVAddRow(XPU xpu, Dtype alpha, Dtype beta, const Blob<Dtype> & A,
-    Blob<Dtype> * B) {
+void MVAddRow(Dtype alpha, Dtype beta, const Blob<Dtype> & A, Blob<Dtype> * B) 
{
   if (B->transpose()) {
-    Blob<Dtype>* tmp = Transpose(* B);
-    MVAddCol(xpu, alpha, beta, A, tmp);
-    delete tmp;
+    B->set_transpose(false);
+    MVAddCol(alpha, beta, A, B);
+    B->set_transpose(true);
   } else {
     CHECK_EQ(B->count() % A.count(), 0) << "#col of B not match length of A";
     int m = A.count(), n = B->count() / m;
-    if (xpu == cpu) {
-      Blob<Dtype> one(n);
-      one.SetValue(1);
+    Blob<Dtype> one(n);
+    one.SetValue(1);
+    auto context = Singleton<Context>::Instance();
+    int device = context->device_id(std::this_thread::get_id());
+    if (device == -1) {
       cpu_gemm(one.cpu_data(), A.cpu_data(), n, m, 1, alpha, beta,
           false, false, B->mutable_cpu_data());
-    }
+    } else {
 #ifdef USE_GPU
-    if (xpu == gpu) {
-      // gpu part
-      singa_gpu_add_vec_row(B->gpu_data(),
-          A.gpu_data(), A.gpu_data(), m, n, n);
-    }
+      singa_gpu_add_vec_row(B->gpu_data(), A.gpu_data(), A.gpu_data(), m, n, 
n);
 #endif  // USE_GPU
+    }
   }
 }
 /**
@@ -462,8 +518,8 @@ void MVAddRow(XPU xpu, Dtype alpha, Dtype beta, const 
Blob<Dtype> & A,
  * # rows of B = B.count() / A.count().
  */
 template<typename Dtype>
-void MVAddRow(XPU xpu, const Blob<Dtype> & A, Blob<Dtype>* B) {
-  MVAddRow(xpu, Dtype(1), Dtype(1), A, B);
+void MVAddRow(const Blob<Dtype> & A, Blob<Dtype>* B) {
+  MVAddRow(Dtype(1), Dtype(1), A, B);
 }
 
 /**
@@ -472,8 +528,8 @@ void MVAddRow(XPU xpu, const Blob<Dtype> & A, Blob<Dtype>* 
B) {
  * # columns of B = B.count() / A.count().
  */
 template<typename Dtype>
-void RepmatCol(XPU xpu, const Blob<Dtype> & A, Blob<Dtype> * B) {
-  MVAddCol(xpu, Dtype(1), Dtype(0), A, B);
+void RepmatCol(const Blob<Dtype> & A, Blob<Dtype> * B) {
+  MVAddCol(Dtype(1), Dtype(0), A, B);
 }
 
 /**
@@ -482,8 +538,8 @@ void RepmatCol(XPU xpu, const Blob<Dtype> & A, Blob<Dtype> 
* B) {
  * # rows of B = B.count() / A.count().
  */
 template<typename Dtype>
-void RepmatRow(XPU xpu, const Blob<Dtype> & A, Blob<Dtype> * B) {
-  MVAddRow(xpu, Dtype(1), Dtype(0), A, B);
+void RepmatRow(const Blob<Dtype> & A, Blob<Dtype> * B) {
+  MVAddRow(Dtype(1), Dtype(0), A, B);
 }
 
 /**
@@ -493,18 +549,18 @@ void RepmatRow(XPU xpu, const Blob<Dtype> & A, 
Blob<Dtype> * B) {
  * # columns of A = A.count() / B.count().
  */
 template<typename Dtype>
-void MVSumCol(XPU xpu, Dtype alpha, Dtype beta, const Blob<Dtype> & A,
-    Blob<Dtype> * B) {
+void MVSumCol(Dtype alpha, Dtype beta, const Blob<Dtype> & A, Blob<Dtype> * B) 
{
   CHECK_EQ(A.count() % B->count(), 0) << "length of B must = # of cols of A";
   int m = B->count(), n = A.count() / m;
-  if (xpu == cpu) {
-    Blob<Dtype> one(n);
-    one.SetValue(1);
+  Blob<Dtype> one(n);
+  one.SetValue(1);
+  auto context = Singleton<Context>::Instance();
+  int device = context->device_id(std::this_thread::get_id());
+  if (device == -1) {
     cpu_gemm(A.cpu_data(), one.cpu_data(), m, 1, n, alpha, beta,
         A.transpose(), false, B->mutable_cpu_data());
-  }
+  } else {
 #ifdef USE_GPU
-  if (xpu == gpu) {
     singa_gpu_sum_col(A.gpu_data(), B->gpu_data(), m, n, n);
     // gpu part (TODO check transpose case)
   }
@@ -518,18 +574,18 @@ void MVSumCol(XPU xpu, Dtype alpha, Dtype beta, const 
Blob<Dtype> & A,
  * # rows of A = A.count() / B.count().
  */
 template<typename Dtype>
-void MVSumRow(XPU xpu, Dtype alpha, Dtype beta, const Blob<Dtype> & A,
-    Blob<Dtype> * B) {
+void MVSumRow(Dtype alpha, Dtype beta, const Blob<Dtype> & A, Blob<Dtype> * B) 
{
   CHECK_EQ(A.count() % B->count(), 0) << "length of B must = # of cols of A";
   int m = B->count(), n = A.count() / m;
-  if (xpu == cpu) {
-    Blob<Dtype> one(n);
-    one.SetValue(1);
-    cpu_gemm(one.cpu_data(), A.cpu_data(), 1, m, n, alpha, beta,
-        A.transpose(), false, B->mutable_cpu_data());
-  }
+  Blob<Dtype> one(n);
+  one.SetValue(1);
+  auto context = Singleton<Context>::Instance();
+  int device = context->device_id(std::this_thread::get_id());
+  if (device == -1) {
+    cpu_gemm(one.cpu_data(), A.cpu_data(), 1, m, n, alpha, beta, A.transpose(),
+      false, B->mutable_cpu_data());
+  } else {
 #ifdef USE_GPU
-  if (xpu == gpu) {
     singa_gpu_sum_row(A.gpu_data(), B->gpu_data(), m, n, n);
     // gpu part (TODO check transpose case)
   }
@@ -542,18 +598,19 @@ void MVSumRow(XPU xpu, Dtype alpha, Dtype beta, const 
Blob<Dtype> & A,
  * # columns of A = A.count() / B.count().
  */
 template<typename Op, typename Dtype>
-void Reduce2D(XPU xpu, const Blob<Dtype> & A, Blob<Dtype> * B) {
+void Reduce2D(const Blob<Dtype> & A, Blob<Dtype> * B) {
   CHECK_EQ(A.count() % B->count(), 0) << "Row size not match B length";
   int m = B->count(), n = A.count() / m;
-  if (xpu == cpu) {
+  auto context = Singleton<Context>::Instance();
+  int device = context->device_id(std::this_thread::get_id());
+  if (device == -1) {
     cpu_reduce_f<Op>(A.cpu_data(), m, n, B->mutable_cpu_data());
-  }
+  } else {
 #ifdef SINGA_GPU
-  if (xpu == gpu) {
     // gpu part
     gpu_reduce_f<Op>(A.gpu_data(), m, n, B->mutable_gpu_data());
-  }
 #endif  // SINGA_GPU
+  }
 }
 /**
  * Duplicate each element of A into a row of B.
@@ -561,62 +618,67 @@ void Reduce2D(XPU xpu, const Blob<Dtype> & A, Blob<Dtype> 
* B) {
  * # columns of B = B.count() / A.count().
  */
 template<typename Op, typename Dtype>
-void Expand2D(XPU xpu, const Blob<Dtype> & A, Blob<Dtype> * B) {
+void Expand2D(const Blob<Dtype> & A, Blob<Dtype> * B) {
   CHECK_EQ(B->count() % A.count(), 0) << "Row size of B not match length of A";
   int m = A.count(), n = B->count() / m;
-  if (xpu == cpu) {
+  auto context = Singleton<Context>::Instance();
+  int device = context->device_id(std::this_thread::get_id());
+  if (device == -1) {
     cpu_expand_f<Op>(A.cpu_data(), m, n, B->mutable_cpu_data());
-  }
+  } else {
 #ifdef SINGA_GPU
-  if (xpu == gpu) {
-    // gpu part
     gpu_expand_f<Op>(A.gpu_data(), m, n, B->mutable_gpu_data());
-  }
 #endif  // SINGA_GPU
+  }
 }
 
 /**
  * Average the absolute values.
  */
-template <typename Dtype>
-Dtype Asum(XPU xpu, const Blob<Dtype>& A) {
+template<typename Dtype>
+Dtype Asum(const Blob<Dtype>& A) {
   if (A.count() == 0) return Dtype(0);
-  if (xpu == cpu)
+  auto context = Singleton<Context>::Instance();
+  int device = context->device_id(std::this_thread::get_id());
+  if (device == -1) {
     return cpu_asum(A.count(), A.cpu_data(), 1) / A.count();
-  return Dtype(0); // avoid compile warning
+  } else {
 #ifdef USE_GPU
+    return 0; // TODO(haibo)
 #endif
+  }
 }
 
+
 /*************Random Sample***************/
 template<typename Dtype>
-void SampleUniform(Dtype low, Dtype high, Blob<Dtype>* A);
-
-template<>
-inline void SampleUniform<float>(float low, float high, Blob<float> *A) {
+void SampleUniform(Dtype low, Dtype high, Blob<Dtype>* A) {
   auto context = Singleton<Context>::Instance();
-  int device = context->device_id(std::this_thread::get_id());
-  if (device == -1)
-    cpu_sample_uniform(A->count(), low, high, A->mutable_cpu_data());
-  else {
+  const auto& thread = std::this_thread::get_id();
+  int device = context->device_id(thread);
+  if (device == -1) {
+    cpu_sample_uniform(*context->rand_generator(thread), A->count(), low, high,
+        A->mutable_cpu_data());
+  } else {
 #ifdef USE_GPU
+    // TODO(haibo) check
     gpu_sample_uniform(A->count(), low, high, A->mutable_gpu_data());
 #endif
   }
 }
 
 template<typename Dtype>
-void SampleGaussian(Dtype mean, Dtype std, Blob<Dtype>* A);
-
-template<>
-inline void SampleGaussian<float>(float low, float high, Blob<float> *A) {
+void SampleGaussian(Dtype mean, Dtype std, Blob<Dtype>* A) {
   auto context = Singleton<Context>::Instance();
-  int device = context->device_id(std::this_thread::get_id());
-  if (device == -1)
-    cpu_sample_gaussian(A->count(), low, high, A->mutable_cpu_data());
-  else {
+  const auto& thread = std::this_thread::get_id();
+  int device = context->device_id(thread);
+  if (device == -1) {
+    cpu_sample_gaussian(*context->rand_generator(thread), A->count(), mean, 
std,
+        A->mutable_cpu_data());
+  } else {
 #ifdef USE_GPU
-    gpu_sample_gaussian(A->count(), low, high, A->mutable_gpu_data());
+    // TODO(haibo) check it.
+    gpu_sample_gaussian(A->count(), mean, std, A->mutable_gpu_data());
 #endif
   }
 }
@@ -627,11 +689,15 @@ void Softmax(int nb_rows, const Blob<Dtype>& A, 
Blob<Dtype>* B) {
   CHECK_GT(nb_rows, 0);
   CHECK_EQ(A.count() % nb_rows, 0);
   CHECK_EQ(A.count(), B->count());
-
-#ifdef USE_GPU
-  cpu_softmax(nb_rows, A.count() / nb_rows, A.cpu_data(),
+  auto context = Singleton<Context>::Instance();
+  int device = context->device_id(std::this_thread::get_id());
+  if (device == -1) {
+    cpu_softmax(nb_rows, A.count() / nb_rows, A.cpu_data(),
       B->mutable_cpu_data());
+  } else {
+#ifdef USE_GPU
 #endif  // USE_GPU
+  }
 }
 }  // end of namespace singa
 

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/5d35ef26/include/singa/utils/singa_op.h
----------------------------------------------------------------------
diff --git a/include/singa/utils/singa_op.h b/include/singa/utils/singa_op.h
index c934050..7499eb1 100644
--- a/include/singa/utils/singa_op.h
+++ b/include/singa/utils/singa_op.h
@@ -211,6 +211,34 @@ struct Pow {
   }
 #endif  // USE_GPU
 };
+
+template<typename Dtype>
+struct Add {
+  inline static void Map(const Dtype & a, const Dtype & b, Dtype * c) {
+    *c =  a + b;
+  }
+#ifdef USE_GPU
+  inline static void CudaMap(const Dtype * a,
+      const Dtype * b, Dtype * c, int n) {
+//    singa::singa_gpu_add(a, b, c, n); // TODO(haibo)
+  }
+#endif  // USE_GPU
+};
+
+template<typename Dtype>
+struct Sub {
+  inline static void Map(const Dtype & a, const Dtype & b, Dtype * c) {
+    *c =  a - b;
+  }
+#ifdef USE_GPU
+  inline static void CudaMap(const Dtype * a,
+      const Dtype * b, Dtype * c, int n) {
+//    singa::singa_gpu_add(a, b, c, n);  // TODO(haibo)
+  }
+#endif  // USE_GPU
+};
+
+
 template<typename Dtype>
 struct Mult {
   inline static void Map(const Dtype & a, const Dtype & b, Dtype * c) {

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/5d35ef26/src/neuralnet/layer.cc
----------------------------------------------------------------------
diff --git a/src/neuralnet/layer.cc b/src/neuralnet/layer.cc
index be38ac5..9414948 100644
--- a/src/neuralnet/layer.cc
+++ b/src/neuralnet/layer.cc
@@ -48,17 +48,16 @@ const std::string Layer::ToString(bool debug, int flag) {
     return "";
   string ret = StringPrintf("Layer %10s ", name().c_str());
   if ((flag & kForward) == kForward && data_.count() !=0) {
-    ret += StringPrintf("data norm1 %13.9f", Asum(cpu, data_));
+    ret += StringPrintf("data norm1 %13.9f", Asum(data_));
   } else if ((flag & kBackward) == kBackward) {
     if (grad_.count() != 0)
-      ret += StringPrintf("grad norm1 %13.9f\n", Asum(cpu, grad_));
+      ret += StringPrintf("grad norm1 %13.9f\n", Asum(grad_));
   }
   if ((flag & kTrain) == kTrain) {
     for (Param* p : GetParams()) {
       ret += StringPrintf(
           "param id %2d, name %10s, value norm1 %13.9f, grad norm1 %13.9f\n",
-          p->id(), p->name().c_str(), Asum(cpu, p->data()),
-          Asum(cpu, p->grad()));
+          p->id(), p->name().c_str(), Asum(p->data()), Asum(p->grad()));
     }
   }
   return ret;

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/5d35ef26/src/neuralnet/loss_layer/cudnn_softmaxloss.cu
----------------------------------------------------------------------
diff --git a/src/neuralnet/loss_layer/cudnn_softmaxloss.cu 
b/src/neuralnet/loss_layer/cudnn_softmaxloss.cu
new file mode 100644
index 0000000..e0af05f
--- /dev/null
+++ b/src/neuralnet/loss_layer/cudnn_softmaxloss.cu
@@ -0,0 +1,52 @@
+/************************************************************
+*
+* Licensed to the Apache Software Foundation (ASF) under one
+* or more contributor license agreements.  See the NOTICE file
+* distributed with this work for additional information
+* regarding copyright ownership.  The ASF licenses this file
+* to you under the Apache License, Version 2.0 (the
+* "License"); you may not use this file except in compliance
+* with the License.  You may obtain a copy of the License at
+*
+*   http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing,
+* software distributed under the License is distributed on an
+* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+* KIND, either express or implied.  See the License for the
+* specific language governing permissions and limitations
+* under the License.
+*
+*************************************************************/
+
+#include "singa/neuralnet/loss_layer.h"
+
+namespace singa {
+void CudnnSoftmaxLossLayer::Setup(const LayerProto& conf,
+    const vector<Layer*>& srclayers) {
+  CudnnSoftmaxLayer::Setup(conf, srclayers);
+  topk_ = conf.softmaxloss_conf().topk();
+  loss_ = accuracy_ = 0.0f;
+  counter_ = 0;
+}
+void CudnnSoftmaxLossLayer::ComputeFeature(int flag,
+    const vector<Layer*>& srclayers) {
+  CudnnSoftmaxLayer::ComputeFeature(flag, srclayers);
+  // compute loss
+  counter_++;
+  // add loss and accuracy
+}
+
+void CudnnSoftmaxLossLayer::ComputeGradient(int flag,
+    const vector<Layer*>& srclayers) {
+ // compute gradient
+}
+
+const std::string CudnnSoftmaxLossLayer::ToString(bool debug, int flag) {
+  string disp = "Loss = " + std::to_string(loss_ / counter_)
+    + ", accuracy = " + std::to_string(accuracy_ / counter_);
+  counter_ = 0;
+  loss_ = accuracy_ = 0;
+  return disp;
+}
+}  // namespace singa

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/5d35ef26/src/neuralnet/neuron_layer/activation.cc
----------------------------------------------------------------------
diff --git a/src/neuralnet/neuron_layer/activation.cc 
b/src/neuralnet/neuron_layer/activation.cc
new file mode 100644
index 0000000..6f62646
--- /dev/null
+++ b/src/neuralnet/neuron_layer/activation.cc
@@ -0,0 +1,83 @@
+/************************************************************
+*
+* Licensed to the Apache Software Foundation (ASF) under one
+* or more contributor license agreements.  See the NOTICE file
+* distributed with this work for additional information
+* regarding copyright ownership.  The ASF licenses this file
+* to you under the Apache License, Version 2.0 (the
+* "License"); you may not use this file except in compliance
+* with the License.  You may obtain a copy of the License at
+*
+*   http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing,
+* software distributed under the License is distributed on an
+* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+* KIND, either express or implied.  See the License for the
+* specific language governing permissions and limitations
+* under the License.
+*
+*************************************************************/
+#include "singa/neuralnet/neuron_layer.h"
+#include "singa/utils/math_blob.h"
+#include "singa/utils/singa_op.h"
+#include "singa/proto/job.pb.h"
+namespace singa {
+
+void ActivationLayer::Setup(const LayerProto& conf,
+    const vector<Layer*>& srclayers) {
+  NeuronLayer::Setup(conf, srclayers);
+  data_.ReshapeLike(srclayers[0]->data(this));
+  grad_.ReshapeLike(*(srclayers[0]->mutable_grad(this)));
+}
+void ActivationLayer::ComputeFeature(int flag, const vector<Layer*>& srclayers)
+{
+  switch (layer_conf_.activation_conf().type()) {
+    case RELU:
+      Map<op::Relu<float>, float>(srclayers[0]->data(this), &data_);
+      break;
+    case SIGMOID:
+      Map<op::Sigmoid<float>, float>(srclayers[0]->data(this), &data_);
+      break;
+    case TANH:
+      Map<op::Tanh<float>, float>(srclayers[0]->data(this), &data_);
+      break;
+      /*
+    case ActivationType_STANH:
+      Map<op::STanh<float>, float>(srclayers[0]->data(this), &data_);
+      break;
+      */
+    default:
+      LOG(ERROR) << "Unknow activation type " <<
+        layer_conf_.activation_conf().type();
+  }
+}
+void ActivationLayer::ComputeGradient(int flag, const vector<Layer*>& 
srclayers)
+{
+  Blob<float> * gsrc = srclayers[0]->mutable_grad(this);
+  switch (layer_conf_.activation_conf().type()) {
+    case RELU:
+      Map<op::Relu<float>, float>(data_, gsrc);
+      Mult(*gsrc, grad_, gsrc);
+      break;
+    case SIGMOID:
+      Map<op::SigmoidGrad<float>, float>(data_, gsrc);
+      Mult(*gsrc, grad_, gsrc);
+      break;
+    case TANH:
+      Map<op::Tanh<float>, float>(data_, gsrc);
+      Mult(*gsrc, grad_, gsrc);
+      break;
+      /*
+    case ActivationType_STANH:
+      Map<op::STanh<float>, float>(data_, gsrc);
+      Mult(*gsrc, grad_, gsrc);
+      break;
+      */
+    default:
+      LOG(ERROR) << "Unknow activation type " <<
+        layer_conf_.activation_conf().type();
+  }
+}
+
+}  // namespace singa

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/5d35ef26/src/neuralnet/neuron_layer/dropout.cc
----------------------------------------------------------------------
diff --git a/src/neuralnet/neuron_layer/dropout.cc 
b/src/neuralnet/neuron_layer/dropout.cc
index ad8c10f..030a663 100644
--- a/src/neuralnet/neuron_layer/dropout.cc
+++ b/src/neuralnet/neuron_layer/dropout.cc
@@ -48,8 +48,8 @@ void DropoutLayer::ComputeFeature(int flag, const 
vector<Layer*>& srclayers) {
   float pkeep = 1 - pdrop_;
   Blob<float> rand(data_.count());
   SampleUniform(0.0f, 1.0f, &rand);
-  // Threashold(pkeep, rand, &mask_);
-  // Scale(1.0f / pkeep, &mask_);
+  Map<op::Threshold<float>, float>(pkeep, rand, &mask_);
+  Scale(1.0f / pkeep, &mask_);
   Mult(srclayers[0]->data(this), mask_, &data_);
 }

[01/19] incubator-singa git commit: SINGA-100 Implement layers using CUDNN for GPU training

Reply via email to