Repository: incubator-singa Updated Branches: refs/heads/master 1981874fc -> 8ac511c70
SINGA-100 Implement layers using CUDNN for GPU training Update dropout layer to run for both cudnn and cpu training mode. Update the cpu part of math_blob and math_addr to use Context class for getting cpu random generators. TODO update math_blob and math_addr for GPU code, e.g., sampling. Project: http://git-wip-us.apache.org/repos/asf/incubator-singa/repo Commit: http://git-wip-us.apache.org/repos/asf/incubator-singa/commit/5d35ef26 Tree: http://git-wip-us.apache.org/repos/asf/incubator-singa/tree/5d35ef26 Diff: http://git-wip-us.apache.org/repos/asf/incubator-singa/diff/5d35ef26 Branch: refs/heads/master Commit: 5d35ef267326be519bc0a7ea7d2847a6c5056181 Parents: 49293a6 Author: Wei Wang <[email protected]> Authored: Fri Nov 27 15:27:31 2015 +0800 Committer: Wei Wang <[email protected]> Committed: Fri Dec 11 11:01:29 2015 +0800 ---------------------------------------------------------------------- include/singa/utils/math_addr.h | 50 ++- include/singa/utils/math_blob.h | 408 ++++++++++++--------- include/singa/utils/singa_op.h | 28 ++ src/neuralnet/layer.cc | 7 +- src/neuralnet/loss_layer/cudnn_softmaxloss.cu | 52 +++ src/neuralnet/neuron_layer/activation.cc | 83 +++++ src/neuralnet/neuron_layer/dropout.cc | 4 +- 7 files changed, 445 insertions(+), 187 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/5d35ef26/include/singa/utils/math_addr.h ---------------------------------------------------------------------- diff --git a/include/singa/utils/math_addr.h b/include/singa/utils/math_addr.h index b21ea45..f63ff78 100644 --- a/include/singa/utils/math_addr.h +++ b/include/singa/utils/math_addr.h @@ -64,11 +64,21 @@ void cpu_gemv(const Dtype * A, const Dtype * B, const int m, const int n, } template<typename Dtype> -void cpu_axpy(const Dtype * A, const int n, const Dtype alpha, Dtype * B) { +void cpu_axpy(const int n, const Dtype alpha, const Dtype * A, Dtype * B) { cblas_saxpy(n, alpha, A, 1, B, 1); } template<typename Dtype> +void cpu_scale(const int n, const Dtype alpha, Dtype * A) { + cblas_sscal(n, alpha, A, 1); +} + +template<typename Dtype> +void cpu_copy(const int n, const Dtype* A, Dtype *B) { + cblas_scopy(n, A, 1, B, 1); +} + +template<typename Dtype> Dtype cpu_dot(const Dtype * A, const Dtype * B, const int n) { Dtype sum = 0; for (int i = 0 ; i < n ; i++) @@ -122,22 +132,42 @@ void cpu_expand_f(const Dtype * A, const int m, const int n, Dtype * B) { Op::Map(A[i], n, B+i*n); } } -// expand each element in A into a row of B + template<typename Dtype> -void cpu_sample_uniform(int n, Dtype low, Dtype high, Dtype* A); +void cpu_softmax(int nb_rows, int nb_cols, const Dtype* A, Dtype* B) { + for (int i = 0; i < nb_rows; i++) { + const Dtype* dptr = A + i * nb_cols; + Dtype mmax = dptr[0]; + for (int x = 1; x < nb_cols; ++x ) + if (mmax < dptr[x]) mmax = dptr[x]; + Dtype sum = 0.0f; + for(int x = 0; x < nb_cols; ++x ) { + dptr[x] = std::exp(dptr[x] - mmax ); + sum += dptr[x]; + } + for(int x = 0; x < nb_cols; ++x ) { + dptr[x] /= sum; + } + } +} -template<> -inline void cpu_sample_uniform<float>(int n, float low, float high, float* A) { -} -template<typename Dtype> -void cpu_sample_gaussian(int n, Dtype mean, Dtype std, Dtype* A); -template<> -inline void cpu_sample_gaussian<float>(int n, float mean, float std, float* A) { +template<typename Dtype, typename URNG> +void cpu_sample_uniform(URNG& g, int n, Dtype low, Dtype high, Dtype* A) { + std::uniform_real_distribution<Dtype> distribution(low, high); + for (int i = 0; i < n; i++) + A[i] = distribution(g); +} +template<typename Dtype, typename URNG> +void cpu_sample_gaussian(URNG& g, int n, Dtype mean, Dtype std, Dtype* A) { + std::normal_distribution<Dtype> distribution(mean, std); + for (int i = 0; i < n; i++) + A[i] = distribution(g); } + #ifdef USE_GPU template<typename Dtype> void gpu_gemm(const Dtype * A, const Dtype * B, const int m, const int n, http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/5d35ef26/include/singa/utils/math_blob.h ---------------------------------------------------------------------- diff --git a/include/singa/utils/math_blob.h b/include/singa/utils/math_blob.h index ca75205..ce40d4f 100644 --- a/include/singa/utils/math_blob.h +++ b/include/singa/utils/math_blob.h @@ -42,30 +42,34 @@ enum XPU {cpu, gpu, any}; * Use blas scale internally. */ template<typename Dtype> -void Scale(XPU xpu, Dtype alpha, const Blob<Dtype> & A, Blob<Dtype> * B) { - CHECK_EQ(A.count(), B->count()); - if (xpu == cpu) - cpu_scale(A.count(), alpha, A.cpu_data(), B->mutable_cpu_data()); +void Scale(Dtype alpha, Blob<Dtype> * B) { + auto context = Singleton<Context>::Instance(); + int device = context->device_id(std::this_thread::get_id()); + if (device == -1) + cpu_scale(B->count(), alpha, B->mutable_cpu_data()); + else { #ifdef USE_GPU + // TODO(haibo) check it. +// gpu_scale(B->count(), alpha, B->mutable_gpu_data()); #endif + } } /** * Element-wise operation: Bi = alpha*Ai+Bi. A and B should have the same size */ template<typename Dtype> -void AXPY(XPU xpu, Dtype alpha, const Blob<Dtype> & A, Blob<Dtype> * B) { +void AXPY(Dtype alpha, const Blob<Dtype> & A, Blob<Dtype> * B) { CHECK_EQ(A.count(), B->count()); - if (xpu == cpu) { - cpu_axpy(A.cpu_data(), A.count(), - alpha, B->mutable_cpu_data()); - } + auto context = Singleton<Context>::Instance(); + int device = context->device_id(std::this_thread::get_id()); + if (device == -1) { + cpu_axpy(A.count(), alpha, A.cpu_data(), B->mutable_cpu_data()); + } else { #ifdef USE_GPU - if (xpu == gpu) { - gpu_axpy(A.gpu_data(), A.count(), - alpha, B->mutable_gpu_data()); - } + gpu_axpy(A.count(), alpha, A.gpu_data(), B->mutable_gpu_data()); #endif // USE_GPU + } } /************* BLAS level 2 *****************/ @@ -83,7 +87,7 @@ void AXPY(XPU xpu, Dtype alpha, const Blob<Dtype> & A, Blob<Dtype> * B) { * @param[in, out] C, vector */ template<typename Dtype> -void GEMV(XPU xpu, Dtype alpha, Dtype beta, const Blob<Dtype>& A, +void GEMV(Dtype alpha, Dtype beta, const Blob<Dtype>& A, const Blob<Dtype>& B, Blob<Dtype>* C) { CHECK_EQ(A.shape().size(), 2) << "A must be a matrix"; int a1, a2, m, n; @@ -95,17 +99,18 @@ void GEMV(XPU xpu, Dtype alpha, Dtype beta, const Blob<Dtype>& A, CHECK_EQ(a1, n) << "# rows of A(.T) must = length of C"; bool TranA = A.transpose(); - if (xpu == cpu) { + auto context = Singleton<Context>::Instance(); + int device = context->device_id(std::this_thread::get_id()); + if (device == -1) { cpu_gemv(A.cpu_data(), B.cpu_data(), m, n, alpha, beta, TranA, C->mutable_cpu_data()); - } + } else { #ifdef USE_GPU - if (xpu == gpu) { // gpu part gpu_gemv(A.gpu_data(), B.gpu_data(), m, n, alpha, beta, TranA, C->mutable_gpu_data()); - } #endif // USE_GPU + } } /** * Matrix vector multiplication, C = A(.T) * B, transpose is considered. @@ -119,9 +124,9 @@ void GEMV(XPU xpu, Dtype alpha, Dtype beta, const Blob<Dtype>& A, * @param[out] C output vector */ template <typename Dtype> -void MVDot(XPU xpu, const Blob<Dtype>& A, const Blob<Dtype>& B, +void MVDot(const Blob<Dtype>& A, const Blob<Dtype>& B, Blob<Dtype>* C) { - GEMV(xpu, Dtype(1), Dtype(0), A, B, C); + GEMV(Dtype(1), Dtype(0), A, B, C); } /************* BLAS level 3 *****************/ @@ -140,7 +145,7 @@ void MVDot(XPU xpu, const Blob<Dtype>& A, const Blob<Dtype>& B, * @param[in, out] C, matrix */ template <typename Dtype> -void GEMM(XPU xpu, Dtype alpha, Dtype beta, const Blob<Dtype>& A, +void GEMM( Dtype alpha, Dtype beta, const Blob<Dtype>& A, const Blob<Dtype> & B, Blob<Dtype> * C) { CHECK_EQ(A.shape().size(), 2); CHECK_EQ(B.shape().size(), 2); @@ -160,17 +165,18 @@ void GEMM(XPU xpu, Dtype alpha, Dtype beta, const Blob<Dtype>& A, int k = A.transpose() ? A.shape(0) : A.shape(1); bool TranA = A.transpose(); bool TranB = B.transpose(); - if (xpu == cpu) { - cpu_gemm(A.cpu_data(), B.cpu_data(), m, n, k, alpha, beta, - TranA, TranB, C->mutable_cpu_data()); - } + auto context = Singleton<Context>::Instance(); + int device = context->device_id(std::this_thread::get_id()); + if (device == -1) { + cpu_gemm(A.cpu_data(), B.cpu_data(), m, n, k, alpha, beta, TranA, TranB, + C->mutable_cpu_data()); + } else { #ifdef USE_GPU - if (xpu == gpu) { // gpu part gpu_gemm(A.gpu_data(), B.gpu_data(), m, n, k, alpha, beta, TranA, TranB, C->mutable_gpu_data()); - } #endif // USE_GPU + } } /** * Matrix multiplication, C = A(.T) * B(.T), transpose is considered. @@ -183,9 +189,9 @@ void GEMM(XPU xpu, Dtype alpha, Dtype beta, const Blob<Dtype>& A, * @param[out] C output matrix */ template <typename Dtype> -void MMDot(XPU xpu, const Blob<Dtype>& A, const Blob<Dtype>& B, +void MMDot(const Blob<Dtype>& A, const Blob<Dtype>& B, Blob<Dtype>* C) { - GEMM(xpu, Dtype(1), Dtype(0), A, B, C); + GEMM(Dtype(1), Dtype(0), A, B, C); } @@ -199,19 +205,20 @@ void MMDot(XPU xpu, const Blob<Dtype>& A, const Blob<Dtype>& B, * @return inner product value. */ template <typename Dtype> -Dtype VVDot(XPU xpu, const Blob<Dtype> & A, const Blob<Dtype> & B) { +Dtype VVDot(const Blob<Dtype> & A, const Blob<Dtype> & B) { Dtype res = 0; CHECK_EQ(A.count(), B.count()); int n = A.count(); - if (xpu == cpu) { + auto context = Singleton<Context>::Instance(); + int device = context->device_id(std::this_thread::get_id()); + if (device == -1) { res = cpu_dot(A.cpu_data(), B.cpu_data(), n); - } + } else { #ifdef USE_GPU - if (xpu == gpu) { // gpu part res = gpu_dot(A.gpu_data(), B.gpu_data(), n); - } #endif // USE_GPU + } return res; } @@ -224,25 +231,24 @@ Dtype VVDot(XPU xpu, const Blob<Dtype> & A, const Blob<Dtype> & B) { * @param[out] C, output matrix */ template <typename Dtype> -void OuterProduct(XPU xpu, const Blob<Dtype>& A, const Blob<Dtype>& B, - Blob<Dtype> * C) { +void OuterProduct(const Blob<Dtype>& A, const Blob<Dtype>& B, Blob<Dtype> * C) { CHECK(!C->transpose()); // do not support C.T now. int m = A.count(); int n = B.count(); CHECK_EQ(C->count(), m * n); - - if (xpu == cpu) { - cpu_gemm(A.cpu_data(), B.cpu_data(), m, n, 1, 1, 0, - false, false, C->mutable_cpu_data()); - } + auto context = Singleton<Context>::Instance(); + int device = context->device_id(std::this_thread::get_id()); + if (device == -1) { + cpu_gemm(A.cpu_data(), B.cpu_data(), m, n, 1, 1, 0, false, false, + C->mutable_cpu_data()); + } else { #ifdef USE_GPU - if (xpu == gpu) { // gpu part gpu_gemm(A.gpu_data(), B.gpu_data(), m, n, 1, 1, 0, false, false, C->mutable_gpu_data()); - } #endif // USE_GPU + } } /*********************** Element-wise functions ***********************/ /** @@ -251,17 +257,18 @@ void OuterProduct(XPU xpu, const Blob<Dtype>& A, const Blob<Dtype>& B, * Loose shape checking, A.count() == B.count(). */ template<typename Op, typename Dtype> -void Map(XPU xpu, const Blob<Dtype> & A, Blob<Dtype> * B) { +void Map(const Blob<Dtype> & A, Blob<Dtype> * B) { CHECK_EQ(A.count(), B->count()) << "Blobs must have the same size"; - if (xpu == cpu) { + auto context = Singleton<Context>::Instance(); + int device = context->device_id(std::this_thread::get_id()); + if (device == -1) { cpu_e_f<Op>(A.count(), A.cpu_data(), B->mutable_cpu_data()); - } + } else { #ifdef SINGA_GPU - if (xpu == gpu) { // gpu part gpu_e_f<Op>(A.count(), A.gpu_data(), B->mutable_gpu_data()); - } #endif // SINGA_GPU + } } /** @@ -270,19 +277,20 @@ void Map(XPU xpu, const Blob<Dtype> & A, Blob<Dtype> * B) { * Loose shape checking, A, B and C are of the same size. */ template<typename Op, typename Dtype> -void Map(XPU xpu, const Blob<Dtype> & A, const Blob<Dtype> & B, - Blob<Dtype> * C) { +void Map(const Blob<Dtype> & A, const Blob<Dtype> & B, Blob<Dtype> * C) { CHECK_EQ(A.count(), B.count()) << "Blobs must have the same size"; CHECK_EQ(A.count(), C->count()) << "Blobs must have the same size"; - if (xpu == cpu) { + cpu_e_f<Op>(A.count(), A.cpu_data(), B.cpu_data(), C->mutable_cpu_data()); + auto context = Singleton<Context>::Instance(); + int device = context->device_id(std::this_thread::get_id()); + if (device == -1) { cpu_e_f<Op>(A.count(), A.cpu_data(), B.cpu_data(), C->mutable_cpu_data()); - } + } else { #ifdef SINGA_GPU - if (xpu == gpu) { // gpu part gpu_e_f<Op>(A.count(), A.gpu_data(), B.gpu_data(), C->mutable_gpu_data()); - } #endif // SINGA_GPU + } } /** @@ -290,28 +298,34 @@ void Map(XPU xpu, const Blob<Dtype> & A, const Blob<Dtype> & B, * Loose shape checking, A.count() == B.count(). */ template<typename Op, typename Dtype> -void Map(XPU xpu, Dtype alpha, const Blob<Dtype>& A, Blob<Dtype>* B) { +void Map(Dtype alpha, const Blob<Dtype>& A, Blob<Dtype>* B) { CHECK_EQ(A.count(), B->count()) << "Blobs must have the same size"; - if (xpu == cpu) { + auto context = Singleton<Context>::Instance(); + int device = context->device_id(std::this_thread::get_id()); + if (device == -1) { cpu_e_f<Op>(A.count(), alpha, A.cpu_data(), B->mutable_cpu_data()); - } + } else { #ifdef SINGA_GPU #endif // SINGA_GPU + } } /** * Ci = Op(alpha, Ai, Bi) * Loose shape checking, A, B and C are of the same size. */ template<typename Op, typename Dtype> -void Map(XPU xpu, Dtype alpha, const Blob<Dtype>& A, const Blob<Dtype>& B, +void Map(Dtype alpha, const Blob<Dtype>& A, const Blob<Dtype>& B, Blob<Dtype>* C) { CHECK_EQ(A.count(), B->count()) << "Blobs must have the same size"; - if (xpu == cpu) { + auto context = Singleton<Context>::Instance(); + int device = context->device_id(std::this_thread::get_id()); + if (device == -1) { cpu_e_f<Op>(A.count(), alpha, A.cpu_data(), B->cpu_data(), C->mutable_cpu_data()); - } + } else { #ifdef SINGA_GPU #endif // SINGA_GPU + } } /** @@ -322,24 +336,46 @@ void Map(XPU xpu, Dtype alpha, const Blob<Dtype>& A, const Blob<Dtype>& B, * Loose shape checking, A.count() == B.count(). */ template<typename Dtype> -void Copy(XPU xpu, const Blob<Dtype>& A, Blob<Dtype>* B) { +void Copy(const Blob<Dtype>& A, Blob<Dtype>* B) { CHECK_EQ(A.count(), B->count()) << "Blobs must have the same size"; - if (xpu == cpu) { + auto context = Singleton<Context>::Instance(); + int device = context->device_id(std::this_thread::get_id()); + if (device == -1) { std::copy(A.cpu_data(), A.cpu_data() + A.count(), B->mutable_cpu_data()); } else { - LOG(FATAL) << "Not implemented"; +#ifdef USE_GPU +#endif } } + +/** + * B = alpha + A + * Implemented using Copy and AXPY. + */ +template<typename Dtype> +void Add(Dtype alpha, const Blob<Dtype> & A, Blob<Dtype> * B) { + Map<singa::op::Add<Dtype>>(alpha, A, B); +} + /** * C = A + B * Implemented using Copy and AXPY. */ template<typename Dtype> -void Add(XPU xpu, const Blob<Dtype> & A, const Blob<Dtype> & B, +void Add(const Blob<Dtype> & A, const Blob<Dtype> & B, Blob<Dtype> * C) { Copy(A, C); - AXPY(B, C, 1); + AXPY(Dtype(1), B, C); +} + +/** + * B = alpha - A + * Implemented using Copy and AXPY. + */ +template<typename Dtype> +void Sub(Dtype alpha, const Blob<Dtype> & A, Blob<Dtype>* B) { + Map<singa::op::Sub<Dtype>>(alpha, A, B); } /** @@ -347,10 +383,10 @@ void Add(XPU xpu, const Blob<Dtype> & A, const Blob<Dtype> & B, * Implemented using Copy and AXPY. */ template<typename Dtype> -void Sub(XPU xpu, const Blob<Dtype> & A, const Blob<Dtype> & B, +void Sub(const Blob<Dtype> & A, const Blob<Dtype> & B, Blob<Dtype> * C) { - Copy(xpu, A, C); - AXPY(xpu, B, C, -1); + Copy(A, C); + AXPY(Dtype(-1), B, C); } /** @@ -360,19 +396,8 @@ void Sub(XPU xpu, const Blob<Dtype> & A, const Blob<Dtype> & B, template<typename Dtype> void Mult(const Blob<Dtype> & A, const Blob<Dtype> & B, Blob<Dtype> * C) { - //Map<singa::op::Mult<Dtype>>(xpu, A, B, C); + Map<singa::op::Mult<Dtype>>(A, B, C); // TODO(wangwei) use MKL's vector func - auto context = Singleton<Context>::Instance(); - int device = context->device_id(std::this_thread::get_id()); - if (device == -1) - cpu_e_f<op::Mult<Dtype>>(C->count(), A.cpu_data(), B.cpu_data(), - C->mutable_cpu_data()); - else { -#ifdef USE_GPU - gpu_e_f<op::Mult<Dtype>>(C->count(), A.gpu_data(), B.gpu_data(), - C->mutable_gpu_data()); -#endif - } } /** @@ -380,11 +405,46 @@ void Mult(const Blob<Dtype> & A, const Blob<Dtype> & B, * Map(XPU, const Blob<Dtype>&, const Blob<Dtype>&, Blob<Dtype>*). */ template<typename Dtype> -void Div(XPU xpu, const Blob<Dtype> & A, const Blob<Dtype> & B, +void Div(const Blob<Dtype> & A, const Blob<Dtype> & B, Blob<Dtype> * C) { - Map<singa::op::Div<Dtype>>(xpu, A, B, C); + Map<singa::op::Div<Dtype>>(A, B, C); // TODO(wangwei) use MKL's vector func } +/** + * B = sqrt(A) + */ +template<typename Dtype> +void Sqrt(const Blob<Dtype> & A, Blob<Dtype>* B) { + Map<singa::op::Sqrt<Dtype>, Dtype>(A, B); +} +/** + * B = square(A) + */ +template<typename Dtype> +void Square(const Blob<Dtype> & A, Blob<Dtype>* B) { + Map<singa::op::Square<Dtype>, Dtype>(A, B); +} +/** + * B = exp(A) + */ +template<typename Dtype> +void Exp(const Blob<Dtype> & A, Blob<Dtype>* B) { + Map<singa::op::Exp<Dtype>, Dtype>(A, B); +} +/** + * B = log(A) + */ +template<typename Dtype> +void Log(const Blob<Dtype>& A, Blob<Dtype>* B) { + Map<singa::op::Log<Dtype>, Dtype>(A, B); +} +/** + * B = tanh(A) + */ +template<typename Dtype> +void Tanh(const Blob<Dtype>& A, Blob<Dtype>* B) { + Map<singa::op::Tanh<Dtype>, Dtype>(A, B); +} /*************************1D<-->2D op/transform***************************/ /** * Add A to each column of B, i.e., Bij = alpha*Ai + beta*Bij @@ -392,28 +452,26 @@ void Div(XPU xpu, const Blob<Dtype> & A, const Blob<Dtype> & B, * # columns of B = B.count() / A.count(). */ template<typename Dtype> -void MVAddCol(XPU xpu, Dtype alpha, Dtype beta, const Blob<Dtype> & A, - Blob<Dtype> * B) { +void MVAddCol(Dtype alpha, Dtype beta, const Blob<Dtype> & A, Blob<Dtype> * B) { if (B->transpose()) { - Blob<Dtype>* tmp = Transpose(* B); - MVAddRow(xpu, alpha, beta, A, tmp); - delete tmp; + B->set_transpose(false); + MVAddRow(alpha, beta, A, B); + B->set_transpose(true); } else { CHECK_EQ(B->count() % A.count(), 0) << "#col of B not match length of A"; int m = A.count(), n = B->count() / m; - if (xpu == cpu) { - Blob<Dtype> one(n); - one.SetValue(1); - cpu_gemm(A.cpu_data(), one.cpu_data(), m, n, 1, alpha, beta, - false, false, B->mutable_cpu_data()); - } + Blob<Dtype> one(n); + one.SetValue(1); + auto context = Singleton<Context>::Instance(); + int device = context->device_id(std::this_thread::get_id()); + if (device == -1) { + cpu_gemm(A.cpu_data(), one.cpu_data(), m, n, 1, alpha, beta, false, false, + B->mutable_cpu_data()); + } else { #ifdef USE_GPU - if (xpu == gpu) { - singa_gpu_add_vec_row(B->gpu_data(), - A.gpu_data(), A.gpu_data(), m, n, n); - // gpu part - } + singa_gpu_add_vec_row(B->gpu_data(), A.gpu_data(), A.gpu_data(), m, n, n); #endif // USE_GPU + } } } /** @@ -422,8 +480,8 @@ void MVAddCol(XPU xpu, Dtype alpha, Dtype beta, const Blob<Dtype> & A, * # columns of B = B.count() / A.count(). */ template<typename Dtype> -void MVAddCol(XPU xpu, const Blob<Dtype> & A, Blob<Dtype>* B) { - MVAddCol(xpu, Dtype(1), Dtype(1), A, B); +void MVAddCol(const Blob<Dtype> & A, Blob<Dtype>* B) { + MVAddCol(Dtype(1), Dtype(1), A, B); } /** @@ -432,28 +490,26 @@ void MVAddCol(XPU xpu, const Blob<Dtype> & A, Blob<Dtype>* B) { * # rows of B = B.count() / A.count(). */ template<typename Dtype> -void MVAddRow(XPU xpu, Dtype alpha, Dtype beta, const Blob<Dtype> & A, - Blob<Dtype> * B) { +void MVAddRow(Dtype alpha, Dtype beta, const Blob<Dtype> & A, Blob<Dtype> * B) { if (B->transpose()) { - Blob<Dtype>* tmp = Transpose(* B); - MVAddCol(xpu, alpha, beta, A, tmp); - delete tmp; + B->set_transpose(false); + MVAddCol(alpha, beta, A, B); + B->set_transpose(true); } else { CHECK_EQ(B->count() % A.count(), 0) << "#col of B not match length of A"; int m = A.count(), n = B->count() / m; - if (xpu == cpu) { - Blob<Dtype> one(n); - one.SetValue(1); + Blob<Dtype> one(n); + one.SetValue(1); + auto context = Singleton<Context>::Instance(); + int device = context->device_id(std::this_thread::get_id()); + if (device == -1) { cpu_gemm(one.cpu_data(), A.cpu_data(), n, m, 1, alpha, beta, false, false, B->mutable_cpu_data()); - } + } else { #ifdef USE_GPU - if (xpu == gpu) { - // gpu part - singa_gpu_add_vec_row(B->gpu_data(), - A.gpu_data(), A.gpu_data(), m, n, n); - } + singa_gpu_add_vec_row(B->gpu_data(), A.gpu_data(), A.gpu_data(), m, n, n); #endif // USE_GPU + } } } /** @@ -462,8 +518,8 @@ void MVAddRow(XPU xpu, Dtype alpha, Dtype beta, const Blob<Dtype> & A, * # rows of B = B.count() / A.count(). */ template<typename Dtype> -void MVAddRow(XPU xpu, const Blob<Dtype> & A, Blob<Dtype>* B) { - MVAddRow(xpu, Dtype(1), Dtype(1), A, B); +void MVAddRow(const Blob<Dtype> & A, Blob<Dtype>* B) { + MVAddRow(Dtype(1), Dtype(1), A, B); } /** @@ -472,8 +528,8 @@ void MVAddRow(XPU xpu, const Blob<Dtype> & A, Blob<Dtype>* B) { * # columns of B = B.count() / A.count(). */ template<typename Dtype> -void RepmatCol(XPU xpu, const Blob<Dtype> & A, Blob<Dtype> * B) { - MVAddCol(xpu, Dtype(1), Dtype(0), A, B); +void RepmatCol(const Blob<Dtype> & A, Blob<Dtype> * B) { + MVAddCol(Dtype(1), Dtype(0), A, B); } /** @@ -482,8 +538,8 @@ void RepmatCol(XPU xpu, const Blob<Dtype> & A, Blob<Dtype> * B) { * # rows of B = B.count() / A.count(). */ template<typename Dtype> -void RepmatRow(XPU xpu, const Blob<Dtype> & A, Blob<Dtype> * B) { - MVAddRow(xpu, Dtype(1), Dtype(0), A, B); +void RepmatRow(const Blob<Dtype> & A, Blob<Dtype> * B) { + MVAddRow(Dtype(1), Dtype(0), A, B); } /** @@ -493,18 +549,18 @@ void RepmatRow(XPU xpu, const Blob<Dtype> & A, Blob<Dtype> * B) { * # columns of A = A.count() / B.count(). */ template<typename Dtype> -void MVSumCol(XPU xpu, Dtype alpha, Dtype beta, const Blob<Dtype> & A, - Blob<Dtype> * B) { +void MVSumCol(Dtype alpha, Dtype beta, const Blob<Dtype> & A, Blob<Dtype> * B) { CHECK_EQ(A.count() % B->count(), 0) << "length of B must = # of cols of A"; int m = B->count(), n = A.count() / m; - if (xpu == cpu) { - Blob<Dtype> one(n); - one.SetValue(1); + Blob<Dtype> one(n); + one.SetValue(1); + auto context = Singleton<Context>::Instance(); + int device = context->device_id(std::this_thread::get_id()); + if (device == -1) { cpu_gemm(A.cpu_data(), one.cpu_data(), m, 1, n, alpha, beta, A.transpose(), false, B->mutable_cpu_data()); - } + } else { #ifdef USE_GPU - if (xpu == gpu) { singa_gpu_sum_col(A.gpu_data(), B->gpu_data(), m, n, n); // gpu part (TODO check transpose case) } @@ -518,18 +574,18 @@ void MVSumCol(XPU xpu, Dtype alpha, Dtype beta, const Blob<Dtype> & A, * # rows of A = A.count() / B.count(). */ template<typename Dtype> -void MVSumRow(XPU xpu, Dtype alpha, Dtype beta, const Blob<Dtype> & A, - Blob<Dtype> * B) { +void MVSumRow(Dtype alpha, Dtype beta, const Blob<Dtype> & A, Blob<Dtype> * B) { CHECK_EQ(A.count() % B->count(), 0) << "length of B must = # of cols of A"; int m = B->count(), n = A.count() / m; - if (xpu == cpu) { - Blob<Dtype> one(n); - one.SetValue(1); - cpu_gemm(one.cpu_data(), A.cpu_data(), 1, m, n, alpha, beta, - A.transpose(), false, B->mutable_cpu_data()); - } + Blob<Dtype> one(n); + one.SetValue(1); + auto context = Singleton<Context>::Instance(); + int device = context->device_id(std::this_thread::get_id()); + if (device == -1) { + cpu_gemm(one.cpu_data(), A.cpu_data(), 1, m, n, alpha, beta, A.transpose(), + false, B->mutable_cpu_data()); + } else { #ifdef USE_GPU - if (xpu == gpu) { singa_gpu_sum_row(A.gpu_data(), B->gpu_data(), m, n, n); // gpu part (TODO check transpose case) } @@ -542,18 +598,19 @@ void MVSumRow(XPU xpu, Dtype alpha, Dtype beta, const Blob<Dtype> & A, * # columns of A = A.count() / B.count(). */ template<typename Op, typename Dtype> -void Reduce2D(XPU xpu, const Blob<Dtype> & A, Blob<Dtype> * B) { +void Reduce2D(const Blob<Dtype> & A, Blob<Dtype> * B) { CHECK_EQ(A.count() % B->count(), 0) << "Row size not match B length"; int m = B->count(), n = A.count() / m; - if (xpu == cpu) { + auto context = Singleton<Context>::Instance(); + int device = context->device_id(std::this_thread::get_id()); + if (device == -1) { cpu_reduce_f<Op>(A.cpu_data(), m, n, B->mutable_cpu_data()); - } + } else { #ifdef SINGA_GPU - if (xpu == gpu) { // gpu part gpu_reduce_f<Op>(A.gpu_data(), m, n, B->mutable_gpu_data()); - } #endif // SINGA_GPU + } } /** * Duplicate each element of A into a row of B. @@ -561,62 +618,67 @@ void Reduce2D(XPU xpu, const Blob<Dtype> & A, Blob<Dtype> * B) { * # columns of B = B.count() / A.count(). */ template<typename Op, typename Dtype> -void Expand2D(XPU xpu, const Blob<Dtype> & A, Blob<Dtype> * B) { +void Expand2D(const Blob<Dtype> & A, Blob<Dtype> * B) { CHECK_EQ(B->count() % A.count(), 0) << "Row size of B not match length of A"; int m = A.count(), n = B->count() / m; - if (xpu == cpu) { + auto context = Singleton<Context>::Instance(); + int device = context->device_id(std::this_thread::get_id()); + if (device == -1) { cpu_expand_f<Op>(A.cpu_data(), m, n, B->mutable_cpu_data()); - } + } else { #ifdef SINGA_GPU - if (xpu == gpu) { - // gpu part gpu_expand_f<Op>(A.gpu_data(), m, n, B->mutable_gpu_data()); - } #endif // SINGA_GPU + } } /** * Average the absolute values. */ -template <typename Dtype> -Dtype Asum(XPU xpu, const Blob<Dtype>& A) { +template<typename Dtype> +Dtype Asum(const Blob<Dtype>& A) { if (A.count() == 0) return Dtype(0); - if (xpu == cpu) + auto context = Singleton<Context>::Instance(); + int device = context->device_id(std::this_thread::get_id()); + if (device == -1) { return cpu_asum(A.count(), A.cpu_data(), 1) / A.count(); - return Dtype(0); // avoid compile warning + } else { #ifdef USE_GPU + return 0; // TODO(haibo) #endif + } } + /*************Random Sample***************/ template<typename Dtype> -void SampleUniform(Dtype low, Dtype high, Blob<Dtype>* A); - -template<> -inline void SampleUniform<float>(float low, float high, Blob<float> *A) { +void SampleUniform(Dtype low, Dtype high, Blob<Dtype>* A) { auto context = Singleton<Context>::Instance(); - int device = context->device_id(std::this_thread::get_id()); - if (device == -1) - cpu_sample_uniform(A->count(), low, high, A->mutable_cpu_data()); - else { + const auto& thread = std::this_thread::get_id(); + int device = context->device_id(thread); + if (device == -1) { + cpu_sample_uniform(*context->rand_generator(thread), A->count(), low, high, + A->mutable_cpu_data()); + } else { #ifdef USE_GPU + // TODO(haibo) check gpu_sample_uniform(A->count(), low, high, A->mutable_gpu_data()); #endif } } template<typename Dtype> -void SampleGaussian(Dtype mean, Dtype std, Blob<Dtype>* A); - -template<> -inline void SampleGaussian<float>(float low, float high, Blob<float> *A) { +void SampleGaussian(Dtype mean, Dtype std, Blob<Dtype>* A) { auto context = Singleton<Context>::Instance(); - int device = context->device_id(std::this_thread::get_id()); - if (device == -1) - cpu_sample_gaussian(A->count(), low, high, A->mutable_cpu_data()); - else { + const auto& thread = std::this_thread::get_id(); + int device = context->device_id(thread); + if (device == -1) { + cpu_sample_gaussian(*context->rand_generator(thread), A->count(), mean, std, + A->mutable_cpu_data()); + } else { #ifdef USE_GPU - gpu_sample_gaussian(A->count(), low, high, A->mutable_gpu_data()); + // TODO(haibo) check it. + gpu_sample_gaussian(A->count(), mean, std, A->mutable_gpu_data()); #endif } } @@ -627,11 +689,15 @@ void Softmax(int nb_rows, const Blob<Dtype>& A, Blob<Dtype>* B) { CHECK_GT(nb_rows, 0); CHECK_EQ(A.count() % nb_rows, 0); CHECK_EQ(A.count(), B->count()); - -#ifdef USE_GPU - cpu_softmax(nb_rows, A.count() / nb_rows, A.cpu_data(), + auto context = Singleton<Context>::Instance(); + int device = context->device_id(std::this_thread::get_id()); + if (device == -1) { + cpu_softmax(nb_rows, A.count() / nb_rows, A.cpu_data(), B->mutable_cpu_data()); + } else { +#ifdef USE_GPU #endif // USE_GPU + } } } // end of namespace singa http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/5d35ef26/include/singa/utils/singa_op.h ---------------------------------------------------------------------- diff --git a/include/singa/utils/singa_op.h b/include/singa/utils/singa_op.h index c934050..7499eb1 100644 --- a/include/singa/utils/singa_op.h +++ b/include/singa/utils/singa_op.h @@ -211,6 +211,34 @@ struct Pow { } #endif // USE_GPU }; + +template<typename Dtype> +struct Add { + inline static void Map(const Dtype & a, const Dtype & b, Dtype * c) { + *c = a + b; + } +#ifdef USE_GPU + inline static void CudaMap(const Dtype * a, + const Dtype * b, Dtype * c, int n) { +// singa::singa_gpu_add(a, b, c, n); // TODO(haibo) + } +#endif // USE_GPU +}; + +template<typename Dtype> +struct Sub { + inline static void Map(const Dtype & a, const Dtype & b, Dtype * c) { + *c = a - b; + } +#ifdef USE_GPU + inline static void CudaMap(const Dtype * a, + const Dtype * b, Dtype * c, int n) { +// singa::singa_gpu_add(a, b, c, n); // TODO(haibo) + } +#endif // USE_GPU +}; + + template<typename Dtype> struct Mult { inline static void Map(const Dtype & a, const Dtype & b, Dtype * c) { http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/5d35ef26/src/neuralnet/layer.cc ---------------------------------------------------------------------- diff --git a/src/neuralnet/layer.cc b/src/neuralnet/layer.cc index be38ac5..9414948 100644 --- a/src/neuralnet/layer.cc +++ b/src/neuralnet/layer.cc @@ -48,17 +48,16 @@ const std::string Layer::ToString(bool debug, int flag) { return ""; string ret = StringPrintf("Layer %10s ", name().c_str()); if ((flag & kForward) == kForward && data_.count() !=0) { - ret += StringPrintf("data norm1 %13.9f", Asum(cpu, data_)); + ret += StringPrintf("data norm1 %13.9f", Asum(data_)); } else if ((flag & kBackward) == kBackward) { if (grad_.count() != 0) - ret += StringPrintf("grad norm1 %13.9f\n", Asum(cpu, grad_)); + ret += StringPrintf("grad norm1 %13.9f\n", Asum(grad_)); } if ((flag & kTrain) == kTrain) { for (Param* p : GetParams()) { ret += StringPrintf( "param id %2d, name %10s, value norm1 %13.9f, grad norm1 %13.9f\n", - p->id(), p->name().c_str(), Asum(cpu, p->data()), - Asum(cpu, p->grad())); + p->id(), p->name().c_str(), Asum(p->data()), Asum(p->grad())); } } return ret; http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/5d35ef26/src/neuralnet/loss_layer/cudnn_softmaxloss.cu ---------------------------------------------------------------------- diff --git a/src/neuralnet/loss_layer/cudnn_softmaxloss.cu b/src/neuralnet/loss_layer/cudnn_softmaxloss.cu new file mode 100644 index 0000000..e0af05f --- /dev/null +++ b/src/neuralnet/loss_layer/cudnn_softmaxloss.cu @@ -0,0 +1,52 @@ +/************************************************************ +* +* Licensed to the Apache Software Foundation (ASF) under one +* or more contributor license agreements. See the NOTICE file +* distributed with this work for additional information +* regarding copyright ownership. The ASF licenses this file +* to you under the Apache License, Version 2.0 (the +* "License"); you may not use this file except in compliance +* with the License. You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, +* software distributed under the License is distributed on an +* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +* KIND, either express or implied. See the License for the +* specific language governing permissions and limitations +* under the License. +* +*************************************************************/ + +#include "singa/neuralnet/loss_layer.h" + +namespace singa { +void CudnnSoftmaxLossLayer::Setup(const LayerProto& conf, + const vector<Layer*>& srclayers) { + CudnnSoftmaxLayer::Setup(conf, srclayers); + topk_ = conf.softmaxloss_conf().topk(); + loss_ = accuracy_ = 0.0f; + counter_ = 0; +} +void CudnnSoftmaxLossLayer::ComputeFeature(int flag, + const vector<Layer*>& srclayers) { + CudnnSoftmaxLayer::ComputeFeature(flag, srclayers); + // compute loss + counter_++; + // add loss and accuracy +} + +void CudnnSoftmaxLossLayer::ComputeGradient(int flag, + const vector<Layer*>& srclayers) { + // compute gradient +} + +const std::string CudnnSoftmaxLossLayer::ToString(bool debug, int flag) { + string disp = "Loss = " + std::to_string(loss_ / counter_) + + ", accuracy = " + std::to_string(accuracy_ / counter_); + counter_ = 0; + loss_ = accuracy_ = 0; + return disp; +} +} // namespace singa http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/5d35ef26/src/neuralnet/neuron_layer/activation.cc ---------------------------------------------------------------------- diff --git a/src/neuralnet/neuron_layer/activation.cc b/src/neuralnet/neuron_layer/activation.cc new file mode 100644 index 0000000..6f62646 --- /dev/null +++ b/src/neuralnet/neuron_layer/activation.cc @@ -0,0 +1,83 @@ +/************************************************************ +* +* Licensed to the Apache Software Foundation (ASF) under one +* or more contributor license agreements. See the NOTICE file +* distributed with this work for additional information +* regarding copyright ownership. The ASF licenses this file +* to you under the Apache License, Version 2.0 (the +* "License"); you may not use this file except in compliance +* with the License. You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, +* software distributed under the License is distributed on an +* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +* KIND, either express or implied. See the License for the +* specific language governing permissions and limitations +* under the License. +* +*************************************************************/ +#include "singa/neuralnet/neuron_layer.h" +#include "singa/utils/math_blob.h" +#include "singa/utils/singa_op.h" +#include "singa/proto/job.pb.h" +namespace singa { + +void ActivationLayer::Setup(const LayerProto& conf, + const vector<Layer*>& srclayers) { + NeuronLayer::Setup(conf, srclayers); + data_.ReshapeLike(srclayers[0]->data(this)); + grad_.ReshapeLike(*(srclayers[0]->mutable_grad(this))); +} +void ActivationLayer::ComputeFeature(int flag, const vector<Layer*>& srclayers) +{ + switch (layer_conf_.activation_conf().type()) { + case RELU: + Map<op::Relu<float>, float>(srclayers[0]->data(this), &data_); + break; + case SIGMOID: + Map<op::Sigmoid<float>, float>(srclayers[0]->data(this), &data_); + break; + case TANH: + Map<op::Tanh<float>, float>(srclayers[0]->data(this), &data_); + break; + /* + case ActivationType_STANH: + Map<op::STanh<float>, float>(srclayers[0]->data(this), &data_); + break; + */ + default: + LOG(ERROR) << "Unknow activation type " << + layer_conf_.activation_conf().type(); + } +} +void ActivationLayer::ComputeGradient(int flag, const vector<Layer*>& srclayers) +{ + Blob<float> * gsrc = srclayers[0]->mutable_grad(this); + switch (layer_conf_.activation_conf().type()) { + case RELU: + Map<op::Relu<float>, float>(data_, gsrc); + Mult(*gsrc, grad_, gsrc); + break; + case SIGMOID: + Map<op::SigmoidGrad<float>, float>(data_, gsrc); + Mult(*gsrc, grad_, gsrc); + break; + case TANH: + Map<op::Tanh<float>, float>(data_, gsrc); + Mult(*gsrc, grad_, gsrc); + break; + /* + case ActivationType_STANH: + Map<op::STanh<float>, float>(data_, gsrc); + Mult(*gsrc, grad_, gsrc); + break; + */ + default: + LOG(ERROR) << "Unknow activation type " << + layer_conf_.activation_conf().type(); + } +} + +} // namespace singa http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/5d35ef26/src/neuralnet/neuron_layer/dropout.cc ---------------------------------------------------------------------- diff --git a/src/neuralnet/neuron_layer/dropout.cc b/src/neuralnet/neuron_layer/dropout.cc index ad8c10f..030a663 100644 --- a/src/neuralnet/neuron_layer/dropout.cc +++ b/src/neuralnet/neuron_layer/dropout.cc @@ -48,8 +48,8 @@ void DropoutLayer::ComputeFeature(int flag, const vector<Layer*>& srclayers) { float pkeep = 1 - pdrop_; Blob<float> rand(data_.count()); SampleUniform(0.0f, 1.0f, &rand); - // Threashold(pkeep, rand, &mask_); - // Scale(1.0f / pkeep, &mask_); + Map<op::Threshold<float>, float>(pkeep, rand, &mask_); + Scale(1.0f / pkeep, &mask_); Mult(srclayers[0]->data(this), mask_, &data_); }
