SINGA-80 New Blob Level and Address Level Math Operation Interface Temp commit, not compiled yet. * Move functions in math_addr.cc and math_blob.cc into header files to simply the compilation of template code. * Add comments in math_blob.h file. * Add shape checking.
TODO *remove the functions like relu/softplus/sigmoid for Blob, the function body contains only one line of code, which can be written directly when calling the underlying functions. * Update Blob class to implement helper functions, e.g., Reshape, shape(int k). * Update math functions for gpu, there are mis-matching APIs. Project: http://git-wip-us.apache.org/repos/asf/incubator-singa/repo Commit: http://git-wip-us.apache.org/repos/asf/incubator-singa/commit/4b84dbe3 Tree: http://git-wip-us.apache.org/repos/asf/incubator-singa/tree/4b84dbe3 Diff: http://git-wip-us.apache.org/repos/asf/incubator-singa/diff/4b84dbe3 Branch: refs/heads/master Commit: 4b84dbe30296985afafc88c08dc84f664cfc3617 Parents: 641eb31 Author: Wei Wang <[email protected]> Authored: Mon Nov 9 14:10:40 2015 +0800 Committer: Wei Wang <[email protected]> Committed: Mon Nov 9 17:04:48 2015 +0800 ---------------------------------------------------------------------- include/singa/blob/math_addr.h | 201 +++++--- include/singa/blob/math_blob.h | 955 +++++++++++++++++++++--------------- include/singa/blob/singa_op.h | 500 +++++++++---------- include/singa/utils/blob.h | 188 +++++-- src/blob/math_addr.cc | 120 ----- src/blob/math_blob.cc | 214 -------- 6 files changed, 1092 insertions(+), 1086 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/4b84dbe3/include/singa/blob/math_addr.h ---------------------------------------------------------------------- diff --git a/include/singa/blob/math_addr.h b/include/singa/blob/math_addr.h index 2a25a29..25cef07 100644 --- a/include/singa/blob/math_addr.h +++ b/include/singa/blob/math_addr.h @@ -21,113 +21,186 @@ #ifndef SINGA_BLOB_MATH_ADDR_H_ #define SINGA_BLOB_MATH_ADDR_H_ +extern "C" { + #include <cblas.h> +} +#ifdef USE_GPU +#include <cuda_runtime.h> +#endif +#include "singa/blob/singa_op.h" +#ifdef USE_GPU +#include "cublas_v2.h" +#endif -namespace singa { -const float * cpu_uni_vec(const int n); +namespace singa { -void cpu_gemm(const float * A, const float * B, -const int m, const int n, const int k, const float alpha, const float beta, -const bool TranA, const bool TranB, float * C); +template<typename Dtype> +void cpu_gemm(const Dtype * A, const Dtype * B, + const int m, const int n, const int k, const Dtype alpha, const Dtype beta, + const bool TranA, const bool TranB, Dtype * C) { + int lda, ldb; + CBLAS_TRANSPOSE tA, tB; + lda = TranA ? m : k; + ldb = TranB ? k : n; + tA = TranA ? CblasTrans : CblasNoTrans; + tB = TranB ? CblasTrans : CblasNoTrans; + cblas_sgemm(CblasRowMajor, tA, tB, m, n, k, alpha, A, lda, + B, ldb, beta, C, n); +} -void cpu_gemv(const float * A, const float * B, const int m, const int n, -const float alpha, const float beta, const bool TranA, float * C); // should be very careful: // m is the length of B, and n is the length of C , A is a n*m matrix +template<typename Dtype> +void cpu_gemv(const Dtype * A, const Dtype * B, const int m, const int n, + const Dtype alpha, const Dtype beta, const bool TranA, Dtype * C) { + int lda, ldb; + CBLAS_TRANSPOSE tA, tB; + lda = TranA ? m : k; + ldb = TranB ? k : n; + tA = TranA ? CblasTrans : CblasNoTrans; + tB = TranB ? CblasTrans : CblasNoTrans; + cblas_sgemm(CblasRowMajor, tA, tB, m, n, k, alpha, A, lda, + B, ldb, beta, C, n); + +} -void cpu_axpy(const float * A, const int n, const float alpha, float * B); +template<typename Dtype> +void cpu_axpy(const Dtype * A, const int n, const Dtype alpha, Dtype * B) { + cblas_saxpy(n, alpha, A, 1, B, 1); +} -float cpu_dot(const float * A, const float * B, const int n); +template<typename Dtype> +Dtype cpu_dot(const Dtype * A, const Dtype * B, const int n) { + Dtype sum = 0; + for (int i = 0 ; i < n ; i++) + sum += A[i] * B[i]; + return sum; +} // element-wise -template<typename Op> -void cpu_e_f(const int n, const float alpha, float * A) { - for (int i = 0 ; i < n ; i++) { - Op::Map(alpha, &A[i]); - } +template<typename Op, typename Dtype> +void cpu_e_f(const int n, const Dtype alpha, Dtype * A) { + for (int i = 0 ; i < n ; i++) { + Op::Map(alpha, &A[i]); + } } -template<typename Op> -void cpu_e_f(const int n, const float * A, const float alpha, float * B) { - for (int i = 0 ; i < n ; i++) { - Op::Map(alpha, A[i], &B[i]); - } +template<typename Op, typename Dtype> +void cpu_e_f(const int n, const Dtype * A, const Dtype alpha, Dtype * B) { + for (int i = 0 ; i < n ; i++) { + Op::Map(alpha, A[i], &B[i]); + } } -template<typename Op> -void cpu_e_f(const int n, const float * A, const float * B, -const float alpha, const float beta, float * C) { - for (int i = 0 ; i < n ; i++) { - Op::Map(alpha, beta, A[i], B[i], &C[i]); - } +template<typename Op, typename Dtype> +void cpu_e_f(const int n, const Dtype * A, const Dtype * B, + const Dtype alpha, const Dtype beta, Dtype * C) { + for (int i = 0 ; i < n ; i++) { + Op::Map(alpha, beta, A[i], B[i], &C[i]); + } } // element-wise generalized operation defined in Op // matrix/vector expand/reduce -template<typename Op> -void cpu_reduce_f(const float * A, const int m, const int n, float * B) { - for (int i = 0 ; i < m ; i++) { - Op::Map(A+i*n, n, B[i]); - } +template<typename Op, typename Dtype> +void cpu_reduce_f(const Dtype * A, const int m, const int n, Dtype * B) { + for (int i = 0 ; i < m ; i++) { + Op::Map(A+i*n, n, B[i]); + } } // reduce each row of A to an element of B e.g. the sum operation in softmax -template<typename Op> -void cpu_expand_f(const float * A, const int m, const int n, float * B) { - for (int i = 0 ; i < m ; i++) { - Op::Map(A[i], n, B+i*n); - } +template<typename Op, typename Dtype> +void cpu_expand_f(const Dtype * A, const int m, const int n, Dtype * B) { + for (int i = 0 ; i < m ; i++) { + Op::Map(A[i], n, B+i*n); + } } // expand each element in A into a row of B -#ifdef SINGA_GPU -void gpu_gemm(const float * A, const float * B, -const int m, const int n, const int k, const float alpha, const float beta, -const bool TranA, const bool TranB, float * C); +#ifdef USE_GPU +template<typename Dtype> +void gpu_gemm(const Dtype * A, const Dtype * B, const int m, const int n, + const int k, const Dtype alpha, const Dtype beta, const bool TranA, + const bool TranB, Dtype * C) { + int lda = TranA ? m : k; + int ldb = TranB ? k : n; + int ldc = n; + cublasOperation_t tA = (TranA == false) ? CUBLAS_OP_N : CUBLAS_OP_T; + cublasOperation_t tB = (TranB == false) ? CUBLAS_OP_N : CUBLAS_OP_T; + cublasHandle_t handle; + cublasCreate(&handle); + cublasSgemm(handle, tB, tA, n, m, k, &alpha, B, ldb, + A, lda, &beta, C, ldc); + cublasDestroy(handle); +} -void gpu_gemv(const float * A, const float * B, const int m, const int n, -const float alpha, const float beta, const bool TranA, float * C); +template<typename Dtype> +void gpu_gemv(const Dtype * A, const Dtype * B, const int m, const int n, + const Dtype alpha, const Dtype beta, const bool TranA, Dtype * C) { + int lda = n; + cublasOperation_t tA = (TranA == true) ? CUBLAS_OP_N : CUBLAS_OP_T; + cublasHandle_t handle; + cublasCreate(&handle); + cublasSgemv(handle, tA, n, m, &alpha , A, lda, B, 1, &beta, C, 1); + cublasDestroy(handle); +} -void gpu_axpy(const float * A, const int n, const float alpha, float * B); +template<typename Dtype> +void gpu_axpy(const Dtype * A, const int n, const Dtype alpha, Dtype * B) { + cublasHandle_t handle; + cublasCreate(&handle); + cublasSaxpy(handle, n, &alpha, A, 1, B, 1); + cublasDestroy(handle); +} -float gpu_dot(const float * A, const float * B, const int n); +template<typename Dtype> +Dtype gpu_dot(const Dtype * A, const Dtype * B, const int n) { + cublasHandle_t handle; + cublasCreate(&handle); + Dtype result = 0.0; + cublasSdot(handle, n, A, 1, B, 1, &result); + cublasDestroy(handle); + return result; +} // element-wise -template<typename Op> -void gpu_e_f(const int n, const float alpha, float * A) { - Op::CudaMap(alpha, A, n); +template<typename Op, typename Dtype> +void gpu_e_f(const int n, const Dtype alpha, Dtype * A) { + Op::CudaMap(alpha, A, n); } -template<typename Op> -void gpu_e_f(const int n, const float * A, const float alpha, float * B) { - Op::CudaMap(alpha, A, B, n); +template<typename Op, typename Dtype> +void gpu_e_f(const int n, const Dtype * A, const Dtype alpha, Dtype * B) { + Op::CudaMap(alpha, A, B, n); } -template<typename Op> -void gpu_e_f(const int n, const float * A, const float * B, -const float alpha, const float beta, float * C) { - Op::CudaMap(alpha, beta, A, B, C, n); +template<typename Op, typename Dtype> +void gpu_e_f(const int n, const Dtype * A, const Dtype * B, + const Dtype alpha, const Dtype beta, Dtype * C) { + Op::CudaMap(alpha, beta, A, B, C, n); } // element-wise generalized operation defined in Op // matrix/vector expand/reduce -template<typename Op> -void gpu_reduce_f(const float * A, const int m, const int n, float * B) { - for (int i = 0 ; i < m ; i++) { - Op::CudaMap(A+i*n, n, B[i]); - } +template<typename Op, typename Dtype> +void gpu_reduce_f(const Dtype * A, const int m, const int n, Dtype * B) { + for (int i = 0 ; i < m ; i++) { + Op::CudaMap(A+i*n, n, B[i]); + } } // reduce each row of A to an element of B e.g. the sum operation in softmax -template<typename Op> -void gpu_expand_f(const float * A, const int m, const int n, float * B) { - for (int i = 0 ; i < m ; i++) { - Op::CudaMap(A[i], n, B+i*n); - } +template<typename Op, typename Dtype> +void gpu_expand_f(const Dtype * A, const int m, const int n, Dtype * B) { + for (int i = 0 ; i < m ; i++) { + Op::CudaMap(A[i], n, B+i*n); + } } // expand each element in A into a row of B -#endif // SINGA_GPU +#endif // USE_GPU } // namespace singa #endif // SINGA_BLOB_MATH_ADDR_H_ http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/4b84dbe3/include/singa/blob/math_blob.h ---------------------------------------------------------------------- diff --git a/include/singa/blob/math_blob.h b/include/singa/blob/math_blob.h index 638f9cc..f3147e8 100644 --- a/include/singa/blob/math_blob.h +++ b/include/singa/blob/math_blob.h @@ -29,420 +29,569 @@ namespace singa { -/*********************Level-2 interface, called by user code*******************/ - -int get_size(const std::vector<int>& shape); - -template <typename Dtype> -bool check_shape_mv(const Blob<Dtype> & A, const Blob<Dtype> & B) { - if (A.shape().size() != 2) return false; - if (B.shape().size() != 1) return false; - if (A.shape().at(0) != B.shape().at(0)) return false; - return true; -} - -template <typename Dtype> -bool check_shape_equal(const Blob<Dtype> & A, const Blob<Dtype> & B, -const Blob<Dtype> & C) { - int asize, bsize, csize; - asize = get_size(A.shape()); - bsize = get_size(B.shape()); - csize = get_size(C.shape()); - if (asize != bsize) return false; - if (asize != csize) return false; - return true; -} - -template <typename Dtype> -bool check_shape_mmm(const Blob<Dtype> & A, const Blob<Dtype> & B, -const Blob<Dtype> & C) { - if (A.shape().size() != 2) return false; - if (B.shape().size() != 2) return false; - if (C.shape().size() != 2) return false; - int a1, a2, b1, b2, c1, c2; - if (C.isTranspose()) return false; - a1 = A.isTranspose() ? A.shape().at(1) : A.shape().at(0); - a2 = A.isTranspose() ? A.shape().at(0) : A.shape().at(1); - b1 = B.isTranspose() ? B.shape().at(1) : B.shape().at(0); - b2 = B.isTranspose() ? B.shape().at(0) : B.shape().at(1); - c1 = C.shape().at(0); - c2 = C.shape().at(1); - if (a2 != b1) return false; - if (a1 != c1) return false; - if (b2 != c2) return false; - return true; -} - -template <typename Dtype> -bool check_shape_vvm(const Blob<Dtype> & A, const Blob<Dtype> & B, -const Blob<Dtype> & C) { - if (A.shape().size() != 1) return false; - if (B.shape().size() != 1) return false; - if (C.shape().size() != 2) return false; - int a1, b1, c1, c2; - if (C.isTranspose()) return false; - a1 = A.shape().at(0); - b1 = B.shape().at(0); - c1 = C.shape().at(0); - c2 = C.shape().at(1); - if (a1 != c2) return false; - if (b1 != c1) return false; - return true; -} - +enum XPU {cpu, gpu, any}; + +/************* BLAS level 1 *****************/ +/** + * Scale each element of A with alpha, and put the result into B. + * Bi = alpha*Ai + * Use blas scale internally. + */ +template<typename Dtype> +void Scale(xpu xpu, Dtype alpha, const Blob<Dtype> & A, Blob<Dtype> * B) { + CHECK_EQ(A.count(), B->count()); + if (xpu == cpu) + cpu_scale(A.count(), alpha, A.cpu_data(), B->mutable_cpu_data()); +#ifdef USE_GPU +#endif +} + +/** + * Element-wise operation: Bi = alpha*Ai+Bi. A and B should have the same size + */ +template<typename Dtype> +void AXPY(XPU xpu, Dtype alpha, const Blob<Dtype> & A, Blob<Dtype> * B) { + CHECK_EQ(A.count(), B.count()); + if (xpu == cpu) { + cpu_axpy(A.cpu_data(), A.count(), + alpha, B->mutable_cpu_data()); + } +#ifdef USE_GPU + if (xpu == gpu) { + gpu_axpy(A.gpu_data(), A.count(), + alpha, B->mutable_gpu_data()); + } +#endif // USE_GPU +} + +/************* BLAS level 2 *****************/ +/** + * Matrix vector multiplication, C = alpha A(.T) * B + beta C. + * Strict shape checking: + * - dim of A ==2 + * columsn of A(.T) == B.count() + * - rows of A(.T) == C.count() + * + * @param[in] alpha + * @param[in] beta + * @param[in] A, matrix + * @param[in] B, vector + * @param[in, out] C, vector + */ +template<typename Dtype> +void GEMV(XPU, xpu, Dtype alpha, Dtype beta, const Blob<Dtype>& A, + const Blob<Dtype>& B, Blob<Dtype>* C) { + CHECK_EQ(A.shape().size(), 2) << "A must be a matrix"; + int a1, a2, m, n; + a1 = A.transpose() ? A.shape(1) : A.shape(0); + a2 = A.transpose() ? A.shape(0) : A.shape(1); + m = B.count(); + n = C->count(); + CHECK_EQ(a2, m) << "# columns of A(.T) must = length of B"; + CHECK_EQ(a1, n) << "# rows of A(.T) must = length of C"; + + bool TranA = A.transpose(); + if (xpu == cpu) { + cpu_gemv(A.cpu_data(), B.cpu_data(), m, n, alpha, beta, TranA, + C->mutable_cpu_data()); + } +#ifdef USE_GPU + if (xpu == gpu) { + // gpu part + gpu_gemv(A.gpu_data(), B.gpu_data(), m, n, alpha, beta, TranA, + C->mutable_gpu_data()); + } +#endif // USE_GPU +} +/** + * Matrix vector multiplication, C = A(.T) * B, transpose is considered. + * Loose shape checking: + * - dim of A >=2 + * - A.count() % B.count() == 0 + * - B.count() == C.count() + * + * @param[in] A input matrix + * @param[in] B input vector + * @param[out] C output vector + */ template <typename Dtype> -bool check_shape_mvv(const Blob<Dtype> & A, const Blob<Dtype> & B, -const Blob<Dtype> & C) { - if (A.shape().size() != 2) return false; - if (B.shape().size() != 1) return false; - if (C.shape().size() != 1) return false; - int a1, a2, b1, c1; - a1 = A.isTranspose() ? A.shape().at(1) : A.shape().at(0); - a2 = A.isTranspose() ? A.shape().at(0) : A.shape().at(1); - b1 = B.shape().at(0); - c1 = C.shape().at(0); - if (a2 != b1) return false; - if (a1 != c1) return false; - return true; -} - -/*****************************************************************************/ -// blob transformation - +void MVDot(XPU xpu, const Blob<Dtype>& A, const Blob<Dtype>& B, Blob<Dtype>* C) +{ + GEMV(xpu, Dtype(1), Dtype(0), A, B, C); +} + +/************* BLAS level 3 *****************/ +/** + * Matrix multiplication, C = alpha A*B + beta C, A, B and C are matrix. + * + * Tranpose is considered for A and B. + * Strict shape checking: + * - all are matrix + * - shapes match for matrix multiplication + * + * @param[in] alpha + * @param[in] beta + * @param[in] A, matrix + * @param[in] B, matrix + * @param[in, out] C, matrix + */ template <typename Dtype> -Blob<Dtype>* Reshape(const Blob<Dtype> & A, const std::vector<int>& shape) { - Blob<Dtype>* res = new Blob<Dtype>(); - res->Mirror(A); - res->Reshape(shape); - return res; -} - -// the current reshape in blob.h is: -// void Reshape(const std::vector<int>& shape); - +void GEMM(XPU xpu, Dtype alpha, Dtype beta, const Blob<Dtype>& A, + const Blob<Dtype> & B, Blob<Dtype> * C) { + CHECK_EQ(A.shape().size(), 2); + CHECK_EQ(B.shape().size(), 2); + CHECK_EQ(C.shape().size(), 2); + int a1, a2, b1, b2, m, n; + CHECK(!C->transpose()); + a1 = A.transpose() ? A.shape(1) : A.shape(0); + a2 = A.transpose() ? A.shape(0) : A.shape(1); + b1 = B.transpose() ? B.shape(1) : B.shape(0); + b2 = B.transpose() ? B.shape(0) : B.shape(1); + m = C->shape(0); + n = C->shape(1); + CHECK__EQ(a2, b1); + CHECK__EQ(a1, m); + CHECK__EQ(b2, n); + + int k = A.transpose() ? A.shape(0) : A.shape(1); + bool TranA = A.transpose(); + bool TranB = B.transpose(); + if (xpu == cpu) { + cpu_gemm(A.cpu_data(), B.cpu_data(), m, n, k, alpha, beta, + TranA, TranB, C->mutable_cpu_data()); + } +#ifdef USE_GPU + if (xpu == gpu) { + // gpu part + gpu_gemm(A.gpu_data(), B.gpu_data(), m, n, k, alpha, beta, + TranA, TranB, C->mutable_gpu_data()); + } +#endif // USE_GPU +} +/** + * Matrix multiplication, C = A(.T) * B(.T), transpose is considered. + * Strict shape checking: + * - all are matrix + * - shapes match for matrix multiplication + * + * @param[in] A input matrix + * @param[in] B input matrix + * @param[out] C output matrix + */ template <typename Dtype> -Blob<Dtype>* Reshape(const Blob<Dtype> & A, int dim1) { - std::vector<int> tmpshape; - tmpshape.push_back(dim1); - return Reshape(A, tmpshape); +void MMDot(XPU xpu, const Blob<Dtype>& A, const Blob<Dtype>& B, Blob<Dtype>* C) +{ + GEMM(xpu, Dtype(1), Dtype(0), A, B, C); } -template <typename Dtype> -Blob<Dtype>* Reshape(const Blob<Dtype> & A, int dim1, int dim2) { - std::vector<int> tmpshape; - tmpshape.push_back(dim1); - tmpshape.push_back(dim2);; - return Reshape(A, tmpshape); -} - -template <typename Dtype> -Blob<Dtype>* Reshape(const Blob<Dtype> & A, int dim1, int dim2, int dim3) { - std::vector<int> tmpshape; - tmpshape.push_back(dim1); - tmpshape.push_back(dim2); - tmpshape.push_back(dim3); - return Reshape(A, tmpshape); -} - -template <typename Dtype> -Blob<Dtype>* Reshape(const Blob<Dtype> & A, int dim1, int dim2, -int dim3, int dim4) { - std::vector<int> tmpshape; - tmpshape.push_back(dim1); - tmpshape.push_back(dim2); - tmpshape.push_back(dim3); - tmpshape.push_back(dim4); - return Reshape(A, tmpshape); -} +/*********************** Inner and Outer product****************************/ +/** + * Inner product for two vectors. + * Loose shape checking, A.count() == B.count. + * + * @param[in] A, input vector (shape checking using A.count()). + * @param[in] B, input vector (shape checking using B.count()). + * @return inner product value. + */ template <typename Dtype> -Blob<Dtype>* Reshape(const Blob<Dtype> & A, int dim1, int dim2, -int dim3, int dim4, int dim5) { - std::vector<int> tmpshape; - tmpshape.push_back(dim1); - tmpshape.push_back(dim2); - tmpshape.push_back(dim3); - tmpshape.push_back(dim4); - tmpshape.push_back(dim5); - return Reshape(A, tmpshape); -} - +Dtype VVDot(XPU xpu, const Blob<Dtype> & A, const Blob<Dtype> & B) { + Dtype res = 0; + CHECK_EQ(A.count(), B.count()); + int n = A.count(); + if (xpu == cpu) { + res = cpu_dot(A.cpu_data(), B.cpu_data(), n); + } +#ifdef USE_GPU + if (xpu == gpu) { + // gpu part + res = gpu_dot(A.gpu_data(), B.gpu_data(), n); + } +#endif // USE_GPU + return res; +} + +/** + * Outer product, C = A ** B, transpose is disabled. + * Loose shape checking, A.count() * B.count() == C.count() + * + * @param[in] A, input vector + * @param[in] B, input vector + * @param[out] C, output matrix + */ template <typename Dtype> -Blob<Dtype>* Transpose(const Blob<Dtype> & A) { - Blob<Dtype>* res = new Blob<Dtype>(); - res->Mirror(A); - res->setTranspose(); - return res; -} -// return A^T - - -/*****************************************************************************/ -// class1 matrix operation - - -void MMDot(XPU xpu, const Blob<float> & A, const Blob<float> & B, -Blob<float> * C); -// A, B and C are matrix - - -void MVDot(XPU xpu, const Blob<float> & A, const Blob<float> & B, -Blob<float> * C); -// A is matrix,B and C are vector - - -void VVDot(XPU xpu, const Blob<float> & A, const Blob<float> & B, -Blob<float> * C); -// C is matrix,A and B are vector - - -float VVdot(XPU xpu, const Blob<float> & A, const Blob<float> & B); -// A and B are vectors - - -void GEMM(XPU xpu, const Blob<float> & A, const Blob<float> & B, -Blob<float> * C, float alpha = 1, float beta = 1); -// C = alpha*A*B+beta*C, A, B and C are matrix - - - -/*****************************************************************************/ -// class2 element-wise operation - -// element-wise generalized operation defined in Op - - -template<typename Op> -void E_Func(XPU xpu, Blob<float> * A, float alpha) { - if (xpu == cpu) { - int n = get_size(A->shape()); - cpu_e_f<Op>(n, alpha, A->mutable_cpu_data()); - } - #ifdef SINGA_GPU - if (xpu == gpu) { - // gpu part - int n = get_size(A->shape()); - gpu_e_f<Op>(n, alpha, A->mutable_gpu_data()); - } - #endif // SINGA_GPU -} - -template<typename Op> -void E_Func(XPU xpu, const Blob<float> & A, Blob<float> * B, float alpha) { - if (check_shape_equal(A, *B, *B)) { - int n = get_size(A.shape()); - if (xpu == cpu) { - cpu_e_f<Op>(n, A.cpu_data(), alpha, B->mutable_cpu_data()); - } - #ifdef SINGA_GPU - if (xpu == gpu) { - // gpu part - gpu_e_f<Op>(n, A.gpu_data(), alpha, B->mutable_gpu_data()); - } - #endif // SINGA_GPU - } else { - // report errors here - } +void OuterProduct(XPU xpu, const Blob<Dtype>& A, const Blob<Dtype>& B, + Blob<Dtype> * C) { + CHECK(!C.transpose()); // do not support C.T now. + + int m = A.count(); + int n = B.count(); + CHECK_EQ(C->count(), m * n); + + if (xpu == cpu) { + cpu_gemm(A.cpu_data(), B.cpu_data(), m, n, 1, 1, 0, + false, false, C->mutable_cpu_data()); + } +#ifdef USE_GPU + if (xpu == gpu) { + // gpu part + gpu_gemm(A.gpu_data(), B.gpu_data(), m, n, 1, 1, 0, + false, false, C->mutable_gpu_data()); + } +#endif // USE_GPU +} +/*********************** Element-wise functions ***********************/ +/** + * Apply the function from Op for each element in A and put the result into B, + * i.e., Bi = Op(Ai). + * Loose shape checking, A.count() == B.count(). + */ +template<typename Op, typename Dtype> +void Map(XPU xpu, const Blob<Dtype> & A, Blob<Dtype> * B) { + CHECK_EQ(A.count(), B->count()) << "Blobs must have the same size"; + if (xpu == cpu) { + cpu_e_f<Op>(A.count(), A.cpu_data(), B->mutable_cpu_data()); + } +#ifdef SINGA_GPU + if (xpu == gpu) { + // gpu part + gpu_e_f<Op>(A.count(), A.gpu_data(), B->mutable_gpu_data()); + } +#endif // SINGA_GPU +} + +/** + * Apply the function from Op for each element in A and B, and put the result + * into C, i.e., Ci = Op(Ai, Bi). + * Loose shape checking, A, B and C are of the same size. + */ +template<typename Op, typename Dtype> +void Map(XPU xpu, const Blob<Dtype> & A, const Blob<Dtype> & B, + Blob<Dtype> * C) { + CHECK_EQ(A.count(), B.count()) << "Blobs must have the same size"; + CHECK_EQ(A.count(), C->count()) << "Blobs must have the same size"; + if (xpu == cpu) { + cpu_e_f<Op>(A.count(), A.cpu_data(), B.cpu_data(), C->mutable_cpu_data()); + } +#ifdef SINGA_GPU + if (xpu == gpu) { + // gpu part + gpu_e_f<Op>(A.count(), A.gpu_data(), B.gpu_data(), C->mutable_gpu_data()); + } +#endif // SINGA_GPU +} + +/** + * Bi = Op(alpha, Ai) + * Loose shape checking, A.count() == B.count(). + */ +template<typename Op, typename Dtype> +void Map(XPU xpu, Dtype alpha, const Blob<Dtype>& A, const Blob<Dtype>* B) { + CHECK_EQ(A.count(), B->count()) << "Blobs must have the same size"; + if (xpu == cpu) { + cpu_e_f<Op>(A.count(), alpha, A.cpu_data(), B->mutable_cpu_data()); + } +#ifdef SINGA_GPU +#endif // SINGA_GPU +} +/** + * Ci = Op(alpha, Ai, Bi) + * Loose shape checking, A, B and C are of the same size. + */ +template<typename Op, typename Dtype> +void Map(XPU xpu, Dtype alpha, const Blob<Dtype>& A, const Blob<Dtype>& B, + Blob<Dtype>* C) { + CHECK_EQ(A.count(), B->count()) << "Blobs must have the same size"; + if (xpu == cpu) { + cpu_e_f<Op>(A.count(), alpha, A.cpu_data(), B->cpu_data(), + C->mutable_cpu_data()); + } +#ifdef SINGA_GPU +#endif // SINGA_GPU +} + +/** + * Currently use std::copy which has shown better performance than memcpy. + * http://stackoverflow.com/questions/4707012/c-memcpy-vs-stdcopy + * TODO(wangwei) test blas copy vs std::copy. + * + * Loose shape checking, A.count() == B.count(). + */ +template<typename Dtype> +void Copy(XPU xpu, const Blob<Dtype>& A, const Blob<Dtype>* B) { + CHECK_EQ(A.count(), B->count()) << "Blobs must have the same size"; + if (xpu == cpu) + std::copy(A.cpu_data(), A.cpu_data() + A.count(), B->mutable_cpu_data()); + else { + LOG(FATAL) << "Not implemented"; + } +} + +/** + * C = A + B + * Implemented using Copy and AXPY. + */ +template<typename Dtype> +void Add(XPU xpu, const Blob<Dtype> & A, const Blob<Dtype> & B, + Blob<Dtype> * C) { + Copy(A, C); + AXPY(B, C, 1); +} + +/** + * C = A - B + * Implemented using Copy and AXPY. + */ +template<typename Dtype> +void Sub(XPU xpu, const Blob<Dtype> & A, const Blob<Dtype> & B, + Blob<Dtype> * C) { + Copy(xpu, A, C); + AXPY(xpu, B, C, -1); +} + +/** + * C = A * B, implemented using + * Map(XPU, const Blob<Dtype>&, const Blob<Dtype>&, Blob<Dtype>*). + */ +template<typename Dtype> +void Mult(XPU xpu, const Blob<Dtype> & A, const Blob<Dtype> & B, + Blob<Dtype> * C) { + Map<singa::op::Mult>(xpu, A, B, C); + // TODO(wangwei) use MKL's vector func +} + +/** + * C = A / B, implemented using + * Map(XPU, const Blob<Dtype>&, const Blob<Dtype>&, Blob<Dtype>*). + */ +template<typename Dtype> +void Div(XPU xpu, const Blob<Dtype> & A, const Blob<Dtype> & B, + Blob<Dtype> * C) { + Map<singa::op::Div>(xpu, A, B, C); + // TODO(wangwei) use MKL's vector func +} +/*************************1D<-->2D op/transform***************************/ +/** + * Add each row of B with A, i.e., Bij = alpha*Ai + beta*Bij + * Loose shape checking, B.count() % A.count() == 0. + * # rows of B = B.count() / A.count(). + * Transpose is disabled. + */ +template<typename Dtype> +void MVAdd(XPU xpu, Dtype alpha, Dtype beta, const Blob<Dtype> & A, + Blob<Dtype> * B) { + CHECK_EQ(B.count() % A.count(), 0) << "#col of B not match length of A"; + int m = A.count(), n = B->count() / m; + if (xpu == cpu) { + Blob<Dtype> one(n); + one.SetValue(1); + cpu_gemm(A.cpu_data(), one.cpu_data(), m, n, 1, alpha, beta, + false, false, B->mutable_cpu_data()); + } +#ifdef USE_GPU + if (xpu == gpu) { + singa_gpu_add_vec_row(B->gpu_data(), + A.gpu_data(), A.gpu_data(), m, n, n); + // gpu part + } +#endif // USE_GPU +} +/** + * Add each row of B with A, i.e., Bij = Ai + Bij + * Loose shape checking, B.count() % A.count() == 0. + * # rows of B = B.count() / A.count(). + * Transpose is disabled. + */ +template<typename Dtype> +void MVAdd(XPU xpu, const Blob<Dtype> & A, Blob<Dtype>* B) { + MVAdd(xpu, Dtype(1), Dtype(1), A, B); +} + +/** + * Copy A to each row of B + * Loose shape checking, B.count() % A.count() == 0, + * # rows of B = B.count() / A.count(). + * Transpose is disabled. + */ +template<typename Dtype> +void Repmat(XPU xpu, const Blob<Dtype> & A, Blob<Dtype> * B) { + MVAdd(xpu, Dtype(1), Dtype(0), A, B); +} + +/** + * Add each col of matrix A to vector B, i.e., Bi = \sum_j {alpha*Aij}+beta*Bi + * Loose shape checking, A.count() % B.count() == 0. + * # rows of A = A.count() / B.count(). + * Transpose is disabled. + */ +template<typename Dtype> +void MVSum(XPU xpu, Dtype alpha, Dtype beta, const Blob<Dtype> & A, + Blob<Dtype> * B) { + CHECK_EQ(A.count() % B->count(), 0) << "length of B must = # of cols of A"; + + int m = B->count(), n = A.count() / m; + if (xpu == cpu) { + Blob<Dtype> one(n); + one.SetValue(1); + cpu_gemm(A.cpu_data(), one.cpu_data(), m, 1, n, alpha, beta, + false, false, B->mutable_cpu_data()); + } +#ifdef USE_GPU + if (xpu == gpu) { + singa_gpu_sum_col(A.gpu_data(), B->gpu_data(), m, n, n); + // gpu part + } +#endif // USE_GPU +} +/** + * Reduce each row of A to an element of B. + * Loose shape checking, A.count() % B.count() == 0. + * # rows of A = A.count() / B.count(). + */ +template<typename Op, typename Dtype> +void Reduce2D(XPU xpu, const Blob<Dtype> & A, Blob<Dtype> * B) { + CHECK_EQ(A.count() % B.count(), 0) << "Row size not match B length"; + int m = B->count(), n = A.count() / m; + if (xpu == cpu) { + cpu_reduce_f<Op>(A.cpu_data(), m, n, B->mutable_cpu_data()); + } +#ifdef SINGA_GPU + if (xpu == gpu) { + // gpu part + gpu_reduce_f<Op>(A.gpu_data(), m, n, B->mutable_gpu_data()); + } +#endif // SINGA_GPU +} +/** + * Duplicate each element of A into a row of B. + * Loose shape checking, B.count() % A.count() == 0. + * # rows of B = B.count() / A.count(). + */ +template<typename Op, typename Dtype> +void Expand2D(XPU xpu, const Blob<Dtype> & A, Blob<Dtype> * B) { + CHECK_EQ(B.count() % A.count(), 0) << "Row size of B not match length of A"; + int m = A.count(), n = B->count() / m; + if (xpu == cpu) { + cpu_expand_f<Op>(A.cpu_data(), m, n, B->mutable_cpu_data()); + } +#ifdef SINGA_GPU + if (xpu == gpu) { + // gpu part + gpu_expand_f<Op>(A.gpu_data(), m, n, B->mutable_gpu_data()); + } +#endif // SINGA_GPU +} + +/***********************************************************/ +/** + * Apply the function from Op for each element in A, Op(Ai). + * @param A + */ +template<typename Op, typename Dtype> +void Map(XPU xpu, Blob<Dtype>* A) { + if (xpu == cpu) { + cpu_e_f<Op>(A->count(), A->mutable_cpu_data()); + } +#ifdef SINGA_GPU + if (xpu == gpu) { + // gpu part + gpu_e_f<Op>(A->count(), A->mutable_gpu_data()); + } +#endif // SINGA_GPU +} + +/** + * B = e ^ A + */ +template<typename Dtype> +void Exp(XPU xpu, const Blob<Dtype> & A, Blob<Dtype> * B) { + Map<singa::op::Exp>(xpu, A, B); +} + +/** + * element-wise operation: b = 1.0f / (1.0f + expf(-a)); + */ +template<typename Dtype> +inline void Sigmoid(XPU xpu, const Blob<Dtype> & A, Blob<Dtype> * B) { + Map<singa::op::Sigmoid>(xpu, A, B); +} + +/** + * element-wise operation: b = a * ( 1.0f - a ); + */ +inline void SigmoidGrad(XPU xpu, const Blob<Dtype> & A, Blob<Dtype> * B) { + Map<singa::op::SigmoidGrad>(xpu, A, B); +} + +/** + * element-wise operation: b = std::max( a, 0) + */ +inline void Relu(XPU xpu, const Blob<Dtype> & A, Blob<Dtype> * B) { + Map<singa::op::Relu>(xpu, A, B); +} + +/** + * element-wise operation: b = a > 0 ? 1: 0; + */ +template<typename Dtype> +inline void ReluGrad(XPU xpu, const Blob<Dtype> & A, Blob<Dtype> * B) { + Map<singa::op::ReluGrad>(xpu, A, B); +} + +/** + * element-wise operation: b = tanh(a); + */ +template<typename Dtype> +inline void Tanh(XPU xpu, const Blob<Dtype> & A, Blob<Dtype> * B) { + Map<singa::op::Tanh>(xpu, A, B); +} + +/** + * B = 1- A^2 + */ +template<typename Dtype> +inline void TanhGrad(XPU xpu, const Blob<Dtype> & A, Blob<Dtype> * B) { + Map<singa::op::TanhGrad>(xpu, A, B); +} +/** + * B = log(1+exp(A)) + */ +template<typename Dtype> +inline void Softplus(XPU xpu, const Blob<Dtype> & A, Blob<Dtype> * B) { + Map<singa::op::Softplus>(xpu, A, B); +} + +/** + * B = 1.0f / (1.0f + expf(-A)); + */ +template<typename Dtype> +inline void SoftplusGrad(XPU xpu, const Blob<Dtype> & A, Blob<Dtype> * B) { + Map<singa::op::SoftplusGrad>(xpu, A, B); +} + +template<typename Dtype> +inline void Square(XPU xpu, const Blob<Dtype> & A, Blob<Dtype> * B) { + Map<singa::op::Square>(xpu, A, B); +} + +template<typename Dtype> +inline void SquareGrad(XPU xpu, const Blob<Dtype> & A, Blob<Dtype> * B) { + Map<singa::op::Square_grad>(xpu, A, B); } -template<typename Op> -void E_Func(XPU xpu, const Blob<float> & A, const Blob<float> & B, -Blob<float> * C, float alpha, float beta) { - if (check_shape_equal(A, B, *C)) { - int n = get_size(A.shape()); - if (xpu == cpu) { - cpu_e_f<Op>(n, A.cpu_data(), B.cpu_data(), alpha, beta, - C->mutable_cpu_data()); - } - #ifdef SINGA_GPU - if (xpu == gpu) { - // gpu part - gpu_e_f<Op>(n, A.gpu_data(), B.gpu_data(), alpha, beta, - C->mutable_gpu_data()); - } - #endif // SINGA_GPU - } else { - // report errors here - } -} - - -inline void Set(XPU xpu, Blob<float> * A, float alpha) { - E_Func<singa::op::Set>(xpu, A, alpha); +template<typename Dtype> +inline void Sqrt(XPU xpu, const Blob<Dtype> & A, Blob<Dtype> * B) { + Map<singa::op::Sqrt>(xpu, A, B); +} + +/** + * B = A < alpha ? 1:0; + */ +template<typename Dtype> +inline void Threshold(XPU xpu, Dtype alpha, const Blob<Dtype> & A, + Blob<Dtype> * B) { + Map<singa::op::Threshold>(xpu, alpha, A, B); } -// element-wise operation: Ai = alpha - - -inline void Scale(XPU xpu, const Blob<float> & A, Blob<float> * B, -float alpha) { - E_Func<singa::op::Scale>(xpu, A, B, alpha); -} -// element-wise operation: Bi = alpha*Ai - -inline void Exp(XPU xpu, const Blob<float> & A, Blob<float> * B, -float alpha = 2.71) { - E_Func<singa::op::Exp>(xpu, A, B, alpha); -} -// element-wise operation: Bi = alpha^Ai - -inline void Exp_grad(XPU xpu, const Blob<float> & A, Blob<float> * B, -float alpha = 2.71) { - E_Func<singa::op::Exp_grad>(xpu, A, B, alpha); -} -// element-wise operation: Bi = Ai*log(alpha) - -inline void Gsigmoid(XPU xpu, const Blob<float> & A, Blob<float> * B, -float alpha) { - E_Func<singa::op::Gsigmoid>(xpu, A, B, alpha); -} -// element-wise operation: b = 1.0f / (1.0f + expf(-a * alpha)); - -inline void Gsigmoid_grad(XPU xpu, const Blob<float> & A, Blob<float> * B, -float alpha) { - E_Func<singa::op::Gsigmoid_grad>(xpu, A, B, alpha); -} -// element-wise operation: b = alpha * a * ( 1.0f - a ); - -inline void Grelu(XPU xpu, const Blob<float> & A, Blob<float> * B, -float alpha = 0) { - E_Func<singa::op::Grelu>(xpu, A, B, alpha); -} -// element-wise operation: b = ( 1 - alpha ) * std::max( a, 0.0f ) + alpha * a; - -inline void Grelu_grad(XPU xpu, const Blob<float> & A, Blob<float> * B, -float alpha = 0) { - E_Func<singa::op::Grelu_grad>(xpu, A, B, alpha); -} -// element-wise operation: b = a > 0.0f ? 1.0f : alpha; - -inline void Gtanh(XPU xpu, const Blob<float> & A, Blob<float> * B, -float alpha) { - E_Func<singa::op::Gtanh>(xpu, A, B, alpha); -} -// element-wise operation: b = tanhf( a * alpha ); - -inline void Gtanh_grad(XPU xpu, const Blob<float> & A, Blob<float> * B, -float alpha) { - E_Func<singa::op::Gtanh_grad>(xpu, A, B, alpha); -} -// element-wise operation: b = alpha * ( 1.0f - a * a ); - -inline void Softplus(XPU xpu, const Blob<float> & A, Blob<float> * B) { - E_Func<singa::op::Softplus>(xpu, A, B, 0); -} -// element-wise operation: b = logf(1 + expf(a)); - -inline void Softplus_grad(XPU xpu, const Blob<float> & A, Blob<float> * B) { - E_Func<singa::op::Softplus_grad>(xpu, A, B, 0); -} -// element-wise operation: b = 1.0f / (1.0f + expf(-a)); - -inline void Square(XPU xpu, const Blob<float> & A, Blob<float> * B) { - E_Func<singa::op::Square>(xpu, A, B, 0); -} -// element-wise operation: b = a * a; - -inline void Square_grad(XPU xpu, const Blob<float> & A, Blob<float> * B) { - E_Func<singa::op::Square_grad>(xpu, A, B, 0); -} -// element-wise operation: b = 2 * sqrt(a); - -inline void Sqrt(XPU xpu, const Blob<float> & A, Blob<float> * B) { - E_Func<singa::op::Sqrt>(xpu, A, B, 0); -} -// element-wise operation: b = sqrt(a); - -inline void Threshold(XPU xpu, const Blob<float> & A, float alpha, -Blob<float> * B) { - E_Func<singa::op::Threshold>(xpu, A, B, alpha); -} -// element-wise operation: b = a < alpha ? 1.0f : 0.0f; - -inline void Add(XPU xpu, const Blob<float> & A, const Blob<float> & B, -Blob<float> * C) { - E_Func<singa::op::Add>(xpu, A, B, C, 0, 0); -} -// element-wise operation: Ci = Ai+Bi A,B and C should have the same size - -inline void Sub(XPU xpu, const Blob<float> & A, const Blob<float> & B, -Blob<float> * C) { - E_Func<singa::op::Sub>(xpu, A, B, C, 0, 0); -} -// element-wise operation: Ci = Ai-Bi A,B and C should have the same size - -inline void Mult(XPU xpu, const Blob<float> & A, const Blob<float> & B, -Blob<float> * C) { - E_Func<singa::op::Mult>(xpu, A, B, C, 0, 0); -} -// element-wise operation: Ci = Ai*Bi A,B and C should have the same size - -inline void Div(XPU xpu, const Blob<float> & A, const Blob<float> & B, -Blob<float> * C) { - E_Func<singa::op::Div>(xpu, A, B, C, 0, 0); -} -// element-wise operation: Ci = Ai/Bi A,B and C should have the same size - - -void AXPY(XPU xpu, const Blob<float> & A, Blob<float> * B, float alpha); -// element-wise operation: Bi = alpha*Ai+Bi A and B should have the same size - -/*****************************************************************************/ -// class3 matrix-vector expand/reduce operation - -template<typename Op> -void Reduce_F(XPU xpu, const Blob<float> & A, Blob<float> * B) { - if (check_shape_mv(A, *B)) { - int m = get_size(B->shape()); - int n = get_size(A.shape()) / m; - if (xpu == cpu) { - cpu_reduce_f<Op>(A.cpu_data(), m, n, B->mutable_cpu_data()); - } - #ifdef SINGA_GPU - if (xpu == gpu) { - // gpu part - gpu_reduce_f<Op>(A.gpu_data(), m, n, B->mutable_gpu_data()); - } - #endif // SINGA_GPU - } else { - // report errors here - } -} -// reduce each row of A to an element of B e.g. the sum operation in softmax - -template<typename Op> -void Expand_F(XPU xpu, const Blob<float> & A, Blob<float> * B) { - if (check_shape_mv(*B, A)) { - int m = get_size(A.shape()); - int n = get_size(B->shape()) / m; - if (xpu == cpu) { - cpu_expand_f<Op>(A.cpu_data(), m, n, B->mutable_cpu_data()); - } - #ifdef SINGA_GPU - if (xpu == gpu) { - // gpu part - gpu_expand_f<Op>(A.gpu_data(), m, n, B->mutable_gpu_data()); - } - #endif // SINGA_GPU - } else { - // report errors here - } -} -// expand each element in A into a row of B - -void Repmat(XPU xpu, const Blob<float> & A, Blob<float> * B); -// A is a vector, B is a matrix , let each row of B to be A - -void MVAdd(XPU xpu, const Blob<float> & A, Blob<float> * B, -float alpha, float beta); -// A is a vector, B is a matrix , Bij = alpha*Ai+beta*Bij -// will use gemm. faster than general expand_f - -void MVSum(XPU xpu, const Blob<float> & A, Blob<float> * B, -float alpha, float beta); -// A is a vector, B is a matrix , Ai = \sigma_j_{alpha*Bij}+beta*Ai -// will use gemm. faster than general reduce_f - - } // end of namespace singa #endif // SINGA_BLOB_MATH_BLOB_H_ http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/4b84dbe3/include/singa/blob/singa_op.h ---------------------------------------------------------------------- diff --git a/include/singa/blob/singa_op.h b/include/singa/blob/singa_op.h index 3747568..1131c5d 100644 --- a/include/singa/blob/singa_op.h +++ b/include/singa/blob/singa_op.h @@ -33,314 +33,318 @@ #endif // SINGA_GPU namespace singa { - enum XPU { cpu, gpu, any}; namespace op { -struct Set { - inline static void Map(float alpha, float * a) { - *a = alpha; - } - #ifdef SINGA_GPU - inline static void CudaMap(float alpha, float * a, int n) { - singa::singa_gpu_set_value(a, alpha, n); - } - #endif // SINGA_GPU -}; -struct Scale { - inline static void Map(float alpha, const float & a, float * b) { - *b = alpha * a; - } - #ifdef SINGA_GPU - inline static void CudaMap(float alpha, const float * a, - float * b, int n) { - singa::singa_gpu_scale(a, b, alpha, n); - } - #endif // SINGA_GPU +/** + * b = e^a + */ +template<Dtype> +struct Exp { + inline static void Map(const float & a, float * b) { + *b = exp(a); + } +#ifdef SINGA_GPU + inline static void CudaMap(float alpha, const float * a, + float * b, int n) { + singa::singa_gpu_exp(a, b, alpha, n); + } +#endif // SINGA_GPU }; - -struct Scale_grad { - inline static void Map(float alpha, float * output) { - *output = alpha; - } - #ifdef SINGA_GPU - inline static void CudaMap(float alpha, float * output, int n) { - singa::singa_gpu_scale_grad(output, alpha, n); - } - #endif // SINGA_GPU +/** + * b = log(a), base is e + */ +template<Dtype> +struct Log { + inline static void Map(const float & a, float *b) { + *b = log(a); + } +} + +template<Dtype> +struct Sigmoid { + inline static void Map(const float & a, float * b) { + *b = 1.0f / (1.0f + expf(-a * alpha)); + } +#ifdef SINGA_GPU + inline static void CudaMap(const float * a, + float * b, int n) { + singa::singa_gpu_sigmoid(a, b, 1, n); + } +#endif // SINGA_GPU }; - -struct Exp { - inline static void Map(float alpha, const float & a, float * b) { - *b = pow(a, alpha); - } - #ifdef SINGA_GPU - inline static void CudaMap(float alpha, const float * a, - float * b, int n) { - singa::singa_gpu_exp(a, b, alpha, n); - } - #endif // SINGA_GPU +template<Dtype> +struct SigmoidGrad { + inline static void Map(const float & a, float * b) { + *b = a * (1.0f - a); + } +#ifdef SINGA_GPU + inline static void CudaMap(float alpha, const float * a, float * b, int n) { + singa::singa_gpu_sigmoid_grad(a, b, 1, n); + } +#endif // SINGA_GPU }; -struct Exp_grad { - inline static void Map(float alpha, const float & a, float * b) { - // log is the natrual log based on e - *b = a * log(alpha); - } - #ifdef SINGA_GPU - inline static void CudaMap(float alpha, const float * a, - float * b, int n) { - singa::singa_gpu_exp_grad(a, b, alpha, n); - } - #endif // SINGA_GPU +template<Dtype> +struct Relu { + inline static void Map(const float & a, float * b) { + *b = std::max(a, 0.0f); + } +#ifdef SINGA_GPU + inline static void CudaMap(const float * a, float * b, int n) { + singa::singa_gpu_relu(a, b, 1, n); + } +#endif // SINGA_GPU }; -struct Gsigmoid { - inline static void Map(float alpha, const float & a, float * b) { - *b = 1.0f / (1.0f + expf(-a * alpha)); - } - #ifdef SINGA_GPU - inline static void CudaMap(float alpha, const float * a, - float * b, int n) { - singa::singa_gpu_sigmoid(a, b, alpha, n); - } - #endif // SINGA_GPU +template<Dtype> +struct ReluGrad { + inline static void Map(const float & a, float * b) { + *b = a > 0 ? 1 : 0; + } +#ifdef SINGA_GPU + inline static void CudaMap(const float * a, float * b, int n) { + singa::singa_gpu_relu_grad(a, b, 1, n); + } +#endif // SINGA_GPU }; -struct Gsigmoid_grad { - inline static void Map(float alpha, const float & a, float * b) { - *b = alpha * a * (1.0f - a); - } - #ifdef SINGA_GPU - inline static void CudaMap(float alpha, const float * a, - float * b, int n) { - singa::singa_gpu_sigmoid_grad(a, b, alpha, n); - } - #endif // SINGA_GPU +template<Dtype> +struct Tanh { + inline static void Map(const float & a, float * b) { + *b = tanhf(a); + } +#ifdef SINGA_GPU + inline static void CudaMap(float alpha, const float * a, float * b, int n) { + singa::singa_gpu_tanh(a, b, alpha, n); + } +#endif // SINGA_GPU }; -struct Grelu { - inline static void Map(float alpha, const float & a, float * b) { - *b = (1 - alpha) * std::max(a, 0.0f) + alpha * a; - } - #ifdef SINGA_GPU - inline static void CudaMap(float alpha, const float * a, - float * b, int n) { - singa::singa_gpu_relu(a, b, alpha, n); - } - #endif // SINGA_GPU +template<Dtype> +struct TanhGrad { + inline static void Map(const float & a, float * b) { + *b = 1 - a * a; + } +#ifdef SINGA_GPU + inline static void CudaMap(float alpha, const float * a, float * b, int n) { + singa::singa_gpu_tanh_grad(a, b, alpha, n); + } +#endif // SINGA_GPU }; -struct Grelu_grad { - inline static void Map(float alpha, const float & a, float * b) { - *b = a > 0.0f ? 1.0f : alpha; - } - #ifdef SINGA_GPU - inline static void CudaMap(float alpha, const float * a, - float * b, int n) { - singa::singa_gpu_relu_grad(a, b, alpha, n); - } - #endif // SINGA_GPU +template<Dtype> +struct Softplus { + inline static void Map(const float & a, float * b) { + *b = logf(1 + expf(a)); + } +#ifdef SINGA_GPU + inline static void CudaMap(const float * a, float * b, int n) { + singa::singa_gpu_softplus(a, b, 1, n); + } +#endif // SINGA_GPU }; -struct Gtanh { - inline static void Map(float alpha, const float & a, float * b) { - *b = tanhf(a * alpha); - } - #ifdef SINGA_GPU - inline static void CudaMap(float alpha, const float * a, - float * b, int n) { - singa::singa_gpu_tanh(a, b, alpha, n); - } - #endif // SINGA_GPU +template<Dtype> +struct SoftplusGrad { + inline static void Map(const float & a, float * b) { + *b = 1.0f / (1.0f + expf(-a)); + } +#ifdef SINGA_GPU + inline static void CudaMap(const float * a, + float * b, int n) { + singa::singa_gpu_softplus_grad(a, b, n); + } +#endif // SINGA_GPU }; -struct Gtanh_grad { - inline static void Map(float alpha, const float & a, float * b) { - *b = alpha * (1.0f - a * a); - } - #ifdef SINGA_GPU - inline static void CudaMap(float alpha, const float * a, - float * b, int n) { - singa::singa_gpu_tanh_grad(a, b, alpha, n); - } - #endif // SINGA_GPU +template<Dtype> +struct Square { + inline static void Map(const float & a, float * b) { + *b = a * a; + } +#ifdef SINGA_GPU + inline static void CudaMap(const float * a, + float * b, int n) { + singa::singa_gpu_square(a, b, n); + } +#endif // SINGA_GPU }; -struct Softplus { - inline static void Map(float alpha, const float & a, float * b) { - *b = logf(1 + expf(a)); - } - #ifdef SINGA_GPU - inline static void CudaMap(float alpha, const float * a, - float * b, int n) { - singa::singa_gpu_softplus(a, b, alpha, n); - } - #endif // SINGA_GPU +template<Dtype> +struct SquareGrad { + inline static void Map(const float & a, float * b) { + *b = 2 * sqrt(a); + } +#ifdef SINGA_GPU + inline static void CudaMap(const float * a, + float * b, int n) { + singa::singa_gpu_square_grad(a, b, 1, n); + } +#endif // SINGA_GPU }; -struct Softplus_grad { - inline static void Map(float alpha, const float & a, float * b) { - *b = 1.0f / (1.0f + expf(-a)); - } - #ifdef SINGA_GPU - inline static void CudaMap(float alpha, const float * a, - float * b, int n) { - singa::singa_gpu_softplus_grad(a, b, alpha, n); - } - #endif // SINGA_GPU +template<Dtype> +struct Sqrt { + inline static void Map(const float & a, float * b) { + *b = sqrt(a); + } +#ifdef SINGA_GPU + inline static void CudaMap(const float * a, + float * b, int n) { + singa::singa_gpu_sqrt(a, b, n); + } +#endif // SINGA_GPU }; -struct Square { - inline static void Map(float alpha, const float & a, float * b) { - *b = a * a; - } - #ifdef SINGA_GPU - inline static void CudaMap(float alpha, const float * a, - float * b, int n) { - singa::singa_gpu_square(a, b, alpha, n); - } - #endif // SINGA_GPU +/*********************************************************************/ +/** + * c = pow(a, b), i.e., c = a^b + */ +template<Dtype> +struct Pow { + inline static void Map(const float & a, const float &b, float * c) { + *c = pow(a, b); + } +} +template<Dtype> +struct Mult { + inline static void Map(const float & a, const float & b, float * c) { + *c = a * b; + } +#ifdef SINGA_GPU + inline static void CudaMap(const float* a, const float* b, float* c, int n) { + singa::singa_gpu_mult(a, b, c, 1, 1, n); + } +#endif // SINGA_GPU }; -struct Square_grad { - inline static void Map(float alpha, const float & a, float * b) { - *b = 2 * sqrt(a); - } - #ifdef SINGA_GPU - inline static void CudaMap(float alpha, const float * a, - float * b, int n) { - singa::singa_gpu_square_grad(a, b, alpha, n); - } - #endif // SINGA_GPU +template<Dtype> +struct Div { + inline static void Map(const float & a, const float & b, float * c) { + *c = a / b; + } +#ifdef SINGA_GPU + inline static void CudaMap(const float * a, + const float * b, float * c, int n) { + singa::singa_gpu_div(a, b, c, 1, 1, n); + } +#endif // SINGA_GPU }; -struct Sqrt { - inline static void Map(float alpha, const float & a, float * b) { - *b = sqrt(a); - } - #ifdef SINGA_GPU - inline static void CudaMap(float alpha, const float * a, - float * b, int n) { - singa::singa_gpu_sqrt(a, b, alpha, n); - } - #endif // SINGA_GPU + +/*********************************************************************/ +template<Dtype> +struct Set { + inline static void Map(float alpha, float * a) { + *a = alpha; + } +#ifdef SINGA_GPU + inline static void CudaMap(float alpha, float * a, int n) { + singa::singa_gpu_set_value(a, alpha, n); + } +#endif // SINGA_GPU }; +template<Dtype> struct Threshold { - inline static void Map(float alpha, const float & a, float * b) { - *b = a < alpha ? 1.0f : 0.0f; - } - #ifdef SINGA_GPU - inline static void CudaMap(float alpha, const float * a, - float * b, int n) { - singa::singa_gpu_threshold(a, b, alpha, n); - } - #endif // SINGA_GPU + inline static void Map(float alpha, const float & a, float * b) { + *b = a < alpha ? 1.0f : 0.0f; + } +#ifdef SINGA_GPU + inline static void CudaMap(float alpha, const float * a, + float * b, int n) { + singa::singa_gpu_threshold(a, b, alpha, n); + } +#endif // SINGA_GPU }; -struct Add { - inline static void Map(float alpha, float beta, const float & a, - const float & b, float * c) { - *c = a + b; - } - #ifdef SINGA_GPU - inline static void CudaMap(float alpha, float beta, const float * a, - const float * b, float * c, int n) { - singa::singa_gpu_add(a, b, c, alpha, beta, n); +/**********************************/ +struct Expand_Div { + inline static void Map(const float & a, int n, float * b) { + for (int i = 0 ; i < n ; i++) { + b[i] /= a; } - #endif // SINGA_GPU + } +#ifdef SINGA_GPU + inline static void CudaMap(const float & a, int n, float * b) { + singa::singa_gpu_scale(b, b, a, n); + } +#endif // SINGA_GPU }; -struct Sub { - inline static void Map(float alpha, float beta, const float & a, - const float & b, float * c) { - *c = a - b; - } - #ifdef SINGA_GPU - inline static void CudaMap(float alpha, float beta, const float * a, - const float * b, float * c, int n) { - singa::singa_gpu_sub(a, b, c, alpha, beta, n); +struct Repmat { + inline static void Map(const float & a, int n, float * b) { + for (int i = 0 ; i < n ; i++) { + b[i] = a; } - #endif // SINGA_GPU + } +#ifdef SINGA_GPU + inline static void CudaMap(const float & a, int n, float * b) { + singa::singa_gpu_set_value(b, a, n); + } +#endif // SINGA_GPU }; -struct Mult { - inline static void Map(float alpha, float beta, const float & a, - const float & b, float * c) { - *c = a * b; - } - #ifdef SINGA_GPU - inline static void CudaMap(float alpha, float beta, const float * a, - const float * b, float * c, int n) { - singa::singa_gpu_mult(a, b, c, alpha, beta, n); - } - #endif // SINGA_GPU -}; -struct Div { - inline static void Map(float alpha, float beta, const float & a, - const float & b, float * c) { - *c = a / b; +struct Scale { + inline static void Map(float alpha, const float & a, float * b) { + *b = alpha * a; } #ifdef SINGA_GPU - inline static void CudaMap(float alpha, float beta, const float * a, - const float * b, float * c, int n) { - singa::singa_gpu_div(a, b, c, alpha, beta, n); + inline static void CudaMap(float alpha, const float * a, + float * b, int n) { + singa::singa_gpu_scale(a, b, alpha, n); } #endif // SINGA_GPU }; -struct Sum { - inline static void Map(const float * a, int n, float * b) { - *b = 0; - for (int i = 0 ; i < n ; i++) { - *b += a[i]; - } +struct Scale_grad { + inline static void Map(float alpha, float * output) { + *output = alpha; } #ifdef SINGA_GPU - inline static void CudaMap(const float * a, int n, float * b) { - float *sum = NULL; - cudaMalloc(<void**>(&sum), n*sizeof(float)); - - singa::singa_gpu_sum_vec(a, sum, n); - - cudaMemcpyAsync(b, sum, sizeof(float), cudaMemcpyDeviceToDevice); - cudaFree(sum); + inline static void CudaMap(float alpha, float * output, int n) { + singa::singa_gpu_scale_grad(output, alpha, n); } #endif // SINGA_GPU }; -struct Expand_Div { - inline static void Map(const float & a, int n, float * b) { - for (int i = 0 ; i < n ; i++) { - b[i] /= a; - } - } - #ifdef SINGA_GPU - inline static void CudaMap(const float & a, int n, float * b) { - singa::singa_gpu_scale(b, b, a, n); - } - #endif // SINGA_GPU +struct ExpGrad { + inline static void Map(float alpha, const float & a, float * b) { + // log is the natrual log based on e + *b = a * log(alpha); + } +#ifdef SINGA_GPU + inline static void CudaMap(float alpha, const float * a, + float * b, int n) { + singa::singa_gpu_exp_grad(a, b, alpha, n); + } +#endif // SINGA_GPU }; -struct Repmat { - inline static void Map(const float & a, int n, float * b) { - for (int i = 0 ; i < n ; i++) { - b[i] = a; - } - } - #ifdef SINGA_GPU - inline static void CudaMap(const float & a, int n, float * b) { - singa::singa_gpu_set_value(b, a, n); +struct Sum { + inline static void Map(const float * a, int n, float * b) { + *b = 0; + for (int i = 0 ; i < n ; i++) { + *b += a[i]; } - #endif // SINGA_GPU + } +#ifdef SINGA_GPU + inline static void CudaMap(const float * a, int n, float * b) { + float *sum = NULL; + cudaMalloc(<void**>(&sum), n*sizeof(float)); + + singa::singa_gpu_sum_vec(a, sum, n); + + cudaMemcpyAsync(b, sum, sizeof(float), cudaMemcpyDeviceToDevice); + cudaFree(sum); + } +#endif // SINGA_GPU }; }; // namespace op }; // namespace singa - - #endif // SINGA_BLOB_SINGA_OP_H_ http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/4b84dbe3/include/singa/utils/blob.h ---------------------------------------------------------------------- diff --git a/include/singa/utils/blob.h b/include/singa/utils/blob.h index 0ebf8fd..eecb674 100644 --- a/include/singa/utils/blob.h +++ b/include/singa/utils/blob.h @@ -7,9 +7,9 @@ * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at -* +* * http://www.apache.org/licenses/LICENSE-2.0 -* +* * Unless required by applicable law or agreed to in writing, * software distributed under the License is distributed on an * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY @@ -121,21 +121,53 @@ class Blob { public: Blob() {} explicit Blob(const std::vector<int>& shape) { Reshape(shape); } + explicit Blob(int count) { Reshape(count); } + explicit Blob(int a, int b) { Reshape(a, b); } + explicit Blob(int a, int b, int c) { Reshape(a, b, c); } + explicit Blob(int a, int b, int c, int d) { Reshape(a, b, c, d); } /** - * @brief Change the dimensions of the blob, allocating new memory if - * necessary. + * Change the shape of the blob, re-allocat memory if Blob size() changes. * - * This function can be called both to create an initial allocation - * of memory, and to adjust the dimensions of a top blob during Layer::Reshape - * or Layer::Forward. When changing the size of blob, memory will only be - * reallocated if sufficient memory does not already exist, and excess memory - * will never be freed. - * - * Note that reshaping an input blob and immediately calling Net::Backward is - * an error; either Net::Forward or Net::Reshape need to be called to - * propagate the new input shape to higher layers. + * @param[in] shape specifies the size of each dimension, shape[0] is the highest + * dimension, i.e., stride[0] = shape[1] * shape[2] * ... */ void Reshape(const std::vector<int>& shape); + /** + * Helper for Reshape(const std::vector<int>& shape) with shape.size() = 1. + * + * @see Reshape(const std::vector<int>&). + * @param[in] count total num of elements. + */ + void Reshape(int count); + /** + * Helper for Reshape(const std::vector<int>& shape) with shape.size() = 2. + * + * @param a the highest dimension size, i.e., a = shape[0]. E.g., a could the + * batchsize. + * @param[in] b, b = shape[1], e.g., b could be the length of the feature vector. + */ + void Reshape(int a, int b); + /** + * Helper for Reshape(const std::vector<int>& shape) with shape.size() = 3. + * + * @param[in] a, a = shape[0] + * @param[in] b, b = shape[1] + * @param[in] c, c = shape[2] + */ + void Reshape(int a, int b, int c); + /** + * Helper for Reshape(const std::vector<int>& shape) with shape.size() = 4. + * + * @param[in] a, a = shape[0] + * @param[in] b, b = shape[1] + * @param[in] c, c = shape[2] + * @param[in] d, d = shape[3] + */ + void Reshape(int a, int b, int c, int d); + /** + * Reshape as the shape of *other* Blob. + * @param[in] other + */ void ReshapeLike(const Blob& other); /** * @brief Copy from a source Blob. @@ -149,20 +181,45 @@ class Blob { void CopyFrom(const Blob<Dtype>& source, bool reshape); void FromProto(const singa::BlobProto& proto); void ToProto(singa::BlobProto* proto) const; + void SetValue(Dtype v); + /** + * Compute the sum of absolute values (L1 norm) of the data. + */ + Dtype asum_data() const; /** - * @brief Set the data_ shared_ptr to point to the SyncedMemory holding the - * data_ of Blob other -- useful in Layer&s which simply perform a copy - * in their Forward pass. + * Sum all elements + */ + Dtype sum_data() const; + /** + * Share data with the other Blob. + * Set the data_ shared_ptr to point to the SyncedMemory holding the data_ + * of Blob other. * - * This deallocates the SyncedMemory holding this Blob's data_, as + * It may deallocate the SyncedMemory holding this Blob's data_, as * shared_ptr calls its destructor when reset with the "=" operator. */ void ShareData(const Blob& other); void Swap(Blob& other); + /** + * @return the shape vector. + */ inline const std::vector<int>& shape() const { return shape_; } - inline int count() const { return count_; } - inline const int version() const { return version_; } - inline void set_version(int v) { version_ = v; } + /** + * @return the size of the k-th dimension. + */ + inline const int shape(int k) const { + CHECK_LT(k, shape_.size()); + return shape_.at(k); + } + inline int count() const { + return count_; + } + inline const int version() const { + return version_; + } + inline void set_version(int v) { + version_ = v; + } inline const Dtype* cpu_data() const { CHECK(data_); return static_cast<const Dtype*>(data_->cpu_data()); @@ -183,34 +240,90 @@ class Blob { CHECK(data_); return static_cast<Dtype*>(data_->mutable_gpu_data()); } - /// @brief Compute the sum of absolute values (L1 norm) of the data. - Dtype asum_data() const; - Dtype sum_data() const; - inline void setTranspose() { - isTranspose_ = !isTranspose_; - } - inline bool isTranspose() const { - return isTranspose_; + inline void set_transpose() { + transpose_ = true; } - inline void Mirror(const Blob<Dtype> & other) { - data_ = other.data_; - shape_ = other.shape_; - count_ = other.count_; - capacity_ = other.capacity_; - version_ = other.version_; - isTranspose_ = other.isTranspose_; + inline bool transpose() const { + return transpose_; } - protected: std::shared_ptr<SyncedMemory> data_ = nullptr; std::vector<int> shape_; int count_ = 0; int capacity_ = 0; int version_ = -1; - bool isTranspose_ = false; + bool transpose_ = false; }; // class Blob +/** + * Reshape a Blob. + * @return a new Blob with the given shape, it shares the internal data_ with + * the original Blob, i.e., no memory copy and allocation. + */ +template <typename Dtype> +Blob<Dtype>* Reshape(const Blob<Dtype> & A, const std::vector<int>& shape) { + Blob<Dtype>* res = new Blob<Dtype>(A); + res->Reshape(shape); + return res; +} + +/** + * Helper of Reshape(const Blob<Dtype>, const std::vector<int>*). + */ +template <typename Dtype> +Blob<Dtype>* Reshape(const Blob<Dtype> & A, int count) { + std::vector<int> tmpshape; + tmpshape.push_back(dim1); + return Reshape(A, tmpshape); +} +/** + * Helper of Reshape(const Blob<Dtype>, const std::vector<int>*). + */ +template <typename Dtype> +Blob<Dtype>* Reshape(const Blob<Dtype> & A, int dim0, int dim1) { + std::vector<int> tmpshape; + tmpshape.push_back(dim0); + tmpshape.push_back(dim1);; + return Reshape(A, tmpshape); +} +/** + * Helper of Reshape(const Blob<Dtype>, const std::vector<int>*). + */ +template <typename Dtype> +Blob<Dtype>* Reshape(const Blob<Dtype> & A, int dim0, int dim1, int dim2) { + std::vector<int> tmpshape; + tmpshape.push_back(dim0); + tmpshape.push_back(dim1); + tmpshape.push_back(dim2); + return Reshape(A, tmpshape); +} +/** + * Helper of Reshape(const Blob<Dtype>, const std::vector<int>*). + */ +template <typename Dtype> +Blob<Dtype>* Reshape(const Blob<Dtype> & A, int dim0, int dim1, + int dim2, int dim3) { + std::vector<int> tmpshape; + tmpshape.push_back(dim0); + tmpshape.push_back(dim1); + tmpshape.push_back(dim2); + tmpshape.push_back(dim3); + return Reshape(A, tmpshape); +} + +/** + * @return a new Blob which share all internal members with the input Blob + * except that the transpose_ field is set to true. + */ +template <typename Dtype> +Blob<Dtype>* Transpose(const Blob<Dtype> & A) { + Blob<Dtype>* res = new Blob<Dtype>(A); + res->set_transpose(); + return res; +} + +// TODO(wangwei) remove mshadow functions. using namespace mshadow; using mshadow::cpu; @@ -249,6 +362,7 @@ inline Tensor<cpu, 1> Tensor1(Blob<float>* blob) { return tensor; } + } // namespace singa #endif // SINGA_UTILS_BLOB_H_ http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/4b84dbe3/src/blob/math_addr.cc ---------------------------------------------------------------------- diff --git a/src/blob/math_addr.cc b/src/blob/math_addr.cc deleted file mode 100644 index fb1c42e..0000000 --- a/src/blob/math_addr.cc +++ /dev/null @@ -1,120 +0,0 @@ -/************************************************************ -* -* Licensed to the Apache Software Foundation (ASF) under one -* or more contributor license agreements. See the NOTICE file -* distributed with this work for additional information -* regarding copyright ownership. The ASF licenses this file -* to you under the Apache License, Version 2.0 (the -* "License"); you may not use this file except in compliance -* with the License. You may obtain a copy of the License at -* -* http://www.apache.org/licenses/LICENSE-2.0 -* -* Unless required by applicable law or agreed to in writing, -* software distributed under the License is distributed on an -* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -* KIND, either express or implied. See the License for the -* specific language governing permissions and limitations -* under the License. -* -*************************************************************/ - -#include "singa/blob/math_addr.h" -extern "C" { - #include <cblas.h> -} -#ifdef SINGA_GPU -#include <cuda_runtime.h> -#endif -#include "singa/blob/singa_op.h" -#ifdef SINGA_GPU -#include "cublas_v2.h" -#endif - -namespace singa { - -const float * cpu_uni_vec(const int n) { - float * res = new float[n]; - for (int i = 0; i < n; i++) - res[i] = 1.0; - return res; -} - -void cpu_gemm(const float * A, const float * B, const int m, const int n, -const int k, const float alpha, const float beta, -const bool TranA, const bool TranB, float * C) { - int lda, ldb; - CBLAS_TRANSPOSE tA, tB; - lda = TranA ? m : k; - ldb = TranB ? k : n; - tA = TranA ? CblasTrans : CblasNoTrans; - tB = TranB ? CblasTrans : CblasNoTrans; - cblas_sgemm(CblasRowMajor, tA, tB, m, n, k, alpha, A, lda, - B, ldb, beta, C, n); -} - -void cpu_gemv(const float * A, const float * B, const int m, const int n, -const float alpha, const float beta, const bool TranA, float * C) { - CBLAS_TRANSPOSE tA; - tA = TranA ? CblasTrans : CblasNoTrans; - cblas_sgemv(CblasRowMajor, tA, m, n, alpha, A, n, B, 1, beta, C, 1); -} - -void cpu_axpy(const float * A, const int n, const float alpha, float * B) { - cblas_saxpy(n, alpha, A, 1, B, 1); -} - -float cpu_dot(const float * A, const float * B, const int n) { - float sum = 0; - for (int i = 0 ; i < n ; i++) - sum += A[i] * B[i]; - return sum; -} - -#ifdef SINGA_GPU -// Trick: swap A and B -void gpu_gemm(const float * A, const float * B, const int m, const int n, -const int k, const float alpha, const float beta, const bool TranA, -const bool TranB, float * C) { - int lda = TranA ? m : k; - int ldb = TranB ? k : n; - int ldc = n; - cublasOperation_t tA = (TranA == false) ? CUBLAS_OP_N : CUBLAS_OP_T; - cublasOperation_t tB = (TranB == false) ? CUBLAS_OP_N : CUBLAS_OP_T; - cublasHandle_t handle; - cublasCreate(&handle); - cublasSgemm(handle, tB, tA, n, m, k, &alpha, B, ldb, - A, lda, &beta, C, ldc); - cublasDestroy(handle); -} - -void gpu_gemv(const float * A, const float * B, const int m, const int n, -const float alpha, const float beta, const bool TranA, float * C) { - int lda = n; - cublasOperation_t tA = (TranA == true) ? CUBLAS_OP_N : CUBLAS_OP_T; - cublasHandle_t handle; - cublasCreate(&handle); - cublasSgemv(handle, tA, n, m, &alpha , A, lda, B, 1, &beta, C, 1); - cublasDestroy(handle); -} - - -void gpu_axpy(const float * A, const int n, const float alpha, float * B) { - cublasHandle_t handle; - cublasCreate(&handle); - cublasSaxpy(handle, n, &alpha, A, 1, B, 1); - cublasDestroy(handle); -} - - -float gpu_dot(const float * A, const float * B, const int n) { - cublasHandle_t handle; - cublasCreate(&handle); - float result = 0.0; - cublasSdot(handle, n, A, 1, B, 1, &result); - cublasDestroy(handle); - return result; -} -#endif - -} // namespace singa http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/4b84dbe3/src/blob/math_blob.cc ---------------------------------------------------------------------- diff --git a/src/blob/math_blob.cc b/src/blob/math_blob.cc deleted file mode 100644 index 083d3e5..0000000 --- a/src/blob/math_blob.cc +++ /dev/null @@ -1,214 +0,0 @@ -/************************************************************ -* -* Licensed to the Apache Software Foundation (ASF) under one -* or more contributor license agreements. See the NOTICE file -* distributed with this work for additional information -* regarding copyright ownership. The ASF licenses this file -* to you under the Apache License, Version 2.0 (the -* "License"); you may not use this file except in compliance -* with the License. You may obtain a copy of the License at -* -* http://www.apache.org/licenses/LICENSE-2.0 -* -* Unless required by applicable law or agreed to in writing, -* software distributed under the License is distributed on an -* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -* KIND, either express or implied. See the License for the -* specific language governing permissions and limitations -* under the License. -* -*************************************************************/ - -#include "singa/blob/math_blob.h" -#ifdef SINGA_GPU -#include "singa/blob/math_kernel.h" -#endif // SINGA_GPU - -namespace singa { - -/*****************************************************************************/ -// shape_check function - -int get_size(const std::vector<int>& shape) { - int sum = 1; - for (unsigned int i = 0; i < shape.size(); i++) sum *= shape[i]; - return sum; -} - -/*****************************************************************************/ -// class1 matrix operation - - -void GEMM(XPU xpu, const Blob<float> & A, const Blob<float> & B, -Blob<float> * C, float alpha, float beta) { - if (check_shape_mmm(A, B, *C)) { - int m = C->shape().at(0); - int n = C->shape().at(1); - int k = A.isTranspose() ? A.shape().at(0) : A.shape().at(1); - bool TranA = A.isTranspose(); - bool TranB = B.isTranspose(); - if (xpu == cpu) { - cpu_gemm(A.cpu_data(), B.cpu_data(), m, n, k, alpha, beta, - TranA, TranB, C->mutable_cpu_data()); - } - #ifdef SINGA_GPU - if (xpu == gpu) { - // gpu part - gpu_gemm(A.gpu_data(), B.gpu_data(), m, n, k, alpha, beta, - TranA, TranB, C->mutable_gpu_data()); - } - #endif // SINGA_GPU - } else { - // report errors here - } -} -// C = alpha*A*B+beta*C, A, B and C are matrix - -void MMDot(XPU xpu, const Blob<float> & A, const Blob<float> & B, -Blob<float> * C) { - GEMM(xpu, A, B, C, 1, 0); -} -// A,B and C are matrix - - -void MVDot(XPU xpu, const Blob<float> & A, const Blob<float> & B, -Blob<float> * C) { - if (check_shape_mvv(A, B, *C)) { - int m = B.shape().at(0); - int n = C->shape().at(0); - bool TranA = A.isTranspose(); - if (xpu == cpu) { - cpu_gemv(A.cpu_data(), B.cpu_data(), m, n, 1, 0, TranA, - C->mutable_cpu_data()); - } - #ifdef SINGA_GPU - if (xpu == gpu) { - // gpu part - gpu_gemv(A.gpu_data(), B.gpu_data(), m, n, 1, 0, TranA, - C->mutable_gpu_data()); - } - #endif // SINGA_GPU - } else { - // report errors here - } -} -// A is matrix,B and C are vector - - -void VVDot(XPU xpu, const Blob<float> & A, const Blob<float> & B, -Blob<float> * C) { - if (check_shape_vvm(A, B, *C)) { - int m = C->shape().at(0); - int n = C->shape().at(1); - if (xpu == cpu) { - cpu_gemm(A.cpu_data(), B.cpu_data(), m, n, 1, 1, 0, - false, false, C->mutable_cpu_data()); - } - #ifdef SINGA_GPU - if (xpu == gpu) { - // gpu part - gpu_gemm(A.gpu_data(), B.gpu_data(), m, n, 1, 1, 0, - false, false, C->mutable_gpu_data()); - } - #endif // SINGA_GPU - } else { - // report errors here - } -} -// C is matrix,A and B are vector - - -float VVdot(XPU xpu, const Blob<float> & A, const Blob<float> & B) { - float res = 0; - if (check_shape_equal(A, B, B)) { - int n = get_size(A.shape()); - if (xpu == cpu) { - res = cpu_dot(A.cpu_data(), B.cpu_data(), n); - } - #ifdef SINGA_GPU - if (xpu == gpu) { - // gpu part - res = gpu_dot(A.gpu_data(), B.gpu_data(), n); - } - #endif // SINGA_GPU - } else { - // report errors here - } - return res; -} -// A and B are vectors - -void AXPY(XPU xpu, const Blob<float> & A, Blob<float> * B, float alpha) { - if (check_shape_equal(A, *B, *B)) { - if (xpu == cpu) { - cpu_axpy(A.cpu_data(), get_size(A.shape()), - alpha, B->mutable_cpu_data()); - } - #ifdef SINGA_GPU - if (xpu == gpu) { - gpu_axpy(A.gpu_data(), get_size(A.shape()), - alpha, B->mutable_gpu_data()); - } - #endif // SINGA_GPU - } else { - // report errors here - } -} -// element-wise operation: Bi = alpha*Ai+Bi A and B should have the same size - -inline void Repmat(XPU xpu, const Blob<float> & A, Blob<float> * B) { - MVAdd(xpu, A, B, 1, 0); -} -// A is a vector, B is a matrix , let each row of B to be A - -void MVAdd(XPU xpu, const Blob<float> & A, Blob<float> * B, -float alpha, float beta) { - if (check_shape_mv(*B, A)) { - int m = get_size(A.shape()); - int n = get_size(B->shape()) / m; - if (xpu == cpu) { - const float * univ = cpu_uni_vec(n); - cpu_gemm(A.cpu_data(), univ, m, n, 1, alpha, beta, - false, false, B->mutable_cpu_data()); - delete univ; - } - #ifdef SINGA_GPU - if (xpu == gpu) { - singa_gpu_add_vec_row(B->gpu_data(), - A.gpu_data(), A.gpu_data(), m, n, n); - // gpu part - } - #endif // SINGA_GPU - } else { - // report errors here - } -} -// A is a vector, B is a matrix , Bij = alpha*Ai+beta*Bij -// will use gemm. faster than general expand_f - -void MVSum(XPU xpu, const Blob<float> & A, Blob<float> * B, -float alpha, float beta) { - if (check_shape_mv(A, *B)) { - int m = get_size(B->shape()); - int n = get_size(A.shape()) / m; - if (xpu == cpu) { - const float * univ = cpu_uni_vec(n); - cpu_gemm(A.cpu_data(), univ, m, 1, n, alpha, beta, - false, false, B->mutable_cpu_data()); - delete univ; - } - #ifdef SINGA_GPU - if (xpu == gpu) { - singa_gpu_sum_col(A.gpu_data(), B->gpu_data(), m, n, n); - // gpu part - } - #endif // SINGA_GPU - } else { - // report errors here - } -} -// B is a vector, A is a matrix , Bi = \sigma_j_{alpha*Aij}+beta*Bi -// will use gemm. faster than general reduce_f - -} // namespace singa -
