SINGA-80 New Blob Level and Address Level Math Operation Interface * Move files from blob/ folder into utils folder. May separate some files from utils later. * Remove functions like relu/softplus/sigmoid for Blobs, since they only have one line of code, which can be written directly when called. * Compile cpu code successfully.
TODO rebase to latest master and do test. Project: http://git-wip-us.apache.org/repos/asf/incubator-singa/repo Commit: http://git-wip-us.apache.org/repos/asf/incubator-singa/commit/d3379cba Tree: http://git-wip-us.apache.org/repos/asf/incubator-singa/tree/d3379cba Diff: http://git-wip-us.apache.org/repos/asf/incubator-singa/diff/d3379cba Branch: refs/heads/master Commit: d3379cbaff99fce26f9b06301cc0d63a56b062ee Parents: 4b84dbe Author: Wei Wang <[email protected]> Authored: Mon Nov 9 16:57:33 2015 +0800 Committer: Wei Wang <[email protected]> Committed: Mon Nov 9 17:04:48 2015 +0800 ---------------------------------------------------------------------- include/singa/blob/math_addr.h | 206 ------------ include/singa/blob/math_blob.h | 597 --------------------------------- include/singa/blob/math_kernel.h | 59 ---- include/singa/blob/singa_op.h | 350 ------------------- include/singa/utils/blob.h | 55 +-- include/singa/utils/math_addr.h | 206 ++++++++++++ include/singa/utils/math_blob.h | 487 +++++++++++++++++++++++++++ include/singa/utils/math_kernel.h | 59 ++++ include/singa/utils/singa_op.h | 265 +++++++++++++++ src/blob/math_kernel.cu | 439 ------------------------ src/blob/test.cc | 165 --------- src/test/test_math.cc | 31 +- src/utils/blob.cc | 10 +- src/utils/math_kernel.cu | 439 ++++++++++++++++++++++++ 14 files changed, 1512 insertions(+), 1856 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/d3379cba/include/singa/blob/math_addr.h ---------------------------------------------------------------------- diff --git a/include/singa/blob/math_addr.h b/include/singa/blob/math_addr.h deleted file mode 100644 index 25cef07..0000000 --- a/include/singa/blob/math_addr.h +++ /dev/null @@ -1,206 +0,0 @@ -/************************************************************ -* -* Licensed to the Apache Software Foundation (ASF) under one -* or more contributor license agreements. See the NOTICE file -* distributed with this work for additional information -* regarding copyright ownership. The ASF licenses this file -* to you under the Apache License, Version 2.0 (the -* "License"); you may not use this file except in compliance -* with the License. You may obtain a copy of the License at -* -* http://www.apache.org/licenses/LICENSE-2.0 -* -* Unless required by applicable law or agreed to in writing, -* software distributed under the License is distributed on an -* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -* KIND, either express or implied. See the License for the -* specific language governing permissions and limitations -* under the License. -* -*************************************************************/ - -#ifndef SINGA_BLOB_MATH_ADDR_H_ -#define SINGA_BLOB_MATH_ADDR_H_ -extern "C" { - #include <cblas.h> -} -#ifdef USE_GPU -#include <cuda_runtime.h> -#endif -#include "singa/blob/singa_op.h" -#ifdef USE_GPU -#include "cublas_v2.h" -#endif - - -namespace singa { - -template<typename Dtype> -void cpu_gemm(const Dtype * A, const Dtype * B, - const int m, const int n, const int k, const Dtype alpha, const Dtype beta, - const bool TranA, const bool TranB, Dtype * C) { - int lda, ldb; - CBLAS_TRANSPOSE tA, tB; - lda = TranA ? m : k; - ldb = TranB ? k : n; - tA = TranA ? CblasTrans : CblasNoTrans; - tB = TranB ? CblasTrans : CblasNoTrans; - cblas_sgemm(CblasRowMajor, tA, tB, m, n, k, alpha, A, lda, - B, ldb, beta, C, n); -} - -// should be very careful: -// m is the length of B, and n is the length of C , A is a n*m matrix -template<typename Dtype> -void cpu_gemv(const Dtype * A, const Dtype * B, const int m, const int n, - const Dtype alpha, const Dtype beta, const bool TranA, Dtype * C) { - int lda, ldb; - CBLAS_TRANSPOSE tA, tB; - lda = TranA ? m : k; - ldb = TranB ? k : n; - tA = TranA ? CblasTrans : CblasNoTrans; - tB = TranB ? CblasTrans : CblasNoTrans; - cblas_sgemm(CblasRowMajor, tA, tB, m, n, k, alpha, A, lda, - B, ldb, beta, C, n); - -} - -template<typename Dtype> -void cpu_axpy(const Dtype * A, const int n, const Dtype alpha, Dtype * B) { - cblas_saxpy(n, alpha, A, 1, B, 1); -} - -template<typename Dtype> -Dtype cpu_dot(const Dtype * A, const Dtype * B, const int n) { - Dtype sum = 0; - for (int i = 0 ; i < n ; i++) - sum += A[i] * B[i]; - return sum; -} - -// element-wise -template<typename Op, typename Dtype> -void cpu_e_f(const int n, const Dtype alpha, Dtype * A) { - for (int i = 0 ; i < n ; i++) { - Op::Map(alpha, &A[i]); - } -} - -template<typename Op, typename Dtype> -void cpu_e_f(const int n, const Dtype * A, const Dtype alpha, Dtype * B) { - for (int i = 0 ; i < n ; i++) { - Op::Map(alpha, A[i], &B[i]); - } -} - -template<typename Op, typename Dtype> -void cpu_e_f(const int n, const Dtype * A, const Dtype * B, - const Dtype alpha, const Dtype beta, Dtype * C) { - for (int i = 0 ; i < n ; i++) { - Op::Map(alpha, beta, A[i], B[i], &C[i]); - } -} -// element-wise generalized operation defined in Op - - -// matrix/vector expand/reduce - -template<typename Op, typename Dtype> -void cpu_reduce_f(const Dtype * A, const int m, const int n, Dtype * B) { - for (int i = 0 ; i < m ; i++) { - Op::Map(A+i*n, n, B[i]); - } -} -// reduce each row of A to an element of B e.g. the sum operation in softmax -template<typename Op, typename Dtype> -void cpu_expand_f(const Dtype * A, const int m, const int n, Dtype * B) { - for (int i = 0 ; i < m ; i++) { - Op::Map(A[i], n, B+i*n); - } -} -// expand each element in A into a row of B - -#ifdef USE_GPU -template<typename Dtype> -void gpu_gemm(const Dtype * A, const Dtype * B, const int m, const int n, - const int k, const Dtype alpha, const Dtype beta, const bool TranA, - const bool TranB, Dtype * C) { - int lda = TranA ? m : k; - int ldb = TranB ? k : n; - int ldc = n; - cublasOperation_t tA = (TranA == false) ? CUBLAS_OP_N : CUBLAS_OP_T; - cublasOperation_t tB = (TranB == false) ? CUBLAS_OP_N : CUBLAS_OP_T; - cublasHandle_t handle; - cublasCreate(&handle); - cublasSgemm(handle, tB, tA, n, m, k, &alpha, B, ldb, - A, lda, &beta, C, ldc); - cublasDestroy(handle); -} - -template<typename Dtype> -void gpu_gemv(const Dtype * A, const Dtype * B, const int m, const int n, - const Dtype alpha, const Dtype beta, const bool TranA, Dtype * C) { - int lda = n; - cublasOperation_t tA = (TranA == true) ? CUBLAS_OP_N : CUBLAS_OP_T; - cublasHandle_t handle; - cublasCreate(&handle); - cublasSgemv(handle, tA, n, m, &alpha , A, lda, B, 1, &beta, C, 1); - cublasDestroy(handle); -} - -template<typename Dtype> -void gpu_axpy(const Dtype * A, const int n, const Dtype alpha, Dtype * B) { - cublasHandle_t handle; - cublasCreate(&handle); - cublasSaxpy(handle, n, &alpha, A, 1, B, 1); - cublasDestroy(handle); -} - -template<typename Dtype> -Dtype gpu_dot(const Dtype * A, const Dtype * B, const int n) { - cublasHandle_t handle; - cublasCreate(&handle); - Dtype result = 0.0; - cublasSdot(handle, n, A, 1, B, 1, &result); - cublasDestroy(handle); - return result; -} - -// element-wise -template<typename Op, typename Dtype> -void gpu_e_f(const int n, const Dtype alpha, Dtype * A) { - Op::CudaMap(alpha, A, n); -} - -template<typename Op, typename Dtype> -void gpu_e_f(const int n, const Dtype * A, const Dtype alpha, Dtype * B) { - Op::CudaMap(alpha, A, B, n); -} - -template<typename Op, typename Dtype> -void gpu_e_f(const int n, const Dtype * A, const Dtype * B, - const Dtype alpha, const Dtype beta, Dtype * C) { - Op::CudaMap(alpha, beta, A, B, C, n); -} -// element-wise generalized operation defined in Op - -// matrix/vector expand/reduce - -template<typename Op, typename Dtype> -void gpu_reduce_f(const Dtype * A, const int m, const int n, Dtype * B) { - for (int i = 0 ; i < m ; i++) { - Op::CudaMap(A+i*n, n, B[i]); - } -} -// reduce each row of A to an element of B e.g. the sum operation in softmax -template<typename Op, typename Dtype> -void gpu_expand_f(const Dtype * A, const int m, const int n, Dtype * B) { - for (int i = 0 ; i < m ; i++) { - Op::CudaMap(A[i], n, B+i*n); - } -} -// expand each element in A into a row of B -#endif // USE_GPU - -} // namespace singa -#endif // SINGA_BLOB_MATH_ADDR_H_ http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/d3379cba/include/singa/blob/math_blob.h ---------------------------------------------------------------------- diff --git a/include/singa/blob/math_blob.h b/include/singa/blob/math_blob.h deleted file mode 100644 index f3147e8..0000000 --- a/include/singa/blob/math_blob.h +++ /dev/null @@ -1,597 +0,0 @@ -/************************************************************ -* -* Licensed to the Apache Software Foundation (ASF) under one -* or more contributor license agreements. See the NOTICE file -* distributed with this work for additional information -* regarding copyright ownership. The ASF licenses this file -* to you under the Apache License, Version 2.0 (the -* "License"); you may not use this file except in compliance -* with the License. You may obtain a copy of the License at -* -* http://www.apache.org/licenses/LICENSE-2.0 -* -* Unless required by applicable law or agreed to in writing, -* software distributed under the License is distributed on an -* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -* KIND, either express or implied. See the License for the -* specific language governing permissions and limitations -* under the License. -* -*************************************************************/ - -#ifndef SINGA_BLOB_MATH_BLOB_H_ -#define SINGA_BLOB_MATH_BLOB_H_ - -#include <vector> -#include "singa/utils/blob.h" -#include "singa/blob/singa_op.h" -#include "singa/blob/math_addr.h" - - -namespace singa { -enum XPU {cpu, gpu, any}; - -/************* BLAS level 1 *****************/ -/** - * Scale each element of A with alpha, and put the result into B. - * Bi = alpha*Ai - * Use blas scale internally. - */ -template<typename Dtype> -void Scale(xpu xpu, Dtype alpha, const Blob<Dtype> & A, Blob<Dtype> * B) { - CHECK_EQ(A.count(), B->count()); - if (xpu == cpu) - cpu_scale(A.count(), alpha, A.cpu_data(), B->mutable_cpu_data()); -#ifdef USE_GPU -#endif -} - -/** - * Element-wise operation: Bi = alpha*Ai+Bi. A and B should have the same size - */ -template<typename Dtype> -void AXPY(XPU xpu, Dtype alpha, const Blob<Dtype> & A, Blob<Dtype> * B) { - CHECK_EQ(A.count(), B.count()); - if (xpu == cpu) { - cpu_axpy(A.cpu_data(), A.count(), - alpha, B->mutable_cpu_data()); - } -#ifdef USE_GPU - if (xpu == gpu) { - gpu_axpy(A.gpu_data(), A.count(), - alpha, B->mutable_gpu_data()); - } -#endif // USE_GPU -} - -/************* BLAS level 2 *****************/ -/** - * Matrix vector multiplication, C = alpha A(.T) * B + beta C. - * Strict shape checking: - * - dim of A ==2 - * columsn of A(.T) == B.count() - * - rows of A(.T) == C.count() - * - * @param[in] alpha - * @param[in] beta - * @param[in] A, matrix - * @param[in] B, vector - * @param[in, out] C, vector - */ -template<typename Dtype> -void GEMV(XPU, xpu, Dtype alpha, Dtype beta, const Blob<Dtype>& A, - const Blob<Dtype>& B, Blob<Dtype>* C) { - CHECK_EQ(A.shape().size(), 2) << "A must be a matrix"; - int a1, a2, m, n; - a1 = A.transpose() ? A.shape(1) : A.shape(0); - a2 = A.transpose() ? A.shape(0) : A.shape(1); - m = B.count(); - n = C->count(); - CHECK_EQ(a2, m) << "# columns of A(.T) must = length of B"; - CHECK_EQ(a1, n) << "# rows of A(.T) must = length of C"; - - bool TranA = A.transpose(); - if (xpu == cpu) { - cpu_gemv(A.cpu_data(), B.cpu_data(), m, n, alpha, beta, TranA, - C->mutable_cpu_data()); - } -#ifdef USE_GPU - if (xpu == gpu) { - // gpu part - gpu_gemv(A.gpu_data(), B.gpu_data(), m, n, alpha, beta, TranA, - C->mutable_gpu_data()); - } -#endif // USE_GPU -} -/** - * Matrix vector multiplication, C = A(.T) * B, transpose is considered. - * Loose shape checking: - * - dim of A >=2 - * - A.count() % B.count() == 0 - * - B.count() == C.count() - * - * @param[in] A input matrix - * @param[in] B input vector - * @param[out] C output vector - */ -template <typename Dtype> -void MVDot(XPU xpu, const Blob<Dtype>& A, const Blob<Dtype>& B, Blob<Dtype>* C) -{ - GEMV(xpu, Dtype(1), Dtype(0), A, B, C); -} - -/************* BLAS level 3 *****************/ -/** - * Matrix multiplication, C = alpha A*B + beta C, A, B and C are matrix. - * - * Tranpose is considered for A and B. - * Strict shape checking: - * - all are matrix - * - shapes match for matrix multiplication - * - * @param[in] alpha - * @param[in] beta - * @param[in] A, matrix - * @param[in] B, matrix - * @param[in, out] C, matrix - */ -template <typename Dtype> -void GEMM(XPU xpu, Dtype alpha, Dtype beta, const Blob<Dtype>& A, - const Blob<Dtype> & B, Blob<Dtype> * C) { - CHECK_EQ(A.shape().size(), 2); - CHECK_EQ(B.shape().size(), 2); - CHECK_EQ(C.shape().size(), 2); - int a1, a2, b1, b2, m, n; - CHECK(!C->transpose()); - a1 = A.transpose() ? A.shape(1) : A.shape(0); - a2 = A.transpose() ? A.shape(0) : A.shape(1); - b1 = B.transpose() ? B.shape(1) : B.shape(0); - b2 = B.transpose() ? B.shape(0) : B.shape(1); - m = C->shape(0); - n = C->shape(1); - CHECK__EQ(a2, b1); - CHECK__EQ(a1, m); - CHECK__EQ(b2, n); - - int k = A.transpose() ? A.shape(0) : A.shape(1); - bool TranA = A.transpose(); - bool TranB = B.transpose(); - if (xpu == cpu) { - cpu_gemm(A.cpu_data(), B.cpu_data(), m, n, k, alpha, beta, - TranA, TranB, C->mutable_cpu_data()); - } -#ifdef USE_GPU - if (xpu == gpu) { - // gpu part - gpu_gemm(A.gpu_data(), B.gpu_data(), m, n, k, alpha, beta, - TranA, TranB, C->mutable_gpu_data()); - } -#endif // USE_GPU -} -/** - * Matrix multiplication, C = A(.T) * B(.T), transpose is considered. - * Strict shape checking: - * - all are matrix - * - shapes match for matrix multiplication - * - * @param[in] A input matrix - * @param[in] B input matrix - * @param[out] C output matrix - */ -template <typename Dtype> -void MMDot(XPU xpu, const Blob<Dtype>& A, const Blob<Dtype>& B, Blob<Dtype>* C) -{ - GEMM(xpu, Dtype(1), Dtype(0), A, B, C); -} - - -/*********************** Inner and Outer product****************************/ -/** - * Inner product for two vectors. - * Loose shape checking, A.count() == B.count. - * - * @param[in] A, input vector (shape checking using A.count()). - * @param[in] B, input vector (shape checking using B.count()). - * @return inner product value. - */ -template <typename Dtype> -Dtype VVDot(XPU xpu, const Blob<Dtype> & A, const Blob<Dtype> & B) { - Dtype res = 0; - CHECK_EQ(A.count(), B.count()); - int n = A.count(); - if (xpu == cpu) { - res = cpu_dot(A.cpu_data(), B.cpu_data(), n); - } -#ifdef USE_GPU - if (xpu == gpu) { - // gpu part - res = gpu_dot(A.gpu_data(), B.gpu_data(), n); - } -#endif // USE_GPU - return res; -} - -/** - * Outer product, C = A ** B, transpose is disabled. - * Loose shape checking, A.count() * B.count() == C.count() - * - * @param[in] A, input vector - * @param[in] B, input vector - * @param[out] C, output matrix - */ -template <typename Dtype> -void OuterProduct(XPU xpu, const Blob<Dtype>& A, const Blob<Dtype>& B, - Blob<Dtype> * C) { - CHECK(!C.transpose()); // do not support C.T now. - - int m = A.count(); - int n = B.count(); - CHECK_EQ(C->count(), m * n); - - if (xpu == cpu) { - cpu_gemm(A.cpu_data(), B.cpu_data(), m, n, 1, 1, 0, - false, false, C->mutable_cpu_data()); - } -#ifdef USE_GPU - if (xpu == gpu) { - // gpu part - gpu_gemm(A.gpu_data(), B.gpu_data(), m, n, 1, 1, 0, - false, false, C->mutable_gpu_data()); - } -#endif // USE_GPU -} -/*********************** Element-wise functions ***********************/ -/** - * Apply the function from Op for each element in A and put the result into B, - * i.e., Bi = Op(Ai). - * Loose shape checking, A.count() == B.count(). - */ -template<typename Op, typename Dtype> -void Map(XPU xpu, const Blob<Dtype> & A, Blob<Dtype> * B) { - CHECK_EQ(A.count(), B->count()) << "Blobs must have the same size"; - if (xpu == cpu) { - cpu_e_f<Op>(A.count(), A.cpu_data(), B->mutable_cpu_data()); - } -#ifdef SINGA_GPU - if (xpu == gpu) { - // gpu part - gpu_e_f<Op>(A.count(), A.gpu_data(), B->mutable_gpu_data()); - } -#endif // SINGA_GPU -} - -/** - * Apply the function from Op for each element in A and B, and put the result - * into C, i.e., Ci = Op(Ai, Bi). - * Loose shape checking, A, B and C are of the same size. - */ -template<typename Op, typename Dtype> -void Map(XPU xpu, const Blob<Dtype> & A, const Blob<Dtype> & B, - Blob<Dtype> * C) { - CHECK_EQ(A.count(), B.count()) << "Blobs must have the same size"; - CHECK_EQ(A.count(), C->count()) << "Blobs must have the same size"; - if (xpu == cpu) { - cpu_e_f<Op>(A.count(), A.cpu_data(), B.cpu_data(), C->mutable_cpu_data()); - } -#ifdef SINGA_GPU - if (xpu == gpu) { - // gpu part - gpu_e_f<Op>(A.count(), A.gpu_data(), B.gpu_data(), C->mutable_gpu_data()); - } -#endif // SINGA_GPU -} - -/** - * Bi = Op(alpha, Ai) - * Loose shape checking, A.count() == B.count(). - */ -template<typename Op, typename Dtype> -void Map(XPU xpu, Dtype alpha, const Blob<Dtype>& A, const Blob<Dtype>* B) { - CHECK_EQ(A.count(), B->count()) << "Blobs must have the same size"; - if (xpu == cpu) { - cpu_e_f<Op>(A.count(), alpha, A.cpu_data(), B->mutable_cpu_data()); - } -#ifdef SINGA_GPU -#endif // SINGA_GPU -} -/** - * Ci = Op(alpha, Ai, Bi) - * Loose shape checking, A, B and C are of the same size. - */ -template<typename Op, typename Dtype> -void Map(XPU xpu, Dtype alpha, const Blob<Dtype>& A, const Blob<Dtype>& B, - Blob<Dtype>* C) { - CHECK_EQ(A.count(), B->count()) << "Blobs must have the same size"; - if (xpu == cpu) { - cpu_e_f<Op>(A.count(), alpha, A.cpu_data(), B->cpu_data(), - C->mutable_cpu_data()); - } -#ifdef SINGA_GPU -#endif // SINGA_GPU -} - -/** - * Currently use std::copy which has shown better performance than memcpy. - * http://stackoverflow.com/questions/4707012/c-memcpy-vs-stdcopy - * TODO(wangwei) test blas copy vs std::copy. - * - * Loose shape checking, A.count() == B.count(). - */ -template<typename Dtype> -void Copy(XPU xpu, const Blob<Dtype>& A, const Blob<Dtype>* B) { - CHECK_EQ(A.count(), B->count()) << "Blobs must have the same size"; - if (xpu == cpu) - std::copy(A.cpu_data(), A.cpu_data() + A.count(), B->mutable_cpu_data()); - else { - LOG(FATAL) << "Not implemented"; - } -} - -/** - * C = A + B - * Implemented using Copy and AXPY. - */ -template<typename Dtype> -void Add(XPU xpu, const Blob<Dtype> & A, const Blob<Dtype> & B, - Blob<Dtype> * C) { - Copy(A, C); - AXPY(B, C, 1); -} - -/** - * C = A - B - * Implemented using Copy and AXPY. - */ -template<typename Dtype> -void Sub(XPU xpu, const Blob<Dtype> & A, const Blob<Dtype> & B, - Blob<Dtype> * C) { - Copy(xpu, A, C); - AXPY(xpu, B, C, -1); -} - -/** - * C = A * B, implemented using - * Map(XPU, const Blob<Dtype>&, const Blob<Dtype>&, Blob<Dtype>*). - */ -template<typename Dtype> -void Mult(XPU xpu, const Blob<Dtype> & A, const Blob<Dtype> & B, - Blob<Dtype> * C) { - Map<singa::op::Mult>(xpu, A, B, C); - // TODO(wangwei) use MKL's vector func -} - -/** - * C = A / B, implemented using - * Map(XPU, const Blob<Dtype>&, const Blob<Dtype>&, Blob<Dtype>*). - */ -template<typename Dtype> -void Div(XPU xpu, const Blob<Dtype> & A, const Blob<Dtype> & B, - Blob<Dtype> * C) { - Map<singa::op::Div>(xpu, A, B, C); - // TODO(wangwei) use MKL's vector func -} -/*************************1D<-->2D op/transform***************************/ -/** - * Add each row of B with A, i.e., Bij = alpha*Ai + beta*Bij - * Loose shape checking, B.count() % A.count() == 0. - * # rows of B = B.count() / A.count(). - * Transpose is disabled. - */ -template<typename Dtype> -void MVAdd(XPU xpu, Dtype alpha, Dtype beta, const Blob<Dtype> & A, - Blob<Dtype> * B) { - CHECK_EQ(B.count() % A.count(), 0) << "#col of B not match length of A"; - int m = A.count(), n = B->count() / m; - if (xpu == cpu) { - Blob<Dtype> one(n); - one.SetValue(1); - cpu_gemm(A.cpu_data(), one.cpu_data(), m, n, 1, alpha, beta, - false, false, B->mutable_cpu_data()); - } -#ifdef USE_GPU - if (xpu == gpu) { - singa_gpu_add_vec_row(B->gpu_data(), - A.gpu_data(), A.gpu_data(), m, n, n); - // gpu part - } -#endif // USE_GPU -} -/** - * Add each row of B with A, i.e., Bij = Ai + Bij - * Loose shape checking, B.count() % A.count() == 0. - * # rows of B = B.count() / A.count(). - * Transpose is disabled. - */ -template<typename Dtype> -void MVAdd(XPU xpu, const Blob<Dtype> & A, Blob<Dtype>* B) { - MVAdd(xpu, Dtype(1), Dtype(1), A, B); -} - -/** - * Copy A to each row of B - * Loose shape checking, B.count() % A.count() == 0, - * # rows of B = B.count() / A.count(). - * Transpose is disabled. - */ -template<typename Dtype> -void Repmat(XPU xpu, const Blob<Dtype> & A, Blob<Dtype> * B) { - MVAdd(xpu, Dtype(1), Dtype(0), A, B); -} - -/** - * Add each col of matrix A to vector B, i.e., Bi = \sum_j {alpha*Aij}+beta*Bi - * Loose shape checking, A.count() % B.count() == 0. - * # rows of A = A.count() / B.count(). - * Transpose is disabled. - */ -template<typename Dtype> -void MVSum(XPU xpu, Dtype alpha, Dtype beta, const Blob<Dtype> & A, - Blob<Dtype> * B) { - CHECK_EQ(A.count() % B->count(), 0) << "length of B must = # of cols of A"; - - int m = B->count(), n = A.count() / m; - if (xpu == cpu) { - Blob<Dtype> one(n); - one.SetValue(1); - cpu_gemm(A.cpu_data(), one.cpu_data(), m, 1, n, alpha, beta, - false, false, B->mutable_cpu_data()); - } -#ifdef USE_GPU - if (xpu == gpu) { - singa_gpu_sum_col(A.gpu_data(), B->gpu_data(), m, n, n); - // gpu part - } -#endif // USE_GPU -} -/** - * Reduce each row of A to an element of B. - * Loose shape checking, A.count() % B.count() == 0. - * # rows of A = A.count() / B.count(). - */ -template<typename Op, typename Dtype> -void Reduce2D(XPU xpu, const Blob<Dtype> & A, Blob<Dtype> * B) { - CHECK_EQ(A.count() % B.count(), 0) << "Row size not match B length"; - int m = B->count(), n = A.count() / m; - if (xpu == cpu) { - cpu_reduce_f<Op>(A.cpu_data(), m, n, B->mutable_cpu_data()); - } -#ifdef SINGA_GPU - if (xpu == gpu) { - // gpu part - gpu_reduce_f<Op>(A.gpu_data(), m, n, B->mutable_gpu_data()); - } -#endif // SINGA_GPU -} -/** - * Duplicate each element of A into a row of B. - * Loose shape checking, B.count() % A.count() == 0. - * # rows of B = B.count() / A.count(). - */ -template<typename Op, typename Dtype> -void Expand2D(XPU xpu, const Blob<Dtype> & A, Blob<Dtype> * B) { - CHECK_EQ(B.count() % A.count(), 0) << "Row size of B not match length of A"; - int m = A.count(), n = B->count() / m; - if (xpu == cpu) { - cpu_expand_f<Op>(A.cpu_data(), m, n, B->mutable_cpu_data()); - } -#ifdef SINGA_GPU - if (xpu == gpu) { - // gpu part - gpu_expand_f<Op>(A.gpu_data(), m, n, B->mutable_gpu_data()); - } -#endif // SINGA_GPU -} - -/***********************************************************/ -/** - * Apply the function from Op for each element in A, Op(Ai). - * @param A - */ -template<typename Op, typename Dtype> -void Map(XPU xpu, Blob<Dtype>* A) { - if (xpu == cpu) { - cpu_e_f<Op>(A->count(), A->mutable_cpu_data()); - } -#ifdef SINGA_GPU - if (xpu == gpu) { - // gpu part - gpu_e_f<Op>(A->count(), A->mutable_gpu_data()); - } -#endif // SINGA_GPU -} - -/** - * B = e ^ A - */ -template<typename Dtype> -void Exp(XPU xpu, const Blob<Dtype> & A, Blob<Dtype> * B) { - Map<singa::op::Exp>(xpu, A, B); -} - -/** - * element-wise operation: b = 1.0f / (1.0f + expf(-a)); - */ -template<typename Dtype> -inline void Sigmoid(XPU xpu, const Blob<Dtype> & A, Blob<Dtype> * B) { - Map<singa::op::Sigmoid>(xpu, A, B); -} - -/** - * element-wise operation: b = a * ( 1.0f - a ); - */ -inline void SigmoidGrad(XPU xpu, const Blob<Dtype> & A, Blob<Dtype> * B) { - Map<singa::op::SigmoidGrad>(xpu, A, B); -} - -/** - * element-wise operation: b = std::max( a, 0) - */ -inline void Relu(XPU xpu, const Blob<Dtype> & A, Blob<Dtype> * B) { - Map<singa::op::Relu>(xpu, A, B); -} - -/** - * element-wise operation: b = a > 0 ? 1: 0; - */ -template<typename Dtype> -inline void ReluGrad(XPU xpu, const Blob<Dtype> & A, Blob<Dtype> * B) { - Map<singa::op::ReluGrad>(xpu, A, B); -} - -/** - * element-wise operation: b = tanh(a); - */ -template<typename Dtype> -inline void Tanh(XPU xpu, const Blob<Dtype> & A, Blob<Dtype> * B) { - Map<singa::op::Tanh>(xpu, A, B); -} - -/** - * B = 1- A^2 - */ -template<typename Dtype> -inline void TanhGrad(XPU xpu, const Blob<Dtype> & A, Blob<Dtype> * B) { - Map<singa::op::TanhGrad>(xpu, A, B); -} -/** - * B = log(1+exp(A)) - */ -template<typename Dtype> -inline void Softplus(XPU xpu, const Blob<Dtype> & A, Blob<Dtype> * B) { - Map<singa::op::Softplus>(xpu, A, B); -} - -/** - * B = 1.0f / (1.0f + expf(-A)); - */ -template<typename Dtype> -inline void SoftplusGrad(XPU xpu, const Blob<Dtype> & A, Blob<Dtype> * B) { - Map<singa::op::SoftplusGrad>(xpu, A, B); -} - -template<typename Dtype> -inline void Square(XPU xpu, const Blob<Dtype> & A, Blob<Dtype> * B) { - Map<singa::op::Square>(xpu, A, B); -} - -template<typename Dtype> -inline void SquareGrad(XPU xpu, const Blob<Dtype> & A, Blob<Dtype> * B) { - Map<singa::op::Square_grad>(xpu, A, B); -} - -template<typename Dtype> -inline void Sqrt(XPU xpu, const Blob<Dtype> & A, Blob<Dtype> * B) { - Map<singa::op::Sqrt>(xpu, A, B); -} - -/** - * B = A < alpha ? 1:0; - */ -template<typename Dtype> -inline void Threshold(XPU xpu, Dtype alpha, const Blob<Dtype> & A, - Blob<Dtype> * B) { - Map<singa::op::Threshold>(xpu, alpha, A, B); -} -} // end of namespace singa - -#endif // SINGA_BLOB_MATH_BLOB_H_ http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/d3379cba/include/singa/blob/math_kernel.h ---------------------------------------------------------------------- diff --git a/include/singa/blob/math_kernel.h b/include/singa/blob/math_kernel.h deleted file mode 100644 index f5d3e34..0000000 --- a/include/singa/blob/math_kernel.h +++ /dev/null @@ -1,59 +0,0 @@ -#ifndef MATH_KERNEL_H -#define MATH_KERNEL_H - -namespace singa{ - -extern "C" { - void singa_gpu_sum_vec(float *data, float *sum , long n); - - void singa_gpu_sum_col(const float *src_mat_data, float *dst_vec_data, long rows, long cols, long stride); - - void singa_gpu_add_vec_row(const float *src_vec_data, const float *src_mat_data, float *des_mat_data, long rows, long cols, long stride); - - void singa_gpu_set_value(float *data, float value, long n); - - void singa_gpu_scale(const float *src_data, float *des_data, float alpha, long n); - - void singa_gpu_scale_grad(float *data, float alpha, long n); - - void singa_gpu_exp(const float *src_data, float *des_data, float alpha, long n); - - void singa_gpu_exp_grad(const float *src_data, float *des_data, float alpha, long n); - - void singa_gpu_sigmoid(const float *src_data, float *des_data, float alpha, long n); - - void singa_gpu_sigmoid_grad(const float *src_data, float *des_data, float alpha, long n); - - void singa_gpu_relu(const float *src_data, float *des_data, float alpha, long n); - - void singa_gpu_relu_grad(const float *src_data, float *des_data, float alpha, long n); - - void singa_gpu_tanh(const float *src_data, float *des_data, float alpha, long n); - - void singa_gpu_tanh_grad(const float *src_data, float *des_data, float alpha, long n); - - void singa_gpu_softplus(const float *src_data, float *des_data, float alpha, long n); - - void singa_gpu_softplus_grad(const float *src_data, float *des_data, float alpha, long n); - - void singa_gpu_square(const float *src_data, float *des_data, float alpha, long n); - - void singa_gpu_square_grad(const float *src_data, float *des_data, float alpha, long n); - - void singa_gpu_sqrt(const float *src_data, float *des_data, float alpha, long n); - - void singa_gpu_threshold(const float *src_data, float *des_data, float alpha, long n); - - void singa_gpu_add(const float *src_data_a, const float *src_data_b, float *des_data, float alpha, float beta, long n); - - void singa_gpu_sub(const float *src_data_a, const float *src_data_b, float *des_data, float alpha, float beta, long n); - - void singa_gpu_mult(const float *src_data_a, const float *src_data_b, float *des_data, float alpha, float beta, long n); - - void singa_gpu_div(const float *src_data_a, const float *src_data_b, float *des_data, float alpha, float beta, long n); - -}; - -} - -#endif http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/d3379cba/include/singa/blob/singa_op.h ---------------------------------------------------------------------- diff --git a/include/singa/blob/singa_op.h b/include/singa/blob/singa_op.h deleted file mode 100644 index 1131c5d..0000000 --- a/include/singa/blob/singa_op.h +++ /dev/null @@ -1,350 +0,0 @@ -/************************************************************ -* -* Licensed to the Apache Software Foundation (ASF) under one -* or more contributor license agreements. See the NOTICE file -* distributed with this work for additional information -* regarding copyright ownership. The ASF licenses this file -* to you under the Apache License, Version 2.0 (the -* "License"); you may not use this file except in compliance -* with the License. You may obtain a copy of the License at -* -* http://www.apache.org/licenses/LICENSE-2.0 -* -* Unless required by applicable law or agreed to in writing, -* software distributed under the License is distributed on an -* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -* KIND, either express or implied. See the License for the -* specific language governing permissions and limitations -* under the License. -* -*************************************************************/ - -#ifndef SINGA_BLOB_SINGA_OP_H_ -#define SINGA_BLOB_SINGA_OP_H_ - -#ifdef SINGA_GPU -#include <cuda_runtime.h> -#endif // SINGA_GPU -#include <cmath> -#include <algorithm> -#ifdef SINGA_GPU -#include "cublas_v2.h" -#include "singa/blob/math_kernel.h" -#endif // SINGA_GPU - -namespace singa { - -namespace op { - -/** - * b = e^a - */ -template<Dtype> -struct Exp { - inline static void Map(const float & a, float * b) { - *b = exp(a); - } -#ifdef SINGA_GPU - inline static void CudaMap(float alpha, const float * a, - float * b, int n) { - singa::singa_gpu_exp(a, b, alpha, n); - } -#endif // SINGA_GPU -}; -/** - * b = log(a), base is e - */ -template<Dtype> -struct Log { - inline static void Map(const float & a, float *b) { - *b = log(a); - } -} - -template<Dtype> -struct Sigmoid { - inline static void Map(const float & a, float * b) { - *b = 1.0f / (1.0f + expf(-a * alpha)); - } -#ifdef SINGA_GPU - inline static void CudaMap(const float * a, - float * b, int n) { - singa::singa_gpu_sigmoid(a, b, 1, n); - } -#endif // SINGA_GPU -}; -template<Dtype> -struct SigmoidGrad { - inline static void Map(const float & a, float * b) { - *b = a * (1.0f - a); - } -#ifdef SINGA_GPU - inline static void CudaMap(float alpha, const float * a, float * b, int n) { - singa::singa_gpu_sigmoid_grad(a, b, 1, n); - } -#endif // SINGA_GPU -}; - -template<Dtype> -struct Relu { - inline static void Map(const float & a, float * b) { - *b = std::max(a, 0.0f); - } -#ifdef SINGA_GPU - inline static void CudaMap(const float * a, float * b, int n) { - singa::singa_gpu_relu(a, b, 1, n); - } -#endif // SINGA_GPU -}; - -template<Dtype> -struct ReluGrad { - inline static void Map(const float & a, float * b) { - *b = a > 0 ? 1 : 0; - } -#ifdef SINGA_GPU - inline static void CudaMap(const float * a, float * b, int n) { - singa::singa_gpu_relu_grad(a, b, 1, n); - } -#endif // SINGA_GPU -}; - -template<Dtype> -struct Tanh { - inline static void Map(const float & a, float * b) { - *b = tanhf(a); - } -#ifdef SINGA_GPU - inline static void CudaMap(float alpha, const float * a, float * b, int n) { - singa::singa_gpu_tanh(a, b, alpha, n); - } -#endif // SINGA_GPU -}; - -template<Dtype> -struct TanhGrad { - inline static void Map(const float & a, float * b) { - *b = 1 - a * a; - } -#ifdef SINGA_GPU - inline static void CudaMap(float alpha, const float * a, float * b, int n) { - singa::singa_gpu_tanh_grad(a, b, alpha, n); - } -#endif // SINGA_GPU -}; - -template<Dtype> -struct Softplus { - inline static void Map(const float & a, float * b) { - *b = logf(1 + expf(a)); - } -#ifdef SINGA_GPU - inline static void CudaMap(const float * a, float * b, int n) { - singa::singa_gpu_softplus(a, b, 1, n); - } -#endif // SINGA_GPU -}; - -template<Dtype> -struct SoftplusGrad { - inline static void Map(const float & a, float * b) { - *b = 1.0f / (1.0f + expf(-a)); - } -#ifdef SINGA_GPU - inline static void CudaMap(const float * a, - float * b, int n) { - singa::singa_gpu_softplus_grad(a, b, n); - } -#endif // SINGA_GPU -}; - -template<Dtype> -struct Square { - inline static void Map(const float & a, float * b) { - *b = a * a; - } -#ifdef SINGA_GPU - inline static void CudaMap(const float * a, - float * b, int n) { - singa::singa_gpu_square(a, b, n); - } -#endif // SINGA_GPU -}; - -template<Dtype> -struct SquareGrad { - inline static void Map(const float & a, float * b) { - *b = 2 * sqrt(a); - } -#ifdef SINGA_GPU - inline static void CudaMap(const float * a, - float * b, int n) { - singa::singa_gpu_square_grad(a, b, 1, n); - } -#endif // SINGA_GPU -}; - -template<Dtype> -struct Sqrt { - inline static void Map(const float & a, float * b) { - *b = sqrt(a); - } -#ifdef SINGA_GPU - inline static void CudaMap(const float * a, - float * b, int n) { - singa::singa_gpu_sqrt(a, b, n); - } -#endif // SINGA_GPU -}; - -/*********************************************************************/ -/** - * c = pow(a, b), i.e., c = a^b - */ -template<Dtype> -struct Pow { - inline static void Map(const float & a, const float &b, float * c) { - *c = pow(a, b); - } -} -template<Dtype> -struct Mult { - inline static void Map(const float & a, const float & b, float * c) { - *c = a * b; - } -#ifdef SINGA_GPU - inline static void CudaMap(const float* a, const float* b, float* c, int n) { - singa::singa_gpu_mult(a, b, c, 1, 1, n); - } -#endif // SINGA_GPU -}; - -template<Dtype> -struct Div { - inline static void Map(const float & a, const float & b, float * c) { - *c = a / b; - } -#ifdef SINGA_GPU - inline static void CudaMap(const float * a, - const float * b, float * c, int n) { - singa::singa_gpu_div(a, b, c, 1, 1, n); - } -#endif // SINGA_GPU -}; - - -/*********************************************************************/ -template<Dtype> -struct Set { - inline static void Map(float alpha, float * a) { - *a = alpha; - } -#ifdef SINGA_GPU - inline static void CudaMap(float alpha, float * a, int n) { - singa::singa_gpu_set_value(a, alpha, n); - } -#endif // SINGA_GPU -}; - -template<Dtype> -struct Threshold { - inline static void Map(float alpha, const float & a, float * b) { - *b = a < alpha ? 1.0f : 0.0f; - } -#ifdef SINGA_GPU - inline static void CudaMap(float alpha, const float * a, - float * b, int n) { - singa::singa_gpu_threshold(a, b, alpha, n); - } -#endif // SINGA_GPU -}; - -/**********************************/ -struct Expand_Div { - inline static void Map(const float & a, int n, float * b) { - for (int i = 0 ; i < n ; i++) { - b[i] /= a; - } - } -#ifdef SINGA_GPU - inline static void CudaMap(const float & a, int n, float * b) { - singa::singa_gpu_scale(b, b, a, n); - } -#endif // SINGA_GPU -}; - -struct Repmat { - inline static void Map(const float & a, int n, float * b) { - for (int i = 0 ; i < n ; i++) { - b[i] = a; - } - } -#ifdef SINGA_GPU - inline static void CudaMap(const float & a, int n, float * b) { - singa::singa_gpu_set_value(b, a, n); - } -#endif // SINGA_GPU -}; - - -struct Scale { - inline static void Map(float alpha, const float & a, float * b) { - *b = alpha * a; - } - #ifdef SINGA_GPU - inline static void CudaMap(float alpha, const float * a, - float * b, int n) { - singa::singa_gpu_scale(a, b, alpha, n); - } - #endif // SINGA_GPU -}; - -struct Scale_grad { - inline static void Map(float alpha, float * output) { - *output = alpha; - } - #ifdef SINGA_GPU - inline static void CudaMap(float alpha, float * output, int n) { - singa::singa_gpu_scale_grad(output, alpha, n); - } - #endif // SINGA_GPU -}; - -struct ExpGrad { - inline static void Map(float alpha, const float & a, float * b) { - // log is the natrual log based on e - *b = a * log(alpha); - } -#ifdef SINGA_GPU - inline static void CudaMap(float alpha, const float * a, - float * b, int n) { - singa::singa_gpu_exp_grad(a, b, alpha, n); - } -#endif // SINGA_GPU -}; - -struct Sum { - inline static void Map(const float * a, int n, float * b) { - *b = 0; - for (int i = 0 ; i < n ; i++) { - *b += a[i]; - } - } -#ifdef SINGA_GPU - inline static void CudaMap(const float * a, int n, float * b) { - float *sum = NULL; - cudaMalloc(<void**>(&sum), n*sizeof(float)); - - singa::singa_gpu_sum_vec(a, sum, n); - - cudaMemcpyAsync(b, sum, sizeof(float), cudaMemcpyDeviceToDevice); - cudaFree(sum); - } -#endif // SINGA_GPU -}; - -}; // namespace op - -}; // namespace singa - -#endif // SINGA_BLOB_SINGA_OP_H_ http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/d3379cba/include/singa/utils/blob.h ---------------------------------------------------------------------- diff --git a/include/singa/utils/blob.h b/include/singa/utils/blob.h index eecb674..f7c29af 100644 --- a/include/singa/utils/blob.h +++ b/include/singa/utils/blob.h @@ -121,10 +121,12 @@ class Blob { public: Blob() {} explicit Blob(const std::vector<int>& shape) { Reshape(shape); } - explicit Blob(int count) { Reshape(count); } - explicit Blob(int a, int b) { Reshape(a, b); } - explicit Blob(int a, int b, int c) { Reshape(a, b, c); } - explicit Blob(int a, int b, int c, int d) { Reshape(a, b, c, d); } + explicit Blob(int dim0) { Reshape(dim0); } + explicit Blob(int dim0, int dim1) { Reshape(dim0, dim1); } + explicit Blob(int dim0, int dim1, int dim2) { Reshape(dim0, dim1, dim2); } + explicit Blob(int dim0, int dim1, int dim2, int dim3) { + Reshape(dim0, dim1, dim2, dim3); + } /** * Change the shape of the blob, re-allocat memory if Blob size() changes. * @@ -136,34 +138,43 @@ class Blob { * Helper for Reshape(const std::vector<int>& shape) with shape.size() = 1. * * @see Reshape(const std::vector<int>&). - * @param[in] count total num of elements. + * @param[in] dim0 total num of elements. */ - void Reshape(int count); + void Reshape(int dim0) { + Reshape(std::vector<int>{dim0}); + } /** * Helper for Reshape(const std::vector<int>& shape) with shape.size() = 2. * - * @param a the highest dimension size, i.e., a = shape[0]. E.g., a could the - * batchsize. - * @param[in] b, b = shape[1], e.g., b could be the length of the feature vector. + * @param dim0 the highest dimension size, i.e., dim0 = shape[0]. E.g., dim0 + * could the batchsize. + * @param[in] dim1, dim1 = shape[1], e.g., dim1 could be the length of the + * feature vector. */ - void Reshape(int a, int b); + void Reshape(int dim0, int dim1) { + Reshape(std::vector<int>{dim0, dim1}); + } /** * Helper for Reshape(const std::vector<int>& shape) with shape.size() = 3. * - * @param[in] a, a = shape[0] - * @param[in] b, b = shape[1] - * @param[in] c, c = shape[2] + * @param[in] dim0, dim0 = shape[0] + * @param[in] dim1, dim1 = shape[1] + * @param[in] dim2, dim2 = shape[2] */ - void Reshape(int a, int b, int c); + void Reshape(int dim0, int dim1, int dim2) { + Reshape(std::vector<int>{dim0, dim1, dim2}); + } /** * Helper for Reshape(const std::vector<int>& shape) with shape.size() = 4. * - * @param[in] a, a = shape[0] - * @param[in] b, b = shape[1] - * @param[in] c, c = shape[2] - * @param[in] d, d = shape[3] + * @param[in] dim0, dim0 = shape[0] + * @param[in] dim1, dim1 = shape[1] + * @param[in] dim2, dim2 = shape[2] + * @param[in] dim3, dim3 = shape[3] */ - void Reshape(int a, int b, int c, int d); + void Reshape(int dim0, int dim1, int dim2, int dim3) { + Reshape(std::vector<int>{dim0, dim1, dim2, dim3}); + } /** * Reshape as the shape of *other* Blob. * @param[in] other @@ -274,7 +285,7 @@ Blob<Dtype>* Reshape(const Blob<Dtype> & A, const std::vector<int>& shape) { template <typename Dtype> Blob<Dtype>* Reshape(const Blob<Dtype> & A, int count) { std::vector<int> tmpshape; - tmpshape.push_back(dim1); + tmpshape.push_back(count); return Reshape(A, tmpshape); } /** @@ -302,8 +313,8 @@ Blob<Dtype>* Reshape(const Blob<Dtype> & A, int dim0, int dim1, int dim2) { * Helper of Reshape(const Blob<Dtype>, const std::vector<int>*). */ template <typename Dtype> -Blob<Dtype>* Reshape(const Blob<Dtype> & A, int dim0, int dim1, - int dim2, int dim3) { +Blob<Dtype>* Reshape(const Blob<Dtype> & A, int dim0, int dim1, int dim2, + int dim3) { std::vector<int> tmpshape; tmpshape.push_back(dim0); tmpshape.push_back(dim1); http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/d3379cba/include/singa/utils/math_addr.h ---------------------------------------------------------------------- diff --git a/include/singa/utils/math_addr.h b/include/singa/utils/math_addr.h new file mode 100644 index 0000000..59f89ea --- /dev/null +++ b/include/singa/utils/math_addr.h @@ -0,0 +1,206 @@ +/************************************************************ +* +* Licensed to the Apache Software Foundation (ASF) under one +* or more contributor license agreements. See the NOTICE file +* distributed with this work for additional information +* regarding copyright ownership. The ASF licenses this file +* to you under the Apache License, Version 2.0 (the +* "License"); you may not use this file except in compliance +* with the License. You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, +* software distributed under the License is distributed on an +* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +* KIND, either express or implied. See the License for the +* specific language governing permissions and limitations +* under the License. +* +*************************************************************/ + +#ifndef SINGA_UTILS_MATH_ADDR_H_ +#define SINGA_UTILS_MATH_ADDR_H_ +extern "C" { + #include <cblas.h> +} +#ifdef USE_GPU +#include <cuda_runtime.h> +#endif +#include "singa/utils/singa_op.h" +#ifdef USE_GPU +#include "cublas_v2.h" +#endif + + +namespace singa { + +template<typename Dtype> +void cpu_gemm(const Dtype * A, const Dtype * B, + const int m, const int n, const int k, const Dtype alpha, const Dtype beta, + const bool TranA, const bool TranB, Dtype * C) { + int lda, ldb; + CBLAS_TRANSPOSE tA, tB; + lda = TranA ? m : k; + ldb = TranB ? k : n; + tA = TranA ? CblasTrans : CblasNoTrans; + tB = TranB ? CblasTrans : CblasNoTrans; + cblas_sgemm(CblasRowMajor, tA, tB, m, n, k, alpha, A, lda, + B, ldb, beta, C, n); +} + +// should be very careful: +// m is the length of B, and n is the length of C , A is a n*m matrix +template<typename Dtype> +void cpu_gemv(const Dtype * A, const Dtype * B, const int m, const int n, + const Dtype alpha, const Dtype beta, const bool TranA, Dtype * C) { + int lda, ldb; + CBLAS_TRANSPOSE tA, tB; + lda = TranA ? m : k; + ldb = TranB ? k : n; + tA = TranA ? CblasTrans : CblasNoTrans; + tB = TranB ? CblasTrans : CblasNoTrans; + cblas_sgemm(CblasRowMajor, tA, tB, m, n, k, alpha, A, lda, + B, ldb, beta, C, n); + +} + +template<typename Dtype> +void cpu_axpy(const Dtype * A, const int n, const Dtype alpha, Dtype * B) { + cblas_saxpy(n, alpha, A, 1, B, 1); +} + +template<typename Dtype> +Dtype cpu_dot(const Dtype * A, const Dtype * B, const int n) { + Dtype sum = 0; + for (int i = 0 ; i < n ; i++) + sum += A[i] * B[i]; + return sum; +} + +// element-wise +template<typename Op, typename Dtype> +void cpu_e_f(const int n, const Dtype alpha, Dtype * A) { + for (int i = 0 ; i < n ; i++) { + Op::Map(alpha, &A[i]); + } +} + +template<typename Op, typename Dtype> +void cpu_e_f(const int n, const Dtype * A, const Dtype alpha, Dtype * B) { + for (int i = 0 ; i < n ; i++) { + Op::Map(alpha, A[i], &B[i]); + } +} + +template<typename Op, typename Dtype> +void cpu_e_f(const int n, const Dtype * A, const Dtype * B, + const Dtype alpha, const Dtype beta, Dtype * C) { + for (int i = 0 ; i < n ; i++) { + Op::Map(alpha, beta, A[i], B[i], &C[i]); + } +} +// element-wise generalized operation defined in Op + + +// matrix/vector expand/reduce + +template<typename Op, typename Dtype> +void cpu_reduce_f(const Dtype * A, const int m, const int n, Dtype * B) { + for (int i = 0 ; i < m ; i++) { + Op::Map(A+i*n, n, B[i]); + } +} +// reduce each row of A to an element of B e.g. the sum operation in softmax +template<typename Op, typename Dtype> +void cpu_expand_f(const Dtype * A, const int m, const int n, Dtype * B) { + for (int i = 0 ; i < m ; i++) { + Op::Map(A[i], n, B+i*n); + } +} +// expand each element in A into a row of B + +#ifdef USE_GPU +template<typename Dtype> +void gpu_gemm(const Dtype * A, const Dtype * B, const int m, const int n, + const int k, const Dtype alpha, const Dtype beta, const bool TranA, + const bool TranB, Dtype * C) { + int lda = TranA ? m : k; + int ldb = TranB ? k : n; + int ldc = n; + cublasOperation_t tA = (TranA == false) ? CUBLAS_OP_N : CUBLAS_OP_T; + cublasOperation_t tB = (TranB == false) ? CUBLAS_OP_N : CUBLAS_OP_T; + cublasHandle_t handle; + cublasCreate(&handle); + cublasSgemm(handle, tB, tA, n, m, k, &alpha, B, ldb, + A, lda, &beta, C, ldc); + cublasDestroy(handle); +} + +template<typename Dtype> +void gpu_gemv(const Dtype * A, const Dtype * B, const int m, const int n, + const Dtype alpha, const Dtype beta, const bool TranA, Dtype * C) { + int lda = n; + cublasOperation_t tA = (TranA == true) ? CUBLAS_OP_N : CUBLAS_OP_T; + cublasHandle_t handle; + cublasCreate(&handle); + cublasSgemv(handle, tA, n, m, &alpha , A, lda, B, 1, &beta, C, 1); + cublasDestroy(handle); +} + +template<typename Dtype> +void gpu_axpy(const Dtype * A, const int n, const Dtype alpha, Dtype * B) { + cublasHandle_t handle; + cublasCreate(&handle); + cublasSaxpy(handle, n, &alpha, A, 1, B, 1); + cublasDestroy(handle); +} + +template<typename Dtype> +Dtype gpu_dot(const Dtype * A, const Dtype * B, const int n) { + cublasHandle_t handle; + cublasCreate(&handle); + Dtype result = 0.0; + cublasSdot(handle, n, A, 1, B, 1, &result); + cublasDestroy(handle); + return result; +} + +// element-wise +template<typename Op, typename Dtype> +void gpu_e_f(const int n, const Dtype alpha, Dtype * A) { + Op::CudaMap(alpha, A, n); +} + +template<typename Op, typename Dtype> +void gpu_e_f(const int n, const Dtype * A, const Dtype alpha, Dtype * B) { + Op::CudaMap(alpha, A, B, n); +} + +template<typename Op, typename Dtype> +void gpu_e_f(const int n, const Dtype * A, const Dtype * B, + const Dtype alpha, const Dtype beta, Dtype * C) { + Op::CudaMap(alpha, beta, A, B, C, n); +} +// element-wise generalized operation defined in Op + +// matrix/vector expand/reduce + +template<typename Op, typename Dtype> +void gpu_reduce_f(const Dtype * A, const int m, const int n, Dtype * B) { + for (int i = 0 ; i < m ; i++) { + Op::CudaMap(A+i*n, n, B[i]); + } +} +// reduce each row of A to an element of B e.g. the sum operation in softmax +template<typename Op, typename Dtype> +void gpu_expand_f(const Dtype * A, const int m, const int n, Dtype * B) { + for (int i = 0 ; i < m ; i++) { + Op::CudaMap(A[i], n, B+i*n); + } +} +// expand each element in A into a row of B +#endif // USE_GPU + +} // namespace singa +#endif // SINGA_UTILS_MATH_ADDR_H_ http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/d3379cba/include/singa/utils/math_blob.h ---------------------------------------------------------------------- diff --git a/include/singa/utils/math_blob.h b/include/singa/utils/math_blob.h new file mode 100644 index 0000000..2d53d32 --- /dev/null +++ b/include/singa/utils/math_blob.h @@ -0,0 +1,487 @@ +/************************************************************ +* +* Licensed to the Apache Software Foundation (ASF) under one +* or more contributor license agreements. See the NOTICE file +* distributed with this work for additional information +* regarding copyright ownership. The ASF licenses this file +* to you under the Apache License, Version 2.0 (the +* "License"); you may not use this file except in compliance +* with the License. You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, +* software distributed under the License is distributed on an +* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +* KIND, either express or implied. See the License for the +* specific language governing permissions and limitations +* under the License. +* +*************************************************************/ + +#ifndef SINGA_UTILS_MATH_BLOB_H_ +#define SINGA_UTILS_MATH_BLOB_H_ + +#include <vector> +#include "singa/utils/blob.h" +#include "singa/utils/singa_op.h" +#include "singa/utils/math_addr.h" + + +namespace singa { +enum XPU {cpu, gpu, any}; + +/************* BLAS level 1 *****************/ +/** + * Scale each element of A with alpha, and put the result into B. + * Bi = alpha*Ai + * Use blas scale internally. + */ +template<typename Dtype> +void Scale(xpu xpu, Dtype alpha, const Blob<Dtype> & A, Blob<Dtype> * B) { + CHECK_EQ(A.count(), B->count()); + if (xpu == cpu) + cpu_scale(A.count(), alpha, A.cpu_data(), B->mutable_cpu_data()); +#ifdef USE_GPU +#endif +} + +/** + * Element-wise operation: Bi = alpha*Ai+Bi. A and B should have the same size + */ +template<typename Dtype> +void AXPY(XPU xpu, Dtype alpha, const Blob<Dtype> & A, Blob<Dtype> * B) { + CHECK_EQ(A.count(), B.count()); + if (xpu == cpu) { + cpu_axpy(A.cpu_data(), A.count(), + alpha, B->mutable_cpu_data()); + } +#ifdef USE_GPU + if (xpu == gpu) { + gpu_axpy(A.gpu_data(), A.count(), + alpha, B->mutable_gpu_data()); + } +#endif // USE_GPU +} + +/************* BLAS level 2 *****************/ +/** + * Matrix vector multiplication, C = alpha A(.T) * B + beta C. + * Strict shape checking: + * - dim of A ==2 + * columsn of A(.T) == B.count() + * - rows of A(.T) == C.count() + * + * @param[in] alpha + * @param[in] beta + * @param[in] A, matrix + * @param[in] B, vector + * @param[in, out] C, vector + */ +template<typename Dtype> +void GEMV(XPU, xpu, Dtype alpha, Dtype beta, const Blob<Dtype>& A, + const Blob<Dtype>& B, Blob<Dtype>* C) { + CHECK_EQ(A.shape().size(), 2) << "A must be a matrix"; + int a1, a2, m, n; + a1 = A.transpose() ? A.shape(1) : A.shape(0); + a2 = A.transpose() ? A.shape(0) : A.shape(1); + m = B.count(); + n = C->count(); + CHECK_EQ(a2, m) << "# columns of A(.T) must = length of B"; + CHECK_EQ(a1, n) << "# rows of A(.T) must = length of C"; + + bool TranA = A.transpose(); + if (xpu == cpu) { + cpu_gemv(A.cpu_data(), B.cpu_data(), m, n, alpha, beta, TranA, + C->mutable_cpu_data()); + } +#ifdef USE_GPU + if (xpu == gpu) { + // gpu part + gpu_gemv(A.gpu_data(), B.gpu_data(), m, n, alpha, beta, TranA, + C->mutable_gpu_data()); + } +#endif // USE_GPU +} +/** + * Matrix vector multiplication, C = A(.T) * B, transpose is considered. + * Loose shape checking: + * - dim of A >=2 + * - A.count() % B.count() == 0 + * - B.count() == C.count() + * + * @param[in] A input matrix + * @param[in] B input vector + * @param[out] C output vector + */ +template <typename Dtype> +void MVDot(XPU xpu, const Blob<Dtype>& A, const Blob<Dtype>& B, Blob<Dtype>* C) +{ + GEMV(xpu, Dtype(1), Dtype(0), A, B, C); +} + +/************* BLAS level 3 *****************/ +/** + * Matrix multiplication, C = alpha A*B + beta C, A, B and C are matrix. + * + * Tranpose is considered for A and B. + * Strict shape checking: + * - all are matrix + * - shapes match for matrix multiplication + * + * @param[in] alpha + * @param[in] beta + * @param[in] A, matrix + * @param[in] B, matrix + * @param[in, out] C, matrix + */ +template <typename Dtype> +void GEMM(XPU xpu, Dtype alpha, Dtype beta, const Blob<Dtype>& A, + const Blob<Dtype> & B, Blob<Dtype> * C) { + CHECK_EQ(A.shape().size(), 2); + CHECK_EQ(B.shape().size(), 2); + CHECK_EQ(C.shape().size(), 2); + int a1, a2, b1, b2, m, n; + CHECK(!C->transpose()); + a1 = A.transpose() ? A.shape(1) : A.shape(0); + a2 = A.transpose() ? A.shape(0) : A.shape(1); + b1 = B.transpose() ? B.shape(1) : B.shape(0); + b2 = B.transpose() ? B.shape(0) : B.shape(1); + m = C->shape(0); + n = C->shape(1); + CHECK__EQ(a2, b1); + CHECK__EQ(a1, m); + CHECK__EQ(b2, n); + + int k = A.transpose() ? A.shape(0) : A.shape(1); + bool TranA = A.transpose(); + bool TranB = B.transpose(); + if (xpu == cpu) { + cpu_gemm(A.cpu_data(), B.cpu_data(), m, n, k, alpha, beta, + TranA, TranB, C->mutable_cpu_data()); + } +#ifdef USE_GPU + if (xpu == gpu) { + // gpu part + gpu_gemm(A.gpu_data(), B.gpu_data(), m, n, k, alpha, beta, + TranA, TranB, C->mutable_gpu_data()); + } +#endif // USE_GPU +} +/** + * Matrix multiplication, C = A(.T) * B(.T), transpose is considered. + * Strict shape checking: + * - all are matrix + * - shapes match for matrix multiplication + * + * @param[in] A input matrix + * @param[in] B input matrix + * @param[out] C output matrix + */ +template <typename Dtype> +void MMDot(XPU xpu, const Blob<Dtype>& A, const Blob<Dtype>& B, Blob<Dtype>* C) +{ + GEMM(xpu, Dtype(1), Dtype(0), A, B, C); +} + + +/*********************** Inner and Outer product****************************/ +/** + * Inner product for two vectors. + * Loose shape checking, A.count() == B.count. + * + * @param[in] A, input vector (shape checking using A.count()). + * @param[in] B, input vector (shape checking using B.count()). + * @return inner product value. + */ +template <typename Dtype> +Dtype VVDot(XPU xpu, const Blob<Dtype> & A, const Blob<Dtype> & B) { + Dtype res = 0; + CHECK_EQ(A.count(), B.count()); + int n = A.count(); + if (xpu == cpu) { + res = cpu_dot(A.cpu_data(), B.cpu_data(), n); + } +#ifdef USE_GPU + if (xpu == gpu) { + // gpu part + res = gpu_dot(A.gpu_data(), B.gpu_data(), n); + } +#endif // USE_GPU + return res; +} + +/** + * Outer product, C = A ** B, transpose is disabled. + * Loose shape checking, A.count() * B.count() == C.count() + * + * @param[in] A, input vector + * @param[in] B, input vector + * @param[out] C, output matrix + */ +template <typename Dtype> +void OuterProduct(XPU xpu, const Blob<Dtype>& A, const Blob<Dtype>& B, + Blob<Dtype> * C) { + CHECK(!C.transpose()); // do not support C.T now. + + int m = A.count(); + int n = B.count(); + CHECK_EQ(C->count(), m * n); + + if (xpu == cpu) { + cpu_gemm(A.cpu_data(), B.cpu_data(), m, n, 1, 1, 0, + false, false, C->mutable_cpu_data()); + } +#ifdef USE_GPU + if (xpu == gpu) { + // gpu part + gpu_gemm(A.gpu_data(), B.gpu_data(), m, n, 1, 1, 0, + false, false, C->mutable_gpu_data()); + } +#endif // USE_GPU +} +/*********************** Element-wise functions ***********************/ +/** + * Apply the function from Op for each element in A and put the result into B, + * i.e., Bi = Op(Ai). + * Loose shape checking, A.count() == B.count(). + */ +template<typename Op, typename Dtype> +void Map(XPU xpu, const Blob<Dtype> & A, Blob<Dtype> * B) { + CHECK_EQ(A.count(), B->count()) << "Blobs must have the same size"; + if (xpu == cpu) { + cpu_e_f<Op>(A.count(), A.cpu_data(), B->mutable_cpu_data()); + } +#ifdef SINGA_GPU + if (xpu == gpu) { + // gpu part + gpu_e_f<Op>(A.count(), A.gpu_data(), B->mutable_gpu_data()); + } +#endif // SINGA_GPU +} + +/** + * Apply the function from Op for each element in A and B, and put the result + * into C, i.e., Ci = Op(Ai, Bi). + * Loose shape checking, A, B and C are of the same size. + */ +template<typename Op, typename Dtype> +void Map(XPU xpu, const Blob<Dtype> & A, const Blob<Dtype> & B, + Blob<Dtype> * C) { + CHECK_EQ(A.count(), B.count()) << "Blobs must have the same size"; + CHECK_EQ(A.count(), C->count()) << "Blobs must have the same size"; + if (xpu == cpu) { + cpu_e_f<Op>(A.count(), A.cpu_data(), B.cpu_data(), C->mutable_cpu_data()); + } +#ifdef SINGA_GPU + if (xpu == gpu) { + // gpu part + gpu_e_f<Op>(A.count(), A.gpu_data(), B.gpu_data(), C->mutable_gpu_data()); + } +#endif // SINGA_GPU +} + +/** + * Bi = Op(alpha, Ai) + * Loose shape checking, A.count() == B.count(). + */ +template<typename Op, typename Dtype> +void Map(XPU xpu, Dtype alpha, const Blob<Dtype>& A, const Blob<Dtype>* B) { + CHECK_EQ(A.count(), B->count()) << "Blobs must have the same size"; + if (xpu == cpu) { + cpu_e_f<Op>(A.count(), alpha, A.cpu_data(), B->mutable_cpu_data()); + } +#ifdef SINGA_GPU +#endif // SINGA_GPU +} +/** + * Ci = Op(alpha, Ai, Bi) + * Loose shape checking, A, B and C are of the same size. + */ +template<typename Op, typename Dtype> +void Map(XPU xpu, Dtype alpha, const Blob<Dtype>& A, const Blob<Dtype>& B, + Blob<Dtype>* C) { + CHECK_EQ(A.count(), B->count()) << "Blobs must have the same size"; + if (xpu == cpu) { + cpu_e_f<Op>(A.count(), alpha, A.cpu_data(), B->cpu_data(), + C->mutable_cpu_data()); + } +#ifdef SINGA_GPU +#endif // SINGA_GPU +} + +/** + * Currently use std::copy which has shown better performance than memcpy. + * http://stackoverflow.com/questions/4707012/c-memcpy-vs-stdcopy + * TODO(wangwei) test blas copy vs std::copy. + * + * Loose shape checking, A.count() == B.count(). + */ +template<typename Dtype> +void Copy(XPU xpu, const Blob<Dtype>& A, const Blob<Dtype>* B) { + CHECK_EQ(A.count(), B->count()) << "Blobs must have the same size"; + if (xpu == cpu) + std::copy(A.cpu_data(), A.cpu_data() + A.count(), B->mutable_cpu_data()); + else { + LOG(FATAL) << "Not implemented"; + } +} + +/** + * C = A + B + * Implemented using Copy and AXPY. + */ +template<typename Dtype> +void Add(XPU xpu, const Blob<Dtype> & A, const Blob<Dtype> & B, + Blob<Dtype> * C) { + Copy(A, C); + AXPY(B, C, 1); +} + +/** + * C = A - B + * Implemented using Copy and AXPY. + */ +template<typename Dtype> +void Sub(XPU xpu, const Blob<Dtype> & A, const Blob<Dtype> & B, + Blob<Dtype> * C) { + Copy(xpu, A, C); + AXPY(xpu, B, C, -1); +} + +/** + * C = A * B, implemented using + * Map(XPU, const Blob<Dtype>&, const Blob<Dtype>&, Blob<Dtype>*). + */ +template<typename Dtype> +void Mult(XPU xpu, const Blob<Dtype> & A, const Blob<Dtype> & B, + Blob<Dtype> * C) { + Map<singa::op::Mult>(xpu, A, B, C); + // TODO(wangwei) use MKL's vector func +} + +/** + * C = A / B, implemented using + * Map(XPU, const Blob<Dtype>&, const Blob<Dtype>&, Blob<Dtype>*). + */ +template<typename Dtype> +void Div(XPU xpu, const Blob<Dtype> & A, const Blob<Dtype> & B, + Blob<Dtype> * C) { + Map<singa::op::Div>(xpu, A, B, C); + // TODO(wangwei) use MKL's vector func +} +/*************************1D<-->2D op/transform***************************/ +/** + * Add each row of B with A, i.e., Bij = alpha*Ai + beta*Bij + * Loose shape checking, B.count() % A.count() == 0. + * # rows of B = B.count() / A.count(). + * Transpose is disabled. + */ +template<typename Dtype> +void MVAdd(XPU xpu, Dtype alpha, Dtype beta, const Blob<Dtype> & A, + Blob<Dtype> * B) { + CHECK_EQ(B.count() % A.count(), 0) << "#col of B not match length of A"; + int m = A.count(), n = B->count() / m; + if (xpu == cpu) { + Blob<Dtype> one(n); + one.SetValue(1); + cpu_gemm(A.cpu_data(), one.cpu_data(), m, n, 1, alpha, beta, + false, false, B->mutable_cpu_data()); + } +#ifdef USE_GPU + if (xpu == gpu) { + singa_gpu_add_vec_row(B->gpu_data(), + A.gpu_data(), A.gpu_data(), m, n, n); + // gpu part + } +#endif // USE_GPU +} +/** + * Add each row of B with A, i.e., Bij = Ai + Bij + * Loose shape checking, B.count() % A.count() == 0. + * # rows of B = B.count() / A.count(). + * Transpose is disabled. + */ +template<typename Dtype> +void MVAdd(XPU xpu, const Blob<Dtype> & A, Blob<Dtype>* B) { + MVAdd(xpu, Dtype(1), Dtype(1), A, B); +} + +/** + * Copy A to each row of B + * Loose shape checking, B.count() % A.count() == 0, + * # rows of B = B.count() / A.count(). + * Transpose is disabled. + */ +template<typename Dtype> +void Repmat(XPU xpu, const Blob<Dtype> & A, Blob<Dtype> * B) { + MVAdd(xpu, Dtype(1), Dtype(0), A, B); +} + +/** + * Add each col of matrix A to vector B, i.e., Bi = \sum_j {alpha*Aij}+beta*Bi + * Loose shape checking, A.count() % B.count() == 0. + * # rows of A = A.count() / B.count(). + * Transpose is disabled. + */ +template<typename Dtype> +void MVSum(XPU xpu, Dtype alpha, Dtype beta, const Blob<Dtype> & A, + Blob<Dtype> * B) { + CHECK_EQ(A.count() % B->count(), 0) << "length of B must = # of cols of A"; + + int m = B->count(), n = A.count() / m; + if (xpu == cpu) { + Blob<Dtype> one(n); + one.SetValue(1); + cpu_gemm(A.cpu_data(), one.cpu_data(), m, 1, n, alpha, beta, + false, false, B->mutable_cpu_data()); + } +#ifdef USE_GPU + if (xpu == gpu) { + singa_gpu_sum_col(A.gpu_data(), B->gpu_data(), m, n, n); + // gpu part + } +#endif // USE_GPU +} +/** + * Reduce each row of A to an element of B. + * Loose shape checking, A.count() % B.count() == 0. + * # rows of A = A.count() / B.count(). + */ +template<typename Op, typename Dtype> +void Reduce2D(XPU xpu, const Blob<Dtype> & A, Blob<Dtype> * B) { + CHECK_EQ(A.count() % B.count(), 0) << "Row size not match B length"; + int m = B->count(), n = A.count() / m; + if (xpu == cpu) { + cpu_reduce_f<Op>(A.cpu_data(), m, n, B->mutable_cpu_data()); + } +#ifdef SINGA_GPU + if (xpu == gpu) { + // gpu part + gpu_reduce_f<Op>(A.gpu_data(), m, n, B->mutable_gpu_data()); + } +#endif // SINGA_GPU +} +/** + * Duplicate each element of A into a row of B. + * Loose shape checking, B.count() % A.count() == 0. + * # rows of B = B.count() / A.count(). + */ +template<typename Op, typename Dtype> +void Expand2D(XPU xpu, const Blob<Dtype> & A, Blob<Dtype> * B) { + CHECK_EQ(B.count() % A.count(), 0) << "Row size of B not match length of A"; + int m = A.count(), n = B->count() / m; + if (xpu == cpu) { + cpu_expand_f<Op>(A.cpu_data(), m, n, B->mutable_cpu_data()); + } +#ifdef SINGA_GPU + if (xpu == gpu) { + // gpu part + gpu_expand_f<Op>(A.gpu_data(), m, n, B->mutable_gpu_data()); + } +#endif // SINGA_GPU +} + +} // end of namespace singa + +#endif // SINGA_BLOB_MATH_BLOB_H_ http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/d3379cba/include/singa/utils/math_kernel.h ---------------------------------------------------------------------- diff --git a/include/singa/utils/math_kernel.h b/include/singa/utils/math_kernel.h new file mode 100644 index 0000000..f5d3e34 --- /dev/null +++ b/include/singa/utils/math_kernel.h @@ -0,0 +1,59 @@ +#ifndef MATH_KERNEL_H +#define MATH_KERNEL_H + +namespace singa{ + +extern "C" { + void singa_gpu_sum_vec(float *data, float *sum , long n); + + void singa_gpu_sum_col(const float *src_mat_data, float *dst_vec_data, long rows, long cols, long stride); + + void singa_gpu_add_vec_row(const float *src_vec_data, const float *src_mat_data, float *des_mat_data, long rows, long cols, long stride); + + void singa_gpu_set_value(float *data, float value, long n); + + void singa_gpu_scale(const float *src_data, float *des_data, float alpha, long n); + + void singa_gpu_scale_grad(float *data, float alpha, long n); + + void singa_gpu_exp(const float *src_data, float *des_data, float alpha, long n); + + void singa_gpu_exp_grad(const float *src_data, float *des_data, float alpha, long n); + + void singa_gpu_sigmoid(const float *src_data, float *des_data, float alpha, long n); + + void singa_gpu_sigmoid_grad(const float *src_data, float *des_data, float alpha, long n); + + void singa_gpu_relu(const float *src_data, float *des_data, float alpha, long n); + + void singa_gpu_relu_grad(const float *src_data, float *des_data, float alpha, long n); + + void singa_gpu_tanh(const float *src_data, float *des_data, float alpha, long n); + + void singa_gpu_tanh_grad(const float *src_data, float *des_data, float alpha, long n); + + void singa_gpu_softplus(const float *src_data, float *des_data, float alpha, long n); + + void singa_gpu_softplus_grad(const float *src_data, float *des_data, float alpha, long n); + + void singa_gpu_square(const float *src_data, float *des_data, float alpha, long n); + + void singa_gpu_square_grad(const float *src_data, float *des_data, float alpha, long n); + + void singa_gpu_sqrt(const float *src_data, float *des_data, float alpha, long n); + + void singa_gpu_threshold(const float *src_data, float *des_data, float alpha, long n); + + void singa_gpu_add(const float *src_data_a, const float *src_data_b, float *des_data, float alpha, float beta, long n); + + void singa_gpu_sub(const float *src_data_a, const float *src_data_b, float *des_data, float alpha, float beta, long n); + + void singa_gpu_mult(const float *src_data_a, const float *src_data_b, float *des_data, float alpha, float beta, long n); + + void singa_gpu_div(const float *src_data_a, const float *src_data_b, float *des_data, float alpha, float beta, long n); + +}; + +} + +#endif http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/d3379cba/include/singa/utils/singa_op.h ---------------------------------------------------------------------- diff --git a/include/singa/utils/singa_op.h b/include/singa/utils/singa_op.h new file mode 100644 index 0000000..ff5aba4 --- /dev/null +++ b/include/singa/utils/singa_op.h @@ -0,0 +1,265 @@ +/************************************************************ +* +* Licensed to the Apache Software Foundation (ASF) under one +* or more contributor license agreements. See the NOTICE file +* distributed with this work for additional information +* regarding copyright ownership. The ASF licenses this file +* to you under the Apache License, Version 2.0 (the +* "License"); you may not use this file except in compliance +* with the License. You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, +* software distributed under the License is distributed on an +* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +* KIND, either express or implied. See the License for the +* specific language governing permissions and limitations +* under the License. +* +*************************************************************/ + +#ifndef SINGA_UTILS_SINGA_OP_H_ +#define SINGA_UTILS_SINGA_OP_H_ + +#include <cmath> +#include <algorithm> + +#ifdef USE_GPU +#include <cuda_runtime.h> +#include "cublas_v2.h" +#include "singa/utils/math_kernel.h" +#endif // USE_GPU + +namespace singa { + +namespace op { + +/** + * b = e^a + */ +template<Dtype> +struct Exp { + inline static void Map(const float & a, float * b) { + *b = exp(a); + } +#ifdef USE_GPU + inline static void CudaMap(float alpha, const float * a, + float * b, int n) { + singa::singa_gpu_exp(a, b, alpha, n); + } +#endif // USE_GPU +}; +/** + * b = log(a), base is e + */ +template<Dtype> +struct Log { + inline static void Map(const float & a, float *b) { + *b = log(a); + } +} + +template<Dtype> +struct Sigmoid { + inline static void Map(const float & a, float * b) { + *b = 1.0f / (1.0f + expf(-a * alpha)); + } +#ifdef USE_GPU + inline static void CudaMap(const float * a, + float * b, int n) { + singa::singa_gpu_sigmoid(a, b, 1, n); + } +#endif // USE_GPU +}; +template<Dtype> +struct SigmoidGrad { + inline static void Map(const float & a, float * b) { + *b = a * (1.0f - a); + } +#ifdef USE_GPU + inline static void CudaMap(float alpha, const float * a, float * b, int n) { + singa::singa_gpu_sigmoid_grad(a, b, 1, n); + } +#endif // USE_GPU +}; + +template<Dtype> +struct Relu { + inline static void Map(const float & a, float * b) { + *b = std::max(a, 0.0f); + } +#ifdef USE_GPU + inline static void CudaMap(const float * a, float * b, int n) { + singa::singa_gpu_relu(a, b, 1, n); + } +#endif // USE_GPU +}; + +template<Dtype> +struct ReluGrad { + inline static void Map(const float & a, float * b) { + *b = a > 0 ? 1 : 0; + } +#ifdef USE_GPU + inline static void CudaMap(const float * a, float * b, int n) { + singa::singa_gpu_relu_grad(a, b, 1, n); + } +#endif // USE_GPU +}; + +template<Dtype> +struct Tanh { + inline static void Map(const float & a, float * b) { + *b = tanhf(a); + } +#ifdef USE_GPU + inline static void CudaMap(float alpha, const float * a, float * b, int n) { + singa::singa_gpu_tanh(a, b, alpha, n); + } +#endif // USE_GPU +}; + +template<Dtype> +struct TanhGrad { + inline static void Map(const float & a, float * b) { + *b = 1 - a * a; + } +#ifdef USE_GPU + inline static void CudaMap(float alpha, const float * a, float * b, int n) { + singa::singa_gpu_tanh_grad(a, b, alpha, n); + } +#endif // USE_GPU +}; + +template<Dtype> +struct Softplus { + inline static void Map(const float & a, float * b) { + *b = logf(1 + expf(a)); + } +#ifdef USE_GPU + inline static void CudaMap(const float * a, float * b, int n) { + singa::singa_gpu_softplus(a, b, 1, n); + } +#endif // USE_GPU +}; + +template<Dtype> +struct SoftplusGrad { + inline static void Map(const float & a, float * b) { + *b = 1.0f / (1.0f + expf(-a)); + } +#ifdef USE_GPU + inline static void CudaMap(const float * a, + float * b, int n) { + singa::singa_gpu_softplus_grad(a, b, n); + } +#endif // USE_GPU +}; + +template<Dtype> +struct Square { + inline static void Map(const float & a, float * b) { + *b = a * a; + } +#ifdef USE_GPU + inline static void CudaMap(const float * a, + float * b, int n) { + singa::singa_gpu_square(a, b, n); + } +#endif // USE_GPU +}; + +template<Dtype> +struct SquareGrad { + inline static void Map(const float & a, float * b) { + *b = 2 * sqrt(a); + } +#ifdef USE_GPU + inline static void CudaMap(const float * a, + float * b, int n) { + singa::singa_gpu_square_grad(a, b, 1, n); + } +#endif // USE_GPU +}; + +template<Dtype> +struct Sqrt { + inline static void Map(const float & a, float * b) { + *b = sqrt(a); + } +#ifdef USE_GPU + inline static void CudaMap(const float * a, + float * b, int n) { + singa::singa_gpu_sqrt(a, b, n); + } +#endif // USE_GPU +}; + +/*********************************************************************/ +/** + * c = pow(a, b), i.e., c = a^b + */ +template<Dtype> +struct Pow { + inline static void Map(const float & a, const float &b, float * c) { + *c = pow(a, b); + } +} +template<Dtype> +struct Mult { + inline static void Map(const float & a, const float & b, float * c) { + *c = a * b; + } +#ifdef USE_GPU + inline static void CudaMap(const float* a, const float* b, float* c, int n) { + singa::singa_gpu_mult(a, b, c, 1, 1, n); + } +#endif // USE_GPU +}; + +template<Dtype> +struct Div { + inline static void Map(const float & a, const float & b, float * c) { + *c = a / b; + } +#ifdef USE_GPU + inline static void CudaMap(const float * a, + const float * b, float * c, int n) { + singa::singa_gpu_div(a, b, c, 1, 1, n); + } +#endif // USE_GPU +}; + + +/*********************************************************************/ +template<Dtype> +struct Set { + inline static void Map(float alpha, float * a) { + *a = alpha; + } +#ifdef USE_GPU + inline static void CudaMap(float alpha, float * a, int n) { + singa::singa_gpu_set_value(a, alpha, n); + } +#endif // USE_GPU +}; + +template<Dtype> +struct Threshold { + inline static void Map(float alpha, const float & a, float * b) { + *b = a < alpha ? 1.0f : 0.0f; + } +#ifdef USE_GPU + inline static void CudaMap(float alpha, const float * a, + float * b, int n) { + singa::singa_gpu_threshold(a, b, alpha, n); + } +#endif // USE_GPU +}; + +}; // namespace op + +}; // namespace singa + +#endif // SINGA_UTILS_SINGA_OP_H_ http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/d3379cba/src/blob/math_kernel.cu ---------------------------------------------------------------------- diff --git a/src/blob/math_kernel.cu b/src/blob/math_kernel.cu deleted file mode 100644 index 4c828d5..0000000 --- a/src/blob/math_kernel.cu +++ /dev/null @@ -1,439 +0,0 @@ -#include <cmath> -#include "singa/blob/math_kernel.h" - -#define CU2DBLOCK_X 32 -#define CU2DBLOCK_Y 32 - -#define CU1DBLOCK 1024 -#define CU1DBLOCKF 1024.0 - - -//Cuda Kernel Functions - -__global__ -void kernel_sum_vec(float *data, float *sum , long n) -{ - int THREADS = blockDim.x; - - __shared__ float aux[CU1DBLOCK]; - int steps = (n - 1) / THREADS + 1; - aux[threadIdx.x] = data[threadIdx.x]; - - for(int i=1; i<steps; ++i) { - if(threadIdx.x+i*THREADS < n) { - aux[threadIdx.x] += data[threadIdx.x+i*THREADS]; - } - } - - int total_threads = THREADS; - __syncthreads(); - - while(total_threads > 1) { - int half_point = ((1+total_threads) >> 1); - if (threadIdx.x < half_point) { - if(threadIdx.x+half_point < total_threads) { - aux[threadIdx.x] += aux[threadIdx.x + half_point]; - } - } - __syncthreads(); - total_threads = ((total_threads+1) >> 1); - } - - __syncthreads(); - *sum = aux[0]; -} - -__global__ -void kernel_sum_col(const float *src_mat_data, float *dst_vec_data, long rows, long cols, long stride) -{ - int j = blockIdx.x; - int THREADS = blockDim.x; - if(j >= cols) { - return; - } - - __shared__ float aux[CU1DBLOCK]; - int steps = (rows - 1) / THREADS + 1; - aux[threadIdx.x] = src_mat_data[j+threadIdx.x*stride]; - for(int i=1; i<steps; ++i) { - if(threadIdx.x+i*THREADS < rows) { - aux[threadIdx.x] += src_mat_data[j+(threadIdx.x+i*THREADS)*stride]; - } - } - - int total_threads = THREADS; - __syncthreads(); - while(total_threads > 1) { - int half_point = ((1+total_threads) >> 1); - if (threadIdx.x < half_point) { - if(threadIdx.x+half_point < total_threads) { - aux[threadIdx.x] += aux[threadIdx.x + half_point]; - } - } - __syncthreads(); - total_threads = ((total_threads+1) >> 1); - } - - __syncthreads(); - dst_vec_data[j] = aux[0]; -} - -__global__ -void kernel_add_vec_row(const float *src_vec_data, const float *src_mat_data, float* des_mat_data,long rows, long cols, long stride) -{ - long i = blockIdx.x * blockDim.x + threadIdx.x; - long j = blockIdx.y * blockDim.y + threadIdx.y; - long num_threads_x = blockDim.x * gridDim.x; - long num_threads_y = blockDim.y * gridDim.y; - long index = 0; - for(; i<cols && j<rows; i+=num_threads_x, j+=num_threads_y) { - index = j * stride + i; - des_mat_data[index] = src_mat_data[index] + src_vec_data[i]; - } -} - -__global__ static -void kernel_set_value(float *data, float value, long n) -{ - long index = blockIdx.x * blockDim.x + threadIdx.x; - long num_threads = blockDim.x * gridDim.x; - for(; index<n; index+=num_threads) { - data[index] = value; - } -} - -__global__ -void kernel_scale(const float *src_data, float *des_data, float alpha, long n) -{ - long index = blockIdx.x * blockDim.x + threadIdx.x; - long num_threads = blockDim.x * gridDim.x; - for(; index<n; index+=num_threads) { - des_data[index] = src_data[index] * alpha; - } -} - -__global__ -void kernel_scale_grad(float *data, float alpha, long n) -{ - long index = blockIdx.x * blockDim.x + threadIdx.x; - long num_threads = blockDim.x * gridDim.x; - for(; index<n; index+=num_threads) { - data[index] = alpha; - } -} - -__global__ -void kernel_exp(const float *src_data, float *des_data, float alpha, long n) -{ - long index = blockIdx.x * blockDim.x + threadIdx.x; - long num_threads = blockDim.x * gridDim.x; - for(; index<n; index+=num_threads) { - des_data[index] = pow(-src_data[index],alpha); - } -} - -__global__ -void kernel_exp_grad(const float *src_data, float *des_data, float alpha, long n) -{ - long index = blockIdx.x * blockDim.x + threadIdx.x; - long num_threads = blockDim.x * gridDim.x; - for(; index<n; index+=num_threads) { - des_data[index] = src_data[index] * log(alpha); - } -} - -__global__ -void kernel_sigmoid(const float *src_data, float *des_data, float alpha, long n) -{ - long index = blockIdx.x * blockDim.x + threadIdx.x; - long num_threads = blockDim.x * gridDim.x; - for(; index<n; index+=num_threads) { - des_data[index] = 1.0f / (1.0f + expf(-src_data[index]) * alpha); - } -} - -__global__ -void kernel_sigmoid_grad(const float *src_data, float *des_data, float alpha, long n) -{ - long index = blockIdx.x * blockDim.x + threadIdx.x; - long num_threads = blockDim.x * gridDim.x; - for(; index<n; index+=num_threads) { - des_data[index] = src_data[index] * (1.0f - src_data[index]) * alpha; - } -} - -__global__ -void kernel_relu(const float *src_data, float *des_data, float alpha, long n) -{ - long index = blockIdx.x * blockDim.x + threadIdx.x; - long num_threads = blockDim.x * gridDim.x; - for(; index<n; index+=num_threads) { - des_data[index] = 1.0f / ( 1 - alpha ) * max( src_data[index], 0.0f ) + alpha * src_data[index]; - } -} - -__global__ -void kernel_relu_grad(const float *src_data, float *des_data, float alpha, long n) -{ - long index = blockIdx.x * blockDim.x + threadIdx.x; - long num_threads = blockDim.x * gridDim.x; - for(; index<n; index+=num_threads) { - des_data[index] = src_data[index] > 0.0f ? 1.0f : alpha; - } -} - - -__global__ -void kernel_tanh(const float *src_data, float *des_data, float alpha, long n) -{ - long index = blockIdx.x * blockDim.x + threadIdx.x; - long num_threads = blockDim.x * gridDim.x; - for(; index<n; index+=num_threads) { - des_data[index] = tanhf( src_data[index] * alpha ); - } -} - -__global__ -void kernel_tanh_grad(const float *src_data, float *des_data, float alpha, long n) -{ - long index = blockIdx.x * blockDim.x + threadIdx.x; - long num_threads = blockDim.x * gridDim.x; - for(; index<n; index+=num_threads) { - des_data[index] = alpha * (1.0f - src_data[index] * src_data[index] ); - } -} - -__global__ -void kernel_softplus(const float *src_data, float *des_data, float alpha, long n) -{ - long index = blockIdx.x * blockDim.x + threadIdx.x; - long num_threads = blockDim.x * gridDim.x; - for(; index<n; index+=num_threads) { - des_data[index] = logf(1 + expf(src_data[index])); - } -} - -__global__ -void kernel_softplus_grad(const float *src_data, float *des_data, float alpha, long n) -{ - long index = blockIdx.x * blockDim.x + threadIdx.x; - long num_threads = blockDim.x * gridDim.x; - for(; index<n; index+=num_threads) { - des_data[index] = 1.0f / (1.0f + expf(-src_data[index])); - } -} - -__global__ -void kernel_square(const float *src_data, float *des_data, float alpha, long n) -{ - long index = blockIdx.x * blockDim.x + threadIdx.x; - long num_threads = blockDim.x * gridDim.x; - for(; index<n; index+=num_threads) { - des_data[index] = src_data[index] * src_data[index]; - } -} - -__global__ -void kernel_square_grad(const float *src_data, float *des_data, float alpha, long n) -{ - long index = blockIdx.x * blockDim.x + threadIdx.x; - long num_threads = blockDim.x * gridDim.x; - for(; index<n; index+=num_threads) { - des_data[index] = 2 * sqrt(src_data[index]); - } -} - -__global__ -void kernel_sqrt(const float *src_data, float *des_data, float alpha, long n) -{ - long index = blockIdx.x * blockDim.x + threadIdx.x; - long num_threads = blockDim.x * gridDim.x; - for(; index<n; index+=num_threads) { - des_data[index] = sqrt(src_data[index]); - } -} - -__global__ -void kernel_threshold(const float *src_data, float *des_data, float alpha, long n) -{ - long index = blockIdx.x * blockDim.x + threadIdx.x; - long num_threads = blockDim.x * gridDim.x; - for(; index<n; index+=num_threads) { - des_data[index] = src_data[index] < alpha ? 1.0f : 0.0f; - } -} - -__global__ -void kernel_add(const float *src_data_a, const float *src_data_b, float *des_data, float alpha, float beta, long n) -{ - long index = blockIdx.x * blockDim.x + threadIdx.x; - long num_threads = blockDim.x * gridDim.x; - for(; index<n; index+=num_threads) { - des_data[index] = src_data_a[index] + src_data_b[index]; - } -} - -__global__ -void kernel_sub(const float *src_data_a, const float *src_data_b, float *des_data, float alpha, float beta, long n) -{ - long index = blockIdx.x * blockDim.x + threadIdx.x; - long num_threads = blockDim.x * gridDim.x; - for(; index<n; index+=num_threads) { - des_data[index] = src_data_a[index] - src_data_b[index]; - } -} - -__global__ -void kernel_mult(const float *src_data_a, const float *src_data_b, float *des_data, float alpha, float beta, long n) -{ - long index = blockIdx.x * blockDim.x + threadIdx.x; - long num_threads = blockDim.x * gridDim.x; - for(; index<n; index+=num_threads) { - des_data[index] = src_data_a[index] * src_data_b[index]; - } -} - -__global__ -void kernel_div(const float *src_data_a, const float *src_data_b, float *des_data, float alpha, float beta, long n) -{ - long index = blockIdx.x * blockDim.x + threadIdx.x; - long num_threads = blockDim.x * gridDim.x; - for(; index<n; index+=num_threads) { - des_data[index] = src_data_a[index] / src_data_b[index]; - } -} - -// -namespace singa{ - -void singa_gpu_sum_vec(float *data, float *sum , long n) -{ - long threads_per_block = n > CU1DBLOCK ? CU1DBLOCK : n; - // here, we only need one block - long num_blocks = 1; - - kernel_sum_vec<<<num_blocks, threads_per_block>>>(data, sum, n); -} - -void singa_gpu_sum_col(const float *src_mat_data, float *dst_vec_data, long rows, long cols, long stride) -{ - long threads_per_block = rows > CU1DBLOCK ? CU1DBLOCK : rows; - long num_blocks = cols; - - kernel_sum_col<<<num_blocks, threads_per_block>>>(src_mat_data, dst_vec_data, rows, cols, stride); -} - -void singa_gpu_add_vec_row(const float *src_vec_data, const float *src_mat_data, float *des_mat_data ,long rows, long cols, long stride) -{ - dim3 threads_per_block(CU2DBLOCK_X, CU2DBLOCK_Y); - dim3 num_blocks(cols/threads_per_block.x + (cols%threads_per_block.x == 0 ? 0 : 1), rows/threads_per_block.y + (rows%threads_per_block.y == 0 ? 0 : 1)); - kernel_add_vec_row<<<num_blocks, threads_per_block>>>(src_vec_data, src_mat_data, des_mat_data,rows, cols, stride); -} - -void singa_gpu_set_value(float *data, float value, long n) -{ - kernel_set_value<<<ceil(n/CU1DBLOCKF), CU1DBLOCKF>>>(data, value, n); -} - -void singa_gpu_scale(const float *src_data, float *des_data, float alpha, long n) -{ - kernel_scale<<<ceil(n/CU1DBLOCKF), CU1DBLOCKF>>>(src_data, des_data, alpha, n); -} - -void singa_gpu_scale_grad(float *data, float alpha, long n) -{ - kernel_scale_grad<<<ceil(n/CU1DBLOCKF), CU1DBLOCKF>>>(data, alpha, n); -} - -void singa_gpu_exp(const float *src_data, float *des_data, float alpha, long n) -{ - kernel_exp<<<ceil(n/CU1DBLOCKF), CU1DBLOCKF>>>(src_data, des_data, alpha, n); -} - -void singa_gpu_exp_grad(const float *src_data, float *des_data, float alpha, long n) -{ - kernel_exp_grad<<<ceil(n/CU1DBLOCKF), CU1DBLOCKF>>>(src_data, des_data, alpha, n); -} - -void singa_gpu_sigmoid(const float *src_data, float *des_data, float alpha, long n) -{ - kernel_sigmoid<<<ceil(n/CU1DBLOCKF), CU1DBLOCKF>>>(src_data, des_data, alpha, n); -} - -void singa_gpu_sigmoid_grad(const float *src_data, float *des_data, float alpha, long n) -{ - kernel_sigmoid_grad<<<ceil(n/CU1DBLOCKF), CU1DBLOCKF>>>(src_data, des_data, alpha, n); -} - -void singa_gpu_relu(const float *src_data, float *des_data, float alpha, long n) -{ - kernel_relu<<<ceil(n/CU1DBLOCKF), CU1DBLOCKF>>>(src_data, des_data, alpha, n); -} - -void singa_gpu_relu_grad(const float *src_data, float *des_data, float alpha, long n) -{ - kernel_relu_grad<<<ceil(n/CU1DBLOCKF), CU1DBLOCKF>>>(src_data, des_data, alpha, n); -} - -void singa_gpu_tanh(const float *src_data, float *des_data, float alpha, long n) -{ - kernel_tanh<<<ceil(n/CU1DBLOCKF), CU1DBLOCKF>>>(src_data, des_data, alpha, n); -} - -void singa_gpu_tanh_grad(const float *src_data, float *des_data, float alpha, long n) -{ - kernel_tanh_grad<<<ceil(n/CU1DBLOCKF), CU1DBLOCKF>>>(src_data, des_data, alpha, n); -} - -void singa_gpu_softplus(const float *src_data, float *des_data, float alpha, long n) -{ - kernel_softplus<<<ceil(n/CU1DBLOCKF), CU1DBLOCKF>>>(src_data, des_data, alpha, n); -} - -void singa_gpu_softplus_grad(const float *src_data, float *des_data, float alpha, long n) -{ - kernel_softplus_grad<<<ceil(n/CU1DBLOCKF), CU1DBLOCKF>>>(src_data, des_data, alpha, n); -} - -void singa_gpu_square(const float *src_data, float *des_data, float alpha, long n) -{ - kernel_square<<<ceil(n/CU1DBLOCKF), CU1DBLOCKF>>>(src_data, des_data, alpha, n); -} - -void singa_gpu_square_grad(const float *src_data, float *des_data, float alpha, long n) -{ - kernel_square_grad<<<ceil(n/CU1DBLOCKF), CU1DBLOCKF>>>(src_data, des_data, alpha, n); -} - -void singa_gpu_sqrt(const float *src_data, float *des_data, float alpha, long n) -{ - kernel_sqrt<<<ceil(n/CU1DBLOCKF), CU1DBLOCKF>>>(src_data, des_data, alpha, n); -} - -void singa_gpu_threshold(const float *src_data, float *des_data, float alpha, long n) -{ - kernel_threshold<<<ceil(n/CU1DBLOCKF), CU1DBLOCKF>>>(src_data, des_data, alpha, n); -} - -void singa_gpu_add(const float *src_data_a, const float *src_data_b, float *des_data, float alpha, float beta, long n) -{ - kernel_add<<<ceil(n/CU1DBLOCKF), CU1DBLOCKF>>>(src_data_a, src_data_b, des_data, alpha, beta, n); -} - -void singa_gpu_sub(const float *src_data_a, const float *src_data_b, float *des_data, float alpha, float beta, long n) -{ - kernel_sub<<<ceil(n/CU1DBLOCKF), CU1DBLOCKF>>>(src_data_a, src_data_b, des_data, alpha, beta, n); -} - -void singa_gpu_mult(const float *src_data_a, const float *src_data_b, float *des_data, float alpha, float beta, long n) -{ - kernel_mult<<<ceil(n/CU1DBLOCKF), CU1DBLOCKF>>>(src_data_a, src_data_b, des_data, alpha, beta, n); -} - -void singa_gpu_div(const float *src_data_a, const float *src_data_b, float *des_data, float alpha, float beta, long n) -{ - kernel_div<<<ceil(n/CU1DBLOCKF), CU1DBLOCKF>>>(src_data_a, src_data_b, des_data, alpha, beta, n); -} - - -}//namespace singa_gpu
