SINGA-80 New Blob Level and Address Level Math Operation Interface ----
clean the code format based on cpplint Project: http://git-wip-us.apache.org/repos/asf/incubator-singa/repo Commit: http://git-wip-us.apache.org/repos/asf/incubator-singa/commit/01d91af1 Tree: http://git-wip-us.apache.org/repos/asf/incubator-singa/tree/01d91af1 Diff: http://git-wip-us.apache.org/repos/asf/incubator-singa/diff/01d91af1 Branch: refs/heads/master Commit: 01d91af1b14b8bb4e3edbcfa51684aeb759273b0 Parents: 4728f7c Author: jinyangturbo <[email protected]> Authored: Fri Nov 6 04:40:55 2015 -0800 Committer: Wei Wang <[email protected]> Committed: Mon Nov 9 17:04:48 2015 +0800 ---------------------------------------------------------------------- include/singa/blob/math_addr.h | 159 +++++----- include/singa/blob/math_blob.h | 594 +++++++++++++++++------------------- include/singa/blob/singa_op.h | 538 +++++++++++++++++--------------- src/blob/math_addr.cc | 168 +++++----- src/blob/math_blob.cc | 325 ++++++++++---------- 5 files changed, 903 insertions(+), 881 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/01d91af1/include/singa/blob/math_addr.h ---------------------------------------------------------------------- diff --git a/include/singa/blob/math_addr.h b/include/singa/blob/math_addr.h index 4895343..7c74201 100644 --- a/include/singa/blob/math_addr.h +++ b/include/singa/blob/math_addr.h @@ -1,122 +1,131 @@ -#ifndef MATH_ADDR_H -#define MATH_ADDR_H - -namespace singa{ +/************************************************************ +* +* Licensed to the Apache Software Foundation (ASF) under one +* or more contributor license agreements. See the NOTICE file +* distributed with this work for additional information +* regarding copyright ownership. The ASF licenses this file +* to you under the Apache License, Version 2.0 (the +* "License"); you may not use this file except in compliance +* with the License. You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, +* software distributed under the License is distributed on an +* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +* KIND, either express or implied. See the License for the +* specific language governing permissions and limitations +* under the License. +* +*************************************************************/ + +#ifndef SINGA_BLOB_MATH_ADDR_H_ +#define SINGA_BLOB_MATH_ADDR_H_ + +namespace singa { const float * cpu_uni_vec(const int n); -void cpu_gemm(const float * A, const float * B, const int m, const int n, const int k, const float alpha, const float beta, const bool TranA, const bool TranB, float * C); +void cpu_gemm(const float * A, const float * B, +const int m, const int n, const int k, const float alpha, const float beta, +const bool TranA, const bool TranB, float * C); -void cpu_gemv(const float * A, const float * B, const int m, const int n, const float alpha, const float beta, const bool TranA, float * C); -// should be very careful : m is the length of B, and n is the length of C , A is a n*m matrix +void cpu_gemv(const float * A, const float * B, const int m, const int n, +const float alpha, const float beta, const bool TranA, float * C); +// should be very careful: +// m is the length of B, and n is the length of C , A is a n*m matrix void cpu_axpy(const float * A, const int n, const float alpha, float * B); float cpu_dot(const float * A, const float * B, const int n); -/* -//element-wise -template<typename Op> void cpu_e_f(const int n, const float alpha, float * A); -template<typename Op> void cpu_e_f(const int n,const float * A,const float alpha, float * B); -template<typename Op> void cpu_e_f(const int n,const float * A,const float * B,const float alpha, const float beta,float * C); -// element-wise generalized operation defined in Op -*/ - -//element-wise -template<typename Op> void cpu_e_f(const int n, const float alpha, float * A) -{ - for(int i = 0 ; i < n ; i++) - { - Op::Map(alpha, A[i]); +// element-wise +template<typename Op> +void cpu_e_f(const int n, const float alpha, float * A) { + for (int i = 0 ; i < n ; i++) { + Op::Map(alpha, &A[i]); } } -template<typename Op> void cpu_e_f(const int n,const float * A,const float alpha, float * B) -{ - for(int i = 0 ; i < n ; i++) - { - Op::Map(alpha, A[i], B[i]); +template<typename Op> +void cpu_e_f(const int n, const float * A, const float alpha, float * B) { + for (int i = 0 ; i < n ; i++) { + Op::Map(alpha, A[i], &B[i]); } } -template<typename Op> void cpu_e_f(const int n,const float * A,const float * B,const float alpha, const float beta,float * C) -{ - for(int i = 0 ; i < n ; i++) - { - Op::Map(alpha, beta, A[i], B[i], C[i]); +template<typename Op> +void cpu_e_f(const int n, const float * A, const float * B, +const float alpha, const float beta, float * C) { + for (int i = 0 ; i < n ; i++) { + Op::Map(alpha, beta, A[i], B[i], &C[i]); } } // element-wise generalized operation defined in Op -/* -//matrix/vector expand/reduce -template<typename Op> void cpu_reduce_f(const float * A,const int m, const int n, float * B); -//reduce each row of A to an element of B e.g. the sum operation in softmax -template<typename Op> void cpu_expand_f(const float * A,const int m, const int n, float * B); -//expand each element in A into a row of B -*/ +// matrix/vector expand/reduce -//matrix/vector expand/reduce - -template<typename Op> void cpu_reduce_f(const float * A,const int m, const int n, float * B) -{ - for(int i = 0 ; i < m ; i++) - { +template<typename Op> +void cpu_reduce_f(const float * A, const int m, const int n, float * B) { + for (int i = 0 ; i < m ; i++) { Op::Map(A+i*n, n, B[i]); } } -//reduce each row of A to an element of B e.g. the sum operation in softmax -template<typename Op> void cpu_expand_f(const float * A,const int m, const int n, float * B) -{ - for(int i = 0 ; i < m ; i++) - { +// reduce each row of A to an element of B e.g. the sum operation in softmax +template<typename Op> +void cpu_expand_f(const float * A, const int m, const int n, float * B) { + for (int i = 0 ; i < m ; i++) { Op::Map(A[i], n, B+i*n); } } -//expand each element in A into a row of B +// expand each element in A into a row of B + +void gpu_gemm(const float * A, const float * B, +const int m, const int n, const int k, const float alpha, const float beta, +const bool TranA, const bool TranB, float * C); + +void gpu_gemv(const float * A, const float * B, const int m, const int n, +const float alpha, const float beta, const bool TranA, float * C); -void gpu_gemm(const float * A, const float * B, const int m, const int n, const int k, const float alpha, const float beta, const bool TranA, const bool TranB, float * C); -void gpu_gemv(const float * A, const float * B, const int m, const int n, const float alpha, const float beta, const bool TranA, float * C); void gpu_axpy(const float * A, const int n, const float alpha, float * B); + float gpu_dot(const float * A, const float * B, const int n); -//element-wise -template<typename Op> void gpu_e_f(const int n, const float alpha, float * A) -{ - Op::CudaMap(alpha, A, n); +// element-wise +template<typename Op> +void gpu_e_f(const int n, const float alpha, float * A) { + Op::CudaMap(alpha, A, n); } -template<typename Op> void gpu_e_f(const int n,const float * A,const float alpha, float * B) -{ - Op::CudaMap(alpha, A, B, n); +template<typename Op> +void gpu_e_f(const int n, const float * A, const float alpha, float * B) { + Op::CudaMap(alpha, A, B, n); } -template<typename Op> void gpu_e_f(const int n,const float * A,const float * B,const float alpha, const float beta,float * C) -{ - Op::CudaMap(alpha, beta, A, B, C, n); +template<typename Op> +void gpu_e_f(const int n, const float * A, const float * B, +const float alpha, const float beta, float * C) { + Op::CudaMap(alpha, beta, A, B, C, n); } // element-wise generalized operation defined in Op -//matrix/vector expand/reduce +// matrix/vector expand/reduce -template<typename Op> void gpu_reduce_f(const float * A,const int m, const int n, float * B) -{ - for(int i = 0 ; i < m ; i++) - { +template<typename Op> +void gpu_reduce_f(const float * A, const int m, const int n, float * B) { + for (int i = 0 ; i < m ; i++) { Op::CudaMap(A+i*n, n, B[i]); } } -//reduce each row of A to an element of B e.g. the sum operation in softmax -template<typename Op> void gpu_expand_f(const float * A,const int m, const int n, float * B) -{ - for(int i = 0 ; i < m ; i++) - { +// reduce each row of A to an element of B e.g. the sum operation in softmax +template<typename Op> +void gpu_expand_f(const float * A, const int m, const int n, float * B) { + for (int i = 0 ; i < m ; i++) { Op::CudaMap(A[i], n, B+i*n); } } -//expand each element in A into a row of B - +// expand each element in A into a row of B } // namespace singa -#endif // MATH_ADDR_H +#endif // SINGA_BLOB_MATH_ADDR_H_ http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/01d91af1/include/singa/blob/math_blob.h ---------------------------------------------------------------------- diff --git a/include/singa/blob/math_blob.h b/include/singa/blob/math_blob.h index ee0fb60..b52cb91 100644 --- a/include/singa/blob/math_blob.h +++ b/include/singa/blob/math_blob.h @@ -1,380 +1,378 @@ -#ifndef MATH_BLOB_H -#define MATH_BLOB_H +/************************************************************ +* +* Licensed to the Apache Software Foundation (ASF) under one +* or more contributor license agreements. See the NOTICE file +* distributed with this work for additional information +* regarding copyright ownership. The ASF licenses this file +* to you under the Apache License, Version 2.0 (the +* "License"); you may not use this file except in compliance +* with the License. You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, +* software distributed under the License is distributed on an +* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +* KIND, either express or implied. See the License for the +* specific language governing permissions and limitations +* under the License. +* +*************************************************************/ + +#ifndef SINGA_BLOB_MATH_BLOB_H_ +#define SINGA_BLOB_MATH_BLOB_H_ #include <vector> #include "singa/utils/blob.h" -#include "singa/blob/singa_op.h" +#include "singa/blob/singa::op.h" #include "singa/blob/math_addr.h" -namespace singa{ +namespace singa { /*********************Level-2 interface, called by user code*******************/ -// c++ ususally use const & for input arguments, and * for output arguments. -// ww: maybe we represent Blob's shape using int s[4]+dim? currently we use a vector, which may -// not be convenient as int array. - int get_size(const std::vector<int>& shape); template <typename Dtype> -bool check_shape_mv(const Blob<Dtype> & A, const Blob<Dtype> & B) -{ - if(A.shape().size() != 2) return false; - if(B.shape().size() != 1) return false; - if(A.shape().at(0) != B.shape().at(0)) return false; - return true; +bool check_shape_mv(const Blob<Dtype> & A, const Blob<Dtype> & B) { + if (A.shape().size() != 2) return false; + if (B.shape().size() != 1) return false; + if (A.shape().at(0) != B.shape().at(0)) return false; + return true; } template <typename Dtype> -bool check_shape_equal(const Blob<Dtype> & A, const Blob<Dtype> & B, const Blob<Dtype> & C) -{ - int asize, bsize, csize; - asize = get_size(A.shape()); - bsize = get_size(B.shape()); - csize = get_size(C.shape()); - if(asize != bsize) return false; - if(asize != csize) return false; - return true; +bool check_shape_equal(const Blob<Dtype> & A, const Blob<Dtype> & B, +const Blob<Dtype> & C) { + int asize, bsize, csize; + asize = get_size(A.shape()); + bsize = get_size(B.shape()); + csize = get_size(C.shape()); + if (asize != bsize) return false; + if (asize != csize) return false; + return true; } template <typename Dtype> -bool check_shape_mmm(const Blob<Dtype> & A, const Blob<Dtype> & B, const Blob<Dtype> & C) -{ - if(A.shape().size() != 2) return false; - if(B.shape().size() != 2) return false; - if(C.shape().size() != 2) return false; - int a1, a2, b1, b2, c1, c2; - if(C.isTranspose()) return false; - a1 = A.isTranspose() ? A.shape().at(1) : A.shape().at(0); - a2 = A.isTranspose() ? A.shape().at(0) : A.shape().at(1); - b1 = B.isTranspose() ? B.shape().at(1) : B.shape().at(0); - b2 = B.isTranspose() ? B.shape().at(0) : B.shape().at(1); - c1 = C.shape().at(0); - c2 = C.shape().at(1); - if(a2 != b1) return false; - if(a1 != c1) return false; - if(b2 != c2) return false; - return true; +bool check_shape_mmm(const Blob<Dtype> & A, const Blob<Dtype> & B, +const Blob<Dtype> & C) { + if (A.shape().size() != 2) return false; + if (B.shape().size() != 2) return false; + if (C.shape().size() != 2) return false; + int a1, a2, b1, b2, c1, c2; + if (C.isTranspose()) return false; + a1 = A.isTranspose() ? A.shape().at(1) : A.shape().at(0); + a2 = A.isTranspose() ? A.shape().at(0) : A.shape().at(1); + b1 = B.isTranspose() ? B.shape().at(1) : B.shape().at(0); + b2 = B.isTranspose() ? B.shape().at(0) : B.shape().at(1); + c1 = C.shape().at(0); + c2 = C.shape().at(1); + if (a2 != b1) return false; + if (a1 != c1) return false; + if (b2 != c2) return false; + return true; } template <typename Dtype> -bool check_shape_vvm(const Blob<Dtype> & A, const Blob<Dtype> & B, const Blob<Dtype> & C) -{ - if(A.shape().size() != 1) return false; - if(B.shape().size() != 1) return false; - if(C.shape().size() != 2) return false; - int a1, b1, c1, c2; - if(C.isTranspose()) return false; - a1 = A.shape().at(0); - b1 = B.shape().at(0); - c1 = C.shape().at(0); - c2 = C.shape().at(1); - if(a1 != c2) return false; - if(b1 != c1) return false; - return true; +bool check_shape_vvm(const Blob<Dtype> & A, const Blob<Dtype> & B, +const Blob<Dtype> & C) { + if (A.shape().size() != 1) return false; + if (B.shape().size() != 1) return false; + if (C.shape().size() != 2) return false; + int a1, b1, c1, c2; + if (C.isTranspose()) return false; + a1 = A.shape().at(0); + b1 = B.shape().at(0); + c1 = C.shape().at(0); + c2 = C.shape().at(1); + if (a1 != c2) return false; + if (b1 != c1) return false; + return true; } template <typename Dtype> -bool check_shape_mvv(const Blob<Dtype> & A, const Blob<Dtype> & B, const Blob<Dtype> & C) -{ - if(A.shape().size() != 2) return false; - if(B.shape().size() != 1) return false; - if(C.shape().size() != 1) return false; - int a1, a2, b1, c1; - a1 = A.isTranspose() ? A.shape().at(1) : A.shape().at(0); - a2 = A.isTranspose() ? A.shape().at(0) : A.shape().at(1); - b1 = B.shape().at(0); - c1 = C.shape().at(0); - if(a2 != b1) return false; - if(a1 != c1) return false; - return true; -} - -/**********************************************************************************/ +bool check_shape_mvv(const Blob<Dtype> & A, const Blob<Dtype> & B, +const Blob<Dtype> & C) { + if (A.shape().size() != 2) return false; + if (B.shape().size() != 1) return false; + if (C.shape().size() != 1) return false; + int a1, a2, b1, c1; + a1 = A.isTranspose() ? A.shape().at(1) : A.shape().at(0); + a2 = A.isTranspose() ? A.shape().at(0) : A.shape().at(1); + b1 = B.shape().at(0); + c1 = C.shape().at(0); + if (a2 != b1) return false; + if (a1 != c1) return false; + return true; +} + +/*****************************************************************************/ // blob transformation template <typename Dtype> -Blob<Dtype>* Reshape(const Blob<Dtype> & A, const std::vector<int>& shape) -{ - Blob<Dtype>* res = new Blob<Dtype>(); - res->Mirror(A); - res->Reshape(shape); - return res; +Blob<Dtype>* Reshape(const Blob<Dtype> & A, const std::vector<int>& shape) { + Blob<Dtype>* res = new Blob<Dtype>(); + res->Mirror(A); + res->Reshape(shape); + return res; } -// the current reshape in blob.h is: void Reshape(const std::vector<int>& shape); +// the current reshape in blob.h is: +// void Reshape(const std::vector<int>& shape); template <typename Dtype> -Blob<Dtype>* Reshape(const Blob<Dtype> & A, int dim1) -{ - std::vector<int> tmpshape; - tmpshape.push_back(dim1); - return Reshape(A, tmpshape); +Blob<Dtype>* Reshape(const Blob<Dtype> & A, int dim1) { + std::vector<int> tmpshape; + tmpshape.push_back(dim1); + return Reshape(A, tmpshape); } template <typename Dtype> -Blob<Dtype>* Reshape(const Blob<Dtype> & A, int dim1, int dim2) -{ - std::vector<int> tmpshape; - tmpshape.push_back(dim1); - tmpshape.push_back(dim2);; - return Reshape(A, tmpshape); +Blob<Dtype>* Reshape(const Blob<Dtype> & A, int dim1, int dim2) { + std::vector<int> tmpshape; + tmpshape.push_back(dim1); + tmpshape.push_back(dim2);; + return Reshape(A, tmpshape); } template <typename Dtype> -Blob<Dtype>* Reshape(const Blob<Dtype> & A, int dim1, int dim2, int dim3) -{ - std::vector<int> tmpshape; - tmpshape.push_back(dim1); - tmpshape.push_back(dim2); - tmpshape.push_back(dim3); - return Reshape(A, tmpshape); +Blob<Dtype>* Reshape(const Blob<Dtype> & A, int dim1, int dim2, int dim3) { + std::vector<int> tmpshape; + tmpshape.push_back(dim1); + tmpshape.push_back(dim2); + tmpshape.push_back(dim3); + return Reshape(A, tmpshape); } template <typename Dtype> -Blob<Dtype>* Reshape(const Blob<Dtype> & A, int dim1, int dim2, int dim3, int dim4) -{ - std::vector<int> tmpshape; - tmpshape.push_back(dim1); - tmpshape.push_back(dim2); - tmpshape.push_back(dim3); - tmpshape.push_back(dim4); - return Reshape(A, tmpshape); +Blob<Dtype>* Reshape(const Blob<Dtype> & A, int dim1, int dim2, +int dim3, int dim4) { + std::vector<int> tmpshape; + tmpshape.push_back(dim1); + tmpshape.push_back(dim2); + tmpshape.push_back(dim3); + tmpshape.push_back(dim4); + return Reshape(A, tmpshape); } template <typename Dtype> -Blob<Dtype>* Reshape(const Blob<Dtype> & A, int dim1, int dim2, int dim3, int dim4, int dim5) -{ - std::vector<int> tmpshape; - tmpshape.push_back(dim1); - tmpshape.push_back(dim2); - tmpshape.push_back(dim3); - tmpshape.push_back(dim4); - tmpshape.push_back(dim5); - return Reshape(A, tmpshape); +Blob<Dtype>* Reshape(const Blob<Dtype> & A, int dim1, int dim2, +int dim3, int dim4, int dim5) { + std::vector<int> tmpshape; + tmpshape.push_back(dim1); + tmpshape.push_back(dim2); + tmpshape.push_back(dim3); + tmpshape.push_back(dim4); + tmpshape.push_back(dim5); + return Reshape(A, tmpshape); } template <typename Dtype> -Blob<Dtype>* Transpose(const Blob<Dtype> & A) -{ - Blob<Dtype>* res = new Blob<Dtype>(); - res->Mirror(A); - res->setTranspose(); - return res; +Blob<Dtype>* Transpose(const Blob<Dtype> & A) { + Blob<Dtype>* res = new Blob<Dtype>(); + res->Mirror(A); + res->setTranspose(); + return res; } // return A^T -/**********************************************************************************/ +/*****************************************************************************/ // class1 matrix operation -void MMDot(XPU xpu, const Blob<float> & A, const Blob<float> & B, Blob<float> * C); -// A,B and C are matrix +void MMDot(XPU xpu, const Blob<float> & A, const Blob<float> & B, +Blob<float> * C); +// A, B and C are matrix -void MVDot(XPU xpu, const Blob<float> & A, const Blob<float> & B, Blob<float> * C); +void MVDot(XPU xpu, const Blob<float> & A, const Blob<float> & B, +Blob<float> * C); // A is matrix,B and C are vector -void VVDot(XPU xpu, const Blob<float> & A, const Blob<float> & B, Blob<float> * C); +void VVDot(XPU xpu, const Blob<float> & A, const Blob<float> & B, +Blob<float> * C); // C is matrix,A and B are vector float VVdot(XPU xpu, const Blob<float> & A, const Blob<float> & B); -//A and B are vectors +// A and B are vectors -void GEMM(XPU xpu, const Blob<float> & A, const Blob<float> & B, Blob<float> * C, float alpha = 1, float beta = 1); -//C = alpha*A*B+beta*C, A, B and C are matrix +void GEMM(XPU xpu, const Blob<float> & A, const Blob<float> & B, +Blob<float> * C, float alpha = 1, float beta = 1); +// C = alpha*A*B+beta*C, A, B and C are matrix -/**********************************************************************************/ +/*****************************************************************************/ // class2 element-wise operation // element-wise generalized operation defined in Op -template<typename Op> -void E_Func(XPU xpu, Blob<float> * A, float alpha) -{ - if(xpu == cpu) - { - int n = get_size(A->shape()); - cpu_e_f<Op>(n, alpha, A->mutable_cpu_data()); - } - if(xpu == gpu) - { - //gpu part - int n = get_size(A->shape()); - gpu_e_f<Op>(n, alpha, A->mutable_gpu_data()); - } +template<typename Op> +void E_Func(XPU xpu, Blob<float> * A, float alpha) { + if (xpu == cpu) { + int n = get_size(A->shape()); + cpu_e_f<Op>(n, alpha, A->mutable_cpu_data()); + } + if (xpu == gpu) { + // gpu part + int n = get_size(A->shape()); + gpu_e_f<Op>(n, alpha, A->mutable_gpu_data()); + } } template<typename Op> -void E_Func(XPU xpu, const Blob<float> & A, Blob<float> * B, float alpha) -{ - if(check_shape_equal(A, *B, *B)) - { - int n = get_size(A.shape()); - if(xpu == cpu) - { - cpu_e_f<Op>(n, A.cpu_data(), alpha, B->mutable_cpu_data()); - } - - if(xpu == gpu) - { - //gpu part - gpu_e_f<Op>(n, A.gpu_data(), alpha, B->mutable_gpu_data()); - } - } - else{ - // report errors here - } +void E_Func(XPU xpu, const Blob<float> & A, Blob<float> * B, float alpha) { + if (check_shape_equal(A, *B, *B)) { + int n = get_size(A.shape()); + if (xpu == cpu) { + cpu_e_f<Op>(n, A.cpu_data(), alpha, B->mutable_cpu_data()); + } + if (xpu == gpu) { + // gpu part + gpu_e_f<Op>(n, A.gpu_data(), alpha, B->mutable_gpu_data()); + } + } else { + // report errors here + } } template<typename Op> -void E_Func(XPU xpu, const Blob<float> & A, const Blob<float> & B, Blob<float> * C, float alpha, float beta) -{ - if(check_shape_equal(A, B, *C)) - { - int n = get_size(A.shape()); - - if(xpu == cpu) - { - cpu_e_f<Op>(n, A.cpu_data(), B.cpu_data(), alpha, beta, C->mutable_cpu_data()); - } - if(xpu == gpu) - { - //gpu part - gpu_e_f<Op>(n, A.gpu_data(), B.gpu_data(), alpha, beta, C->mutable_gpu_data()); - } - } - else{ - // report errors here - } -} - - -inline void Set(XPU xpu, Blob<float> * A,float alpha) -{ - E_Func<singa_op::Set>(xpu, A, alpha); +void E_Func(XPU xpu, const Blob<float> & A, const Blob<float> & B, +Blob<float> * C, float alpha, float beta) { + if (check_shape_equal(A, B, *C)) { + int n = get_size(A.shape()); + if (xpu == cpu) { + cpu_e_f<Op>(n, A.cpu_data(), B.cpu_data(), alpha, beta, + C->mutable_cpu_data()); + } + if (xpu == gpu) { + // gpu part + gpu_e_f<Op>(n, A.gpu_data(), B.gpu_data(), alpha, beta, + C->mutable_gpu_data()); + } + } else { + // report errors here + } +} + + +inline void Set(XPU xpu, Blob<float> * A, float alpha) { + E_Func<singa::op::Set>(xpu, A, alpha); } // element-wise operation: Ai = alpha -inline void Scale(XPU xpu, const Blob<float> & A, Blob<float> * B, float alpha) -{ - E_Func<singa_op::Scale>(xpu, A, B, alpha); +inline void Scale(XPU xpu, const Blob<float> & A, Blob<float> * B, +float alpha) { + E_Func<singa::op::Scale>(xpu, A, B, alpha); } // element-wise operation: Bi = alpha*Ai -inline void Exp(XPU xpu, const Blob<float> & A, Blob<float> * B, float alpha = 2.71) -{ - E_Func<singa_op::Exp>(xpu, A, B, alpha); +inline void Exp(XPU xpu, const Blob<float> & A, Blob<float> * B, +float alpha = 2.71) { + E_Func<singa::op::Exp>(xpu, A, B, alpha); } // element-wise operation: Bi = alpha^Ai -inline void Exp_grad(XPU xpu, const Blob<float> & A, Blob<float> * B, float alpha = 2.71) -{ - E_Func<singa_op::Exp_grad>(xpu, A, B, alpha); +inline void Exp_grad(XPU xpu, const Blob<float> & A, Blob<float> * B, +float alpha = 2.71) { + E_Func<singa::op::Exp_grad>(xpu, A, B, alpha); } // element-wise operation: Bi = Ai*log(alpha) -inline void Gsigmoid(XPU xpu, const Blob<float> & A, Blob<float> * B,float alpha) -{ - E_Func<singa_op::Gsigmoid>(xpu, A, B, alpha); +inline void Gsigmoid(XPU xpu, const Blob<float> & A, Blob<float> * B, +float alpha) { + E_Func<singa::op::Gsigmoid>(xpu, A, B, alpha); } // element-wise operation: b = 1.0f / (1.0f + expf(-a * alpha)); -inline void Gsigmoid_grad(XPU xpu, const Blob<float> & A, Blob<float> * B,float alpha) -{ - E_Func<singa_op::Gsigmoid_grad>(xpu, A, B, alpha); +inline void Gsigmoid_grad(XPU xpu, const Blob<float> & A, Blob<float> * B, +float alpha) { + E_Func<singa::op::Gsigmoid_grad>(xpu, A, B, alpha); } // element-wise operation: b = alpha * a * ( 1.0f - a ); -inline void Grelu(XPU xpu, const Blob<float> & A, Blob<float> * B,float alpha = 0) -{ - E_Func<singa_op::Grelu>(xpu, A, B, alpha); +inline void Grelu(XPU xpu, const Blob<float> & A, Blob<float> * B, +float alpha = 0) { + E_Func<singa::op::Grelu>(xpu, A, B, alpha); } // element-wise operation: b = ( 1 - alpha ) * std::max( a, 0.0f ) + alpha * a; -inline void Grelu_grad(XPU xpu, const Blob<float> & A, Blob<float> * B,float alpha = 0) -{ - E_Func<singa_op::Grelu_grad>(xpu, A, B, alpha); +inline void Grelu_grad(XPU xpu, const Blob<float> & A, Blob<float> * B, +float alpha = 0) { + E_Func<singa::op::Grelu_grad>(xpu, A, B, alpha); } // element-wise operation: b = a > 0.0f ? 1.0f : alpha; -inline void Gtanh(XPU xpu, const Blob<float> & A, Blob<float> * B,float alpha) -{ - E_Func<singa_op::Gtanh>(xpu, A, B, alpha); +inline void Gtanh(XPU xpu, const Blob<float> & A, Blob<float> * B, +float alpha) { + E_Func<singa::op::Gtanh>(xpu, A, B, alpha); } // element-wise operation: b = tanhf( a * alpha ); -inline void Gtanh_grad(XPU xpu, const Blob<float> & A, Blob<float> * B,float alpha) -{ - E_Func<singa_op::Gtanh_grad>(xpu, A, B, alpha); +inline void Gtanh_grad(XPU xpu, const Blob<float> & A, Blob<float> * B, +float alpha) { + E_Func<singa::op::Gtanh_grad>(xpu, A, B, alpha); } // element-wise operation: b = alpha * ( 1.0f - a * a ); - -inline void Softplus(XPU xpu, const Blob<float> & A, Blob<float> * B) -{ - E_Func<singa_op::Softplus>(xpu, A, B, 0); + +inline void Softplus(XPU xpu, const Blob<float> & A, Blob<float> * B) { + E_Func<singa::op::Softplus>(xpu, A, B, 0); } // element-wise operation: b = logf(1 + expf(a)); -inline void Softplus_grad(XPU xpu, const Blob<float> & A, Blob<float> * B) -{ - E_Func<singa_op::Softplus_grad>(xpu, A, B, 0); +inline void Softplus_grad(XPU xpu, const Blob<float> & A, Blob<float> * B) { + E_Func<singa::op::Softplus_grad>(xpu, A, B, 0); } // element-wise operation: b = 1.0f / (1.0f + expf(-a)); -inline void Square(XPU xpu, const Blob<float> & A, Blob<float> * B) -{ - E_Func<singa_op::Square>(xpu, A, B, 0); +inline void Square(XPU xpu, const Blob<float> & A, Blob<float> * B) { + E_Func<singa::op::Square>(xpu, A, B, 0); } // element-wise operation: b = a * a; -inline void Square_grad(XPU xpu, const Blob<float> & A, Blob<float> * B) -{ - E_Func<singa_op::Square_grad>(xpu, A, B, 0); +inline void Square_grad(XPU xpu, const Blob<float> & A, Blob<float> * B) { + E_Func<singa::op::Square_grad>(xpu, A, B, 0); } // element-wise operation: b = 2 * sqrt(a); -inline void Sqrt(XPU xpu, const Blob<float> & A, Blob<float> * B) -{ - E_Func<singa_op::Sqrt>(xpu, A, B, 0); +inline void Sqrt(XPU xpu, const Blob<float> & A, Blob<float> * B) { + E_Func<singa::op::Sqrt>(xpu, A, B, 0); } // element-wise operation: b = sqrt(a); -inline void Threshold(XPU xpu, const Blob<float> & A, float alpha, Blob<float> * B) -{ - E_Func<singa_op::Threshold>(xpu, A, B, alpha); +inline void Threshold(XPU xpu, const Blob<float> & A, float alpha, +Blob<float> * B) { + E_Func<singa::op::Threshold>(xpu, A, B, alpha); } // element-wise operation: b = a < alpha ? 1.0f : 0.0f; -inline void Add(XPU xpu, const Blob<float> & A, const Blob<float> & B, Blob<float> * C) -{ - E_Func<singa_op::Add>(xpu, A, B, C, 0, 0); +inline void Add(XPU xpu, const Blob<float> & A, const Blob<float> & B, +Blob<float> * C) { + E_Func<singa::op::Add>(xpu, A, B, C, 0, 0); } // element-wise operation: Ci = Ai+Bi A,B and C should have the same size -inline void Sub(XPU xpu, const Blob<float> & A, const Blob<float> & B, Blob<float> * C) -{ - E_Func<singa_op::Sub>(xpu, A, B, C, 0, 0); +inline void Sub(XPU xpu, const Blob<float> & A, const Blob<float> & B, +Blob<float> * C) { + E_Func<singa::op::Sub>(xpu, A, B, C, 0, 0); } // element-wise operation: Ci = Ai-Bi A,B and C should have the same size -inline void Mult(XPU xpu, const Blob<float> & A, const Blob<float> & B, Blob<float> * C) -{ - E_Func<singa_op::Mult>(xpu, A, B, C, 0, 0); +inline void Mult(XPU xpu, const Blob<float> & A, const Blob<float> & B, +Blob<float> * C) { + E_Func<singa::op::Mult>(xpu, A, B, C, 0, 0); } // element-wise operation: Ci = Ai*Bi A,B and C should have the same size -inline void Div(XPU xpu, const Blob<float> & A, const Blob<float> & B, Blob<float> * C) -{ - E_Func<singa_op::Div>(xpu, A, B, C, 0, 0); +inline void Div(XPU xpu, const Blob<float> & A, const Blob<float> & B, +Blob<float> * C) { + E_Func<singa::op::Div>(xpu, A, B, C, 0, 0); } // element-wise operation: Ci = Ai/Bi A,B and C should have the same size @@ -382,81 +380,59 @@ inline void Div(XPU xpu, const Blob<float> & A, const Blob<float> & B, Blob<floa void AXPY(XPU xpu, const Blob<float> & A, Blob<float> * B, float alpha); // element-wise operation: Bi = alpha*Ai+Bi A and B should have the same size -//todo: random part -/* -void Gaussian(XPU xpu, Blob & A, float mu, float sigma); -// element-wise operation: initialize each element in A following distribution Gaussian(mu, sigma) - -void Uniform(XPU xpu, Blob & A, float low, float high); -// element-wise operation: initialize each element in A following uniform distribution from low to high - -void Bernoulli(XPU xpu, Blob & A, float p, int n = 1); -// element-wise operation: initialize each element in A following distribution Bernoulli(n,p) -*/ - -/**********************************************************************************/ -//class3 matrix-vector expand/reduce operation - -template<typename Op> -void Reduce_F(XPU xpu, const Blob<float> & A, Blob<float> * B) -{ - if(check_shape_mv(A, *B)) - { - int m = get_size(B->shape()); - int n = get_size(A.shape()) / m; - - if(xpu == cpu) - { - cpu_reduce_f<Op>(A.cpu_data(), m, n, B->mutable_cpu_data()); - } - if(xpu == gpu) - { - //gpu part - gpu_reduce_f<Op>(A.gpu_data(), m, n, B->mutable_gpu_data()); - } - } - else{ - // report errors here - } -} -//reduce each row of A to an element of B e.g. the sum operation in softmax - -template<typename Op> -void Expand_F(XPU xpu, const Blob<float> & A, Blob<float> * B) -{ - if(check_shape_mv(*B, A)) - { - int m = get_size(A.shape()); - int n = get_size(B->shape()) / m; - - if(xpu == cpu) - { - cpu_expand_f<Op>(A.cpu_data(), m, n, B->mutable_cpu_data()); - } - if(xpu == gpu) - { - //gpu part - gpu_expand_f<Op>(A.gpu_data(), m, n, B->mutable_gpu_data()); - } - } - else{ - // report errors here - } -} -//expand each element in A into a row of B +/*****************************************************************************/ +// class3 matrix-vector expand/reduce operation + +template<typename Op> +void Reduce_F(XPU xpu, const Blob<float> & A, Blob<float> * B) { + if (check_shape_mv(A, *B)) { + int m = get_size(B->shape()); + int n = get_size(A.shape()) / m; + if (xpu == cpu) { + cpu_reduce_f<Op>(A.cpu_data(), m, n, B->mutable_cpu_data()); + } + if (xpu == gpu) { + // gpu part + gpu_reduce_f<Op>(A.gpu_data(), m, n, B->mutable_gpu_data()); + } + } else { + // report errors here + } +} +// reduce each row of A to an element of B e.g. the sum operation in softmax + +template<typename Op> +void Expand_F(XPU xpu, const Blob<float> & A, Blob<float> * B) { + if (check_shape_mv(*B, A)) { + int m = get_size(A.shape()); + int n = get_size(B->shape()) / m; + if (xpu == cpu) { + cpu_expand_f<Op>(A.cpu_data(), m, n, B->mutable_cpu_data()); + } + if (xpu == gpu) { + // gpu part + gpu_expand_f<Op>(A.gpu_data(), m, n, B->mutable_gpu_data()); + } + } else { + // report errors here + } +} +// expand each element in A into a row of B void Repmat(XPU xpu, const Blob<float> & A, Blob<float> * B); // A is a vector, B is a matrix , let each row of B to be A -void MVAdd(XPU xpu, const Blob<float> & A, Blob<float> * B, float alpha, float beta); +void MVAdd(XPU xpu, const Blob<float> & A, Blob<float> * B, +float alpha, float beta); // A is a vector, B is a matrix , Bij = alpha*Ai+beta*Bij // will use gemm. faster than general expand_f -void MVSum(XPU xpu, const Blob<float> & A, Blob<float> * B, float alpha, float beta); +void MVSum(XPU xpu, const Blob<float> & A, Blob<float> * B, +float alpha, float beta); // A is a vector, B is a matrix , Ai = \sigma_j_{alpha*Bij}+beta*Ai // will use gemm. faster than general reduce_f -} // end of namespace singa +} // end of namespace singa -#endif // MATH_BLOB_H +#endif // SINGA_BLOB_MATH_BLOB_H_ http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/01d91af1/include/singa/blob/singa_op.h ---------------------------------------------------------------------- diff --git a/include/singa/blob/singa_op.h b/include/singa/blob/singa_op.h index 33ef4f8..abdfd66 100644 --- a/include/singa/blob/singa_op.h +++ b/include/singa/blob/singa_op.h @@ -1,252 +1,296 @@ -#ifndef SINGA_OP_H -#define SINGA_OP_H +/************************************************************ +* +* Licensed to the Apache Software Foundation (ASF) under one +* or more contributor license agreements. See the NOTICE file +* distributed with this work for additional information +* regarding copyright ownership. The ASF licenses this file +* to you under the Apache License, Version 2.0 (the +* "License"); you may not use this file except in compliance +* with the License. You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, +* software distributed under the License is distributed on an +* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +* KIND, either express or implied. See the License for the +* specific language governing permissions and limitations +* under the License. +* +*************************************************************/ + +#ifndef SINGA_BLOB_SINGA_OP_H_ +#define SINGA_BLOB_SINGA_OP_H_ -#include<cmath> -#include <algorithm> #include <cuda_runtime.h> -#include "cublas_v2.h" +#include <cmath> +#include <algorithm> +// #include "cublas_v2.h" #include "singa/blob/math_kernel.h" + namespace singa { - enum XPU { cpu, gpu, any}; - -} - -namespace singa_op { - struct Set { - inline static void Map(float alpha, float & a) { - a= alpha; - } - inline static void CudaMap(float alpha, float * a, int n) { - singa::singa_gpu_set_value(a, alpha, n); - } - }; - - struct Scale { - inline static void Map(float alpha, const float & a, float & b) { - b = alpha* a; - } - inline static void CudaMap(float alpha, const float * a, float * b, int n) { - singa::singa_gpu_scale(a,b,alpha,n); - } - }; - - struct Scale_grad { - inline static void Map(float alpha, float & output) { - output = alpha; - } - inline static void CudaMap(float alpha, float *output, int n) { - singa::singa_gpu_scale_grad(output,alpha,n); - } - }; - - struct Exp { - inline static void Map(float alpha, const float & a, float & b) { - b = pow(a, alpha); - } - inline static void CudaMap(float alpha, const float * a, float * b, int n) { - singa::singa_gpu_exp(a,b,alpha,n); - } - }; - - struct Exp_grad { - inline static void Map(float alpha, const float & a, float & b) { - // log is the natrual log based on e - b = a * log(alpha); - } - inline static void CudaMap(float alpha, const float * a, float * b, int n) { - singa::singa_gpu_exp_grad(a,b,alpha,n); - } - }; - - struct Gsigmoid { - inline static void Map(float alpha, const float & a, float & b) { - b = 1.0f / (1.0f + expf(-a * alpha)); - } - inline static void CudaMap(float alpha, const float * a, float * b, int n) { - singa::singa_gpu_sigmoid(a,b,alpha,n); - } - }; - - struct Gsigmoid_grad { - inline static void Map(float alpha, const float & a, float & b) { - b = alpha * a * ( 1.0f - a ); - } - inline static void CudaMap(float alpha, const float * a, float * b, int n) { - singa::singa_gpu_sigmoid_grad(a,b,alpha,n); - } - }; - - struct Grelu { - inline static void Map(float alpha, const float & a, float & b) { - b = ( 1 - alpha ) * std::max( a, 0.0f ) + alpha * a; - } - inline static void CudaMap(float alpha, const float * a, float * b, int n) { - singa::singa_gpu_relu(a,b,alpha,n); - } - }; - - struct Grelu_grad { - inline static void Map(float alpha, const float & a, float & b) { - b = a > 0.0f ? 1.0f : alpha; - } - inline static void CudaMap(float alpha, const float * a, float * b, int n) { - singa::singa_gpu_relu_grad(a,b,alpha,n); - } - }; - - struct Gtanh { - inline static void Map(float alpha, const float & a, float & b) { - b = tanhf( a * alpha ); - } - inline static void CudaMap(float alpha, const float * a, float * b, int n) { - singa::singa_gpu_tanh(a,b,alpha,n); - } - }; - - struct Gtanh_grad { - inline static void Map(float alpha, const float & a, float & b) { - b = alpha * ( 1.0f - a * a ); - } - inline static void CudaMap(float alpha, const float * a, float * b, int n) { - singa::singa_gpu_tanh_grad(a,b,alpha,n); - } - }; - - struct Softplus { - inline static void Map(float alpha, const float & a, float & b) { - b = logf(1 + expf(a)); - } - inline static void CudaMap(float alpha, const float * a, float * b, int n) { - singa::singa_gpu_softplus(a,b,alpha,n); - } - }; - - struct Softplus_grad { - inline static void Map(float alpha, const float & a, float & b) { - b = 1.0f / (1.0f + expf(-a)); - } - inline static void CudaMap(float alpha, const float * a, float * b, int n) { - singa::singa_gpu_softplus_grad(a,b,alpha,n); - } - }; - - struct Square { - inline static void Map(float alpha, const float & a, float & b) { - b = a * a; - } - inline static void CudaMap(float alpha, const float * a, float * b, int n) { - singa::singa_gpu_square(a,b,alpha,n); - } - }; - - struct Square_grad { - inline static void Map(float alpha, const float & a, float & b) { - b = 2 * sqrt(a); - } - inline static void CudaMap(float alpha, const float * a, float * b, int n) { - singa::singa_gpu_square_grad(a,b,alpha,n); - } - }; - - struct Sqrt { - inline static void Map(float alpha, const float & a, float & b) { - b = sqrt(a); - } - inline static void CudaMap(float alpha, const float * a, float * b, int n) { - singa::singa_gpu_sqrt(a,b,alpha,n); - } - }; - - struct Threshold { - inline static void Map(float alpha, const float & a, float & b) { - b = a < alpha ? 1.0f : 0.0f; - } - inline static void CudaMap(float alpha, const float * a, float * b, int n) { - singa::singa_gpu_threshold(a,b,alpha,n); - } - }; - - struct Add { - inline static void Map(float alpha, float beta, const float & a, const float & b, float & c) { - c = a + b; - } - inline static void CudaMap(float alpha, float beta, const float * a, const float * b, float *c, int n) { - singa::singa_gpu_add(a,b,c,alpha,beta,n); - } - }; - - struct Sub { - inline static void Map(float alpha, float beta, const float & a, const float & b, float & c) { - c = a - b; - } - inline static void CudaMap(float alpha, float beta, const float * a, const float * b, float *c, int n) { - singa::singa_gpu_sub(a,b,c,alpha,beta,n); - } - }; - - struct Mult { - inline static void Map(float alpha, float beta, const float & a, const float & b, float & c) { - c = a * b; - } - inline static void CudaMap(float alpha, float beta, const float * a, const float * b, float *c, int n) { - singa::singa_gpu_mult(a,b,c,alpha,beta,n); - } - }; - - struct Div { - inline static void Map(float alpha, float beta, const float & a, const float & b, float & c) { - c = a / b; - } - inline static void CudaMap(float alpha, float beta, const float * a, const float * b, float *c, int n) { - singa::singa_gpu_div(a,b,c,alpha,beta,n); - } - }; - - struct Sum { - inline static void Map(const float * a, int n, float & b) { - b = 0; - for(int i = 0 ; i < n ; i++) - { - b += a[i]; - } - } - - inline static void CudaMap(const float * a, int n, float & b) { - float *sum = NULL; - cudaMalloc((void**)&sum, n*sizeof(float)); - - singa::singa_gpu_sum_vec(a,sum,n); - - cudaMemcpyAsync(&b, sum, sizeof(float), cudaMemcpyDeviceToDevice); - cudaFree(sum); - } - }; - - struct Expand_Div { - inline static void Map(const float & a, int n, float * b) { - for(int i = 0 ; i < n ; i++) - { - b[i] /= a; - } - } - inline static void CudaMap(const float & a, int n, float * b) { - singa::singa_gpu_scale(b,b,a,n); - } - }; - - struct Repmat { - inline static void Map(const float & a, int n, float * b) { - for(int i = 0 ; i < n ; i++) - { - b[i] = a; - } - } - inline static void CudaMap(const float & a, int n, float * b) { - singa::singa_gpu_set_value(b,a,n); - } - }; - -}; // namespace op - -#endif // SINGA_OP_H + enum XPU { cpu, gpu, any}; + +namespace op { +struct Set { + inline static void Map(float alpha, float * a) { + *a = alpha; + } + inline static void CudaMap(float alpha, float * a, int n) { + singa::singa_gpu_set_value(a, alpha, n); + } +}; + +struct Scale { + inline static void Map(float alpha, const float & a, float * b) { + *b = alpha * a; + } + inline static void CudaMap(float alpha, const float * a, + float * b, int n) { + singa::singa_gpu_scale(a, b, alpha, n); + } +}; + +struct Scale_grad { + inline static void Map(float alpha, float * output) { + *output = alpha; + } + inline static void CudaMap(float alpha, float * output, int n) { + singa::singa_gpu_scale_grad(output, alpha, n); + } +}; + +struct Exp { + inline static void Map(float alpha, const float & a, float * b) { + *b = pow(a, alpha); + } + inline static void CudaMap(float alpha, const float * a, + float * b, int n) { + singa::singa_gpu_exp(a, b, alpha, n); + } +}; + +struct Exp_grad { + inline static void Map(float alpha, const float & a, float * b) { + // log is the natrual log based on e + *b = a * log(alpha); + } + inline static void CudaMap(float alpha, const float * a, + float * b, int n) { + singa::singa_gpu_exp_grad(a, b, alpha, n); + } +}; + +struct Gsigmoid { + inline static void Map(float alpha, const float & a, float * b) { + *b = 1.0f / (1.0f + expf(-a * alpha)); + } + inline static void CudaMap(float alpha, const float * a, + float * b, int n) { + singa::singa_gpu_sigmoid(a, b, alpha, n); + } +}; + +struct Gsigmoid_grad { + inline static void Map(float alpha, const float & a, float * b) { + *b = alpha * a * (1.0f - a); + } + inline static void CudaMap(float alpha, const float * a, + float * b, int n) { + singa::singa_gpu_sigmoid_grad(a, b, alpha, n); + } +}; + +struct Grelu { + inline static void Map(float alpha, const float & a, float * b) { + *b = (1 - alpha) * std::max(a, 0.0f) + alpha * a; + } + inline static void CudaMap(float alpha, const float * a, + float * b, int n) { + singa::singa_gpu_relu(a, b, alpha, n); + } +}; + +struct Grelu_grad { + inline static void Map(float alpha, const float & a, float * b) { + *b = a > 0.0f ? 1.0f : alpha; + } + inline static void CudaMap(float alpha, const float * a, + float * b, int n) { + singa::singa_gpu_relu_grad(a, b, alpha, n); + } +}; + +struct Gtanh { + inline static void Map(float alpha, const float & a, float * b) { + *b = tanhf(a * alpha); + } + inline static void CudaMap(float alpha, const float * a, + float * b, int n) { + singa::singa_gpu_tanh(a, b, alpha, n); + } +}; + +struct Gtanh_grad { + inline static void Map(float alpha, const float & a, float * b) { + *b = alpha * (1.0f - a * a); + } + inline static void CudaMap(float alpha, const float * a, + float * b, int n) { + singa::singa_gpu_tanh_grad(a, b, alpha, n); + } +}; + +struct Softplus { + inline static void Map(float alpha, const float & a, float * b) { + *b = logf(1 + expf(a)); + } + inline static void CudaMap(float alpha, const float * a, + float * b, int n) { + singa::singa_gpu_softplus(a, b, alpha, n); + } +}; + +struct Softplus_grad { + inline static void Map(float alpha, const float & a, float * b) { + *b = 1.0f / (1.0f + expf(-a)); + } + inline static void CudaMap(float alpha, const float * a, + float * b, int n) { + singa::singa_gpu_softplus_grad(a, b, alpha, n); + } +}; + +struct Square { + inline static void Map(float alpha, const float & a, float * b) { + *b = a * a; + } + inline static void CudaMap(float alpha, const float * a, + float * b, int n) { + singa::singa_gpu_square(a, b, alpha, n); + } +}; + +struct Square_grad { + inline static void Map(float alpha, const float & a, float * b) { + *b = 2 * sqrt(a); + } + inline static void CudaMap(float alpha, const float * a, + float * b, int n) { + singa::singa_gpu_square_grad(a, b, alpha, n); + } +}; + +struct Sqrt { + inline static void Map(float alpha, const float & a, float * b) { + *b = sqrt(a); + } + inline static void CudaMap(float alpha, const float * a, + float * b, int n) { + singa::singa_gpu_sqrt(a, b, alpha, n); + } +}; + +struct Threshold { + inline static void Map(float alpha, const float & a, float * b) { + *b = a < alpha ? 1.0f : 0.0f; + } + inline static void CudaMap(float alpha, const float * a, + float * b, int n) { + singa::singa_gpu_threshold(a, b, alpha, n); + } +}; + +struct Add { + inline static void Map(float alpha, float beta, const float & a, + const float & b, float * c) { + *c = a + b; + } + inline static void CudaMap(float alpha, float beta, const float * a, + const float * b, float * c, int n) { + singa::singa_gpu_add(a, b, c, alpha, beta, n); + } +}; + +struct Sub { + inline static void Map(float alpha, float beta, const float & a, + const float & b, float * c) { + *c = a - b; + } + inline static void CudaMap(float alpha, float beta, const float * a, + const float * b, float * c, int n) { + singa::singa_gpu_sub(a, b, c, alpha, beta, n); + } +}; + +struct Mult { + inline static void Map(float alpha, float beta, const float & a, + const float & b, float * c) { + *c = a * b; + } + inline static void CudaMap(float alpha, float beta, const float * a, + const float * b, float * c, int n) { + singa::singa_gpu_mult(a, b, c, alpha, beta, n); + } +}; + +struct Div { + inline static void Map(float alpha, float beta, const float & a, + const float & b, float * c) { + *c = a / b; + } + inline static void CudaMap(float alpha, float beta, const float * a, + const float * b, float * c, int n) { + singa::singa_gpu_div(a, b, c, alpha, beta, n); + } +}; + +struct Sum { + inline static void Map(const float * a, int n, float * b) { + *b = 0; + for (int i = 0 ; i < n ; i++) { + *b += a[i]; + } + } + + inline static void CudaMap(const float * a, int n, float * b) { + float *sum = NULL; + cudaMalloc(<void**>(&sum), n*sizeof(float)); + + singa::singa_gpu_sum_vec(a, sum, n); + + cudaMemcpyAsync(b, sum, sizeof(float), cudaMemcpyDeviceToDevice); + cudaFree(sum); + } +}; + +struct Expand_Div { + inline static void Map(const float & a, int n, float * b) { + for (int i = 0 ; i < n ; i++) { + b[i] /= a; + } + } + inline static void CudaMap(const float & a, int n, float * b) { + singa::singa_gpu_scale(b, b, a, n); + } +}; + +struct Repmat { + inline static void Map(const float & a, int n, float * b) { + for (int i = 0 ; i < n ; i++) { + b[i] = a; + } + } + inline static void CudaMap(const float & a, int n, float * b) { + singa::singa_gpu_set_value(b, a, n); + } +}; + +}; // namespace op + +}; // namespace singa + + + +#endif // SINGA_BLOB_SINGA_OP_H_ http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/01d91af1/src/blob/math_addr.cc ---------------------------------------------------------------------- diff --git a/src/blob/math_addr.cc b/src/blob/math_addr.cc index 799a749..8451957 100644 --- a/src/blob/math_addr.cc +++ b/src/blob/math_addr.cc @@ -1,115 +1,117 @@ -extern "C" -{ - #include <cblas.h> -} - -#include <cuda_runtime.h> -#include "cublas_v2.h" +/************************************************************ +* +* Licensed to the Apache Software Foundation (ASF) under one +* or more contributor license agreements. See the NOTICE file +* distributed with this work for additional information +* regarding copyright ownership. The ASF licenses this file +* to you under the Apache License, Version 2.0 (the +* "License"); you may not use this file except in compliance +* with the License. You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, +* software distributed under the License is distributed on an +* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +* KIND, either express or implied. See the License for the +* specific language governing permissions and limitations +* under the License. +* +*************************************************************/ #include "singa/blob/math_addr.h" +extern "C" { + #include <cblas.h> +} +#include <cuda_runtime.h> #include "singa/blob/singa_op.h" +// #include "cublas_v2.h" -namespace singa{ -const float * cpu_uni_vec(const int n) -{ - float * res = new float[n]; - for(int i = 0; i < n; i++) - res[i] = 1.0; - return res; + +namespace singa { + +const float * cpu_uni_vec(const int n) { + float * res = new float[n]; + for (int i = 0; i < n; i++) + res[i] = 1.0; + return res; } -void cpu_gemm(const float * A, const float * B, const int m, const int n, const int k, const float alpha, const float beta, const bool TranA, const bool TranB, float * C) -{ - int lda, ldb; - CBLAS_TRANSPOSE tA, tB; - lda = TranA ? m : k; - ldb = TranB ? k : n; - tA = TranA ? CblasTrans : CblasNoTrans; - tB = TranB ? CblasTrans : CblasNoTrans; - cblas_sgemm(CblasRowMajor, tA, tB, m, n, k, alpha, A, lda, B, ldb, beta, C, n); +void cpu_gemm(const float * A, const float * B, const int m, const int n, +const int k, const float alpha, const float beta, +const bool TranA, const bool TranB, float * C) { + int lda, ldb; + CBLAS_TRANSPOSE tA, tB; + lda = TranA ? m : k; + ldb = TranB ? k : n; + tA = TranA ? CblasTrans : CblasNoTrans; + tB = TranB ? CblasTrans : CblasNoTrans; + cblas_sgemm(CblasRowMajor, tA, tB, m, n, k, alpha, A, lda, + B, ldb, beta, C, n); } -void cpu_gemv(const float * A, const float * B, const int m, const int n, const float alpha, const float beta, const bool TranA, float * C) -{ - CBLAS_TRANSPOSE tA; - tA = TranA ? CblasTrans : CblasNoTrans; - cblas_sgemv(CblasRowMajor, tA, m, n, alpha, A, n, B, 1, beta, C, 1); +void cpu_gemv(const float * A, const float * B, const int m, const int n, +const float alpha, const float beta, const bool TranA, float * C) { + CBLAS_TRANSPOSE tA; + tA = TranA ? CblasTrans : CblasNoTrans; + cblas_sgemv(CblasRowMajor, tA, m, n, alpha, A, n, B, 1, beta, C, 1); } -void cpu_axpy(const float * A, const int n, const float alpha, float * B) -{ - cblas_saxpy(n, alpha, A, 1, B, 1); +void cpu_axpy(const float * A, const int n, const float alpha, float * B) { + cblas_saxpy(n, alpha, A, 1, B, 1); } -float cpu_dot(const float * A, const float * B, const int n) -{ - float sum = 0; - for(int i = 0 ; i < n ; i++) - sum += A[i]*B[i]; - return sum; +float cpu_dot(const float * A, const float * B, const int n) { + float sum = 0; + for (int i = 0 ; i < n ; i++) + sum += A[i] * B[i]; + return sum; } -//Trick: swap A and B +// Trick: swap A and B // -void gpu_gemm(const float * A, const float * B, const int m, const int n, const int k, const float alpha, const float beta, const bool TranA, const bool TranB, float * C) -{ +void gpu_gemm(const float * A, const float * B, const int m, const int n, +const int k, const float alpha, const float beta, const bool TranA, +const bool TranB, float * C) { int lda = TranA ? m : k; int ldb = TranB ? k : n; int ldc = n; - - cublasOperation_t tA= (TranA==false) ? CUBLAS_OP_N : CUBLAS_OP_T; - cublasOperation_t tB= (TranB==false) ? CUBLAS_OP_N : CUBLAS_OP_T; - + cublasOperation_t tA = (TranA == false) ? CUBLAS_OP_N : CUBLAS_OP_T; + cublasOperation_t tB = (TranB == false) ? CUBLAS_OP_N : CUBLAS_OP_T; cublasHandle_t handle; cublasCreate(&handle); - - cublasSgemm(handle, tB, tA, n, m, k, &alpha, B, ldb, A, lda, &beta, C, ldc); - + cublasSgemm(handle, tB, tA, n, m, k, &alpha, B, ldb, + A, lda, &beta, C, ldc); cublasDestroy(handle); } -void gpu_gemv(const float * A, const float * B, const int m, const int n, const float alpha, const float beta, const bool TranA, float * C) -{ - int lda = n ; - cublasOperation_t tA= (TranA==true) ? CUBLAS_OP_N : CUBLAS_OP_T; - - cublasHandle_t handle; - cublasCreate(&handle); - - cublasSgemv(handle, tA , n , m ,&alpha , A , lda , B , 1 ,&beta , C , 1); - - cublasDestroy(handle); - +void gpu_gemv(const float * A, const float * B, const int m, const int n, +const float alpha, const float beta, const bool TranA, float * C) { + int lda = n; + cublasOperation_t tA = (TranA == true) ? CUBLAS_OP_N : CUBLAS_OP_T; + cublasHandle_t handle; + cublasCreate(&handle); + cublasSgemv(handle, tA, n, m, &alpha , A, lda, B, 1, &beta, C, 1); + cublasDestroy(handle); } -void gpu_axpy(const float * A, const int n, const float alpha, float * B) -{ - - cublasHandle_t handle; - cublasCreate(&handle); - - cublasSaxpy(handle,n,&alpha,A,1,B,1); - - cublasDestroy(handle); - +void gpu_axpy(const float * A, const int n, const float alpha, float * B) { + cublasHandle_t handle; + cublasCreate(&handle); + cublasSaxpy(handle, n, &alpha, A, 1, B, 1); + cublasDestroy(handle); } -float gpu_dot(const float * A, const float * B, const int n) -{ - cublasHandle_t handle; - cublasCreate(&handle); - - float result=0.0; - - cublasSdot(handle,n,A,1,B,1,&result); - - cublasDestroy(handle); - - return result; - +float gpu_dot(const float * A, const float * B, const int n) { + cublasHandle_t handle; + cublasCreate(&handle); + float result = 0.0; + cublasSdot(handle, n, A, 1, B, 1, &result); + cublasDestroy(handle); + return result; } -} // namespace singa +} // namespace singa http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/01d91af1/src/blob/math_blob.cc ---------------------------------------------------------------------- diff --git a/src/blob/math_blob.cc b/src/blob/math_blob.cc index bd0e6ee..ad0b766 100644 --- a/src/blob/math_blob.cc +++ b/src/blob/math_blob.cc @@ -1,207 +1,198 @@ +/************************************************************ +* +* Licensed to the Apache Software Foundation (ASF) under one +* or more contributor license agreements. See the NOTICE file +* distributed with this work for additional information +* regarding copyright ownership. The ASF licenses this file +* to you under the Apache License, Version 2.0 (the +* "License"); you may not use this file except in compliance +* with the License. You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, +* software distributed under the License is distributed on an +* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +* KIND, either express or implied. See the License for the +* specific language governing permissions and limitations +* under the License. +* +*************************************************************/ + #include "singa/blob/math_blob.h" #include "singa/blob/math_kernel.h" namespace singa { -/**********************************************************************************/ +/*****************************************************************************/ // shape_check function -int get_size(const std::vector<int>& shape) -{ - int sum = 1; - for(unsigned int i = 0; i < shape.size(); i++) sum *= shape[i]; - return sum; +int get_size(const std::vector<int>& shape) { + int sum = 1; + for (unsigned int i = 0; i < shape.size(); i++) sum *= shape[i]; + return sum; } -/**********************************************************************************/ +/*****************************************************************************/ // class1 matrix operation -void GEMM(XPU xpu, const Blob<float> & A, const Blob<float> & B, Blob<float> * C, float alpha, float beta) -{ - if(check_shape_mmm(A, B, *C)) - { - int m = C->shape().at(0); - int n = C->shape().at(1); - int k = A.isTranspose() ? A.shape().at(0) : A.shape().at(1); - bool TranA = A.isTranspose(); - bool TranB = B.isTranspose(); - - if(xpu == cpu) - { - cpu_gemm(A.cpu_data(), B.cpu_data(), m, n, k, alpha, beta, TranA, TranB, C->mutable_cpu_data()); - } - if(xpu == gpu) - { - //gpu part - gpu_gemm(A.gpu_data(), B.gpu_data(), m, n, k, alpha, beta, TranA, TranB, C->mutable_gpu_data()); - } - } - else{ - // report errors here - } +void GEMM(XPU xpu, const Blob<float> & A, const Blob<float> & B, +Blob<float> * C, float alpha, float beta) { + if (check_shape_mmm(A, B, *C)) { + int m = C->shape().at(0); + int n = C->shape().at(1); + int k = A.isTranspose() ? A.shape().at(0) : A.shape().at(1); + bool TranA = A.isTranspose(); + bool TranB = B.isTranspose(); + if (xpu == cpu) { + cpu_gemm(A.cpu_data(), B.cpu_data(), m, n, k, alpha, beta, + TranA, TranB, C->mutable_cpu_data()); + } + if (xpu == gpu) { + // gpu part + gpu_gemm(A.gpu_data(), B.gpu_data(), m, n, k, alpha, beta, + TranA, TranB, C->mutable_gpu_data()); + } + } else { + // report errors here + } } -//C = alpha*A*B+beta*C, A, B and C are matrix +// C = alpha*A*B+beta*C, A, B and C are matrix - -void MMDot(XPU xpu, const Blob<float> & A, const Blob<float> & B, Blob<float> * C) -{ - GEMM(xpu, A, B, C, 1, 0); +void MMDot(XPU xpu, const Blob<float> & A, const Blob<float> & B, +Blob<float> * C) { + GEMM(xpu, A, B, C, 1, 0); } // A,B and C are matrix -void MVDot(XPU xpu, const Blob<float> & A, const Blob<float> & B, Blob<float> * C) -{ - if(check_shape_mvv(A, B, *C)) - { - int m = B.shape().at(0); - int n = C->shape().at(0); - bool TranA = A.isTranspose(); - - if(xpu == cpu) - { - cpu_gemv(A.cpu_data(), B.cpu_data(), m, n, 1, 0, TranA, C->mutable_cpu_data()); - } - if(xpu == gpu) - { - //gpu part - gpu_gemv(A.gpu_data(), B.gpu_data(), m, n, 1, 0, TranA, C->mutable_gpu_data()); - } - } - else{ - // report errors here - } - +void MVDot(XPU xpu, const Blob<float> & A, const Blob<float> & B, +Blob<float> * C) { + if (check_shape_mvv(A, B, *C)) { + int m = B.shape().at(0); + int n = C->shape().at(0); + bool TranA = A.isTranspose(); + if (xpu == cpu) { + cpu_gemv(A.cpu_data(), B.cpu_data(), m, n, 1, 0, TranA, + C->mutable_cpu_data()); + } + if (xpu == gpu) { + // gpu part + gpu_gemv(A.gpu_data(), B.gpu_data(), m, n, 1, 0, TranA, + C->mutable_gpu_data()); + } + } else { + // report errors here + } } // A is matrix,B and C are vector - -void VVDot(XPU xpu, const Blob<float> & A, const Blob<float> & B, Blob<float> * C) -{ - if(check_shape_vvm(A, B, *C)) - { - int m = C->shape().at(0); - int n = C->shape().at(1); - - if(xpu == cpu) - { - cpu_gemm(A.cpu_data(), B.cpu_data(), m, n, 1, 1, 0, false, false, C->mutable_cpu_data()); - } - if(xpu == gpu) - { - //gpu part - gpu_gemm(A.gpu_data(), B.gpu_data(), m, n, 1, 1, 0, false, false, C->mutable_gpu_data()); - } - } - else{ - // report errors here - } + +void VVDot(XPU xpu, const Blob<float> & A, const Blob<float> & B, +Blob<float> * C) { + if (check_shape_vvm(A, B, *C)) { + int m = C->shape().at(0); + int n = C->shape().at(1); + if (xpu == cpu) { + cpu_gemm(A.cpu_data(), B.cpu_data(), m, n, 1, 1, 0, + false, false, C->mutable_cpu_data()); + } + if (xpu == gpu) { + // gpu part + gpu_gemm(A.gpu_data(), B.gpu_data(), m, n, 1, 1, 0, + false, false, C->mutable_gpu_data()); + } + } else { + // report errors here + } } // C is matrix,A and B are vector - -float VVdot(XPU xpu, const Blob<float> & A, const Blob<float> & B) -{ - float res = 0; - if(check_shape_equal(A, B, B)) - { - int n = get_size(A.shape()); - if(xpu == cpu) - { - res = cpu_dot(A.cpu_data(), B.cpu_data(), n); - } - if(xpu == gpu) - { - //gpu part - res = gpu_dot(A.gpu_data(), B.gpu_data(), n); - } - } - else{ - // report errors here - } - return res; + +float VVdot(XPU xpu, const Blob<float> & A, const Blob<float> & B) { + float res = 0; + if (check_shape_equal(A, B, B)) { + int n = get_size(A.shape()); + if (xpu == cpu) { + res = cpu_dot(A.cpu_data(), B.cpu_data(), n); + } + if (xpu == gpu) { + // gpu part + res = gpu_dot(A.gpu_data(), B.gpu_data(), n); + } + } else { + // report errors here + } + return res; } -//A and B are vectors - -void AXPY(XPU xpu, const Blob<float> & A, Blob<float> * B, float alpha) -{ - if(check_shape_equal(A, *B, *B)) - { - - if(xpu == cpu) - { - cpu_axpy(A.cpu_data(), get_size(A.shape()), alpha, B->mutable_cpu_data()); - } - if(xpu == gpu) - { - gpu_axpy(A.gpu_data(), get_size(A.shape()), alpha, B->mutable_gpu_data()); - } - } - else{ - // report errors here - } +// A and B are vectors + +void AXPY(XPU xpu, const Blob<float> & A, Blob<float> * B, float alpha) { + if (check_shape_equal(A, *B, *B)) { + if (xpu == cpu) { + cpu_axpy(A.cpu_data(), get_size(A.shape()), + alpha, B->mutable_cpu_data()); + } + if (xpu == gpu) { + gpu_axpy(A.gpu_data(), get_size(A.shape()), + alpha, B->mutable_gpu_data()); + } + } else { + // report errors here + } } // element-wise operation: Bi = alpha*Ai+Bi A and B should have the same size -inline void Repmat(XPU xpu, const Blob<float> & A, Blob<float> * B) -{ - MVAdd(xpu, A, B, 1, 0); +inline void Repmat(XPU xpu, const Blob<float> & A, Blob<float> * B) { + MVAdd(xpu, A, B, 1, 0); } // A is a vector, B is a matrix , let each row of B to be A -void MVAdd(XPU xpu, const Blob<float> & A, Blob<float> * B, float alpha, float beta) -{ - if(check_shape_mv(*B, A)) - { - int m = get_size(A.shape()); - int n = get_size(B->shape()) / m; - - if(xpu == cpu) - { - const float * univ = cpu_uni_vec(n); - cpu_gemm(A.cpu_data(), univ, m, n, 1, alpha, beta, false, false, B->mutable_cpu_data()); - delete univ; - } - - if(xpu == gpu) - { - singa_gpu_add_vec_row(B->gpu_data(),A.gpu_data(),A.gpu_data(),m,n,n); - //gpu part - } - } - else{ - // report errors here - } +void MVAdd(XPU xpu, const Blob<float> & A, Blob<float> * B, +float alpha, float beta) { + if (check_shape_mv(*B, A)) { + int m = get_size(A.shape()); + int n = get_size(B->shape()) / m; + if (xpu == cpu) { + const float * univ = cpu_uni_vec(n); + cpu_gemm(A.cpu_data(), univ, m, n, 1, alpha, beta, + false, false, B->mutable_cpu_data()); + delete univ; + } + if (xpu == gpu) { + singa_gpu_add_vec_row(B->gpu_data(), + A.gpu_data(), A.gpu_data(), m, n, n); + // gpu part + } + } else { + // report errors here + } } // A is a vector, B is a matrix , Bij = alpha*Ai+beta*Bij // will use gemm. faster than general expand_f -void MVSum(XPU xpu, const Blob<float> & A, Blob<float> * B, float alpha, float beta) -{ - if(check_shape_mv(A, *B)) - { - int m = get_size(B->shape()); - int n = get_size(A.shape()) / m; - - if(xpu == cpu) - { - const float * univ = cpu_uni_vec(n); - cpu_gemm(A.cpu_data(), univ, m, 1, n, alpha, beta, false, false, B->mutable_cpu_data()); - delete univ; - } - if(xpu == gpu) - { - singa_gpu_sum_col(A.gpu_data(),B->gpu_data(),m,n,n); - //gpu part - } - } - else{ - // report errors here - } +void MVSum(XPU xpu, const Blob<float> & A, Blob<float> * B, +float alpha, float beta) { + if (check_shape_mv(A, *B)) { + int m = get_size(B->shape()); + int n = get_size(A.shape()) / m; + if (xpu == cpu) { + const float * univ = cpu_uni_vec(n); + cpu_gemm(A.cpu_data(), univ, m, 1, n, alpha, beta, + false, false, B->mutable_cpu_data()); + delete univ; + } + if (xpu == gpu) { + singa_gpu_sum_col(A.gpu_data(), B->gpu_data(), m, n, n); + // gpu part + } + } else { + // report errors here + } } // B is a vector, A is a matrix , Bi = \sigma_j_{alpha*Aij}+beta*Bi // will use gemm. faster than general reduce_f -} // namespace singa +} // namespace singa
