[02/19] incubator-singa git commit: SINGA-80 New Blob Level and Address Level Math Operation Interface

kaiping Sun, 15 Nov 2015 22:09:21 -0800

SINGA-80 New Blob Level and Address Level Math Operation Interface

----


clean the code format based on cpplint


Project: http://git-wip-us.apache.org/repos/asf/incubator-singa/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-singa/commit/01d91af1
Tree: http://git-wip-us.apache.org/repos/asf/incubator-singa/tree/01d91af1
Diff: http://git-wip-us.apache.org/repos/asf/incubator-singa/diff/01d91af1

Branch: refs/heads/master
Commit: 01d91af1b14b8bb4e3edbcfa51684aeb759273b0
Parents: 4728f7c
Author: jinyangturbo <[email protected]>
Authored: Fri Nov 6 04:40:55 2015 -0800
Committer: Wei Wang <[email protected]>
Committed: Mon Nov 9 17:04:48 2015 +0800

----------------------------------------------------------------------
 include/singa/blob/math_addr.h | 159 +++++-----
 include/singa/blob/math_blob.h | 594 +++++++++++++++++-------------------
 include/singa/blob/singa_op.h  | 538 +++++++++++++++++---------------
 src/blob/math_addr.cc          | 168 +++++-----
 src/blob/math_blob.cc          | 325 ++++++++++----------
 5 files changed, 903 insertions(+), 881 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/01d91af1/include/singa/blob/math_addr.h
----------------------------------------------------------------------
diff --git a/include/singa/blob/math_addr.h b/include/singa/blob/math_addr.h
index 4895343..7c74201 100644
--- a/include/singa/blob/math_addr.h
+++ b/include/singa/blob/math_addr.h
@@ -1,122 +1,131 @@
-#ifndef MATH_ADDR_H
-#define MATH_ADDR_H
-
-namespace singa{
+/************************************************************
+*
+* Licensed to the Apache Software Foundation (ASF) under one
+* or more contributor license agreements.  See the NOTICE file
+* distributed with this work for additional information
+* regarding copyright ownership.  The ASF licenses this file
+* to you under the Apache License, Version 2.0 (the
+* "License"); you may not use this file except in compliance
+* with the License.  You may obtain a copy of the License at
+*
+*   http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing,
+* software distributed under the License is distributed on an
+* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+* KIND, either express or implied.  See the License for the
+* specific language governing permissions and limitations
+* under the License.
+*
+*************************************************************/
+
+#ifndef SINGA_BLOB_MATH_ADDR_H_
+#define SINGA_BLOB_MATH_ADDR_H_
+
+namespace singa {
 
 const float * cpu_uni_vec(const int n);
 
-void cpu_gemm(const float * A, const float * B, const int m, const int n, 
const int k, const float alpha, const float beta, const bool TranA, const bool 
TranB, float * C);
+void cpu_gemm(const float * A, const float * B,
+const int m, const int n, const int k, const float alpha, const float beta,
+const bool TranA, const bool TranB, float * C);
 
-void cpu_gemv(const float * A, const float * B, const int m, const int n, 
const float alpha, const float beta, const bool TranA, float * C);
-// should be very careful : m is the length of B, and n is the length of C , A 
is a n*m matrix
+void cpu_gemv(const float * A, const float * B, const int m, const int n,
+const float alpha, const float beta, const bool TranA, float * C);
+// should be very careful:
+// m is the length of B, and n is the length of C , A is a n*m matrix
 
 void cpu_axpy(const float * A, const int n, const float alpha, float * B);
 
 float cpu_dot(const float * A, const float * B, const int n);
 
-/*
-//element-wise
-template<typename Op> void cpu_e_f(const int n, const float alpha, float * A);
-template<typename Op> void cpu_e_f(const int n,const float * A,const float 
alpha, float * B);
-template<typename Op> void cpu_e_f(const int n,const float * A,const float * 
B,const float alpha, const float beta,float * C);
-// element-wise generalized operation defined in Op
-*/
-
-//element-wise
-template<typename Op> void cpu_e_f(const int n, const float alpha, float * A)
-{
-                for(int i = 0 ; i < n ; i++)
-                {
-                                Op::Map(alpha, A[i]);
+// element-wise
+template<typename Op>
+void cpu_e_f(const int n, const float alpha, float * A) {
+                for (int i = 0 ; i < n ; i++) {
+                                Op::Map(alpha, &A[i]);
                 }
 }
 
-template<typename Op> void cpu_e_f(const int n,const float * A,const float 
alpha, float * B)
-{
-                for(int i = 0 ; i < n ; i++)
-                {
-                                Op::Map(alpha, A[i], B[i]);
+template<typename Op>
+void cpu_e_f(const int n, const float * A, const float alpha, float * B) {
+                for (int i = 0 ; i < n ; i++) {
+                                Op::Map(alpha, A[i], &B[i]);
                 }
 }
 
-template<typename Op> void cpu_e_f(const int n,const float * A,const float * 
B,const float alpha, const float beta,float * C)
-{
-                for(int i = 0 ; i < n ; i++)
-                {
-                                Op::Map(alpha, beta, A[i], B[i], C[i]);
+template<typename Op>
+void cpu_e_f(const int n, const float * A, const float * B,
+const float alpha, const float beta, float * C) {
+                for (int i = 0 ; i < n ; i++) {
+                                Op::Map(alpha, beta, A[i], B[i], &C[i]);
                 }
 }
 // element-wise generalized operation defined in Op
 
-/*
-//matrix/vector expand/reduce
 
-template<typename Op> void cpu_reduce_f(const float * A,const int m, const int 
n, float * B);
-//reduce each row of A to an element of B e.g. the sum operation in softmax
-template<typename Op> void cpu_expand_f(const float * A,const int m, const int 
n, float * B);
-//expand each element in A into a row of B
-*/
+// matrix/vector expand/reduce
 
-//matrix/vector expand/reduce
-
-template<typename Op> void cpu_reduce_f(const float * A,const int m, const int 
n, float * B)
-{
-                for(int i = 0 ; i < m ; i++)
-                {
+template<typename Op>
+void cpu_reduce_f(const float * A, const int m, const int n, float * B) {
+                for (int i = 0 ; i < m ; i++) {
                                 Op::Map(A+i*n, n, B[i]);
                 }
 }
-//reduce each row of A to an element of B e.g. the sum operation in softmax
-template<typename Op> void cpu_expand_f(const float * A,const int m, const int 
n, float * B)
-{
-                for(int i = 0 ; i < m ; i++)
-                {
+// reduce each row of A to an element of B e.g. the sum operation in softmax
+template<typename Op>
+void cpu_expand_f(const float * A, const int m, const int n, float * B) {
+                for (int i = 0 ; i < m ; i++) {
                                 Op::Map(A[i], n, B+i*n);
                 }
 }
-//expand each element in A into a row of B
+// expand each element in A into a row of B
+
+void gpu_gemm(const float * A, const float * B,
+const int m, const int n, const int k, const float alpha, const float beta,
+const bool TranA, const bool TranB, float * C);
+
+void gpu_gemv(const float * A, const float * B, const int m, const int n,
+const float alpha, const float beta, const bool TranA, float * C);
 
-void gpu_gemm(const float * A, const float * B, const int m, const int n, 
const int k, const float alpha, const float beta, const bool TranA, const bool 
TranB, float * C);
-void gpu_gemv(const float * A, const float * B, const int m, const int n, 
const float alpha, const float beta, const bool TranA, float * C);
 void gpu_axpy(const float * A, const int n, const float alpha, float * B);
+
 float gpu_dot(const float * A, const float * B, const int n);
 
-//element-wise
-template<typename Op> void gpu_e_f(const int n, const float alpha, float * A)
-{
-       Op::CudaMap(alpha, A, n);
+// element-wise
+template<typename Op>
+void gpu_e_f(const int n, const float alpha, float * A) {
+    Op::CudaMap(alpha, A, n);
 }
 
-template<typename Op> void gpu_e_f(const int n,const float * A,const float 
alpha, float * B)
-{
-       Op::CudaMap(alpha, A, B, n);
+template<typename Op>
+void gpu_e_f(const int n, const float * A, const float alpha, float * B) {
+    Op::CudaMap(alpha, A, B, n);
 }
 
-template<typename Op> void gpu_e_f(const int n,const float * A,const float * 
B,const float alpha, const float beta,float * C)
-{
-       Op::CudaMap(alpha, beta, A, B, C, n);
+template<typename Op>
+void gpu_e_f(const int n, const float * A, const float * B,
+const float alpha, const float beta, float * C) {
+    Op::CudaMap(alpha, beta, A, B, C, n);
 }
 // element-wise generalized operation defined in Op
 
-//matrix/vector expand/reduce
+// matrix/vector expand/reduce
 
-template<typename Op> void gpu_reduce_f(const float * A,const int m, const int 
n, float * B)
-{
-                for(int i = 0 ; i < m ; i++)
-                {
+template<typename Op>
+void gpu_reduce_f(const float * A, const int m, const int n, float * B) {
+                for (int i = 0 ; i < m ; i++) {
                                 Op::CudaMap(A+i*n, n, B[i]);
                 }
 }
-//reduce each row of A to an element of B e.g. the sum operation in softmax
-template<typename Op> void gpu_expand_f(const float * A,const int m, const int 
n, float * B)
-{
-                for(int i = 0 ; i < m ; i++)
-                {
+// reduce each row of A to an element of B e.g. the sum operation in softmax
+template<typename Op>
+void gpu_expand_f(const float * A, const int m, const int n, float * B) {
+                for (int i = 0 ; i < m ; i++) {
                                 Op::CudaMap(A[i], n, B+i*n);
                 }
 }
-//expand each element in A into a row of B
-
+// expand each element in A into a row of B
 
 }  // namespace singa
-#endif // MATH_ADDR_H
+#endif  // SINGA_BLOB_MATH_ADDR_H_

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/01d91af1/include/singa/blob/math_blob.h
----------------------------------------------------------------------
diff --git a/include/singa/blob/math_blob.h b/include/singa/blob/math_blob.h
index ee0fb60..b52cb91 100644
--- a/include/singa/blob/math_blob.h
+++ b/include/singa/blob/math_blob.h
@@ -1,380 +1,378 @@
-#ifndef MATH_BLOB_H
-#define MATH_BLOB_H
+/************************************************************
+*
+* Licensed to the Apache Software Foundation (ASF) under one
+* or more contributor license agreements.  See the NOTICE file
+* distributed with this work for additional information
+* regarding copyright ownership.  The ASF licenses this file
+* to you under the Apache License, Version 2.0 (the
+* "License"); you may not use this file except in compliance
+* with the License.  You may obtain a copy of the License at
+*
+*   http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing,
+* software distributed under the License is distributed on an
+* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+* KIND, either express or implied.  See the License for the
+* specific language governing permissions and limitations
+* under the License.
+*
+*************************************************************/
+
+#ifndef SINGA_BLOB_MATH_BLOB_H_
+#define SINGA_BLOB_MATH_BLOB_H_
 
 #include <vector>
 #include "singa/utils/blob.h"
-#include "singa/blob/singa_op.h"
+#include "singa/blob/singa::op.h"
 #include "singa/blob/math_addr.h"
 
 
-namespace singa{
+namespace singa {
 /*********************Level-2 interface, called by user 
code*******************/
-// c++ ususally use const & for input arguments, and * for output arguments.
-// ww: maybe we represent Blob's shape using int s[4]+dim? currently we use a 
vector, which may
-// not be convenient as int array.
-
 
 int get_size(const std::vector<int>& shape);
 
 template <typename Dtype>
-bool check_shape_mv(const Blob<Dtype> & A, const Blob<Dtype> & B)
-{
-       if(A.shape().size() != 2) return false;
-       if(B.shape().size() != 1) return false;
-       if(A.shape().at(0) != B.shape().at(0)) return false;
-       return true;
+bool check_shape_mv(const Blob<Dtype> & A, const Blob<Dtype> & B) {
+    if (A.shape().size() != 2) return false;
+    if (B.shape().size() != 1) return false;
+    if (A.shape().at(0) != B.shape().at(0)) return false;
+    return true;
 }
 
 template <typename Dtype>
-bool check_shape_equal(const Blob<Dtype> & A, const Blob<Dtype> & B, const 
Blob<Dtype> & C)
-{
-  int asize, bsize, csize;
-  asize = get_size(A.shape());
-  bsize = get_size(B.shape());
-  csize = get_size(C.shape());
-  if(asize != bsize) return false;
-  if(asize != csize) return false;
-  return true;
+bool check_shape_equal(const Blob<Dtype> & A, const Blob<Dtype> & B,
+const Blob<Dtype> & C) {
+    int asize, bsize, csize;
+    asize = get_size(A.shape());
+    bsize = get_size(B.shape());
+    csize = get_size(C.shape());
+    if (asize != bsize) return false;
+    if (asize != csize) return false;
+    return true;
 }
 
 template <typename Dtype>
-bool check_shape_mmm(const Blob<Dtype> & A, const Blob<Dtype> & B, const 
Blob<Dtype> & C)
-{
-  if(A.shape().size() != 2) return false;
-  if(B.shape().size() != 2) return false;
-  if(C.shape().size() != 2) return false;
-  int a1, a2, b1, b2, c1, c2;
-  if(C.isTranspose()) return false;
-  a1 = A.isTranspose() ? A.shape().at(1) : A.shape().at(0);
-  a2 = A.isTranspose() ? A.shape().at(0) : A.shape().at(1);
-  b1 = B.isTranspose() ? B.shape().at(1) : B.shape().at(0);
-  b2 = B.isTranspose() ? B.shape().at(0) : B.shape().at(1);
-  c1 = C.shape().at(0);
-  c2 = C.shape().at(1);
-  if(a2 != b1) return false;
-  if(a1 != c1) return false;
-  if(b2 != c2) return false;
-  return true;
+bool check_shape_mmm(const Blob<Dtype> & A, const Blob<Dtype> & B,
+const Blob<Dtype> & C) {
+    if (A.shape().size() != 2) return false;
+    if (B.shape().size() != 2) return false;
+    if (C.shape().size() != 2) return false;
+    int a1, a2, b1, b2, c1, c2;
+    if (C.isTranspose()) return false;
+    a1 = A.isTranspose() ? A.shape().at(1) : A.shape().at(0);
+    a2 = A.isTranspose() ? A.shape().at(0) : A.shape().at(1);
+    b1 = B.isTranspose() ? B.shape().at(1) : B.shape().at(0);
+    b2 = B.isTranspose() ? B.shape().at(0) : B.shape().at(1);
+    c1 = C.shape().at(0);
+    c2 = C.shape().at(1);
+    if (a2 != b1) return false;
+    if (a1 != c1) return false;
+    if (b2 != c2) return false;
+    return true;
 }
 
 template <typename Dtype>
-bool check_shape_vvm(const Blob<Dtype> & A, const Blob<Dtype> & B, const 
Blob<Dtype> & C)
-{
-  if(A.shape().size() != 1) return false;
-  if(B.shape().size() != 1) return false;
-  if(C.shape().size() != 2) return false;
-  int a1, b1, c1, c2;
-  if(C.isTranspose()) return false;
-  a1 = A.shape().at(0);
-  b1 = B.shape().at(0);
-  c1 = C.shape().at(0);
-  c2 = C.shape().at(1);
-  if(a1 != c2) return false;
-  if(b1 != c1) return false;
-  return true;
+bool check_shape_vvm(const Blob<Dtype> & A, const Blob<Dtype> & B,
+const Blob<Dtype> & C) {
+    if (A.shape().size() != 1) return false;
+    if (B.shape().size() != 1) return false;
+    if (C.shape().size() != 2) return false;
+    int a1, b1, c1, c2;
+    if (C.isTranspose()) return false;
+    a1 = A.shape().at(0);
+    b1 = B.shape().at(0);
+    c1 = C.shape().at(0);
+    c2 = C.shape().at(1);
+    if (a1 != c2) return false;
+    if (b1 != c1) return false;
+    return true;
 }
 
 template <typename Dtype>
-bool check_shape_mvv(const Blob<Dtype> & A, const Blob<Dtype> & B, const 
Blob<Dtype> & C)
-{
-  if(A.shape().size() != 2) return false;
-  if(B.shape().size() != 1) return false;
-  if(C.shape().size() != 1) return false;
-  int a1, a2, b1, c1;
-  a1 = A.isTranspose() ? A.shape().at(1) : A.shape().at(0);
-  a2 = A.isTranspose() ? A.shape().at(0) : A.shape().at(1);
-  b1 = B.shape().at(0);
-  c1 = C.shape().at(0);
-  if(a2 != b1) return false;
-  if(a1 != c1) return false;
-  return true;
-}
-
-/**********************************************************************************/
+bool check_shape_mvv(const Blob<Dtype> & A, const Blob<Dtype> & B,
+const Blob<Dtype> & C) {
+    if (A.shape().size() != 2) return false;
+    if (B.shape().size() != 1) return false;
+    if (C.shape().size() != 1) return false;
+    int a1, a2, b1, c1;
+    a1 = A.isTranspose() ? A.shape().at(1) : A.shape().at(0);
+    a2 = A.isTranspose() ? A.shape().at(0) : A.shape().at(1);
+    b1 = B.shape().at(0);
+    c1 = C.shape().at(0);
+    if (a2 != b1) return false;
+    if (a1 != c1) return false;
+    return true;
+}
+
+/*****************************************************************************/
 // blob transformation
 
 template <typename Dtype>
-Blob<Dtype>* Reshape(const Blob<Dtype> & A, const std::vector<int>& shape)
-{
-  Blob<Dtype>* res = new Blob<Dtype>();
-  res->Mirror(A);
-  res->Reshape(shape);
-  return res;
+Blob<Dtype>* Reshape(const Blob<Dtype> & A, const std::vector<int>& shape) {
+    Blob<Dtype>* res = new Blob<Dtype>();
+    res->Mirror(A);
+    res->Reshape(shape);
+    return res;
 }
 
-// the current reshape in blob.h is: void Reshape(const std::vector<int>& 
shape);
+// the current reshape in blob.h is:
+// void Reshape(const std::vector<int>& shape);
 
 template <typename Dtype>
-Blob<Dtype>* Reshape(const Blob<Dtype> & A, int dim1)
-{
-       std::vector<int> tmpshape;
-       tmpshape.push_back(dim1);
-       return Reshape(A, tmpshape);
+Blob<Dtype>* Reshape(const Blob<Dtype> & A, int dim1) {
+    std::vector<int> tmpshape;
+    tmpshape.push_back(dim1);
+    return Reshape(A, tmpshape);
 }
 
 template <typename Dtype>
-Blob<Dtype>* Reshape(const Blob<Dtype> & A, int dim1, int dim2)
-{
-       std::vector<int> tmpshape;
-       tmpshape.push_back(dim1);
-       tmpshape.push_back(dim2);;
-       return Reshape(A, tmpshape);
+Blob<Dtype>* Reshape(const Blob<Dtype> & A, int dim1, int dim2) {
+    std::vector<int> tmpshape;
+    tmpshape.push_back(dim1);
+    tmpshape.push_back(dim2);;
+    return Reshape(A, tmpshape);
 }
 
 template <typename Dtype>
-Blob<Dtype>* Reshape(const Blob<Dtype> & A, int dim1, int dim2, int dim3)
-{
-       std::vector<int> tmpshape;
-       tmpshape.push_back(dim1);
-       tmpshape.push_back(dim2);
-       tmpshape.push_back(dim3);
-       return Reshape(A, tmpshape);
+Blob<Dtype>* Reshape(const Blob<Dtype> & A, int dim1, int dim2, int dim3) {
+    std::vector<int> tmpshape;
+    tmpshape.push_back(dim1);
+    tmpshape.push_back(dim2);
+    tmpshape.push_back(dim3);
+    return Reshape(A, tmpshape);
 }
 
 template <typename Dtype>
-Blob<Dtype>* Reshape(const Blob<Dtype> & A, int dim1, int dim2, int dim3, int 
dim4)
-{
-       std::vector<int> tmpshape;
-       tmpshape.push_back(dim1);
-       tmpshape.push_back(dim2);
-       tmpshape.push_back(dim3);
-       tmpshape.push_back(dim4);
-       return Reshape(A, tmpshape);
+Blob<Dtype>* Reshape(const Blob<Dtype> & A, int dim1, int dim2,
+int dim3, int dim4) {
+    std::vector<int> tmpshape;
+    tmpshape.push_back(dim1);
+    tmpshape.push_back(dim2);
+    tmpshape.push_back(dim3);
+    tmpshape.push_back(dim4);
+    return Reshape(A, tmpshape);
 }
 
 template <typename Dtype>
-Blob<Dtype>* Reshape(const Blob<Dtype> & A, int dim1, int dim2, int dim3, int 
dim4, int dim5)
-{
-       std::vector<int> tmpshape;
-       tmpshape.push_back(dim1);
-       tmpshape.push_back(dim2);
-       tmpshape.push_back(dim3);
-       tmpshape.push_back(dim4);
-       tmpshape.push_back(dim5);
-       return Reshape(A, tmpshape);
+Blob<Dtype>* Reshape(const Blob<Dtype> & A, int dim1, int dim2,
+int dim3, int dim4, int dim5) {
+    std::vector<int> tmpshape;
+    tmpshape.push_back(dim1);
+    tmpshape.push_back(dim2);
+    tmpshape.push_back(dim3);
+    tmpshape.push_back(dim4);
+    tmpshape.push_back(dim5);
+    return Reshape(A, tmpshape);
 }
 
 template <typename Dtype>
-Blob<Dtype>* Transpose(const Blob<Dtype> & A)
-{
-       Blob<Dtype>* res = new Blob<Dtype>();
-       res->Mirror(A);
-       res->setTranspose();
-       return res;
+Blob<Dtype>* Transpose(const Blob<Dtype> & A) {
+    Blob<Dtype>* res = new Blob<Dtype>();
+    res->Mirror(A);
+    res->setTranspose();
+    return res;
 }
 // return A^T
 
 
-/**********************************************************************************/
+/*****************************************************************************/
 // class1 matrix operation
 
 
-void MMDot(XPU xpu, const Blob<float> & A, const Blob<float> & B, Blob<float> 
* C);
-// A,B and C are matrix
+void MMDot(XPU xpu, const Blob<float> & A, const Blob<float> & B,
+Blob<float> * C);
+// A, B and C are matrix
 
 
-void MVDot(XPU xpu, const Blob<float> & A, const Blob<float> & B, Blob<float> 
* C);
+void MVDot(XPU xpu, const Blob<float> & A, const Blob<float> & B,
+Blob<float> * C);
 // A is matrix,B and C are vector
 
 
-void VVDot(XPU xpu, const Blob<float> & A, const Blob<float> & B, Blob<float> 
* C);
+void VVDot(XPU xpu, const Blob<float> & A, const Blob<float> & B,
+Blob<float> * C);
 // C is matrix,A and B are vector
 
 
 float VVdot(XPU xpu, const Blob<float> & A, const Blob<float> & B);
-//A and B are vectors
+// A and B are vectors
 
 
-void GEMM(XPU xpu, const Blob<float> & A, const Blob<float> & B, Blob<float> * 
C, float alpha = 1, float beta = 1);
-//C = alpha*A*B+beta*C, A, B and C are matrix
+void GEMM(XPU xpu, const Blob<float> & A, const Blob<float> & B,
+Blob<float> * C, float alpha = 1, float beta = 1);
+// C = alpha*A*B+beta*C, A, B and C are matrix
 
 
 
-/**********************************************************************************/
+/*****************************************************************************/
 // class2 element-wise operation
 
 // element-wise generalized operation defined in Op
 
 
-template<typename Op> 
-void E_Func(XPU xpu, Blob<float> * A, float alpha)
-{
-       if(xpu == cpu)
-       {
-               int n = get_size(A->shape());
-               cpu_e_f<Op>(n, alpha, A->mutable_cpu_data());
-       }
-       if(xpu == gpu)
-       {
-               //gpu part
-               int n = get_size(A->shape());
-               gpu_e_f<Op>(n, alpha, A->mutable_gpu_data());
-       }
+template<typename Op>
+void E_Func(XPU xpu, Blob<float> * A, float alpha) {
+    if (xpu == cpu) {
+        int n = get_size(A->shape());
+        cpu_e_f<Op>(n, alpha, A->mutable_cpu_data());
+    }
+    if (xpu == gpu) {
+        // gpu part
+        int n = get_size(A->shape());
+        gpu_e_f<Op>(n, alpha, A->mutable_gpu_data());
+    }
 }
 
 template<typename Op>
-void E_Func(XPU xpu, const Blob<float> & A, Blob<float> * B, float alpha)
-{
-       if(check_shape_equal(A, *B, *B))
-       {
-                       int n = get_size(A.shape());
-                       if(xpu == cpu)
-                       {
-                               cpu_e_f<Op>(n, A.cpu_data(), alpha, 
B->mutable_cpu_data());
-                       }
-
-                       if(xpu == gpu)
-                       {
-                               //gpu part
-                               gpu_e_f<Op>(n, A.gpu_data(), alpha, 
B->mutable_gpu_data());
-                       }
-       }
-       else{
-                       // report errors here
-       }       
+void E_Func(XPU xpu, const Blob<float> & A, Blob<float> * B, float alpha) {
+    if (check_shape_equal(A, *B, *B)) {
+        int n = get_size(A.shape());
+        if (xpu == cpu) {
+            cpu_e_f<Op>(n, A.cpu_data(), alpha, B->mutable_cpu_data());
+        }
+        if (xpu == gpu) {
+            // gpu part
+            gpu_e_f<Op>(n, A.gpu_data(), alpha, B->mutable_gpu_data());
+        }
+    } else {
+        // report errors here
+    }
 }
 
 template<typename Op>
-void E_Func(XPU xpu, const Blob<float> & A, const Blob<float> & B, Blob<float> 
* C, float alpha, float beta)
-{
-       if(check_shape_equal(A, B, *C))
-       {
-               int n = get_size(A.shape());
-
-               if(xpu == cpu)
-               {
-                       cpu_e_f<Op>(n, A.cpu_data(), B.cpu_data(), alpha, beta, 
C->mutable_cpu_data());
-               }
-               if(xpu == gpu)
-               {
-                       //gpu part
-                       gpu_e_f<Op>(n, A.gpu_data(), B.gpu_data(), alpha, beta, 
C->mutable_gpu_data());
-               }
-       }
-       else{
-                       // report errors here
-       }
-}
-
-
-inline void Set(XPU xpu, Blob<float> * A,float alpha)
-{
-       E_Func<singa_op::Set>(xpu, A, alpha);
+void E_Func(XPU xpu, const Blob<float> & A, const Blob<float> & B,
+Blob<float> * C, float alpha, float beta) {
+    if (check_shape_equal(A, B, *C)) {
+        int n = get_size(A.shape());
+        if (xpu == cpu) {
+            cpu_e_f<Op>(n, A.cpu_data(), B.cpu_data(), alpha, beta,
+            C->mutable_cpu_data());
+        }
+        if (xpu == gpu) {
+            // gpu part
+            gpu_e_f<Op>(n, A.gpu_data(), B.gpu_data(), alpha, beta,
+            C->mutable_gpu_data());
+        }
+    } else {
+        // report errors here
+    }
+}
+
+
+inline void Set(XPU xpu, Blob<float> * A, float alpha) {
+    E_Func<singa::op::Set>(xpu, A, alpha);
 }
 // element-wise operation: Ai = alpha
 
 
-inline void Scale(XPU xpu, const Blob<float> & A, Blob<float> * B, float alpha)
-{
-       E_Func<singa_op::Scale>(xpu, A, B, alpha);
+inline void Scale(XPU xpu, const Blob<float> & A, Blob<float> * B,
+float alpha) {
+    E_Func<singa::op::Scale>(xpu, A, B, alpha);
 }
 // element-wise operation: Bi = alpha*Ai
 
-inline void Exp(XPU xpu, const Blob<float> & A, Blob<float> * B, float alpha = 
2.71)
-{
-       E_Func<singa_op::Exp>(xpu, A, B, alpha);
+inline void Exp(XPU xpu, const Blob<float> & A, Blob<float> * B,
+float alpha = 2.71) {
+    E_Func<singa::op::Exp>(xpu, A, B, alpha);
 }
 // element-wise operation: Bi = alpha^Ai
 
-inline void Exp_grad(XPU xpu, const Blob<float> & A, Blob<float> * B, float 
alpha = 2.71)
-{
-       E_Func<singa_op::Exp_grad>(xpu, A, B, alpha);
+inline void Exp_grad(XPU xpu, const Blob<float> & A, Blob<float> * B,
+float alpha = 2.71) {
+    E_Func<singa::op::Exp_grad>(xpu, A, B, alpha);
 }
 // element-wise operation: Bi = Ai*log(alpha)
 
-inline void Gsigmoid(XPU xpu, const Blob<float> & A, Blob<float> * B,float 
alpha)
-{
-       E_Func<singa_op::Gsigmoid>(xpu, A, B, alpha);
+inline void Gsigmoid(XPU xpu, const Blob<float> & A, Blob<float> * B,
+float alpha) {
+    E_Func<singa::op::Gsigmoid>(xpu, A, B, alpha);
 }
 // element-wise operation: b = 1.0f / (1.0f + expf(-a * alpha));
 
-inline void Gsigmoid_grad(XPU xpu, const Blob<float> & A, Blob<float> * 
B,float alpha)
-{
-       E_Func<singa_op::Gsigmoid_grad>(xpu, A, B, alpha);
+inline void Gsigmoid_grad(XPU xpu, const Blob<float> & A, Blob<float> * B,
+float alpha) {
+    E_Func<singa::op::Gsigmoid_grad>(xpu, A, B, alpha);
 }
 // element-wise operation: b = alpha * a * ( 1.0f - a );
 
-inline void Grelu(XPU xpu, const Blob<float> & A, Blob<float> * B,float alpha 
= 0)
-{
-       E_Func<singa_op::Grelu>(xpu, A, B, alpha);
+inline void Grelu(XPU xpu, const Blob<float> & A, Blob<float> * B,
+float alpha = 0) {
+    E_Func<singa::op::Grelu>(xpu, A, B, alpha);
 }
 // element-wise operation: b = ( 1 - alpha ) * std::max( a, 0.0f ) + alpha * a;
 
-inline void Grelu_grad(XPU xpu, const Blob<float> & A, Blob<float> * B,float 
alpha = 0)
-{
-       E_Func<singa_op::Grelu_grad>(xpu, A, B, alpha);
+inline void Grelu_grad(XPU xpu, const Blob<float> & A, Blob<float> * B,
+float alpha = 0) {
+    E_Func<singa::op::Grelu_grad>(xpu, A, B, alpha);
 }
 // element-wise operation: b = a > 0.0f ? 1.0f : alpha;
 
-inline void Gtanh(XPU xpu, const Blob<float> & A, Blob<float> * B,float alpha)
-{
-       E_Func<singa_op::Gtanh>(xpu, A, B, alpha);
+inline void Gtanh(XPU xpu, const Blob<float> & A, Blob<float> * B,
+float alpha) {
+    E_Func<singa::op::Gtanh>(xpu, A, B, alpha);
 }
 // element-wise operation: b = tanhf( a * alpha );
 
-inline void Gtanh_grad(XPU xpu, const Blob<float> & A, Blob<float> * B,float 
alpha)
-{
-       E_Func<singa_op::Gtanh_grad>(xpu, A, B, alpha);
+inline void Gtanh_grad(XPU xpu, const Blob<float> & A, Blob<float> * B,
+float alpha) {
+    E_Func<singa::op::Gtanh_grad>(xpu, A, B, alpha);
 }
 // element-wise operation: b = alpha * ( 1.0f - a * a );
-        
-inline void Softplus(XPU xpu, const Blob<float> & A, Blob<float> * B)
-{
-       E_Func<singa_op::Softplus>(xpu, A, B, 0);
+
+inline void Softplus(XPU xpu, const Blob<float> & A, Blob<float> * B) {
+    E_Func<singa::op::Softplus>(xpu, A, B, 0);
 }
 // element-wise operation: b = logf(1 + expf(a));
 
-inline void Softplus_grad(XPU xpu, const Blob<float> & A, Blob<float> * B)
-{
-       E_Func<singa_op::Softplus_grad>(xpu, A, B, 0);
+inline void Softplus_grad(XPU xpu, const Blob<float> & A, Blob<float> * B) {
+    E_Func<singa::op::Softplus_grad>(xpu, A, B, 0);
 }
 // element-wise operation: b = 1.0f / (1.0f + expf(-a));
 
-inline void Square(XPU xpu, const Blob<float> & A, Blob<float> * B)
-{
-       E_Func<singa_op::Square>(xpu, A, B, 0);
+inline void Square(XPU xpu, const Blob<float> & A, Blob<float> * B) {
+    E_Func<singa::op::Square>(xpu, A, B, 0);
 }
 // element-wise operation: b = a * a;
 
-inline void Square_grad(XPU xpu, const Blob<float> & A, Blob<float> * B)
-{
-       E_Func<singa_op::Square_grad>(xpu, A, B, 0);
+inline void Square_grad(XPU xpu, const Blob<float> & A, Blob<float> * B) {
+    E_Func<singa::op::Square_grad>(xpu, A, B, 0);
 }
 // element-wise operation: b = 2 * sqrt(a);
 
-inline void Sqrt(XPU xpu, const Blob<float> & A, Blob<float> * B)
-{
-       E_Func<singa_op::Sqrt>(xpu, A, B, 0);
+inline void Sqrt(XPU xpu, const Blob<float> & A, Blob<float> * B) {
+    E_Func<singa::op::Sqrt>(xpu, A, B, 0);
 }
 // element-wise operation: b = sqrt(a);
 
-inline void Threshold(XPU xpu, const Blob<float> & A, float alpha, Blob<float> 
* B)
-{
-       E_Func<singa_op::Threshold>(xpu, A, B, alpha);
+inline void Threshold(XPU xpu, const Blob<float> & A, float alpha,
+Blob<float> * B) {
+    E_Func<singa::op::Threshold>(xpu, A, B, alpha);
 }
 // element-wise operation: b =  a < alpha ? 1.0f : 0.0f;
 
-inline void Add(XPU xpu, const Blob<float> & A, const Blob<float> & B, 
Blob<float> * C)
-{
-       E_Func<singa_op::Add>(xpu, A, B, C, 0, 0);
+inline void Add(XPU xpu, const Blob<float> & A, const Blob<float> & B,
+Blob<float> * C) {
+    E_Func<singa::op::Add>(xpu, A, B, C, 0, 0);
 }
 // element-wise operation: Ci = Ai+Bi  A,B and C should have the same size
 
-inline void Sub(XPU xpu, const Blob<float> & A, const Blob<float> & B, 
Blob<float> * C)
-{
-       E_Func<singa_op::Sub>(xpu, A, B, C, 0, 0);
+inline void Sub(XPU xpu, const Blob<float> & A, const Blob<float> & B,
+Blob<float> * C) {
+    E_Func<singa::op::Sub>(xpu, A, B, C, 0, 0);
 }
 // element-wise operation: Ci = Ai-Bi  A,B and C should have the same size
 
-inline void Mult(XPU xpu, const Blob<float> & A, const Blob<float> & B, 
Blob<float> * C)
-{
-       E_Func<singa_op::Mult>(xpu, A, B, C, 0, 0);
+inline void Mult(XPU xpu, const Blob<float> & A, const Blob<float> & B,
+Blob<float> * C) {
+    E_Func<singa::op::Mult>(xpu, A, B, C, 0, 0);
 }
 // element-wise operation: Ci = Ai*Bi  A,B and C should have the same size
 
-inline void Div(XPU xpu, const Blob<float> & A, const Blob<float> & B, 
Blob<float> * C)
-{
-       E_Func<singa_op::Div>(xpu, A, B, C, 0, 0);
+inline void Div(XPU xpu, const Blob<float> & A, const Blob<float> & B,
+Blob<float> * C) {
+    E_Func<singa::op::Div>(xpu, A, B, C, 0, 0);
 }
 // element-wise operation: Ci = Ai/Bi  A,B and C should have the same size
 
@@ -382,81 +380,59 @@ inline void Div(XPU xpu, const Blob<float> & A, const 
Blob<float> & B, Blob<floa
 void AXPY(XPU xpu, const Blob<float> & A, Blob<float> * B, float alpha);
 // element-wise operation: Bi = alpha*Ai+Bi  A and B should have the same size
 
-//todo: random part
-/*
-void Gaussian(XPU xpu, Blob & A, float mu, float sigma);
-// element-wise operation: initialize each element in A following distribution 
Gaussian(mu, sigma)
-
-void Uniform(XPU xpu, Blob & A, float low, float high);
-// element-wise operation: initialize each element in A following uniform 
distribution from low to high
-
-void Bernoulli(XPU xpu, Blob & A, float p, int n = 1);
-// element-wise operation: initialize each element in A following distribution 
Bernoulli(n,p)
-*/
-
-/**********************************************************************************/
-//class3 matrix-vector expand/reduce operation
-
-template<typename Op> 
-void Reduce_F(XPU xpu, const Blob<float> & A, Blob<float> * B)
-{
-       if(check_shape_mv(A, *B))
-       {
-               int m = get_size(B->shape());
-               int n = get_size(A.shape()) / m;
-
-               if(xpu == cpu)
-               {
-                       cpu_reduce_f<Op>(A.cpu_data(), m, n, 
B->mutable_cpu_data());
-               }
-               if(xpu == gpu)
-               {
-                       //gpu part
-                       gpu_reduce_f<Op>(A.gpu_data(), m, n, 
B->mutable_gpu_data());
-               }
-       }
-       else{
-               // report errors here
-       }
-}
-//reduce each row of A to an element of B e.g. the sum operation in softmax
-
-template<typename Op> 
-void Expand_F(XPU xpu, const Blob<float> & A, Blob<float> * B)
-{
-       if(check_shape_mv(*B, A))
-       {
-               int m = get_size(A.shape());
-               int n = get_size(B->shape()) / m;
-
-               if(xpu == cpu)
-               {
-                       cpu_expand_f<Op>(A.cpu_data(), m, n, 
B->mutable_cpu_data());
-               }
-               if(xpu == gpu)
-               {
-                       //gpu part
-                       gpu_expand_f<Op>(A.gpu_data(), m, n, 
B->mutable_gpu_data());
-               }
-       }
-       else{
-               // report errors here
-       }
-}
-//expand each element in A into a row of B
+/*****************************************************************************/
+// class3 matrix-vector expand/reduce operation
+
+template<typename Op>
+void Reduce_F(XPU xpu, const Blob<float> & A, Blob<float> * B) {
+    if (check_shape_mv(A, *B)) {
+        int m = get_size(B->shape());
+        int n = get_size(A.shape()) / m;
+        if (xpu == cpu) {
+            cpu_reduce_f<Op>(A.cpu_data(), m, n, B->mutable_cpu_data());
+        }
+        if (xpu == gpu) {
+            // gpu part
+            gpu_reduce_f<Op>(A.gpu_data(), m, n, B->mutable_gpu_data());
+        }
+    } else {
+        // report errors here
+    }
+}
+// reduce each row of A to an element of B e.g. the sum operation in softmax
+
+template<typename Op>
+void Expand_F(XPU xpu, const Blob<float> & A, Blob<float> * B) {
+    if (check_shape_mv(*B, A)) {
+        int m = get_size(A.shape());
+        int n = get_size(B->shape()) / m;
+        if (xpu == cpu) {
+            cpu_expand_f<Op>(A.cpu_data(), m, n, B->mutable_cpu_data());
+        }
+        if (xpu == gpu) {
+            // gpu part
+            gpu_expand_f<Op>(A.gpu_data(), m, n, B->mutable_gpu_data());
+        }
+    } else {
+        // report errors here
+    }
+}
+// expand each element in A into a row of B
 
 void Repmat(XPU xpu, const Blob<float> & A, Blob<float> * B);
 // A is a vector, B is a matrix , let each row of B to be A
 
-void MVAdd(XPU xpu, const Blob<float> & A, Blob<float> * B, float alpha, float 
beta);
+void MVAdd(XPU xpu, const Blob<float> & A, Blob<float> * B,
+float alpha, float beta);
 // A is a vector, B is a matrix , Bij = alpha*Ai+beta*Bij
 // will use gemm. faster than general expand_f
 
-void MVSum(XPU xpu, const Blob<float> & A, Blob<float> * B, float alpha, float 
beta);
+void MVSum(XPU xpu, const Blob<float> & A, Blob<float> * B,
+float alpha, float beta);
 // A is a vector, B is a matrix , Ai = \sigma_j_{alpha*Bij}+beta*Ai
 // will use gemm. faster than general reduce_f
 
 
-} // end of namespace singa
+}  // end of namespace singa
 
-#endif // MATH_BLOB_H
+#endif  // SINGA_BLOB_MATH_BLOB_H_

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/01d91af1/include/singa/blob/singa_op.h
----------------------------------------------------------------------
diff --git a/include/singa/blob/singa_op.h b/include/singa/blob/singa_op.h
index 33ef4f8..abdfd66 100644
--- a/include/singa/blob/singa_op.h
+++ b/include/singa/blob/singa_op.h
@@ -1,252 +1,296 @@
-#ifndef SINGA_OP_H
-#define SINGA_OP_H
+/************************************************************
+*
+* Licensed to the Apache Software Foundation (ASF) under one
+* or more contributor license agreements.  See the NOTICE file
+* distributed with this work for additional information
+* regarding copyright ownership.  The ASF licenses this file
+* to you under the Apache License, Version 2.0 (the
+* "License"); you may not use this file except in compliance
+* with the License.  You may obtain a copy of the License at
+*
+*   http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing,
+* software distributed under the License is distributed on an
+* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+* KIND, either express or implied.  See the License for the
+* specific language governing permissions and limitations
+* under the License.
+*
+*************************************************************/
+
+#ifndef SINGA_BLOB_SINGA_OP_H_
+#define SINGA_BLOB_SINGA_OP_H_
 
-#include<cmath>
-#include <algorithm>
 #include <cuda_runtime.h>
-#include "cublas_v2.h"
+#include <cmath>
+#include <algorithm>
+// #include "cublas_v2.h"
 #include "singa/blob/math_kernel.h"
 
+
 namespace singa {
-       enum XPU { cpu, gpu, any};
-
-}
-
-namespace singa_op {
-        struct Set {
-            inline static void Map(float alpha, float & a) {
-                a= alpha;
-            }
-            inline static void CudaMap(float alpha, float * a, int n) {
-                               singa::singa_gpu_set_value(a, alpha, n);
-            }
-        };
-
-        struct Scale {
-            inline static void Map(float alpha,  const float & a, float & b) {
-                b = alpha* a;
-            }
-            inline static void CudaMap(float alpha,  const float * a, float * 
b, int n) {
-                               singa::singa_gpu_scale(a,b,alpha,n);
-            }
-        };
-
-        struct Scale_grad {
-            inline static void Map(float alpha,  float & output) {
-                output = alpha;
-            }
-            inline static void CudaMap(float alpha,  float *output, int n) {
-                               singa::singa_gpu_scale_grad(output,alpha,n);
-            }
-        };
-
-        struct Exp {
-            inline static void Map(float alpha,  const float & a, float & b) {
-                b = pow(a, alpha);
-            }
-            inline static void CudaMap(float alpha,  const float * a, float * 
b, int n) {
-                               singa::singa_gpu_exp(a,b,alpha,n);
-            }
-        };
-
-        struct Exp_grad {
-            inline static void Map(float alpha,  const float & a, float & b) {
-                // log is the natrual log based on e
-                b = a * log(alpha);
-            }
-            inline static void CudaMap(float alpha,  const float * a, float * 
b, int n) {
-                               singa::singa_gpu_exp_grad(a,b,alpha,n);
-            }
-        };
-
-        struct Gsigmoid {
-            inline static void Map(float alpha,  const float & a, float & b) {
-                b = 1.0f / (1.0f + expf(-a * alpha));
-            }
-            inline static void CudaMap(float alpha,  const float * a, float * 
b, int n) {
-                               singa::singa_gpu_sigmoid(a,b,alpha,n);
-            }
-        };
-
-        struct Gsigmoid_grad {
-            inline static void Map(float alpha,  const float & a, float & b) {
-                b = alpha * a * ( 1.0f - a );
-            }
-            inline static void CudaMap(float alpha,  const float * a, float * 
b, int n) {
-                               singa::singa_gpu_sigmoid_grad(a,b,alpha,n);
-            }
-        };
-
-        struct Grelu {
-            inline static void Map(float alpha,  const float & a, float & b) {
-                b = ( 1 - alpha ) * std::max( a, 0.0f ) + alpha * a;
-            }
-            inline static void CudaMap(float alpha,  const float * a, float * 
b, int n) {
-                               singa::singa_gpu_relu(a,b,alpha,n);
-            }
-        };
-
-        struct Grelu_grad {
-            inline static void Map(float alpha,  const float & a, float & b) {
-                b = a > 0.0f ? 1.0f : alpha;
-            }
-            inline static void CudaMap(float alpha,  const float * a, float * 
b, int n) {
-                               singa::singa_gpu_relu_grad(a,b,alpha,n);
-            }
-        };
-
-        struct Gtanh {
-            inline static void Map(float alpha,  const float & a, float & b) {
-                b = tanhf( a * alpha );
-            }
-            inline static void CudaMap(float alpha,  const float * a, float * 
b, int n) {
-                               singa::singa_gpu_tanh(a,b,alpha,n);
-            }
-        };
-
-        struct Gtanh_grad {
-            inline static void Map(float alpha,  const float & a, float & b) {
-                b = alpha * ( 1.0f - a * a );
-            }
-            inline static void CudaMap(float alpha,  const float * a, float * 
b, int n) {
-                               singa::singa_gpu_tanh_grad(a,b,alpha,n);
-            }
-        };
-
-        struct Softplus {
-            inline static void Map(float alpha,  const float & a, float & b) {
-                b = logf(1 + expf(a));
-            }
-            inline static void CudaMap(float alpha,  const float * a, float * 
b, int n) {
-                               singa::singa_gpu_softplus(a,b,alpha,n);
-            }
-        };
-
-        struct Softplus_grad {
-            inline static void Map(float alpha,  const float & a, float & b) {
-                b = 1.0f / (1.0f + expf(-a));
-            }
-            inline static void CudaMap(float alpha,  const float * a, float * 
b, int n) {
-                               singa::singa_gpu_softplus_grad(a,b,alpha,n);
-            }
-        };
-
-        struct Square {
-            inline static void Map(float alpha,  const float & a, float & b) {
-                b = a * a;
-            }
-            inline static void CudaMap(float alpha,  const float * a, float * 
b, int n) {
-                               singa::singa_gpu_square(a,b,alpha,n);
-            }
-        };
-
-        struct Square_grad {
-            inline static void Map(float alpha,  const float & a, float & b) {
-                b = 2 * sqrt(a);
-            }
-            inline static void CudaMap(float alpha,  const float * a, float * 
b, int n) {
-                               singa::singa_gpu_square_grad(a,b,alpha,n);
-            }
-        };
-
-        struct Sqrt {
-            inline static void Map(float alpha,  const float & a, float & b) {
-                b = sqrt(a);
-            }
-            inline static void CudaMap(float alpha,  const float * a, float * 
b, int n) {
-                               singa::singa_gpu_sqrt(a,b,alpha,n);
-            }
-        };
-
-        struct Threshold {
-            inline static void Map(float alpha, const float & a, float & b) {
-                b =  a < alpha ? 1.0f : 0.0f;
-            }
-            inline static void CudaMap(float alpha,  const float * a, float * 
b, int n) {
-                               singa::singa_gpu_threshold(a,b,alpha,n);
-            }
-        };
-
-        struct Add {
-            inline static void Map(float alpha, float beta, const float & a, 
const float & b, float & c) {
-                c =  a + b;
-            }
-            inline static void CudaMap(float alpha, float beta, const float * 
a, const float * b, float *c, int n) {
-                               singa::singa_gpu_add(a,b,c,alpha,beta,n);
-            }
-        };
-
-        struct Sub {
-            inline static void Map(float alpha, float beta, const float & a, 
const float & b, float & c) {
-                c =  a - b;
-            }
-            inline static void CudaMap(float alpha, float beta, const float * 
a, const float * b, float *c, int n) {
-                               singa::singa_gpu_sub(a,b,c,alpha,beta,n);
-            }
-        };
-
-        struct Mult {
-            inline static void Map(float alpha, float beta, const float & a, 
const float & b, float & c) {
-                c =  a * b;
-            }
-            inline static void CudaMap(float alpha, float beta, const float * 
a, const float * b, float *c, int n) {
-                               singa::singa_gpu_mult(a,b,c,alpha,beta,n);
-            }
-        };
-
-        struct Div {
-            inline static void Map(float alpha, float beta, const float & a, 
const float & b, float & c) {
-                c =  a / b;
-            }
-            inline static void CudaMap(float alpha, float beta, const float * 
a, const float * b, float *c, int n) {
-                               singa::singa_gpu_div(a,b,c,alpha,beta,n);
-            }
-        };
-
-        struct Sum {
-            inline static void Map(const float * a, int n, float & b) {
-                b = 0;
-                for(int i = 0 ; i < n ; i++)
-                {
-                            b += a[i];
-                }
-            }
-
-            inline static void CudaMap(const float * a, int n, float & b) {
-                               float *sum = NULL;
-                               cudaMalloc((void**)&sum, n*sizeof(float));
-
-                               singa::singa_gpu_sum_vec(a,sum,n);
-
-                               cudaMemcpyAsync(&b, sum, sizeof(float), 
cudaMemcpyDeviceToDevice);
-                               cudaFree(sum);
-                       }
-        };
-
-        struct Expand_Div {
-            inline static void Map(const float & a, int n, float * b) {
-                for(int i = 0 ; i < n ; i++)
-                {
-                            b[i] /= a;
-                }
-            }
-            inline static void CudaMap(const float & a, int n, float * b) {
-                               singa::singa_gpu_scale(b,b,a,n);
-                       }
-        };
-
-        struct Repmat {
-            inline static void Map(const float & a, int n, float * b) {
-                for(int i = 0 ; i < n ; i++)
-                {
-                            b[i] = a;
-                }
-            }
-            inline static void CudaMap(const float & a, int n, float * b) {
-                               singa::singa_gpu_set_value(b,a,n);
-                       }
-        };
-
-}; // namespace op
-
-#endif // SINGA_OP_H
+    enum XPU { cpu, gpu, any};
+
+namespace op {
+struct Set {
+    inline static void Map(float alpha, float * a) {
+        *a = alpha;
+    }
+    inline static void CudaMap(float alpha, float * a, int n) {
+        singa::singa_gpu_set_value(a, alpha, n);
+    }
+};
+
+struct Scale {
+    inline static void Map(float alpha,  const float & a, float * b) {
+        *b = alpha * a;
+    }
+    inline static void CudaMap(float alpha,  const float * a,
+    float * b, int n) {
+        singa::singa_gpu_scale(a, b, alpha, n);
+    }
+};
+
+struct Scale_grad {
+    inline static void Map(float alpha,  float * output) {
+        *output = alpha;
+    }
+    inline static void CudaMap(float alpha,  float * output, int n) {
+        singa::singa_gpu_scale_grad(output, alpha, n);
+    }
+};
+
+struct Exp {
+    inline static void Map(float alpha,  const float & a, float * b) {
+        *b = pow(a, alpha);
+    }
+    inline static void CudaMap(float alpha,  const float * a,
+    float * b, int n) {
+        singa::singa_gpu_exp(a, b, alpha, n);
+    }
+};
+
+struct Exp_grad {
+    inline static void Map(float alpha,  const float & a, float * b) {
+        // log is the natrual log based on e
+        *b = a * log(alpha);
+    }
+    inline static void CudaMap(float alpha,  const float * a,
+    float * b, int n) {
+        singa::singa_gpu_exp_grad(a, b, alpha, n);
+    }
+};
+
+struct Gsigmoid {
+    inline static void Map(float alpha,  const float & a, float * b) {
+        *b = 1.0f / (1.0f + expf(-a * alpha));
+    }
+    inline static void CudaMap(float alpha,  const float * a,
+    float * b, int n) {
+        singa::singa_gpu_sigmoid(a, b, alpha, n);
+    }
+};
+
+struct Gsigmoid_grad {
+    inline static void Map(float alpha,  const float & a, float * b) {
+        *b = alpha * a * (1.0f - a);
+    }
+    inline static void CudaMap(float alpha,  const float * a,
+    float * b, int n) {
+        singa::singa_gpu_sigmoid_grad(a, b, alpha, n);
+    }
+};
+
+struct Grelu {
+    inline static void Map(float alpha,  const float & a, float * b) {
+        *b = (1 - alpha) * std::max(a, 0.0f) + alpha * a;
+    }
+    inline static void CudaMap(float alpha,  const float * a,
+    float * b, int n) {
+        singa::singa_gpu_relu(a, b, alpha, n);
+    }
+};
+
+struct Grelu_grad {
+    inline static void Map(float alpha,  const float & a, float * b) {
+        *b = a > 0.0f ? 1.0f : alpha;
+    }
+    inline static void CudaMap(float alpha,  const float * a,
+    float * b, int n) {
+        singa::singa_gpu_relu_grad(a, b, alpha, n);
+    }
+};
+
+struct Gtanh {
+    inline static void Map(float alpha,  const float & a, float * b) {
+        *b = tanhf(a * alpha);
+    }
+    inline static void CudaMap(float alpha,  const float * a,
+    float * b, int n) {
+        singa::singa_gpu_tanh(a, b, alpha, n);
+    }
+};
+
+struct Gtanh_grad {
+    inline static void Map(float alpha,  const float & a, float * b) {
+        *b = alpha * (1.0f - a * a);
+    }
+    inline static void CudaMap(float alpha,  const float * a,
+    float * b, int n) {
+        singa::singa_gpu_tanh_grad(a, b, alpha, n);
+    }
+};
+
+struct Softplus {
+    inline static void Map(float alpha,  const float & a, float * b) {
+        *b = logf(1 + expf(a));
+    }
+    inline static void CudaMap(float alpha,  const float * a,
+    float * b, int n) {
+        singa::singa_gpu_softplus(a, b, alpha, n);
+    }
+};
+
+struct Softplus_grad {
+    inline static void Map(float alpha,  const float & a, float * b) {
+        *b = 1.0f / (1.0f + expf(-a));
+    }
+    inline static void CudaMap(float alpha,  const float * a,
+    float * b, int n) {
+        singa::singa_gpu_softplus_grad(a, b, alpha, n);
+    }
+};
+
+struct Square {
+    inline static void Map(float alpha,  const float & a, float * b) {
+        *b = a * a;
+    }
+    inline static void CudaMap(float alpha,  const float * a,
+    float * b, int n) {
+        singa::singa_gpu_square(a, b, alpha, n);
+    }
+};
+
+struct Square_grad {
+    inline static void Map(float alpha,  const float & a, float * b) {
+        *b = 2 * sqrt(a);
+    }
+    inline static void CudaMap(float alpha,  const float * a,
+    float * b, int n) {
+        singa::singa_gpu_square_grad(a, b, alpha, n);
+    }
+};
+
+struct Sqrt {
+    inline static void Map(float alpha,  const float & a, float * b) {
+        *b = sqrt(a);
+    }
+    inline static void CudaMap(float alpha,  const float * a,
+    float * b, int n) {
+        singa::singa_gpu_sqrt(a, b, alpha, n);
+    }
+};
+
+struct Threshold {
+    inline static void Map(float alpha, const float & a, float * b) {
+        *b =  a < alpha ? 1.0f : 0.0f;
+    }
+    inline static void CudaMap(float alpha,  const float * a,
+    float * b, int n) {
+        singa::singa_gpu_threshold(a, b, alpha, n);
+    }
+};
+
+struct Add {
+    inline static void Map(float alpha, float beta, const float & a,
+    const float & b, float * c) {
+        *c =  a + b;
+    }
+    inline static void CudaMap(float alpha, float beta, const float * a,
+    const float * b, float * c, int n) {
+        singa::singa_gpu_add(a, b, c, alpha, beta, n);
+    }
+};
+
+struct Sub {
+    inline static void Map(float alpha, float beta, const float & a,
+    const float & b, float * c) {
+        *c =  a - b;
+    }
+    inline static void CudaMap(float alpha, float beta, const float * a,
+    const float * b, float * c, int n) {
+        singa::singa_gpu_sub(a, b, c, alpha, beta, n);
+    }
+};
+
+struct Mult {
+    inline static void Map(float alpha, float beta, const float & a,
+    const float & b, float * c) {
+        *c =  a * b;
+    }
+    inline static void CudaMap(float alpha, float beta, const float * a,
+    const float * b, float * c, int n) {
+        singa::singa_gpu_mult(a, b, c, alpha, beta, n);
+    }
+};
+
+struct Div {
+    inline static void Map(float alpha, float beta, const float & a,
+    const float & b, float * c) {
+        *c =  a / b;
+    }
+    inline static void CudaMap(float alpha, float beta, const float * a,
+    const float * b, float * c, int n) {
+        singa::singa_gpu_div(a, b, c, alpha, beta, n);
+    }
+};
+
+struct Sum {
+    inline static void Map(const float * a, int n, float * b) {
+        *b = 0;
+        for (int i = 0 ; i < n ; i++) {
+                    *b += a[i];
+        }
+    }
+
+    inline static void CudaMap(const float * a, int n, float * b) {
+        float *sum = NULL;
+        cudaMalloc(<void**>(&sum), n*sizeof(float));
+
+        singa::singa_gpu_sum_vec(a, sum, n);
+
+        cudaMemcpyAsync(b, sum, sizeof(float), cudaMemcpyDeviceToDevice);
+        cudaFree(sum);
+    }
+};
+
+struct Expand_Div {
+    inline static void Map(const float & a, int n, float * b) {
+        for (int i = 0 ; i < n ; i++) {
+                    b[i] /= a;
+        }
+    }
+    inline static void CudaMap(const float & a, int n, float * b) {
+        singa::singa_gpu_scale(b, b, a, n);
+    }
+};
+
+struct Repmat {
+    inline static void Map(const float & a, int n, float * b) {
+        for (int i = 0 ; i < n ; i++) {
+                    b[i] = a;
+        }
+    }
+    inline static void CudaMap(const float & a, int n, float * b) {
+        singa::singa_gpu_set_value(b, a, n);
+    }
+};
+
+};  // namespace op
+
+};  // namespace singa
+
+
+
+#endif  // SINGA_BLOB_SINGA_OP_H_

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/01d91af1/src/blob/math_addr.cc
----------------------------------------------------------------------
diff --git a/src/blob/math_addr.cc b/src/blob/math_addr.cc
index 799a749..8451957 100644
--- a/src/blob/math_addr.cc
+++ b/src/blob/math_addr.cc
@@ -1,115 +1,117 @@
-extern "C"
-{
-   #include <cblas.h>
-}
-
-#include <cuda_runtime.h>
-#include "cublas_v2.h"
+/************************************************************
+*
+* Licensed to the Apache Software Foundation (ASF) under one
+* or more contributor license agreements.  See the NOTICE file
+* distributed with this work for additional information
+* regarding copyright ownership.  The ASF licenses this file
+* to you under the Apache License, Version 2.0 (the
+* "License"); you may not use this file except in compliance
+* with the License.  You may obtain a copy of the License at
+*
+*   http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing,
+* software distributed under the License is distributed on an
+* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+* KIND, either express or implied.  See the License for the
+* specific language governing permissions and limitations
+* under the License.
+*
+*************************************************************/
 
 #include "singa/blob/math_addr.h"
+extern "C" {
+    #include <cblas.h>
+}
+#include <cuda_runtime.h>
 #include "singa/blob/singa_op.h"
+// #include "cublas_v2.h"
 
-namespace singa{
 
-const float * cpu_uni_vec(const int n)
-{
-       float * res = new float[n];
-       for(int i = 0; i < n; i++)
-               res[i] = 1.0;
-       return res;
+
+namespace singa {
+
+const float * cpu_uni_vec(const int n) {
+    float * res = new float[n];
+    for (int i = 0; i < n; i++)
+        res[i] = 1.0;
+    return res;
 }
 
-void cpu_gemm(const float * A, const float * B, const int m, const int n, 
const int k, const float alpha, const float beta, const bool TranA, const bool 
TranB, float * C)
-{
-       int lda, ldb;
-       CBLAS_TRANSPOSE tA, tB;
-       lda = TranA ? m : k;
-       ldb = TranB ? k : n;
-       tA = TranA ? CblasTrans : CblasNoTrans;
-       tB = TranB ? CblasTrans : CblasNoTrans;
-       cblas_sgemm(CblasRowMajor, tA, tB, m, n, k, alpha, A, lda, B, ldb, 
beta, C, n);
+void cpu_gemm(const float * A, const float * B, const int m, const int n,
+const int k, const float alpha, const float beta,
+const bool TranA, const bool TranB, float * C) {
+    int lda, ldb;
+    CBLAS_TRANSPOSE tA, tB;
+    lda = TranA ? m : k;
+    ldb = TranB ? k : n;
+    tA = TranA ? CblasTrans : CblasNoTrans;
+    tB = TranB ? CblasTrans : CblasNoTrans;
+    cblas_sgemm(CblasRowMajor, tA, tB, m, n, k, alpha, A, lda,
+    B, ldb, beta, C, n);
 }
 
-void cpu_gemv(const float * A, const float * B, const int m, const int n, 
const float alpha, const float beta, const bool TranA, float * C)
-{
-       CBLAS_TRANSPOSE tA;
-       tA = TranA ? CblasTrans : CblasNoTrans;
-       cblas_sgemv(CblasRowMajor, tA, m, n, alpha, A, n, B, 1, beta, C, 1);
+void cpu_gemv(const float * A, const float * B, const int m, const int n,
+const float alpha, const float beta, const bool TranA, float * C) {
+    CBLAS_TRANSPOSE tA;
+    tA = TranA ? CblasTrans : CblasNoTrans;
+    cblas_sgemv(CblasRowMajor, tA, m, n, alpha, A, n, B, 1, beta, C, 1);
 }
 
-void cpu_axpy(const float * A, const int n, const float alpha, float * B)
-{
-       cblas_saxpy(n, alpha, A, 1, B, 1);
+void cpu_axpy(const float * A, const int n, const float alpha, float * B) {
+    cblas_saxpy(n, alpha, A, 1, B, 1);
 }
 
-float cpu_dot(const float * A, const float * B, const int n)
-{
-       float sum = 0;
-       for(int i = 0 ; i < n ; i++)
-               sum += A[i]*B[i];
-       return sum;
+float cpu_dot(const float * A, const float * B, const int n) {
+    float sum = 0;
+    for (int i = 0 ; i < n ; i++)
+        sum += A[i] * B[i];
+    return sum;
 }
 
-//Trick: swap A and B
+// Trick: swap A and B
 //
-void gpu_gemm(const float * A, const float * B, const int m, const int n, 
const int k, const float alpha, const float beta, const bool TranA, const bool 
TranB, float * C)
-{
+void gpu_gemm(const float * A, const float * B, const int m, const int n,
+const int k, const float alpha, const float beta, const bool TranA,
+const bool TranB, float * C) {
     int lda = TranA ? m : k;
     int ldb = TranB ? k : n;
     int ldc = n;
-
-    cublasOperation_t tA= (TranA==false) ? CUBLAS_OP_N : CUBLAS_OP_T;
-       cublasOperation_t tB= (TranB==false) ? CUBLAS_OP_N : CUBLAS_OP_T;
-
+    cublasOperation_t tA = (TranA == false) ? CUBLAS_OP_N : CUBLAS_OP_T;
+    cublasOperation_t tB = (TranB == false) ? CUBLAS_OP_N : CUBLAS_OP_T;
     cublasHandle_t handle;
     cublasCreate(&handle);
-
-    cublasSgemm(handle, tB, tA, n, m, k, &alpha, B, ldb, A, lda, &beta, C, 
ldc);
-
+    cublasSgemm(handle, tB, tA, n, m, k, &alpha, B, ldb,
+    A, lda, &beta, C, ldc);
     cublasDestroy(handle);
 }
 
-void gpu_gemv(const float * A, const float * B, const int m, const int n, 
const float alpha, const float beta, const bool TranA, float * C)
-{
-       int lda = n ;
-       cublasOperation_t tA= (TranA==true) ? CUBLAS_OP_N : CUBLAS_OP_T;
-
-       cublasHandle_t handle;
-       cublasCreate(&handle);
-
-       cublasSgemv(handle, tA , n , m ,&alpha , A , lda , B , 1 ,&beta , C , 
1);
-
-       cublasDestroy(handle);
-
+void gpu_gemv(const float * A, const float * B, const int m, const int n,
+const float alpha, const float beta, const bool TranA, float * C) {
+    int lda = n;
+    cublasOperation_t tA = (TranA == true) ? CUBLAS_OP_N : CUBLAS_OP_T;
+    cublasHandle_t handle;
+    cublasCreate(&handle);
+    cublasSgemv(handle, tA, n, m, &alpha , A, lda, B, 1, &beta, C, 1);
+    cublasDestroy(handle);
 }
 
 
-void gpu_axpy(const float * A, const int n, const float alpha, float * B)
-{
-
-       cublasHandle_t handle;
-       cublasCreate(&handle);
-
-       cublasSaxpy(handle,n,&alpha,A,1,B,1);
-
-       cublasDestroy(handle);
-
+void gpu_axpy(const float * A, const int n, const float alpha, float * B) {
+    cublasHandle_t handle;
+    cublasCreate(&handle);
+    cublasSaxpy(handle, n, &alpha, A, 1, B, 1);
+    cublasDestroy(handle);
 }
 
 
-float gpu_dot(const float * A, const float * B, const int n)
-{
-       cublasHandle_t handle;
-       cublasCreate(&handle);
-
-       float result=0.0;
-
-       cublasSdot(handle,n,A,1,B,1,&result);
-
-       cublasDestroy(handle);
-
-       return result;
-
+float gpu_dot(const float * A, const float * B, const int n) {
+    cublasHandle_t handle;
+    cublasCreate(&handle);
+    float result = 0.0;
+    cublasSdot(handle, n, A, 1, B, 1, &result);
+    cublasDestroy(handle);
+    return result;
 }
 
-} // namespace singa
+}  // namespace singa

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/01d91af1/src/blob/math_blob.cc
----------------------------------------------------------------------
diff --git a/src/blob/math_blob.cc b/src/blob/math_blob.cc
index bd0e6ee..ad0b766 100644
--- a/src/blob/math_blob.cc
+++ b/src/blob/math_blob.cc
@@ -1,207 +1,198 @@
+/************************************************************
+*
+* Licensed to the Apache Software Foundation (ASF) under one
+* or more contributor license agreements.  See the NOTICE file
+* distributed with this work for additional information
+* regarding copyright ownership.  The ASF licenses this file
+* to you under the Apache License, Version 2.0 (the
+* "License"); you may not use this file except in compliance
+* with the License.  You may obtain a copy of the License at
+*
+*   http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing,
+* software distributed under the License is distributed on an
+* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+* KIND, either express or implied.  See the License for the
+* specific language governing permissions and limitations
+* under the License.
+*
+*************************************************************/
+
 #include "singa/blob/math_blob.h"
 #include "singa/blob/math_kernel.h"
 
 namespace singa {
 
-/**********************************************************************************/
+/*****************************************************************************/
 // shape_check function
 
-int get_size(const std::vector<int>& shape)
-{
-  int sum = 1;
-  for(unsigned int i = 0; i < shape.size(); i++) sum *= shape[i];
-  return sum; 
+int get_size(const std::vector<int>& shape) {
+    int sum = 1;
+    for (unsigned int i = 0; i < shape.size(); i++) sum *= shape[i];
+    return sum;
 }
 
-/**********************************************************************************/
+/*****************************************************************************/
 // class1 matrix operation
 
 
-void GEMM(XPU xpu, const Blob<float> & A, const Blob<float> & B, Blob<float> * 
C, float alpha, float beta)
-{
-       if(check_shape_mmm(A, B, *C))
-       {
-               int m = C->shape().at(0);
-           int n = C->shape().at(1);
-           int k = A.isTranspose() ? A.shape().at(0) : A.shape().at(1);
-           bool TranA = A.isTranspose();
-           bool TranB = B.isTranspose();
-
-               if(xpu == cpu)
-               {
-                       cpu_gemm(A.cpu_data(), B.cpu_data(), m, n, k, alpha, 
beta, TranA, TranB, C->mutable_cpu_data());
-               }
-               if(xpu == gpu)
-               {
-                       //gpu part
-                       gpu_gemm(A.gpu_data(), B.gpu_data(), m, n, k, alpha, 
beta, TranA, TranB, C->mutable_gpu_data());
-               }
-       }
-       else{
-         // report errors here
-       }
+void GEMM(XPU xpu, const Blob<float> & A, const Blob<float> & B,
+Blob<float> * C, float alpha, float beta) {
+    if (check_shape_mmm(A, B, *C)) {
+        int m = C->shape().at(0);
+        int n = C->shape().at(1);
+        int k = A.isTranspose() ? A.shape().at(0) : A.shape().at(1);
+        bool TranA = A.isTranspose();
+        bool TranB = B.isTranspose();
+        if (xpu == cpu) {
+            cpu_gemm(A.cpu_data(), B.cpu_data(), m, n, k, alpha, beta,
+            TranA, TranB, C->mutable_cpu_data());
+        }
+        if (xpu == gpu) {
+            // gpu part
+            gpu_gemm(A.gpu_data(), B.gpu_data(), m, n, k, alpha, beta,
+            TranA, TranB, C->mutable_gpu_data());
+        }
+    } else {
+        // report errors here
+    }
 }
-//C = alpha*A*B+beta*C, A, B and C are matrix
+// C = alpha*A*B+beta*C, A, B and C are matrix
 
- 
-void MMDot(XPU xpu, const Blob<float> & A, const Blob<float> & B, Blob<float> 
* C)
-{
-       GEMM(xpu, A, B, C, 1, 0);
+void MMDot(XPU xpu, const Blob<float> & A, const Blob<float> & B,
+Blob<float> * C) {
+    GEMM(xpu, A, B, C, 1, 0);
 }
 // A,B and C are matrix
 
 
-void MVDot(XPU xpu, const Blob<float> & A, const Blob<float> & B, Blob<float> 
* C)
-{
-       if(check_shape_mvv(A, B, *C))
-       {
-               int m = B.shape().at(0);
-               int n = C->shape().at(0);
-               bool TranA = A.isTranspose();
-
-               if(xpu == cpu)
-               {
-                       cpu_gemv(A.cpu_data(), B.cpu_data(), m, n, 1, 0, TranA, 
C->mutable_cpu_data());
-               }
-               if(xpu == gpu)
-               {
-                       //gpu part
-                       gpu_gemv(A.gpu_data(), B.gpu_data(), m, n, 1, 0, TranA, 
C->mutable_gpu_data());
-               }
-       }
-       else{
-               // report errors here
-       }
-       
+void MVDot(XPU xpu, const Blob<float> & A, const Blob<float> & B,
+Blob<float> * C) {
+    if (check_shape_mvv(A, B, *C)) {
+        int m = B.shape().at(0);
+        int n = C->shape().at(0);
+        bool TranA = A.isTranspose();
+        if (xpu == cpu) {
+            cpu_gemv(A.cpu_data(), B.cpu_data(), m, n, 1, 0, TranA,
+            C->mutable_cpu_data());
+        }
+        if (xpu == gpu) {
+            // gpu part
+            gpu_gemv(A.gpu_data(), B.gpu_data(), m, n, 1, 0, TranA,
+            C->mutable_gpu_data());
+        }
+    } else {
+        // report errors here
+    }
 }
 // A is matrix,B and C are vector
 
- 
-void VVDot(XPU xpu, const Blob<float> & A, const Blob<float> & B, Blob<float> 
* C)
-{
-       if(check_shape_vvm(A, B, *C))
-       {
-               int m = C->shape().at(0);
-               int n = C->shape().at(1);
-
-               if(xpu == cpu)
-               {
-                       cpu_gemm(A.cpu_data(), B.cpu_data(), m, n, 1, 1, 0, 
false, false, C->mutable_cpu_data());
-               }
-               if(xpu == gpu)
-               {
-                       //gpu part
-                       gpu_gemm(A.gpu_data(), B.gpu_data(), m, n, 1, 1, 0, 
false, false, C->mutable_gpu_data());
-               }
-       }
-       else{
-       // report errors here
-       }
+
+void VVDot(XPU xpu, const Blob<float> & A, const Blob<float> & B,
+Blob<float> * C) {
+    if (check_shape_vvm(A, B, *C)) {
+        int m = C->shape().at(0);
+        int n = C->shape().at(1);
+        if (xpu == cpu) {
+            cpu_gemm(A.cpu_data(), B.cpu_data(), m, n, 1, 1, 0,
+            false, false, C->mutable_cpu_data());
+        }
+        if (xpu == gpu) {
+            // gpu part
+            gpu_gemm(A.gpu_data(), B.gpu_data(), m, n, 1, 1, 0,
+            false, false, C->mutable_gpu_data());
+        }
+    } else {
+        // report errors here
+    }
 }
 // C is matrix,A and B are vector
 
- 
-float VVdot(XPU xpu, const Blob<float> & A, const Blob<float> & B)
-{
-       float res = 0;
-       if(check_shape_equal(A, B, B))
-       {
-               int n = get_size(A.shape());
-               if(xpu == cpu)
-               {       
-                       res = cpu_dot(A.cpu_data(), B.cpu_data(), n);
-               }
-               if(xpu == gpu)
-               {
-               //gpu part
-                       res = gpu_dot(A.gpu_data(), B.gpu_data(), n);
-               }
-       }
-       else{
-       // report errors here
-       }
-       return res;
+
+float VVdot(XPU xpu, const Blob<float> & A, const Blob<float> & B) {
+    float res = 0;
+    if (check_shape_equal(A, B, B)) {
+        int n = get_size(A.shape());
+        if (xpu == cpu) {
+            res = cpu_dot(A.cpu_data(), B.cpu_data(), n);
+        }
+        if (xpu == gpu) {
+            // gpu part
+            res = gpu_dot(A.gpu_data(), B.gpu_data(), n);
+        }
+    } else {
+        // report errors here
+    }
+    return res;
 }
-//A and B are vectors
-
-void AXPY(XPU xpu, const Blob<float> & A, Blob<float> * B, float alpha)
-{
-       if(check_shape_equal(A, *B, *B))
-       {
-
-               if(xpu == cpu)
-               {
-                       cpu_axpy(A.cpu_data(), get_size(A.shape()), alpha, 
B->mutable_cpu_data());
-               }
-               if(xpu == gpu)
-               {
-                       gpu_axpy(A.gpu_data(), get_size(A.shape()), alpha, 
B->mutable_gpu_data());
-               }
-       }
-       else{
-       // report errors here
-       }
+// A and B are vectors
+
+void AXPY(XPU xpu, const Blob<float> & A, Blob<float> * B, float alpha) {
+    if (check_shape_equal(A, *B, *B)) {
+        if (xpu == cpu) {
+            cpu_axpy(A.cpu_data(), get_size(A.shape()),
+            alpha, B->mutable_cpu_data());
+        }
+        if (xpu == gpu) {
+            gpu_axpy(A.gpu_data(), get_size(A.shape()),
+            alpha, B->mutable_gpu_data());
+        }
+    } else {
+        // report errors here
+    }
 }
 // element-wise operation: Bi = alpha*Ai+Bi  A and B should have the same size
 
-inline void Repmat(XPU xpu, const Blob<float> & A, Blob<float> * B)
-{
-       MVAdd(xpu, A, B, 1, 0);
+inline void Repmat(XPU xpu, const Blob<float> & A, Blob<float> * B) {
+    MVAdd(xpu, A, B, 1, 0);
 }
 // A is a vector, B is a matrix , let each row of B to be A
 
-void MVAdd(XPU xpu, const Blob<float> & A, Blob<float> * B, float alpha, float 
beta)
-{
-       if(check_shape_mv(*B, A))
-       {
-               int m = get_size(A.shape());
-               int n = get_size(B->shape()) / m;
-
-               if(xpu == cpu)
-               {
-                       const float * univ = cpu_uni_vec(n);
-                       cpu_gemm(A.cpu_data(), univ, m, n, 1, alpha, beta, 
false, false, B->mutable_cpu_data());
-                       delete univ;
-               }
-
-               if(xpu == gpu)
-               {
-                       
singa_gpu_add_vec_row(B->gpu_data(),A.gpu_data(),A.gpu_data(),m,n,n);
-               //gpu part
-               }       
-       }
-       else{
-       // report errors here
-       }
+void MVAdd(XPU xpu, const Blob<float> & A, Blob<float> * B,
+float alpha, float beta) {
+    if (check_shape_mv(*B, A)) {
+        int m = get_size(A.shape());
+        int n = get_size(B->shape()) / m;
+        if (xpu == cpu) {
+            const float * univ = cpu_uni_vec(n);
+            cpu_gemm(A.cpu_data(), univ, m, n, 1, alpha, beta,
+            false, false, B->mutable_cpu_data());
+            delete univ;
+        }
+        if (xpu == gpu) {
+            singa_gpu_add_vec_row(B->gpu_data(),
+            A.gpu_data(), A.gpu_data(), m, n, n);
+            // gpu part
+        }
+    } else {
+        // report errors here
+    }
 }
 // A is a vector, B is a matrix , Bij = alpha*Ai+beta*Bij
 // will use gemm. faster than general expand_f
 
-void MVSum(XPU xpu, const Blob<float> & A, Blob<float> * B, float alpha, float 
beta)
-{
-       if(check_shape_mv(A, *B))
-       {
-               int m = get_size(B->shape());
-               int n = get_size(A.shape()) / m;
-
-               if(xpu == cpu)
-               {
-                       const float * univ = cpu_uni_vec(n);
-                       cpu_gemm(A.cpu_data(), univ, m, 1, n, alpha, beta, 
false, false, B->mutable_cpu_data());
-                       delete univ;
-               }
-               if(xpu == gpu)
-               {
-                       singa_gpu_sum_col(A.gpu_data(),B->gpu_data(),m,n,n);
-               //gpu part
-               }
-       }
-       else{
-       // report errors here
-       }
+void MVSum(XPU xpu, const Blob<float> & A, Blob<float> * B,
+float alpha, float beta) {
+    if (check_shape_mv(A, *B)) {
+        int m = get_size(B->shape());
+        int n = get_size(A.shape()) / m;
+        if (xpu == cpu) {
+            const float * univ = cpu_uni_vec(n);
+            cpu_gemm(A.cpu_data(), univ, m, 1, n, alpha, beta,
+            false, false, B->mutable_cpu_data());
+            delete univ;
+        }
+        if (xpu == gpu) {
+            singa_gpu_sum_col(A.gpu_data(), B->gpu_data(), m, n, n);
+            // gpu part
+        }
+    } else {
+        // report errors here
+    }
 }
 // B is a vector, A is a matrix , Bi = \sigma_j_{alpha*Aij}+beta*Bi
 // will use gemm. faster than general reduce_f
 
-} // namespace singa
+}  // namespace singa

[02/19] incubator-singa git commit: SINGA-80 New Blob Level and Address Level Math Operation Interface

Reply via email to