[18/50] [abbrv] incubator-singa git commit: SINGA-182 Clean math function APIs and implementations

zhaojing Mon, 13 Jun 2016 06:20:59 -0700

SINGA-182 Clean math function APIs and implementations

Implement GEMM/DGMM to support sum rows/columns, and add/sub/mult/div:row/column


Pass all test;

Format code and update the consistency guide for cleaning code.

Add the compile guard for USE_CBLAS.
TODO, find cblas by cmake and set USE_CBLAS


Project: http://git-wip-us.apache.org/repos/asf/incubator-singa/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-singa/commit/870d1a97
Tree: http://git-wip-us.apache.org/repos/asf/incubator-singa/tree/870d1a97
Diff: http://git-wip-us.apache.org/repos/asf/incubator-singa/diff/870d1a97

Branch: refs/heads/master
Commit: 870d1a97e19061f3f42b9cf907874609f7158231
Parents: fbd5219
Author: Wei Wang <[email protected]>
Authored: Fri May 27 20:31:41 2016 +0800
Committer: Wei Wang <[email protected]>
Committed: Mon May 30 13:24:51 2016 +0800

----------------------------------------------------------------------
 CMakeLists.txt                     |   1 +
 cmake/Cuda.cmake                   |   1 +
 cmake/Dependencies.cmake           |   4 +
 cmake/Templates/singa_config.h.in  |   4 +-
 include/singa/core/tensor.h        | 257 +++++++-------
 include/singa/utils/cuda_utils.h   |  60 ++--
 src/core/device/cpp_cpu.cc         |  13 +-
 src/core/device/cuda_gpu.cc        |  10 +-
 src/core/device/device.cc          |   1 -
 src/core/tensor/math_kernel.cu     |  26 ++
 src/core/tensor/math_kernel.h      |  15 +-
 src/core/tensor/tensor.cc          | 610 +++++++++++++++++++-------------
 src/core/tensor/tensor_math.h      | 160 +++++----
 src/core/tensor/tensor_math_cpp.h  | 157 +++++---
 src/core/tensor/tensor_math_cuda.h | 117 ++++--
 test/singa/test_cpp_math.cc        |  25 --
 test/singa/test_mse.cc             |  26 +-
 test/singa/test_tensor.cc          |   2 -
 test/singa/test_tensor_math.cc     | 447 ++++++++++++++++++++++-
 19 files changed, 1325 insertions(+), 611 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/870d1a97/CMakeLists.txt
----------------------------------------------------------------------
diff --git a/CMakeLists.txt b/CMakeLists.txt
index e08fb98..d585497 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -17,6 +17,7 @@ SET(SINGA_INCLUDE_DIR 
"${CMAKE_SOURCE_DIR}/include;${PROJECT_BINARY_DIR}")
 INCLUDE_DIRECTORIES(${SINGA_INCLUDE_DIR})
 
 #OPTION(CPU_ONLY "use GPU libs" OFF)
+OPTION(USE_CBLAS "Use CBlas libs" OFF)
 OPTION(USE_CUDA "Use Cuda libs" ON)
 OPTION(USE_CUDNN "Use Cudnn libs" ON)
 OPTION(USE_OPENCV "Use opencv" OFF)

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/870d1a97/cmake/Cuda.cmake
----------------------------------------------------------------------
diff --git a/cmake/Cuda.cmake b/cmake/Cuda.cmake
index 19d4e27..4985bb8 100644
--- a/cmake/Cuda.cmake
+++ b/cmake/Cuda.cmake
@@ -22,3 +22,4 @@ ENDIF()
 
 INCLUDE_DIRECTORIES(SYSTEM ${CUDA_INCLUDE_DIRS})
 LIST(APPEND SINGA_LINKER_LIBS ${CUDA_CUDART_LIBRARY} ${CUDA_curand_LIBRARY} 
${CUDA_CUBLAS_LIBRARIES})
+MESSAGE(STATUS "libs " ${SINGA_LINKER_LIBS})

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/870d1a97/cmake/Dependencies.cmake
----------------------------------------------------------------------
diff --git a/cmake/Dependencies.cmake b/cmake/Dependencies.cmake
index ae28073..e995553 100644
--- a/cmake/Dependencies.cmake
+++ b/cmake/Dependencies.cmake
@@ -14,3 +14,7 @@ ELSE()
     SET(USE_CUDA FALSE)
     SET(USE_CUDNN FALSE)
 ENDIF()
+
+
+#LIST(APPEND SINGA_LINKER_LIBS "/home/wangwei/local/lib/libopenblas.so")
+#MESSAGE(STATUS "link lib : " ${SINGA_LINKER_LIBS})

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/870d1a97/cmake/Templates/singa_config.h.in
----------------------------------------------------------------------
diff --git a/cmake/Templates/singa_config.h.in 
b/cmake/Templates/singa_config.h.in
index e0f7328..5e8b32d 100644
--- a/cmake/Templates/singa_config.h.in
+++ b/cmake/Templates/singa_config.h.in
@@ -4,7 +4,9 @@
 // Binaries director
 #define BINARY_FOLDER "${PROJECT_BINARY_DIR}"
 
-#cmakedefine CPU_ONLY 
+#cmakedefine CPU_ONLY
+
+#cmakedefine USE_CBLAS
 // cuda
 #cmakedefine USE_CUDA
 

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/870d1a97/include/singa/core/tensor.h
----------------------------------------------------------------------
diff --git a/include/singa/core/tensor.h b/include/singa/core/tensor.h
index e560071..f51c899 100644
--- a/include/singa/core/tensor.h
+++ b/include/singa/core/tensor.h
@@ -33,22 +33,22 @@ namespace singa {
 
 typedef vector<size_t> Shape;
 typedef Shape::iterator ShapeIter;
-inline size_t Product(const Shape& shape, int start = 0, size_t len = 0) {
+inline size_t Product(const Shape &shape, int start = 0, size_t len = 0) {
   if (len == 0)
     len = shape.size();
   CHECK_LE(len, shape.size());
   size_t v = 1;
-  for (unsigned int i = start; i < len; i ++)
+  for (unsigned int i = start; i < len; i++)
     v *= shape[i];
   return v;
 }
 
 /// hardcode the width of types defined in DataType
 const size_t kDataWidth[] = {sizeof(float), sizeof(float) / 2, sizeof(int),
-                          sizeof(char), sizeof(double)};
+                             sizeof(char), sizeof(double)};
 inline size_t SizeOf(DataType t) {
   static_assert(kNumDataType == sizeof(kDataWidth) / sizeof(size_t),
-      "Num of data types not match num of data width");
+                "Num of data types not match num of data width");
   CHECK_GT(kNumDataType, t);
   return kDataWidth[t];
 }
@@ -62,52 +62,44 @@ inline size_t SizeOf(DataType t) {
 /// then it must be set up correctly (shape, device). Otherwise, runtime error
 /// like SegmentFault would happen. Simply type/device check would be 
conducted.
 class Tensor {
- public:
+public:
   ~Tensor();
   Tensor();
-  explicit Tensor(Shape&& shape, DataType dtype = kFloat32);
-  explicit Tensor(const Shape& shape, DataType dtype = kFloat32);
-  Tensor(Shape&& shape, Device* dev, DataType dtype = kFloat32);
-  Tensor(const Shape& shape, Device* dev, DataType dtype = kFloat32);
+  explicit Tensor(Shape &&shape, DataType dtype = kFloat32);
+  explicit Tensor(const Shape &shape, DataType dtype = kFloat32);
+  Tensor(Shape &&shape, Device *dev, DataType dtype = kFloat32);
+  Tensor(const Shape &shape, Device *dev, DataType dtype = kFloat32);
 
   /// Copy Tensor to share the internal data.  No deep copy.
-  Tensor(const Tensor& from);
+  Tensor(const Tensor &from);
   /// Copy Tensor to share the internal data.  No deep copy.
-  Tensor(Tensor&& from);
+  Tensor(Tensor &&from);
 
   /// For functions in xx_math.cc to access the blob.
   /// Users should not operate against Blob directly.
   /// blob_ is allocated in constructors.
-  Blob* blob() const {
-    return blob_;
-  }
+  Blob *blob() const { return blob_; }
 
-  Device* device() const {
-    return device_;
-  }
+  Device *device() const { return device_; }
 
   /// Return immutable Tensor values with given type.
-  template <typename DType>
-  DType data() const {
-    return static_cast<DType> (blob()->data());
+  template <typename DType> DType data() const {
+    return static_cast<DType>(blob()->data());
   }
 
   /// data type, including kFloat16, kFloat32, kInt
-  const DataType data_type() const {
-    return data_type_;
-  }
+  const DataType data_type() const { return data_type_; }
 
-  const Shape& shape() const {
-    return shape_;
-  }
+  const Shape &shape() const { return shape_; }
 
-  int nDim() const {
-    return shape_.size();
+  const size_t shape(size_t idx) const {
+    CHECK_LT(idx, shape_.size());
+    return shape_.at(idx);
   }
 
-  bool transpose() const {
-    return transpose_;
-  }
+  int nDim() const { return shape_.size(); }
+
+  bool transpose() const { return transpose_; }
 
   /// Return number of total elements
   size_t Size() const {
@@ -116,39 +108,37 @@ class Tensor {
   }
 
   /// Return memory size (i.e., Bytes)
-  size_t MemSize() const {
-    return blob_->size();
-  }
+  size_t MemSize() const { return blob_->size(); }
 
   /// Reset the tensor shape, it may reallocate blob, if MemSize() changes.
-  void Reshape(const Shape& shape);
+  void Reshape(const Shape &shape);
+  void Reshape(Shape &&shape);
 
   /// Reset the shape, device, and data type as given tensor.
   /// If blob size changes, then reallocate a new blob. The previous blob would
   /// be deleted.
-  void ResetLike(const Tensor& t);
+  void ResetLike(const Tensor &t);
 
   /// Reset the data type, it would reallocate blob if type changes.
   void AsType(DataType type);
 
   /// Reset the device.
   /// If the target device is a diff device, then do deep data copy.
-  void ToDevice(Device* dev);
+  void ToDevice(Device *dev);
 
   /// Equivalent to ToDevice(host_dev).
   void ToHost();
 
   /// Set each element of the tensor to be x
-  template<typename SType>
-  void SetValue(SType x);
+  template <typename SType> void SetValue(const SType x);
 
   /// For init the tensor values, copy 'num' elements.
-  template<typename DType>
-  void CopyDataFromHostPtr(const DType* src, size_t num);
+  template <typename DType>
+  void CopyDataFromHostPtr(const DType *src, size_t num);
 
   /// Copy data from another Tensor which may be on a diff device.
   /// Meta data would not be copied!
-  void CopyData(const Tensor& other);
+  void CopyData(const Tensor &other);
 
   /// Return an exactly the same Tensor with data been deep copied.
   Tensor Clone();
@@ -160,135 +150,124 @@ class Tensor {
   Tensor T() const;
 
   /// Copy the meta info with data blob shared.
-  Tensor& operator=(const Tensor& t);
+  Tensor &operator=(const Tensor &t);
 
   /// Copy the meta info with data blob shared.
-  Tensor& operator=(Tensor&& t);
+  Tensor &operator=(Tensor &&t);
 
-
-  Tensor& operator+=(const Tensor& t);
+  Tensor &operator+=(const Tensor &t);
   // void operator+=(Tensor&& t);
-  Tensor& operator-=(const Tensor& t);
+  Tensor &operator-=(const Tensor &t);
   // void operator-=(Tensor&& t);
-  Tensor& operator*=(const Tensor& t);
+  Tensor &operator*=(const Tensor &t);
   // void operator*=(Tensor&& t);
-  Tensor& operator/=(const Tensor& t);
+  Tensor &operator/=(const Tensor &t);
   // void operator/=(Tensor&& t);
 
   // Scalar operations.
 
   /// T is a scalar type
-  template<typename DType>
-  Tensor& operator+=(DType x);
+  template <typename DType> Tensor &operator+=(DType x);
 
   /// T is a scalar type
-  template <typename DType>
-  Tensor& operator-=(const DType x);
+  template <typename DType> Tensor &operator-=(const DType x);
 
   /// T is a scalar type
-  template <typename DType>
-  Tensor& operator*=(const DType x);
+  template <typename DType> Tensor &operator*=(const DType x);
 
   /// T is a scalar type
-  template <typename DType>
-  Tensor& operator/=(const DType x);
+  template <typename DType> Tensor &operator/=(const DType x);
 
   /// save Tensor into a proto msg
   // void ToProto(TensorProto* t);
   /// load Tensor from proto msg
   // void FromProto(const TensorProto& t);
 
- protected:
+protected:
   bool transpose_ = false;
   DataType data_type_ = kFloat32;
-  Device* device_ = nullptr;
+  Device *device_ = nullptr;
   /// Note: blob_ is allocated in lazy manner to avoid frequent malloc/free.
   /// If you want to get an allocated Blob, use blob() instead of blob_.
-  Blob* blob_ = nullptr;
+  Blob *blob_ = nullptr;
   Shape shape_;
 };
 
+inline void CheckDataTypeAndLang(const Tensor &in1, const Tensor &in2) {
+  CHECK_EQ(in1.data_type(), in2.data_type());
+  CHECK_EQ(in1.device()->lang(), in2.device()->lang());
+}
+
+Tensor Reshape(const Tensor &in, const Shape &s);
+Tensor Reshape(const Tensor &in, Shape &&s);
+
 // For tensors with sparse content, e.g., missing columns or rows.
 // class SparseTensor : public Tensor {};
 
 /// Copy 'num' elements of src to dst.
 /// The first 'src_offset' ('dst_offset') elements will be skipped.
-void CopyDataToFrom(Tensor* dst,
-              const Tensor& src,
-              size_t num,
-              size_t src_offset = 0,
-              size_t dst_offset = 0);
+void CopyDataToFrom(Tensor *dst, const Tensor &src, size_t num,
+                    size_t src_offset = 0, size_t dst_offset = 0);
 
 // ==================Simple Linear Algebra Operations=========================
-Tensor Abs(const Tensor& t);
-Tensor Exp(const Tensor& t);
-Tensor Log(const Tensor& t);
-Tensor ReLU(const Tensor& t);
-Tensor Sigmoid(const Tensor& t);
-Tensor Sign(const Tensor& t);
-Tensor Sqrt(const Tensor& t);
-Tensor Square(const Tensor& t);
-Tensor Tanh(const Tensor& t);
-
-
-template<typename SType>
-SType Sum(const Tensor& t);
+Tensor Abs(const Tensor &t);
+Tensor Exp(const Tensor &t);
+Tensor Log(const Tensor &t);
+Tensor ReLU(const Tensor &t);
+Tensor Sigmoid(const Tensor &t);
+Tensor Sign(const Tensor &t);
+Tensor Sqrt(const Tensor &t);
+Tensor Square(const Tensor &t);
+Tensor Tanh(const Tensor &t);
+
+template <typename SType> SType Sum(const Tensor &t);
 /// Sum elements in the Tensor, currently only support vector and matrix.
 /// if 'axis' is 0, sum all rows into a single row
 /// if 'axis' is 1, sum all columns into a single column
 /// TODO(wangwei) support arbitrary Tensor like numpy.sum
-Tensor Sum(const Tensor& t, int axis);
+Tensor Sum(const Tensor &t, int axis);
 
 /// Average elements in the Tensor, currently only support vector and matrix.
 /// if 'axis' is 0, average all rows into a single row
 /// if 'axis' is 1, average all columns into a single column
 /// TODO(wangwei) support arbitrary Tensor like numpy.average
-Tensor Average(const Tensor&t, int axis);
+Tensor Average(const Tensor &t, int axis);
 /// Regarding the internal data as 2d, with shape_[0]*...*shape_[axis] rows,
 /// and shape_[axis+1]*...*shape_[nDim()] columns.
 /// and do softmax along each row.
-Tensor Softmax(const Tensor& t, int axis = -1);
-void Softmax(const Tensor& t, Tensor* ret, int axis = -1);
+Tensor Softmax(const Tensor &t, int axis = -1);
+void Softmax(const Tensor &t, Tensor *ret, int axis = -1);
 
 /// Element-wise opeartion, ret[i]=t[i]^x
-template<typename DType>
-Tensor Pow(const Tensor& t, DType x);
+template <typename DType> Tensor Pow(const Tensor &t, DType x);
 /// Element-wise opeartion, ret[i]=t[i]^x
-template<typename DType>
-void Pow(const Tensor& t, DType x, Tensor* ret);
+template <typename DType> void Pow(const Tensor &t, DType x, Tensor *ret);
 /// Element-wise opeartion, ret[i]=baes[i]^exp[i]
-Tensor Pow(const Tensor& base, Tensor exp);
+Tensor Pow(const Tensor &base, Tensor exp);
 /// Element-wise opeartion, ret[i]=baes[i]^exp[i]
-void Pow(const Tensor& base, const Tensor& exp, Tensor* ret);
+void Pow(const Tensor &base, const Tensor &exp, Tensor *ret);
 
-Tensor operator+(const Tensor& lhs, const Tensor& rhs);
-void Add(const Tensor& lhs, const Tensor& rhs, Tensor* ret);
-Tensor operator-(const Tensor& lhs, const Tensor& rhs);
-void Sub(const Tensor& lhs, const Tensor& rhs, Tensor* ret);
-Tensor operator*(const Tensor& lhs, const Tensor& rhs);
-void EltwiseMult(const Tensor& lhs, const Tensor& rhs, Tensor* ret);
-Tensor operator/(const Tensor& lhs, const Tensor& rhs);
-void Div(const Tensor& lhs, const Tensor& rhs, Tensor* ret);
+Tensor operator+(const Tensor &lhs, const Tensor &rhs);
+void Add(const Tensor &lhs, const Tensor &rhs, Tensor *ret);
+Tensor operator-(const Tensor &lhs, const Tensor &rhs);
+void Sub(const Tensor &lhs, const Tensor &rhs, Tensor *ret);
+Tensor operator*(const Tensor &lhs, const Tensor &rhs);
+void EltwiseMult(const Tensor &lhs, const Tensor &rhs, Tensor *ret);
+Tensor operator/(const Tensor &lhs, const Tensor &rhs);
+void Div(const Tensor &lhs, const Tensor &rhs, Tensor *ret);
 
-template <typename DType>
-Tensor operator+(const Tensor& t, DType x);
-template <typename DType>
-void Add(const Tensor& t, DType x, Tensor* ret);
+template <typename DType> Tensor operator+(const Tensor &t, DType x);
+template <typename DType> void Add(const Tensor &t, DType x, Tensor *ret);
 
-template <typename DType>
-Tensor operator-(const Tensor& t, DType x);
-template <typename DType>
-void Sub(const Tensor& t, DType x, Tensor* ret);
+template <typename DType> Tensor operator-(const Tensor &t, DType x);
+template <typename DType> void Sub(const Tensor &t, DType x, Tensor *ret);
 
+template <typename DType> Tensor operator*(const Tensor &t, DType x);
 template <typename DType>
-Tensor operator*(const Tensor& t, DType x);
-template <typename DType>
-void EltwiseMult(const Tensor& t, DType x, Tensor* ret);
+void EltwiseMult(const Tensor &t, DType x, Tensor *ret);
 
-template <typename DType>
-Tensor operator/(const Tensor& t, DType x);
-template <typename DType>
-void Div(const Tensor& t, DType x, Tensor* ret);
+template <typename DType> Tensor operator/(const Tensor &t, DType x);
+template <typename DType> void Div(const Tensor &t, DType x, Tensor *ret);
 
 // ================Blas operations============================================
 // We fix the scalar argument type to be float.
@@ -302,27 +281,59 @@ void Div(const Tensor& t, DType x, Tensor* ret);
 // void Axpy(DType x, const Blob& t, Blob* ret, Context* ctx);
 
 /// Do matrix vector multipication or matrix matrix multiplication depdending
-/// on the Tensor shape.  ret = lhs * rhs
-Tensor Mult(const Tensor& lhs, const Tensor& rhs);
+/// on the Tensor shape.  result = A * B
+Tensor Mult(const Tensor &A, const Tensor &B);
 /// Do matrix vector multipication or matrix matrix multiplication depdending
-/// on the Tensor shape.  ret = lhs * rhs
-void Mult(const Tensor& lhs, const Tensor& rhs, Tensor* ret);
+/// on the Tensor shape.  C = A * B
+void Mult(const Tensor &A, const Tensor &B, Tensor *C);
 
 /// Do matrix vector multipication or matrix matrix multiplication depdending
-/// on the Tensor shape.  ret = alpha lhs * rhs + beta * ret
-Tensor Mult(float alpha, const Tensor& lhs, float beta, const Tensor& rhs);
-/// Do matrix vector multipication or matrix matrix multiplication depdending
 /// on the Tensor shape. ret = alpha lhs * rhs + beta * ret
-void Mult(float alpha, const Tensor& lhs, float beta, const Tensor& rhs,
-    Tensor* C);
+void Mult(const float alpha, const Tensor &lhs, const Tensor &rhs,
+          const float beta, Tensor *C);
 
 // ================Random operations==========================================
 /// For each element x set x = 1 if random() < p; otherwise x = 1.
-void Bernoulli(float p, Tensor* t);
+void Bernoulli(float p, Tensor *t);
 /// Fill in Tensor 't' following uniform distribution.
-void Uniform(float low, float high, Tensor* t);
+void Uniform(float low, float high, Tensor *t);
 /// Fill in Tensor 't' following Gaussian distribution.
-void Gaussian(float mean, float std, Tensor* t);
+void Gaussian(float mean, float std, Tensor *t);
+
+// follow the consistency guide
+// ============Matrix vector operations=======================================
+/// Add column 'v' with each column of matrix M
+void AddColumn(const Tensor &v, Tensor *M);
+void AddColumn(const float alpha, const float beta, const Tensor &v,
+               Tensor *out);
+/// Sub column 'v' by each column of matrix M
+void SubColumn(const Tensor &v, Tensor *M);
+/// Multiply column 'v' and each column of matrix M; write results into 'out'
+void MultColumn(const Tensor &v, Tensor *M);
+/// Divide column 'v' by each column of matrix M; write results into 'out'
+void DivColumn(const Tensor &v, Tensor *M);
+
+/// Add row 'v' with each row of matrix M; write results into 'out'
+void AddRow(const Tensor &v, Tensor *out);
+void AddRow(const float alpha, const float beta, const Tensor &v, Tensor *M);
+/// Sub row 'v' by each row of matrix M; write results into 'out'
+void SubRow(const Tensor &v, Tensor *M);
+/// Multiply row 'v' with each row of matrix M; write results into 'out'
+void MultRow(const Tensor &v, Tensor *M);
+/// Divide row 'v' by each row of matrix M; write results into 'out'
+void DivRow(const Tensor &v, Tensor *M);
+
+/// Sum all rows of matrix M into a single row as 'out'
+void SumRows(const Tensor &M, Tensor *out);
+/// Sum all columns of matrix M into a single column as 'out'
+void SumColumns(const Tensor &M, Tensor *out);
+
+/// For each element x of Tensor 'in', compute alpha/x
+template <typename SType> Tensor Div(const SType alpha, const Tensor &in);
+
+/// For each element x of Tensor 'in', compute alpha/x into 'out'
+template <typename SType>
+void Div(const SType alpha, const Tensor &in, Tensor *out);
 
 }  // namespace singa
 

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/870d1a97/include/singa/utils/cuda_utils.h
----------------------------------------------------------------------
diff --git a/include/singa/utils/cuda_utils.h b/include/singa/utils/cuda_utils.h
index 076d0d1..17eb683 100644
--- a/include/singa/utils/cuda_utils.h
+++ b/include/singa/utils/cuda_utils.h
@@ -8,33 +8,7 @@
 #include <cuda.h>
 #include <cuda_runtime.h>
 
-//
-// CUDA macros
-//
-
-// CUDA: various checks for different function calls.
-#define CUDA_CHECK(condition) \
-  /* Code block avoids redefinition of cudaError_t error */ \
-  do { \
-    cudaError_t error = condition; \
-    CHECK_EQ(error, cudaSuccess) << " " << cudaGetErrorString(error); \
-  } while (0)
-
-#define CUBLAS_CHECK(condition) \
-  do { \
-    cublasStatus_t status = condition; \
-    CHECK_EQ(status, CUBLAS_STATUS_SUCCESS) << " " \
-      << cublasGetErrorString(status); \
-  } while (0)
-
-#define CURAND_CHECK(condition) \
-  do { \
-    curandStatus_t status = condition; \
-    CHECK_EQ(status, CURAND_STATUS_SUCCESS) << " " \
-      << curandGetErrorString(status); \
-  } while (0)
-
-const char* cublasGetErrorString(cublasStatus_t error) {
+inline const char* cublasGetErrorString(cublasStatus_t error) {
   switch (error) {
   case CUBLAS_STATUS_SUCCESS:
     return "CUBLAS_STATUS_SUCCESS";
@@ -64,7 +38,7 @@ const char* cublasGetErrorString(cublasStatus_t error) {
   return "Unknown cublas status";
 }
 
-const char* curandGetErrorString(curandStatus_t error) {
+inline const char* curandGetErrorString(curandStatus_t error) {
   switch (error) {
   case CURAND_STATUS_SUCCESS:
     return "CURAND_STATUS_SUCCESS";
@@ -95,5 +69,33 @@ const char* curandGetErrorString(curandStatus_t error) {
   }
   return "Unknown curand status";
 }
-#endif
+
+//
+// CUDA macros
+//
+
+// CUDA: various checks for different function calls.
+#define CUDA_CHECK(condition) \
+  /* Code block avoids redefinition of cudaError_t error */ \
+  do { \
+    cudaError_t error = condition; \
+    CHECK_EQ(error, cudaSuccess) << " " << cudaGetErrorString(error); \
+  } while (0)
+
+#define CUBLAS_CHECK(condition) \
+  do { \
+    cublasStatus_t status = condition; \
+    CHECK_EQ(status, CUBLAS_STATUS_SUCCESS) << " " \
+      << cublasGetErrorString(status); \
+  } while (0)
+
+#define CURAND_CHECK(condition) \
+  do { \
+    curandStatus_t status = condition; \
+    CHECK_EQ(status, CURAND_STATUS_SUCCESS) << " " \
+      << curandGetErrorString(status); \
+  } while (0)
+
+
+#endif  // USE_CUDA
 #endif  // SINGA_UTILS_CUDA_UTILS_H_

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/870d1a97/src/core/device/cpp_cpu.cc
----------------------------------------------------------------------
diff --git a/src/core/device/cpp_cpu.cc b/src/core/device/cpp_cpu.cc
index 28b0da4..44f614a 100644
--- a/src/core/device/cpp_cpu.cc
+++ b/src/core/device/cpp_cpu.cc
@@ -33,13 +33,18 @@ void CppCPU::DoExec(function<void(Context*)>&& fn, int 
executor) {
 }
 
 void* CppCPU::Malloc(int size) {
-  void *ptr = malloc(size);
-  memset(ptr, 0, size);
-  return ptr;
+  if (size > 0) {
+    void *ptr = malloc(size);
+    memset(ptr, 0, size);
+    return ptr;
+  } else {
+    return nullptr;
+  }
 }
 
 void CppCPU::Free(void* ptr) {
-  free(ptr);
+  if (ptr != nullptr)
+    free(ptr);
 }
 
 void CppCPU::CopyToFrom(void* dst, const void* src, size_t nBytes,

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/870d1a97/src/core/device/cuda_gpu.cc
----------------------------------------------------------------------
diff --git a/src/core/device/cuda_gpu.cc b/src/core/device/cuda_gpu.cc
index 0ba05fb..5d4e1ed 100644
--- a/src/core/device/cuda_gpu.cc
+++ b/src/core/device/cuda_gpu.cc
@@ -89,15 +89,17 @@ void CudaGPU::CopyToFrom(void* dst, const void* src, size_t 
nBytes,
 /// Allocate cpu memory.
 void* CudaGPU::Malloc(int size) {
   void* ptr = nullptr;
-  CUDA_CHECK(cudaMalloc(&ptr, size));
-  CUDA_CHECK(cudaMemset(ptr, 0, size));
+  if (size > 0) {
+    CUDA_CHECK(cudaMalloc(&ptr, size));
+    CUDA_CHECK(cudaMemset(ptr, 0, size));
+  }
   return ptr;
 }
 
   /// Free cpu memory.
 void CudaGPU::Free(void* ptr) {
-  CHECK_NE(ptr, nullptr);
-  CUDA_CHECK(cudaFree(ptr));
+  if (ptr != nullptr)
+    CUDA_CHECK(cudaFree(ptr));
 }
 
 

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/870d1a97/src/core/device/device.cc
----------------------------------------------------------------------
diff --git a/src/core/device/device.cc b/src/core/device/device.cc
index ede3fda..1d3c446 100644
--- a/src/core/device/device.cc
+++ b/src/core/device/device.cc
@@ -35,7 +35,6 @@ void Device::Exec(function<void(Context*)>&& fn, const 
vector<Blob*> read_blobs,
 Blob* Device::NewBlob(int size) {
   if (size > 0) {
     void* ptr = Malloc(size);
-    // memset(ptr, 0, size);
     return new Blob(ptr, size);
   } else {
     return nullptr;

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/870d1a97/src/core/tensor/math_kernel.cu
----------------------------------------------------------------------
diff --git a/src/core/tensor/math_kernel.cu b/src/core/tensor/math_kernel.cu
index e67ea7b..88041b1 100644
--- a/src/core/tensor/math_kernel.cu
+++ b/src/core/tensor/math_kernel.cu
@@ -450,6 +450,32 @@ void set_value(int n, float v, float *out) {
 void threshold(int n, float alpha, const float *in, float *out) {
   kernel_threshold<<<ceil(n / CU1DBLOCKF), CU1DBLOCKF>>>(in, out, alpha, n);
 }
+
+
+// follow the consistency guide for math API
+__global__ void KernelDiv(const size_t num, const float alpha, const float *in,
+                          float *out) {
+  for (size_t idx = blockIdx.x * blockDim.x + threadIdx.x; idx < num;
+       idx += blockDim.x * gridDim.x) {
+    out[idx] = alpha / in[idx];
+  }
+}
+
+__global__ void KernelSet(const size_t num, const float x, float *out) {
+  for (size_t idx = blockIdx.x * blockDim.x + threadIdx.x; idx < num;
+       idx += blockDim.x * gridDim.x) {
+    out[idx] = x;
+  }
+}
+
+void Div(const size_t num, float alpha, const float *in, float *out,
+         cudaStream_t s) {
+  KernelDiv<<<ceil(num / CU1DBLOCKF), CU1DBLOCKF>>>(num, alpha, in, out);
+}
+
+void Set(const size_t num, const float x, float *out, cudaStream_t s) {
+  KernelSet<<<ceil(num / CU1DBLOCKF), CU1DBLOCKF>>>(num, x, out);
+}
 }  // namespace cuda
 }  // namespace singa
 

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/870d1a97/src/core/tensor/math_kernel.h
----------------------------------------------------------------------
diff --git a/src/core/tensor/math_kernel.h b/src/core/tensor/math_kernel.h
index b016007..925346e 100644
--- a/src/core/tensor/math_kernel.h
+++ b/src/core/tensor/math_kernel.h
@@ -28,13 +28,7 @@
 /// TODO(wangwei) Clean the function APIs as commented in tensor_math.h
 ///  Add 'Context *ctx' as an argument of all cuda functions.
 namespace singa {
-/*
-  void softmaxloss_forward(int n, int dim, const float *prob,
-      const int *label, float *loss);
 
-  void softmaxloss_backward(int n, int dim, float scale,
-      const int *label, float *grad);
-*/
 // TODO(wangwei) make all function templates.
 namespace cuda {
 void sum(int n, const float *in, float *out);
@@ -44,7 +38,7 @@ void sum_row(int rows, int cols, int stride, const float *in, 
float *out);
 void sum_col(int rows, int cols, int stride, const float *in, float *out);
 
 void add_row(int rows, int cols, int stride, const float *in_row,
-  const float *in_mat, float *out);
+             const float *in_mat, float *out);
 
 void add(int n, const float *a, const float *b, float *out);
 
@@ -87,7 +81,12 @@ void div(int n, const float *a, const float *b, float *out);
 void set_value(int n, float v, float *out);
 
 void threshold(int n, float alpha, const float *in, float *out);
-}  // cuda
+
+// follow the consistency guide for math API
+void Div(const size_t num, const float x, const float *in, float *out,
+         cudaStream_t s);
+void Set(const size_t num, const float x, float *out, cudaStream_t s);
+} // cuda
 
 }  // namespace singa
 

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/870d1a97/src/core/tensor/tensor.cc
----------------------------------------------------------------------
diff --git a/src/core/tensor/tensor.cc b/src/core/tensor/tensor.cc
index 052f3ff..0e47a4f 100644
--- a/src/core/tensor/tensor.cc
+++ b/src/core/tensor/tensor.cc
@@ -25,51 +25,51 @@
 namespace singa {
 
 Tensor::~Tensor() {
-  if (blob_ != nullptr && blob_->DecRefCount() == 0) device_->FreeBlob(blob_);
+  // LOG(ERROR) << "~";
+  if (blob_ != nullptr && blob_->DecRefCount() == 0)
+    device_->FreeBlob(blob_);
   blob_ = nullptr;
 }
 
 Tensor::Tensor() { device_ = &defaultDevice; }
 
-Tensor::Tensor(const Shape& shape, DataType dtype)
+Tensor::Tensor(const Shape &shape, DataType dtype)
     : data_type_(dtype), device_(&defaultDevice), shape_(shape) {
   device_ = &defaultDevice;
   blob_ = device_->NewBlob(Product(shape_) * SizeOf(data_type_));
 }
-Tensor::Tensor(Shape&& shape, DataType dtype)
+Tensor::Tensor(Shape &&shape, DataType dtype)
     : data_type_(dtype), device_(&defaultDevice), shape_(shape) {
   device_ = &defaultDevice;
   blob_ = device_->NewBlob(Product(shape_) * SizeOf(data_type_));
 }
-Tensor::Tensor(const Shape& shape, Device* device, DataType dtype)
+Tensor::Tensor(const Shape &shape, Device *device, DataType dtype)
     : data_type_(dtype), device_(device), shape_(shape) {
   blob_ = device_->NewBlob(Product(shape_) * SizeOf(data_type_));
 }
-Tensor::Tensor(Shape&& shape, Device* device, DataType dtype)
+Tensor::Tensor(Shape &&shape, Device *device, DataType dtype)
     : data_type_(dtype), device_(device), shape_(shape) {
   blob_ = device_->NewBlob(Product(shape_) * SizeOf(data_type_));
 }
-Tensor::Tensor(const Tensor& t)
-    : transpose_(t.transpose_),
-      data_type_(t.data_type_),
-      device_(t.device_),
-      blob_(t.blob()),
-      shape_(t.shape_) {
+Tensor::Tensor(const Tensor &t)
+    : transpose_(t.transpose_), data_type_(t.data_type_), device_(t.device_),
+      blob_(t.blob()), shape_(t.shape_) {
   blob_->IncRefCount();
+  // LOG(ERROR) << "const&";
 }
 
-Tensor::Tensor(Tensor&& t)
-    : transpose_(t.transpose_),
-      data_type_(t.data_type_),
-      device_(t.device_),
+Tensor::Tensor(Tensor &&t)
+    : transpose_(t.transpose_), data_type_(t.data_type_), device_(t.device_),
       shape_(std::move(t.shape_)) {
   blob_ = t.blob_;
   t.blob_ = nullptr;
+  // LOG(ERROR) << "&&";
 }
 
-void Tensor::ResetLike(const Tensor& t) {
+void Tensor::ResetLike(const Tensor &t) {
   if (blob_ == nullptr || device_ != t.device_ || MemSize() != t.MemSize()) {
-    if (blob_ != nullptr && blob_->DecRefCount() == 0) 
device_->FreeBlob(blob_);
+    if (blob_ != nullptr && blob_->DecRefCount() == 0)
+      device_->FreeBlob(blob_);
     shape_ = t.shape_;
     device_ = t.device_;
     data_type_ = t.data_type_;
@@ -77,28 +77,40 @@ void Tensor::ResetLike(const Tensor& t) {
   }
 }
 
-void Tensor::Reshape(const Shape& shape) {
-  if (shape_ != shape) {
-    if (blob_ != nullptr && blob_->DecRefCount() == 0) 
device_->FreeBlob(blob_);
+void Tensor::Reshape(const Shape &shape) {
+  if (Product(shape_) != Product(shape)) {
+    if (blob_ != nullptr && blob_->DecRefCount() == 0)
+      device_->FreeBlob(blob_);
     blob_ = device_->NewBlob(Product(shape) * SizeOf(data_type_));
-    shape_ = shape;
   }
+  shape_ = shape;
+}
+
+void Tensor::Reshape(Shape &&shape) {
+  if (Product(shape_) != Product(shape)) {
+    if (blob_ != nullptr && blob_->DecRefCount() == 0)
+      device_->FreeBlob(blob_);
+    blob_ = device_->NewBlob(Product(shape) * SizeOf(data_type_));
+  }
+  shape_ = std::move(shape);
 }
 
 void Tensor::AsType(DataType type) {
   if (data_type_ != type) {
-    if (blob_ != nullptr && blob_->DecRefCount() == 0) 
device_->FreeBlob(blob_);
+    if (blob_ != nullptr && blob_->DecRefCount() == 0)
+      device_->FreeBlob(blob_);
     blob_ = device_->NewBlob(Product(shape_) * SizeOf(type));
     data_type_ = type;
   }
 }
 
-void Tensor::ToDevice(Device* dst) {
+void Tensor::ToDevice(Device *dst) {
   // TODO(wangwei) the comparison is very strict. May compare against device 
ID?
   if (device_ != dst) {
     Tensor tmp(shape_, dst, data_type_);
     tmp.CopyData(*this);
-    if (blob_ != nullptr && blob_->DecRefCount() == 0) 
device_->FreeBlob(blob_);
+    if (blob_ != nullptr && blob_->DecRefCount() == 0)
+      device_->FreeBlob(blob_);
     blob_ = tmp.blob_;
     tmp.blob_ = nullptr;
     device_ = dst;
@@ -108,7 +120,7 @@ void Tensor::ToDevice(Device* dst) {
 void Tensor::ToHost() { ToDevice(device_->host()); }
 
 template <typename DType>
-void Tensor::CopyDataFromHostPtr(const DType* src, size_t num) {
+void Tensor::CopyDataFromHostPtr(const DType *src, size_t num) {
   CHECK_EQ(sizeof(DType), SizeOf(data_type_))
       << "data_type is " << DataType_Name(data_type_)
       << " user given type is of size " << sizeof(DType);
@@ -118,10 +130,10 @@ void Tensor::CopyDataFromHostPtr(const DType* src, size_t 
num) {
     LOG(WARNING) << "Copy data from null host ptr";
   }
 }
-template void Tensor::CopyDataFromHostPtr(const float* src, size_t num);
-template void Tensor::CopyDataFromHostPtr(const int* src, size_t num);
+template void Tensor::CopyDataFromHostPtr(const float *src, size_t num);
+template void Tensor::CopyDataFromHostPtr(const int *src, size_t num);
 
-void Tensor::CopyData(const Tensor& src) {
+void Tensor::CopyData(const Tensor &src) {
   CHECK_EQ(Size(), src.Size());
   CHECK(blob_ != nullptr);
   // Do copy only if the src's blob is already initialized.
@@ -139,14 +151,21 @@ Tensor Tensor::Clone() {
 
 Tensor Tensor::T() const {
   CHECK_EQ(shape_.size(), 2u);
-  Tensor t(*this);
+  Tensor t;
+  t.device_ = device_;
+  t.data_type_ = data_type_;
   t.transpose_ = ~transpose_;
-  std::swap(t.shape_[0], t.shape_[1]);
+  t.shape_.push_back(shape_[1]);
+  t.shape_.push_back(shape_[0]);
+  t.blob_ = blob_;
+  blob_->IncRefCount();
   return t;
 }
 
-Tensor& Tensor::operator=(const Tensor& t) {
-  if (blob_ != nullptr && blob_->DecRefCount() == 0) device_->FreeBlob(blob_);
+Tensor &Tensor::operator=(const Tensor &t) {
+  // LOG(ERROR) << "= const &";
+  if (blob_ != nullptr && blob_->DecRefCount() == 0)
+    device_->FreeBlob(blob_);
   transpose_ = t.transpose_;
   data_type_ = t.data_type_;
   shape_ = t.shape_;
@@ -156,8 +175,10 @@ Tensor& Tensor::operator=(const Tensor& t) {
   return *this;
 }
 
-Tensor& Tensor::operator=(Tensor&& t) {
-  if (blob_ != nullptr && blob_->DecRefCount() == 0) device_->FreeBlob(blob_);
+Tensor &Tensor::operator=(Tensor &&t) {
+  // LOG(ERROR) << "= &&";
+  if (blob_ != nullptr && blob_->DecRefCount() == 0)
+    device_->FreeBlob(blob_);
   transpose_ = t.transpose_;
   data_type_ = t.data_type_;
   shape_ = std::move(t.shape_);
@@ -167,10 +188,22 @@ Tensor& Tensor::operator=(Tensor&& t) {
   return *this;
 }
 
-#define GenUnaryTensorArgMemberFunction(op, fn) \
-  Tensor& Tensor::op(const Tensor& t) {         \
-    fn(*this, t, this);                         \
-    return *this;                               \
+Tensor Reshape(const Tensor &in, const Shape &s) {
+  Tensor out(in);
+  out.Reshape(s);
+  return out;
+}
+
+Tensor Reshape(const Tensor &in, Shape &&s) {
+  Tensor out(in);
+  out.Reshape(std::move(s));
+  return out;
+}
+
+#define GenUnaryTensorArgMemberFunction(op, fn)                                
\
+  Tensor &Tensor::op(const Tensor &t) {                                        
\
+    fn(*this, t, this);                                                        
\
+    return *this;                                                              
\
   }
 
 GenUnaryTensorArgMemberFunction(operator+=, Add);
@@ -178,13 +211,12 @@ GenUnaryTensorArgMemberFunction(operator-=, Sub);
 GenUnaryTensorArgMemberFunction(operator*=, EltwiseMult);
 GenUnaryTensorArgMemberFunction(operator/=, Div);
 
-#define GenUnaryScalarArgMemberFunction(op, fn) \
-  template <typename DType>                     \
-  Tensor& Tensor::op(DType x) {                 \
-    fn(*this, x, this);                         \
-    return *this;                               \
-  }                                             \
-  template Tensor& Tensor::op<float>(float x)
+#define GenUnaryScalarArgMemberFunction(op, fn)                                
\
+  template <typename DType> Tensor &Tensor::op(DType x) {                      
\
+    fn(*this, x, this);                                                        
\
+    return *this;                                                              
\
+  }                                                                            
\
+  template Tensor &Tensor::op<float>(float x)
 
 GenUnaryScalarArgMemberFunction(operator-=, Sub);
 GenUnaryScalarArgMemberFunction(operator+=, Add);
@@ -192,7 +224,7 @@ GenUnaryScalarArgMemberFunction(operator*=, EltwiseMult);
 GenUnaryScalarArgMemberFunction(operator/=, Div);
 
 // ====================Tensor Operations=======================================
-void CopyDataToFrom(Tensor* dst, const Tensor& src, size_t num,
+void CopyDataToFrom(Tensor *dst, const Tensor &src, size_t num,
                     size_t dst_offset, size_t src_offset) {
   auto width = SizeOf(src.data_type());
   CHECK_EQ(width, SizeOf(dst->data_type()));
@@ -223,94 +255,93 @@ void CopyDataToFrom(Tensor* dst, const Tensor& src, 
size_t num,
 //============================================================================
 /// typedef DType accroding to type value.
 /// DType would be used in the code block __VA_ARGS__.
-#define TYPE_SWITCH(type, DType, ...)                               \
-  do {                                                              \
-    switch (type) {                                                 \
-      case kFloat32: {                                              \
-        typedef float DType;                                        \
-        { __VA_ARGS__ }                                             \
-        break;                                                      \
-      }                                                             \
-      case kInt: {                                                  \
-        typedef int DType;                                          \
-        { __VA_ARGS__ }                                             \
-        break;                                                      \
-      }                                                             \
-      case kChar: {                                                 \
-        typedef char DType;                                         \
-        { __VA_ARGS__ }                                             \
-        break;                                                      \
-      }                                                             \
-      default:                                                      \
-        LOG(FATAL) << "Unknow data type = " << DataType_Name(type); \
-    }                                                               \
+#define TYPE_SWITCH(type, DType, ...)                                          
\
+  do {                                                                         
\
+    switch (type) {                                                            
\
+    case kFloat32: {                                                           
\
+      typedef float DType;                                                     
\
+      { __VA_ARGS__ }                                                          
\
+      break;                                                                   
\
+    }                                                                          
\
+    case kInt: {                                                               
\
+      typedef int DType;                                                       
\
+      { __VA_ARGS__ }                                                          
\
+      break;                                                                   
\
+    }                                                                          
\
+    case kChar: {                                                              
\
+      typedef char DType;                                                      
\
+      { __VA_ARGS__ }                                                          
\
+      break;                                                                   
\
+    }                                                                          
\
+    default:                                                                   
\
+      LOG(FATAL) << "Unknow data type = " << DataType_Name(type);              
\
+    }                                                                          
\
   } while (0)
 
 /// typedef DType and Lang according to data type and device programming
 /// language respectively.
 /// type is from DataType, and lang is from LangType.
 /// DType and Lang would be used in __VA_ARGS__.
-#define TYPE_LANG_SWITCH(dtype, DType, ltype, Lang, ...)       \
-  do {                                                         \
-    const int _SwitchShift = 3;                                \
-    int _SwitchHash = ((dtype) << _SwitchShift) + (ltype);     \
-    switch (_SwitchHash) {                                     \
-      case ((kFloat32 << _SwitchShift) + kCuda): {             \
-        typedef float DType;                                   \
-        typedef lang::Cuda Lang;                               \
-        { __VA_ARGS__ }                                        \
-        break;                                                 \
-      }                                                        \
-      case ((kFloat32 << _SwitchShift) + kCpp): {              \
-        typedef float DType;                                   \
-        typedef lang::Cpp Lang;                                \
-        { __VA_ARGS__ }                                        \
-        break;                                                 \
-      }                                                        \
-      case ((kFloat32 << _SwitchShift) + kOpencl): {           \
-        typedef float DType;                                   \
-        typedef lang::Opencl Lang;                             \
-        { __VA_ARGS__ }                                        \
-        break;                                                 \
-      }                                                        \
-      default:                                                 \
-        LOG(FATAL) << "Unknown combination of data type "      \
-                   << DataType_Name(dtype) << " and language " \
-                   << LangType_Name(ltype);                    \
-    }                                                          \
+#define TYPE_LANG_SWITCH(dtype, DType, ltype, Lang, ...)                       
\
+  do {                                                                         
\
+    const int _SwitchShift = 3;                                                
\
+    int _SwitchHash = ((dtype) << _SwitchShift) + (ltype);                     
\
+    switch (_SwitchHash) {                                                     
\
+    case ((kFloat32 << _SwitchShift) + kCuda): {                               
\
+      typedef float DType;                                                     
\
+      typedef lang::Cuda Lang;                                                 
\
+      { __VA_ARGS__ }                                                          
\
+      break;                                                                   
\
+    }                                                                          
\
+    case ((kFloat32 << _SwitchShift) + kCpp): {                                
\
+      typedef float DType;                                                     
\
+      typedef lang::Cpp Lang;                                                  
\
+      { __VA_ARGS__ }                                                          
\
+      break;                                                                   
\
+    }                                                                          
\
+    case ((kFloat32 << _SwitchShift) + kOpencl): {                             
\
+      typedef float DType;                                                     
\
+      typedef lang::Opencl Lang;                                               
\
+      { __VA_ARGS__ }                                                          
\
+      break;                                                                   
\
+    }                                                                          
\
+    default:                                                                   
\
+      LOG(FATAL) << "Unknown combination of data type "                        
\
+                 << DataType_Name(dtype) << " and language "                   
\
+                 << LangType_Name(ltype);                                      
\
+    }                                                                          
\
   } while (0)
 
-
-template <typename SType>
-void Tensor::SetValue(SType x) {
+template <typename SType> void Tensor::SetValue(const SType x) {
   CHECK_EQ(sizeof(SType), SizeOf(data_type_));
   auto size = Size();
   auto ptr = blob_;
   TYPE_LANG_SWITCH(data_type_, DType, device_->lang(), Lang, {
+    // cast x to DType
     device_->Exec(
-        [size, x, ptr](Context* ctx) { Set<DType, Lang>(size, x, ptr, ctx); },
+        [size, x, ptr](Context *ctx) { Set<DType, Lang>(size, x, ptr, ctx); },
         {}, {ptr});
   });
 }
-
-
-#define EltwiseUnaryTensorFn(fn, t, ret)                               \
-  do {                                                                 \
-    TYPE_LANG_SWITCH(t.data_type(), DType, t.device()->lang(), Lang, { \
-      ret->device()->Exec(                                             \
-          [t, ret](Context* ctx) {                                     \
-            fn<DType, Lang>(t.Size(), t.blob(), ret->blob(), ctx);     \
-          },                                                           \
-          {t.blob()}, {ret->blob()});                                  \
-    });                                                                \
+template void Tensor::SetValue<float>(const float x);
+
+#define EltwiseUnaryTensorFn(fn, t, ret)                                       
\
+  do {                                                                         
\
+    TYPE_LANG_SWITCH(t.data_type(), DType, t.device()->lang(), Lang, {         
\
+      ret->device()->Exec(                                                     
\
+          [t, ret](Context *ctx) {                                             
\
+            fn<DType, Lang>(t.Size(), t.blob(), ret->blob(), ctx);             
\
+          },                                                                   
\
+          {t.blob()}, {ret->blob()});                                          
\
+    });                                                                        
\
   } while (0)
 
-#define GenUnaryTensorFunction(fn)                    \
-  Tensor fn(const Tensor& t) {                        \
-    Tensor ret(t.shape(), t.device(), t.data_type()); \
-    auto* retptr = &ret;                              \
-    EltwiseUnaryTensorFn(fn, t, retptr);              \
-    return ret;                                       \
+#define GenUnaryTensorFunction(fn)                                             
\
+  Tensor fn(const Tensor &t) {                                                 
\
+    Tensor ret(t.shape(), t.device(), t.data_type());                          
\
+    auto *retptr = &ret;                                                       
\
+    EltwiseUnaryTensorFn(fn, t, retptr);                                       
\
+    return ret;                                                                
\
   }
 
 GenUnaryTensorFunction(Abs);
@@ -323,63 +354,33 @@ GenUnaryTensorFunction(Sqrt);
 GenUnaryTensorFunction(Square);
 GenUnaryTensorFunction(Tanh);
 
-// TODO(wangwei) consider matrix transpose.
-Tensor SumRows(const Tensor& t) {
-  int ndim = t.shape().size();
-  CHECK_EQ(ndim, 2) << "Cannot do SumRows for Tensor with ndim = " << ndim;
-  size_t nrow = t.shape().at(0), ncol = t.shape().at(1);
-  Tensor ret(Shape{ncol}, t.device(), t.data_type());
-  TYPE_LANG_SWITCH(t.data_type(), DType, t.device()->lang(), Lang, {
-    ret.device()->Exec(
-        [nrow, ncol, t, ret](Context* ctx) {
-          SumRows<DType, Lang>(nrow, ncol, t.blob(), ret.blob(), ctx);
-        },
-        {t.blob()}, {ret.blob()});
-  });
-  return ret;
-}
-
-// TODO(wangwei) consider matrix transpose.
-Tensor SumColumns(const Tensor& t) {
-  int ndim = t.shape().size();
-  CHECK_EQ(ndim, 2) << "Cannot do SumColumns for Tensor with ndim = " << ndim;
-  CHECK(!t.transpose());  // TODO(wangwei) enable transpose
-  size_t nrow = t.shape().at(0), ncol = t.shape().at(1);
-  Tensor ret(Shape{nrow}, t.device(), t.data_type());
-  TYPE_LANG_SWITCH(t.data_type(), DType, t.device()->lang(), Lang, {
-    ret.device()->Exec(
-        [nrow, ncol, t, ret](Context* ctx) {
-          SumColumns<DType, Lang>(nrow, ncol, t.blob(), ret.blob(), ctx);
-        },
-        {t.blob()}, {ret.blob()});
-  });
-  return ret;
-}
-
 // TODO(wangwei) conside async exec
-template<>
-float Sum<float>(const Tensor& t)  {
+template <> float Sum<float>(const Tensor &t) {
   float s = 0.0f;
   TYPE_LANG_SWITCH(t.data_type(), DType, t.device()->lang(), Lang, {
-      t.device()->Exec(
-        [t, &s](Context* ctx) {
-        Sum<DType, Lang>(t.Size(), t.blob(), &s, ctx);
+    t.device()->Exec(
+        [t, &s](Context *ctx) {
+          Sum<DType, Lang>(t.Size(), t.blob(), &s, ctx);
         },
         {t.blob()}, {});
-      });
+  });
   return s;
 }
 
-Tensor Sum(const Tensor& t, int axis) {
+Tensor Sum(const Tensor &M, int axis) {
   if (axis == 0) {
-    return SumRows(t);
+    Tensor out(Shape{M.shape(1)}, M.device(), M.data_type());
+    SumRows(M, &out);
+    return out;
   } else {
     CHECK_EQ(axis, 1) << "Not support Sum over axis = " << axis;
-    return SumColumns(t);
+    Tensor out(Shape{M.shape(0)}, M.device(), M.data_type());
+    SumColumns(M, &out);
+    return out;
   }
 }
 
-Tensor Average(const Tensor& t, int axis) {
+Tensor Average(const Tensor &t, int axis) {
   // operator/ only has implementation for float scalar type, hence it is
   // necessary to cast the denominator to a float.
   // TODO(wangwei) implement function for cast scalar type involved in Tensor
@@ -401,13 +402,13 @@ Tensor Average(const Tensor& t, int axis) {
   }
 }
 
-Tensor Softmax(const Tensor& t, int axis) {
+Tensor Softmax(const Tensor &t, int axis) {
   Tensor ret(t.shape(), t.device(), t.data_type());
   Softmax(t, &ret, axis);
   return ret;
 }
 
-void Softmax(const Tensor& t, Tensor* ret, int axis) {
+void Softmax(const Tensor &t, Tensor *ret, int axis) {
   int nrow = 1, ncol = t.Size(), size = ncol;
   CHECK_GE(axis, -1);
   CHECK_GT(t.shape().size(), 0u);
@@ -418,34 +419,34 @@ void Softmax(const Tensor& t, Tensor* ret, int axis) {
   }
   TYPE_LANG_SWITCH(t.data_type(), DType, t.device()->lang(), Lang, {
     ret->device()->Exec(
-        [nrow, ncol, t, ret](Context* ctx) {
+        [nrow, ncol, t, ret](Context *ctx) {
           Softmax<DType, Lang>(nrow, ncol, t.blob(), ret->blob(), ctx);
         },
         {t.blob()}, {ret->blob()});
   });
 }
 
-#define EltwiseBinaryTensorFn(fn, lhs, rhs, ret)                             \
-  do {                                                                       \
-    TYPE_LANG_SWITCH(lhs.data_type(), DType, lhs.device()->lang(), Lang, {   \
-      CHECK_EQ(sizeof(DType), SizeOf(rhs.data_type()));                      \
-      ret->device()->Exec(                                                   \
-          [lhs, rhs, ret](Context* ctx) {                                    \
-            fn<DType, Lang>(lhs.Size(), lhs.blob(), rhs.blob(), ret->blob(), \
-                            ctx);                                            \
-          },                                                                 \
-          {lhs.blob(), rhs.blob()}, {ret->blob()});                          \
-    });                                                                      \
+#define EltwiseBinaryTensorFn(fn, lhs, rhs, ret)                               
\
+  do {                                                                         
\
+    TYPE_LANG_SWITCH(lhs.data_type(), DType, lhs.device()->lang(), Lang, {     
\
+      CHECK_EQ(sizeof(DType), SizeOf(rhs.data_type()));                        
\
+      ret->device()->Exec(                                                     
\
+          [lhs, rhs, ret](Context *ctx) {                                      
\
+            fn<DType, Lang>(lhs.Size(), lhs.blob(), rhs.blob(), ret->blob(),   
\
+                            ctx);                                              
\
+          },                                                                   
\
+          {lhs.blob(), rhs.blob()}, {ret->blob()});                            
\
+    });                                                                        
\
   } while (0)
 
-#define GenBinaryTensorFunction(op, fn)                        \
-  Tensor op(const Tensor& lhs, const Tensor& rhs) {            \
-    Tensor ret(lhs.shape(), lhs.device(), lhs.data_type());    \
-    fn(lhs, rhs, &ret);                                        \
-    return ret;                                                \
-  }                                                            \
-  void fn(const Tensor& lhs, const Tensor& rhs, Tensor* ret) { \
-    EltwiseBinaryTensorFn(fn, lhs, rhs, ret);                  \
+#define GenBinaryTensorFunction(op, fn)                                        
\
+  Tensor op(const Tensor &lhs, const Tensor &rhs) {                            
\
+    Tensor ret(lhs.shape(), lhs.device(), lhs.data_type());                    
\
+    fn(lhs, rhs, &ret);                                                        
\
+    return ret;                                                                
\
+  }                                                                            
\
+  void fn(const Tensor &lhs, const Tensor &rhs, Tensor *ret) {                 
\
+    EltwiseBinaryTensorFn(fn, lhs, rhs, ret);                                  
\
   }
 
 GenBinaryTensorFunction(operator+, Add);
@@ -454,32 +455,30 @@ GenBinaryTensorFunction(operator*, EltwiseMult);
 GenBinaryTensorFunction(operator/, Div);
 GenBinaryTensorFunction(Pow, Pow);
 
-#define EltwiseTensorScalarFn(fn, t, x, ret)                            \
-  do {                                                                  \
-    TYPE_LANG_SWITCH(t.data_type(), DType, t.device()->lang(), Lang, {  \
-      static_assert(std::is_same<SType, DType>::value,                  \
-                    "The Scalar type must match the Tensor data type"); \
-      ret->device()->Exec(                                              \
-          [t, x, ret](Context* ctx) {                                   \
-            fn<DType, Lang>(t.Size(), t.blob(), x, ret->blob(), ctx);   \
-          },                                                            \
-          {t.blob()}, {ret->blob()});                                   \
-    });                                                                 \
+#define EltwiseTensorScalarFn(fn, t, x, ret)                                   
\
+  do {                                                                         
\
+    TYPE_LANG_SWITCH(t.data_type(), DType, t.device()->lang(), Lang, {         
\
+      static_assert(std::is_same<SType, DType>::value,                         
\
+                    "The Scalar type must match the Tensor data type");        
\
+      ret->device()->Exec(                                                     
\
+          [t, x, ret](Context *ctx) {                                          
\
+            fn<DType, Lang>(t.Size(), t.blob(), x, ret->blob(), ctx);          
\
+          },                                                                   
\
+          {t.blob()}, {ret->blob()});                                          
\
+    });                                                                        
\
   } while (0)
 
-#define GenTensorScalarFunction(op, fn)                \
-  template <typename SType>                            \
-  Tensor op(const Tensor& t, SType x) {                \
-    Tensor ret(t.shape(), t.device(), t.data_type());  \
-    fn(t, x, &ret);                                    \
-    return ret;                                        \
-  }                                                    \
-  template <typename SType>                            \
-  void fn(const Tensor& t, SType x, Tensor* ret) {     \
-    EltwiseTensorScalarFn(fn, t, x, ret);              \
-  }                                                    \
-  template Tensor op<float>(const Tensor& t, float x); \
-  template void fn<float>(const Tensor& t, const float x, Tensor* ret)
+#define GenTensorScalarFunction(op, fn)                                        
\
+  template <typename SType> Tensor op(const Tensor &t, SType x) {              
\
+    Tensor ret(t.shape(), t.device(), t.data_type());                          
\
+    fn(t, x, &ret);                                                            
\
+    return ret;                                                                
\
+  }                                                                            
\
+  template <typename SType> void fn(const Tensor &t, SType x, Tensor *ret) {   
\
+    EltwiseTensorScalarFn(fn, t, x, ret);                                      
\
+  }                                                                            
\
+  template Tensor op<float>(const Tensor &t, float x);                         
\
+  template void fn<float>(const Tensor &t, const float x, Tensor *ret)
 
 GenTensorScalarFunction(operator+, Add);
 GenTensorScalarFunction(operator-, Sub);
@@ -488,83 +487,216 @@ GenTensorScalarFunction(operator/, Div);
 GenTensorScalarFunction(Pow, Pow);
 
 // ================Blas operations============================================
-Tensor Mult(const Tensor& lhs, const Tensor& rhs) {
-  Tensor ret(lhs.shape(), lhs.device(), lhs.data_type());
+Tensor Mult(const Tensor &lhs, const Tensor &rhs) {
+  Tensor ret(Shape{lhs.shape(0), rhs.shape(1)}, lhs.device(), lhs.data_type());
   Mult(lhs, rhs, &ret);
   return ret;
 }
 
-void Mult(const Tensor& lhs, const Tensor& rhs, Tensor* ret) {
-  Mult(1, lhs, 1, rhs, ret);
+void Mult(const Tensor &lhs, const Tensor &rhs, Tensor *ret) {
+  Mult(1.0f, lhs, rhs, 0.0f, ret);
 }
 
-Tensor Mult(float alpha, const Tensor& A, float beta, const Tensor& B) {
-  Tensor ret(A.shape(), A.device(), A.data_type());
-  Mult(alpha, A, beta, B, &ret);
-  return ret;
-}
-
-void Mult(float alpha, const Tensor& A, float beta, const Tensor& B,
-          Tensor* C) {
+void Mult(const float alpha, const Tensor &A, const Tensor &B, const float 
beta,
+          Tensor *C) {
   CHECK_EQ(A.shape().size(), 2u);
-  bool transA = A.transpose();
-  size_t m = transA ? A.shape()[1] : A.shape()[0], n = 0;
-  if (B.shape().size() == 1u) {
-    n = C->Size();
+  if (B.nDim() == 1u) {
     TYPE_LANG_SWITCH(A.data_type(), DType, A.device()->lang(), Lang, {
       C->device()->Exec(
-          [transA, m, n, alpha, A, beta, B, C](Context* ctx) {
-            GEMV<DType, Lang>(transA, m, n, alpha, A.blob(), B.blob(), beta,
-                              C->blob(), ctx);
+          [alpha, A, beta, B, C](Context *ctx) {
+            GEMV<DType, Lang>(A.transpose(), A.shape(0), A.shape(1), alpha,
+                              A.blob(), B.blob(), beta, C->blob(), ctx);
           },
           {A.blob(), B.blob()}, {C->blob()});
     });
   } else {
     CHECK(!C->transpose());
-    bool transB = B.transpose();
-    size_t k = transB ? B.shape()[1] : B.shape()[0];
-    n = C->shape()[1];
-    CHECK_EQ(C->shape()[0], m);
-    CHECK_EQ(A.Size(), m * k);
-    CHECK_EQ(B.Size(), n * k);
     TYPE_LANG_SWITCH(A.data_type(), DType, A.device()->lang(), Lang, {
       C->device()->Exec(
-          [transA, transB, m, n, k, alpha, A, beta, B, C](Context* ctx) {
-            GEMM<DType, Lang>(transA, transB, m, n, k, alpha, A.blob(),
-                              B.blob(), beta, C->blob(), ctx);
+          [alpha, A, beta, B, C](Context *ctx) {
+            GEMM<DType, Lang>(A.transpose(), B.transpose(), A.shape(0),
+                              B.shape(1), A.shape(1), alpha, A.blob(), 
B.blob(),
+                              beta, C->blob(), ctx);
           },
           {A.blob(), B.blob()}, {C->blob()});
     });
   }
 }
 
-void Bernoulli(float p, Tensor* t) {
+void Bernoulli(float p, Tensor *t) {
   TYPE_LANG_SWITCH(t->data_type(), DType, t->device()->lang(), Lang, {
     t->device()->Exec(
-        [p, t](Context* ctx) {
+        [p, t](Context *ctx) {
           Bernoulli<DType, Lang>(t->Size(), p, t->blob(), ctx);
         },
         {}, {t->blob()}, true);
   });
 }
 
-void Uniform(float low, float high, Tensor* t) {
+void Uniform(float low, float high, Tensor *t) {
   TYPE_LANG_SWITCH(t->data_type(), DType, t->device()->lang(), Lang, {
     t->device()->Exec(
-        [low, high, t](Context* ctx) {
+        [low, high, t](Context *ctx) {
           Uniform<DType, Lang>(t->Size(), low, high, t->blob(), ctx);
         },
         {}, {t->blob()}, true);
   });
 }
 
-void Gaussian(float mean, float std, Tensor* t) {
+void Gaussian(float mean, float std, Tensor *t) {
   TYPE_LANG_SWITCH(t->data_type(), DType, t->device()->lang(), Lang, {
     t->device()->Exec(
-        [mean, std, t](Context* ctx) {
+        [mean, std, t](Context *ctx) {
           Gaussian<DType, Lang>(t->Size(), mean, std, t->blob(), ctx);
         },
         {}, {t->blob()}, true);
   });
 }
+
+// ======follow the consistency guide
+void AddColumn(const Tensor &v, Tensor *M) { AddColumn(1, 1, v, M); }
+/// Add column 'v' onto each column of matrix M;
+void AddColumn(const float alpha, const float beta, const Tensor &v,
+               Tensor *M) {
+  if (M->transpose()) {
+    Tensor X = M->T();
+    AddRow(v, &X);
+  } else {
+    CHECK_EQ(M->nDim(), 2);
+    CHECK_EQ(v.nDim(), 1);
+    size_t nb_row = M->shape(0), nb_col = M->shape(1);
+    CHECK_EQ(nb_row, v.Size());
+
+    Tensor one(Shape{1, nb_col}, M->device(), M->data_type());
+    one.SetValue(1.0f); // TODO(wangwei) cast type
+    Tensor vmat = Reshape(v, Shape{nb_row, 1});
+    Mult(alpha, vmat, one, beta, M);
+  }
+}
+void AddRow(const Tensor &v, Tensor *M) { AddRow(1, 1, v, M); }
+
+/// Sub column 'v' by each column of matrix M; write results into 'out'
+void AddRow(const float alpha, const float beta, const Tensor &v, Tensor *M) {
+  if (M->transpose()) {
+    Tensor X = M->T();
+    AddColumn(v, &X);
+  } else {
+    CHECK_EQ(M->nDim(), 2);
+    CHECK_EQ(v.nDim(), 1);
+    size_t nb_row = M->shape(0), nb_col = M->shape(1);
+    CHECK_EQ(nb_col, v.Size());
+
+    Tensor one(Shape{nb_row, 1}, M->device(), M->data_type());
+    one.SetValue(1.0f);
+    Tensor vmat = Reshape(v, Shape{1, nb_col});
+    Mult(alpha, one, vmat, beta, M);
+  }
+}
+
+template <typename SType> Tensor Div(const SType alpha, const Tensor &in) {
+  Tensor out(in.shape(), in.device(), in.data_type());
+  Div(alpha, in, &out);
+  return out;
+}
+
+template Tensor Div<float>(const float, const Tensor &);
+
+template <typename SType>
+void Div(const SType alpha, const Tensor &in, Tensor *out) {
+  CheckDataTypeAndLang(in, *out);
+  CHECK(in.shape() == out->shape());
+  TYPE_LANG_SWITCH(in.data_type(), DType, in.device()->lang(), Lang, {
+    // TODO(wangwei) type cast SType to DType;
+    in.device()->Exec(
+        [alpha, in, out](Context *ctx) {
+          Div<DType, Lang>(in.Size(), alpha, in.blob(), out->blob(), ctx);
+        },
+        {in.blob()}, {out->blob()});
+  });
+}
+template void Div<float>(const float, const Tensor &, Tensor *);
+
+/// Divide column 'v' by each column of matrix M; write results into 'out'
+void DivColumn(const Tensor &v, Tensor *M) {
+  Tensor inv;
+  TYPE_SWITCH(v.data_type(), DType, { inv = Div(DType(1), v); });
+  MultColumn(inv, M);
+}
+
+/// Divide row 'v' by each row of matrix M; write results into 'out'
+void DivRow(const Tensor &v, Tensor *M) {
+  Tensor inv;
+  TYPE_SWITCH(v.data_type(), DType, { inv = Div(DType(1), v); });
+  MultRow(inv, M);
+}
+
+/// Multiply column 'v' and each column of matrix M; write results into 'out'
+void MultColumn(const Tensor &v, Tensor *M) {
+  CHECK(!M->transpose()) << "Not supported yet";
+  CHECK_EQ(M->nDim(), 2);
+  CHECK_EQ(v.nDim(), 1);
+  CHECK_EQ(v.Size(), M->shape(0));
+  CheckDataTypeAndLang(*M, v);
+  TYPE_LANG_SWITCH(v.data_type(), DType, v.device()->lang(), Lang, {
+    v.device()->Exec(
+        [M, v](Context *ctx) {
+          DGMM<DType, Lang>(false, M->shape(0), M->shape(1), M->blob(),
+                            v.blob(), M->blob(), ctx);
+        },
+        {M->blob(), v.blob()}, {M->blob()});
+  });
+}
+
+/// Multiply row 'v' with each row of matrix M; write results into 'out'
+void MultRow(const Tensor &v, Tensor *M) {
+  CHECK(!M->transpose()) << "Not supported yet";
+  CHECK_EQ(M->nDim(), 2);
+  CHECK_EQ(v.nDim(), 1);
+  CHECK_EQ(v.Size(), M->shape(1));
+  CheckDataTypeAndLang(*M, v);
+  TYPE_LANG_SWITCH(v.data_type(), DType, v.device()->lang(), Lang, {
+    v.device()->Exec(
+        [M, v](Context *ctx) {
+          DGMM<DType, Lang>(true, M->shape(0), M->shape(1), M->blob(), 
v.blob(),
+                            M->blob(), ctx);
+        },
+        {M->blob(), v.blob()}, {M->blob()});
+  });
+}
+
+void SubColumn(const Tensor &v, Tensor *M) { AddColumn(-1, 1, v, M); }
+
+void SubRow(const Tensor &v, Tensor *M) { AddRow(-1, 1, v, M); }
+
+void SumColumns(const Tensor &M, Tensor *v) {
+  if (M.transpose()) {
+    Tensor X = M.T();
+    SumRows(X, v);
+  } else {
+    CHECK_EQ(M.nDim(), 2);
+    CHECK_EQ(v->nDim(), 1);
+    size_t nb_row = M.shape().at(0), nb_col = M.shape().at(1);
+    CHECK_EQ(nb_row, v->Size());
+
+    Tensor one(Shape{nb_col, 1}, M.device(), M.data_type());
+    one.SetValue(1.0f); // TODO(wangwei) cast type
+    Mult(M, one, v);
+  }
+}
+void SumRows(const Tensor &M, Tensor *v) {
+  if (M.transpose()) {
+    Tensor X = M.T();
+    SumColumns(X, v);
+  } else {
+    CHECK_EQ(M.nDim(), 2);
+    CHECK_EQ(v->nDim(), 1);
+    size_t nb_row = M.shape(0), nb_col = M.shape(1);
+    CHECK_EQ(nb_col, v->Size());
+
+    Tensor one(Shape{nb_row, 1}, M.device(), M.data_type());
+    one.SetValue(1.0f); // TODO(wangwei) cast type
+    Tensor X = M.T();
+    Mult(X, one, v);
+  }
+}
 }  // namespace singa

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/870d1a97/src/core/tensor/tensor_math.h
----------------------------------------------------------------------
diff --git a/src/core/tensor/tensor_math.h b/src/core/tensor/tensor_math.h
index b53d4cb..98d91bf 100644
--- a/src/core/tensor/tensor_math.h
+++ b/src/core/tensor/tensor_math.h
@@ -39,178 +39,184 @@ namespace singa {
 ///      Scale(const float alpha, const Blob* in, Blob* out);
 ///    For such cases, use x, v, alpha, etc for scalar types.
 ///    For blas functions, follow the blas style for argument names.
+///    Use 'M' and 'v' for matrix and vector tensors in functions involving 
both
+///    matrix and vectors.
+/// 5. For Blob argument xxx, name its raw pointer as xxxPtr.
+/// 6. Pass the 'cudaStream_t s' to every function in math_kernel.h
+/// 7. Use size_t for the number of elements, rows or columns.
+/// 8. Use the same name for the Tensor and Blob level math functions.
 
 
 // ================Linear algebra functions====================================
 /// ret[i] = |input[i]|
 template <typename DType, typename Lang>
-void Abs(int count, const Blob* input, Blob* ret, Context* ctx) {
+void Abs(int count, const Blob *input, Blob *ret, Context *ctx) {
   LOG(FATAL) << "Not Implemented";
 }
 
 template <typename DType, typename Lang>
-void Set(int count, DType x, Blob* ret, Context* ctx) {
+void Set(int count, DType x, Blob *ret, Context *ctx) {
   LOG(FATAL) << "Not Implemented";
 }
 /// sum all elements of input into ret
 template <typename DType, typename Lang>
-void Sum(int count, const Blob* input, DType* ret, Context* ctx) {
+void Sum(int count, const Blob *input, DType *ret, Context *ctx) {
   LOG(FATAL) << "Not Implemented";
 }
 
 /// ret[i] = sign(input[i])
 template <typename DType, typename Lang>
-void Sign(int count, const Blob* input, Blob* ret, Context* ctx) {
+void Sign(int count, const Blob *input, Blob *ret, Context *ctx) {
   LOG(FATAL) << "Not Implemented";
 }
 
 /// Base is e, Neper number. ret[i]=exp(input[i])
 template <typename DType, typename Lang>
-void Exp(int count, const Blob* input, Blob* ret, Context* ctx) {
+void Exp(int count, const Blob *input, Blob *ret, Context *ctx) {
   LOG(FATAL) << "Not Implemented";
 }
 
 /// Natual logarithm, the base is e, Neper number ret[i]=log(input[i]).
 template <typename DType, typename Lang>
-void Log(int count, const Blob* input, Blob* ret, Context* ctx) {
+void Log(int count, const Blob *input, Blob *ret, Context *ctx) {
   LOG(FATAL) << "Not Implemented";
 }
 
 /// Element-wise operation, ret[i]=sqrt([input[i])
 template <typename DType, typename Lang>
-void Sqrt(int count, const Blob* input, Blob* ret, Context* ctx) {
+void Sqrt(int count, const Blob *input, Blob *ret, Context *ctx) {
   LOG(FATAL) << "Not Implemented";
 }
 
 /// Element-wise operation, ret[i]=square([input[i])
 template <typename DType, typename Lang>
-void Square(int count, const Blob* input, Blob* ret, Context* ctx) {
+void Square(int count, const Blob *input, Blob *ret, Context *ctx) {
   LOG(FATAL) << "Not Implemented";
 }
 
 /// Element-wise operation, ret[i]=tanh([input[i])
 template <typename DType, typename Lang>
-void Tanh(int count, const Blob* input, Blob* ret, Context* ctx) {
+void Tanh(int count, const Blob *input, Blob *ret, Context *ctx) {
   LOG(FATAL) << "Not Implemented";
 }
 /// Element-wise operation, ret[i]=max(0, input[i])
 template <typename DType, typename Lang>
-void ReLU(int count, const Blob* input, Blob* ret, Context* ctx) {
+void ReLU(int count, const Blob *input, Blob *ret, Context *ctx) {
   LOG(FATAL) << "Not Implemented";
 }
 /// Element-wise operation, ret[i]=sigmoid([input[i])
 template <typename DType, typename Lang>
-void Sigmoid(int count, const Blob* input, Blob* ret, Context* ctx) {
+void Sigmoid(int count, const Blob *input, Blob *ret, Context *ctx) {
   LOG(FATAL) << "Not Implemented";
 }
 
 /// Do softmax for each row invidually
 template <typename DType, typename Lang>
-void Softmax(int nrow, int ncol, const Blob* input, Blob* ret, Context* ctx) {
+void Softmax(int nrow, int ncol, const Blob *input, Blob *ret, Context *ctx) {
   LOG(FATAL) << "Not Implemented";
 }
 
 // TODO(wangwei) unify SumRow and SumCol.
 /// Sum the rows of the input matrix into a vector
 template <typename DType, typename Lang>
-void SumRows(int nrow, int ncol, const Blob* input, Blob* ret, Context* ctx) {
+void SumRows(int nrow, int ncol, const Blob *input, Blob *ret, Context *ctx) {
   LOG(FATAL) << "Not Implemented";
 }
 
 /// Sum the columns of the input matrix into a vector
 template <typename DType, typename Lang>
-void SumColumns(int nrow, int ncol, const Blob* input, Blob* ret, Context* 
ctx) {
+void SumColumns(int nrow, int ncol, const Blob *input, Blob *ret,
+                Context *ctx) {
   LOG(FATAL) << "Not Implemented";
 }
 
 // TODO(wangwei) unify AddRow and AddCol.
 /// Add the vector v to every row of A as the row of ret
 template <typename DType, typename Lang>
-void AddRow(int nrow, int ncol, const Blob* A, const Blob* v, Blob* ret,
-            Context* ctx) {
+void AddRow(int nrow, int ncol, const Blob *A, const Blob *v, Blob *ret,
+            Context *ctx) {
   LOG(FATAL) << "Not Implemented";
 }
 
 /// Add the vector v to every column of A as the column of ret
 template <typename DType, typename Lang>
-void AddCol(int nrow, int ncol, const Blob* A, const Blob* v, Blob* ret,
-            Context* ctx) {
+void AddCol(int nrow, int ncol, const Blob *A, const Blob *v, Blob *ret,
+            Context *ctx) {
   LOG(FATAL) << "Not Implemented";
 }
 
-
 /// Element-wise operation, do v^x for every v from the input tensor
 template <typename DType, typename Lang>
-void Pow(int count, const Blob* input, DType x, Blob* ret, Context* ctx) {
+void Pow(int count, const Blob *input, DType x, Blob *ret, Context *ctx) {
   LOG(FATAL) << "Not Implemented";
 }
 
 /// Element-wise operation, do v^x for every v from the lhs and every x from 
rhs
 template <typename DType, typename Lang>
-void Pow(int count, const Blob* lhs, const Blob* rhs, Blob* ret, Context* ctx) 
{
+void Pow(int count, const Blob *lhs, const Blob *rhs, Blob *ret, Context *ctx) 
{
   LOG(FATAL) << "Not Implemented";
 }
 
 /// Element-wise operation, clamp every element into [low, high]
 /// if x>high, then x=high; if x<low, then x=low.
 template <typename DType, typename Lang>
-void Clamp(int count, DType low, DType high, const Blob* input, Blob* ret,
-           Context* ctx) {
+void Clamp(int count, DType low, DType high, const Blob *input, Blob *ret,
+           Context *ctx) {
   LOG(FATAL) << "Not Implemented";
 }
 
 /// ret = input + x
 template <typename DType, typename Lang>
-void Add(int count, const Blob* input, DType x, Blob* ret, Context* ctx) {
+void Add(int count, const Blob *input, DType x, Blob *ret, Context *ctx) {
   LOG(FATAL) << "Not Implemented";
 }
 /// ret =  input - x
 template <typename DType, typename Lang>
-void Sub(int count, const Blob* input, DType x, Blob* ret, Context* ctx) {
+void Sub(int count, const Blob *input, DType x, Blob *ret, Context *ctx) {
   Add<DType, Lang>(count, input, -x, ret, ctx);
 }
 /// ret = input * x
 template <typename DType, typename Lang>
-void EltwiseMult(int count, const Blob* input, DType x, Blob* ret, Context* 
ctx)
-{
+void EltwiseMult(int count, const Blob *input, DType x, Blob *ret,
+                 Context *ctx) {
   LOG(FATAL) << "Not Implemented";
 }
 /// ret = input / x
 template <typename DType, typename Lang>
-void Div(int count, const Blob* input, DType x, Blob* ret, Context* ctx) {
+void Div(int count, const Blob *input, DType x, Blob *ret, Context *ctx) {
   EltwiseMult<DType, Lang>(count, input, DType(1) / x, ret, ctx);
 }
 
 /// ret = lhs + rhs
 template <typename DType, typename Lang>
-void Add(int count, const Blob* lhs, const Blob* rhs, Blob* ret, Context* ctx) 
{
+void Add(int count, const Blob *lhs, const Blob *rhs, Blob *ret, Context *ctx) 
{
   LOG(FATAL) << "Not Implemented";
 }
 
 /// ret = lhs - rhs
 template <typename DType, typename Lang>
-void Sub(int count, const Blob* lhs, const Blob* rhs, Blob* ret, Context* ctx) 
{
+void Sub(int count, const Blob *lhs, const Blob *rhs, Blob *ret, Context *ctx) 
{
   LOG(FATAL) << "Not Implemented";
 }
 
 /// ret = lhs * rhs
 template <typename DType, typename Lang>
-void EltwiseMult(int count, const Blob* lhs, const Blob* rhs, Blob* ret,
-          Context* ctx) {
+void EltwiseMult(int count, const Blob *lhs, const Blob *rhs, Blob *ret,
+                 Context *ctx) {
   LOG(FATAL) << "Not Implemented";
 }
 
 /// ret = lhs / rhs
 template <typename DType, typename Lang>
-void Div(int count, const Blob* lhs, const Blob* rhs, Blob* ret, Context* ctx) 
{
+void Div(int count, const Blob *lhs, const Blob *rhs, Blob *ret, Context *ctx) 
{
   LOG(FATAL) << "Not Implemented";
 }
 
 /// outer-product.
 /// lhs and rhs are vectors of len m and n. ret is matrix of shape m * n
 template <typename DType, typename Lang>
-void Outer(int m, int n, const Blob* lhs, const Blob* rhs, Blob* ret,
-           Context* ctx) {
+void Outer(int m, int n, const Blob *lhs, const Blob *rhs, Blob *ret,
+           Context *ctx) {
   LOG(FATAL) << "Not Implemented";
 }
 
@@ -218,36 +224,36 @@ void Outer(int m, int n, const Blob* lhs, const Blob* 
rhs, Blob* ret,
 // ===== Level 1
 /// return the index of the element with the max value.
 template <typename DType, typename Lang>
-void Amax(int count, const Blob* input, int* ret, Context* ctx) {
+void Amax(int count, const Blob *input, int *ret, Context *ctx) {
   LOG(FATAL) << "Not Implemented";
 }
 
 /// return the index of the element with the min value.
 template <typename DType, typename Lang>
-void Amin(int count, const Blob* input, int* ret, Context* ctx) {
+void Amin(int count, const Blob *input, int *ret, Context *ctx) {
   LOG(FATAL) << "Not Implemented";
 }
 /// ret = sum |x| for all x in input
 template <typename DType, typename Lang>
-void Asum(int count, const Blob* input, DType* ret, Context* ctx) {
+void Asum(int count, const Blob *input, DType *ret, Context *ctx) {
   LOG(FATAL) << "Not Implemented";
 }
 
 /// ret = alpha * input + ret
 template <typename DType, typename Lang>
-void Axpy(int count, DType alpha, const Blob* input, Blob* ret, Context* ctx) {
+void Axpy(int count, DType alpha, const Blob *input, Blob *ret, Context *ctx) {
   LOG(FATAL) << "Not Implemented";
 }
 
 /// ret *= x
 template <typename DType, typename Lang>
-void Scale(int count, DType x, Blob* ret, Context* ctx) {
+void Scale(int count, DType x, Blob *ret, Context *ctx) {
   LOG(FATAL) << "Not Implemented";
 }
 
 template <typename DType, typename Lang>
-void Dot(int count, const Blob* lhs, const Blob* rhs, DType* ret,
-         Context* ctx) {
+void Dot(const size_t num, const Blob *in1, const Blob *in2, DType *out,
+         Context *ctx) {
   LOG(FATAL) << "Not Implemented";
 }
 
@@ -255,56 +261,64 @@ void Dot(int count, const Blob* lhs, const Blob* rhs, 
DType* ret,
 /// ret = alpha * op(A) * v + beta * ret.
 /// op(A) = A if trans = false; A^T otherwise; rows(op(A)) = m, cols(op(A)) = 
n.
 template <typename DType, typename Lang>
-void GEMV(bool trans, int m, int n, DType alpha, const Blob* A, const Blob* v,
-          DType beta, Blob* ret, Context* ctx) {
+void GEMV(bool trans, int m, int n, DType alpha, const Blob *A, const Blob *v,
+          DType beta, Blob *ret, Context *ctx) {
   LOG(FATAL) << "Not Implemented";
 }
 
 // ===== Level 3
-/// ret = alpha * op(A) * op(B) + beta * ret.
-/// op(A) = A if trans = false; A^T otherwise; rows(ret) = m, cols(ret) = n.
-template <typename DType, typename Lang>
-void GEMM(bool transA, bool transB, int m, int n, int k, DType alpha,
-          const Blob* A, const Blob* B, DType beta, Blob* ret, Context* ctx) {
-  LOG(FATAL) << "Not Implemented";
-}
 
 // ================Random functions===========================================
 /// Each element of ret would be 1 with prob p and 0 with 1-p. 0<= p <= 1
 // Get the random generator from 'ctx'
 // If DType is not float, then convert the threshold to DType
 template <typename DType, typename Lang>
-void Bernoulli(int count, float p, Blob* ret, Context* ctx) {
+void Bernoulli(int count, float p, Blob *ret, Context *ctx) {
   LOG(FATAL) << "Not Implemented";
 }
 // The random generator should be extracted from ctx.
 // If DType is not float, then convert the low and high to DType
 template <typename DType, typename Lang>
-void Uniform(int count, float low, float high, Blob* ret, Context* ctx) {
+void Uniform(int count, float low, float high, Blob *ret, Context *ctx) {
   LOG(FATAL) << "Not Implemented";
 }
 // The random generator should be extracted from ctx.
 // If DType is not float, then convert the mean and std to DType
 template <typename DType, typename Lang>
-void Gaussian(int count, float mean, float std, Blob* ret, Context* ctx) {
-  LOG(FATAL) << "Not Implemented";
-}
-
-/*Some operations would have many config/hyper-parameters, e.g., Conv, and
-these config vary among diff implementations, e.g., cuda/cudnn/opencl.
-To separate the modules, we pass a OpConf pointer to the Tensor Op function.
-The specific fields are implemented by inheriting OpConf, and casting the
-pointer between the base and the sub-class.
-class OpConf {
- public:
-  template <typename T>
-  T* CastTo() {
-    static_assert(std::is_base_of<OpConf, T>::value,
-                  "The cast type must be a sub-class of OpConf");
-    return static_cast<T*>(this);
-  }
-};
-*/
-}  // namespace singa
+void Gaussian(int count, float mean, float std, Blob *ret, Context *ctx) {
+  LOG(FATAL) << "Not Implemented";
+}
+
+// ========follow the consistency guide of math API
+
+template <typename DType, typename Lang>
+void Set(const size_t num, const DType x, Blob *out, Context *ctx) {
+  LOG(FATAL) << "Not Implemented";
+}
+/// Divide alpha by each element of 'in'.
+template <typename DType, typename Lang>
+void Div(const size_t num, const DType alpha, const Blob *in, Blob *out,
+         Context *ctx) {
+  LOG(FATAL) << "Not Implemented";
+}
+
+/// multiply a matrix with a diagnoal matrix constructed using values from 'v'.
+/// if matrix_lef_side is true, do M*v; else do v*M
+template <typename DType, typename Lang>
+void DGMM(const bool side_right, const size_t nrow, const size_t ncol,
+          const Blob *M, const Blob *v, Blob *out, Context *ctx) {
+  LOG(FATAL) << "Not Implemented";
+}
+
+/// C = alpha * A * B + beta * C.
+/// transA indicates if the internal data layout is transposed of A
+template <typename DType, typename Lang>
+void GEMM(const bool transA, const bool transB, const size_t nrowA,
+          const size_t ncolB, const size_t ncolA, const DType alpha,
+          const Blob *A, const Blob *B, const DType beta, Blob *C,
+          Context *ctx) {
+  LOG(FATAL) << "Not Implemented";
+}
+} // namespace singa
 
 #endif  // SINGA_CORE_MATH_H_

[18/50] [abbrv] incubator-singa git commit: SINGA-182 Clean math function APIs and implementations

Reply via email to