SINGA-182 Clean math function APIs and implementations Implement GEMM/DGMM to support sum rows/columns, and add/sub/mult/div:row/column
Pass all test; Format code and update the consistency guide for cleaning code. Add the compile guard for USE_CBLAS. TODO, find cblas by cmake and set USE_CBLAS Project: http://git-wip-us.apache.org/repos/asf/incubator-singa/repo Commit: http://git-wip-us.apache.org/repos/asf/incubator-singa/commit/870d1a97 Tree: http://git-wip-us.apache.org/repos/asf/incubator-singa/tree/870d1a97 Diff: http://git-wip-us.apache.org/repos/asf/incubator-singa/diff/870d1a97 Branch: refs/heads/master Commit: 870d1a97e19061f3f42b9cf907874609f7158231 Parents: fbd5219 Author: Wei Wang <[email protected]> Authored: Fri May 27 20:31:41 2016 +0800 Committer: Wei Wang <[email protected]> Committed: Mon May 30 13:24:51 2016 +0800 ---------------------------------------------------------------------- CMakeLists.txt | 1 + cmake/Cuda.cmake | 1 + cmake/Dependencies.cmake | 4 + cmake/Templates/singa_config.h.in | 4 +- include/singa/core/tensor.h | 257 +++++++------- include/singa/utils/cuda_utils.h | 60 ++-- src/core/device/cpp_cpu.cc | 13 +- src/core/device/cuda_gpu.cc | 10 +- src/core/device/device.cc | 1 - src/core/tensor/math_kernel.cu | 26 ++ src/core/tensor/math_kernel.h | 15 +- src/core/tensor/tensor.cc | 610 +++++++++++++++++++------------- src/core/tensor/tensor_math.h | 160 +++++---- src/core/tensor/tensor_math_cpp.h | 157 +++++--- src/core/tensor/tensor_math_cuda.h | 117 ++++-- test/singa/test_cpp_math.cc | 25 -- test/singa/test_mse.cc | 26 +- test/singa/test_tensor.cc | 2 - test/singa/test_tensor_math.cc | 447 ++++++++++++++++++++++- 19 files changed, 1325 insertions(+), 611 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/870d1a97/CMakeLists.txt ---------------------------------------------------------------------- diff --git a/CMakeLists.txt b/CMakeLists.txt index e08fb98..d585497 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -17,6 +17,7 @@ SET(SINGA_INCLUDE_DIR "${CMAKE_SOURCE_DIR}/include;${PROJECT_BINARY_DIR}") INCLUDE_DIRECTORIES(${SINGA_INCLUDE_DIR}) #OPTION(CPU_ONLY "use GPU libs" OFF) +OPTION(USE_CBLAS "Use CBlas libs" OFF) OPTION(USE_CUDA "Use Cuda libs" ON) OPTION(USE_CUDNN "Use Cudnn libs" ON) OPTION(USE_OPENCV "Use opencv" OFF) http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/870d1a97/cmake/Cuda.cmake ---------------------------------------------------------------------- diff --git a/cmake/Cuda.cmake b/cmake/Cuda.cmake index 19d4e27..4985bb8 100644 --- a/cmake/Cuda.cmake +++ b/cmake/Cuda.cmake @@ -22,3 +22,4 @@ ENDIF() INCLUDE_DIRECTORIES(SYSTEM ${CUDA_INCLUDE_DIRS}) LIST(APPEND SINGA_LINKER_LIBS ${CUDA_CUDART_LIBRARY} ${CUDA_curand_LIBRARY} ${CUDA_CUBLAS_LIBRARIES}) +MESSAGE(STATUS "libs " ${SINGA_LINKER_LIBS}) http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/870d1a97/cmake/Dependencies.cmake ---------------------------------------------------------------------- diff --git a/cmake/Dependencies.cmake b/cmake/Dependencies.cmake index ae28073..e995553 100644 --- a/cmake/Dependencies.cmake +++ b/cmake/Dependencies.cmake @@ -14,3 +14,7 @@ ELSE() SET(USE_CUDA FALSE) SET(USE_CUDNN FALSE) ENDIF() + + +#LIST(APPEND SINGA_LINKER_LIBS "/home/wangwei/local/lib/libopenblas.so") +#MESSAGE(STATUS "link lib : " ${SINGA_LINKER_LIBS}) http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/870d1a97/cmake/Templates/singa_config.h.in ---------------------------------------------------------------------- diff --git a/cmake/Templates/singa_config.h.in b/cmake/Templates/singa_config.h.in index e0f7328..5e8b32d 100644 --- a/cmake/Templates/singa_config.h.in +++ b/cmake/Templates/singa_config.h.in @@ -4,7 +4,9 @@ // Binaries director #define BINARY_FOLDER "${PROJECT_BINARY_DIR}" -#cmakedefine CPU_ONLY +#cmakedefine CPU_ONLY + +#cmakedefine USE_CBLAS // cuda #cmakedefine USE_CUDA http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/870d1a97/include/singa/core/tensor.h ---------------------------------------------------------------------- diff --git a/include/singa/core/tensor.h b/include/singa/core/tensor.h index e560071..f51c899 100644 --- a/include/singa/core/tensor.h +++ b/include/singa/core/tensor.h @@ -33,22 +33,22 @@ namespace singa { typedef vector<size_t> Shape; typedef Shape::iterator ShapeIter; -inline size_t Product(const Shape& shape, int start = 0, size_t len = 0) { +inline size_t Product(const Shape &shape, int start = 0, size_t len = 0) { if (len == 0) len = shape.size(); CHECK_LE(len, shape.size()); size_t v = 1; - for (unsigned int i = start; i < len; i ++) + for (unsigned int i = start; i < len; i++) v *= shape[i]; return v; } /// hardcode the width of types defined in DataType const size_t kDataWidth[] = {sizeof(float), sizeof(float) / 2, sizeof(int), - sizeof(char), sizeof(double)}; + sizeof(char), sizeof(double)}; inline size_t SizeOf(DataType t) { static_assert(kNumDataType == sizeof(kDataWidth) / sizeof(size_t), - "Num of data types not match num of data width"); + "Num of data types not match num of data width"); CHECK_GT(kNumDataType, t); return kDataWidth[t]; } @@ -62,52 +62,44 @@ inline size_t SizeOf(DataType t) { /// then it must be set up correctly (shape, device). Otherwise, runtime error /// like SegmentFault would happen. Simply type/device check would be conducted. class Tensor { - public: +public: ~Tensor(); Tensor(); - explicit Tensor(Shape&& shape, DataType dtype = kFloat32); - explicit Tensor(const Shape& shape, DataType dtype = kFloat32); - Tensor(Shape&& shape, Device* dev, DataType dtype = kFloat32); - Tensor(const Shape& shape, Device* dev, DataType dtype = kFloat32); + explicit Tensor(Shape &&shape, DataType dtype = kFloat32); + explicit Tensor(const Shape &shape, DataType dtype = kFloat32); + Tensor(Shape &&shape, Device *dev, DataType dtype = kFloat32); + Tensor(const Shape &shape, Device *dev, DataType dtype = kFloat32); /// Copy Tensor to share the internal data. No deep copy. - Tensor(const Tensor& from); + Tensor(const Tensor &from); /// Copy Tensor to share the internal data. No deep copy. - Tensor(Tensor&& from); + Tensor(Tensor &&from); /// For functions in xx_math.cc to access the blob. /// Users should not operate against Blob directly. /// blob_ is allocated in constructors. - Blob* blob() const { - return blob_; - } + Blob *blob() const { return blob_; } - Device* device() const { - return device_; - } + Device *device() const { return device_; } /// Return immutable Tensor values with given type. - template <typename DType> - DType data() const { - return static_cast<DType> (blob()->data()); + template <typename DType> DType data() const { + return static_cast<DType>(blob()->data()); } /// data type, including kFloat16, kFloat32, kInt - const DataType data_type() const { - return data_type_; - } + const DataType data_type() const { return data_type_; } - const Shape& shape() const { - return shape_; - } + const Shape &shape() const { return shape_; } - int nDim() const { - return shape_.size(); + const size_t shape(size_t idx) const { + CHECK_LT(idx, shape_.size()); + return shape_.at(idx); } - bool transpose() const { - return transpose_; - } + int nDim() const { return shape_.size(); } + + bool transpose() const { return transpose_; } /// Return number of total elements size_t Size() const { @@ -116,39 +108,37 @@ class Tensor { } /// Return memory size (i.e., Bytes) - size_t MemSize() const { - return blob_->size(); - } + size_t MemSize() const { return blob_->size(); } /// Reset the tensor shape, it may reallocate blob, if MemSize() changes. - void Reshape(const Shape& shape); + void Reshape(const Shape &shape); + void Reshape(Shape &&shape); /// Reset the shape, device, and data type as given tensor. /// If blob size changes, then reallocate a new blob. The previous blob would /// be deleted. - void ResetLike(const Tensor& t); + void ResetLike(const Tensor &t); /// Reset the data type, it would reallocate blob if type changes. void AsType(DataType type); /// Reset the device. /// If the target device is a diff device, then do deep data copy. - void ToDevice(Device* dev); + void ToDevice(Device *dev); /// Equivalent to ToDevice(host_dev). void ToHost(); /// Set each element of the tensor to be x - template<typename SType> - void SetValue(SType x); + template <typename SType> void SetValue(const SType x); /// For init the tensor values, copy 'num' elements. - template<typename DType> - void CopyDataFromHostPtr(const DType* src, size_t num); + template <typename DType> + void CopyDataFromHostPtr(const DType *src, size_t num); /// Copy data from another Tensor which may be on a diff device. /// Meta data would not be copied! - void CopyData(const Tensor& other); + void CopyData(const Tensor &other); /// Return an exactly the same Tensor with data been deep copied. Tensor Clone(); @@ -160,135 +150,124 @@ class Tensor { Tensor T() const; /// Copy the meta info with data blob shared. - Tensor& operator=(const Tensor& t); + Tensor &operator=(const Tensor &t); /// Copy the meta info with data blob shared. - Tensor& operator=(Tensor&& t); + Tensor &operator=(Tensor &&t); - - Tensor& operator+=(const Tensor& t); + Tensor &operator+=(const Tensor &t); // void operator+=(Tensor&& t); - Tensor& operator-=(const Tensor& t); + Tensor &operator-=(const Tensor &t); // void operator-=(Tensor&& t); - Tensor& operator*=(const Tensor& t); + Tensor &operator*=(const Tensor &t); // void operator*=(Tensor&& t); - Tensor& operator/=(const Tensor& t); + Tensor &operator/=(const Tensor &t); // void operator/=(Tensor&& t); // Scalar operations. /// T is a scalar type - template<typename DType> - Tensor& operator+=(DType x); + template <typename DType> Tensor &operator+=(DType x); /// T is a scalar type - template <typename DType> - Tensor& operator-=(const DType x); + template <typename DType> Tensor &operator-=(const DType x); /// T is a scalar type - template <typename DType> - Tensor& operator*=(const DType x); + template <typename DType> Tensor &operator*=(const DType x); /// T is a scalar type - template <typename DType> - Tensor& operator/=(const DType x); + template <typename DType> Tensor &operator/=(const DType x); /// save Tensor into a proto msg // void ToProto(TensorProto* t); /// load Tensor from proto msg // void FromProto(const TensorProto& t); - protected: +protected: bool transpose_ = false; DataType data_type_ = kFloat32; - Device* device_ = nullptr; + Device *device_ = nullptr; /// Note: blob_ is allocated in lazy manner to avoid frequent malloc/free. /// If you want to get an allocated Blob, use blob() instead of blob_. - Blob* blob_ = nullptr; + Blob *blob_ = nullptr; Shape shape_; }; +inline void CheckDataTypeAndLang(const Tensor &in1, const Tensor &in2) { + CHECK_EQ(in1.data_type(), in2.data_type()); + CHECK_EQ(in1.device()->lang(), in2.device()->lang()); +} + +Tensor Reshape(const Tensor &in, const Shape &s); +Tensor Reshape(const Tensor &in, Shape &&s); + // For tensors with sparse content, e.g., missing columns or rows. // class SparseTensor : public Tensor {}; /// Copy 'num' elements of src to dst. /// The first 'src_offset' ('dst_offset') elements will be skipped. -void CopyDataToFrom(Tensor* dst, - const Tensor& src, - size_t num, - size_t src_offset = 0, - size_t dst_offset = 0); +void CopyDataToFrom(Tensor *dst, const Tensor &src, size_t num, + size_t src_offset = 0, size_t dst_offset = 0); // ==================Simple Linear Algebra Operations========================= -Tensor Abs(const Tensor& t); -Tensor Exp(const Tensor& t); -Tensor Log(const Tensor& t); -Tensor ReLU(const Tensor& t); -Tensor Sigmoid(const Tensor& t); -Tensor Sign(const Tensor& t); -Tensor Sqrt(const Tensor& t); -Tensor Square(const Tensor& t); -Tensor Tanh(const Tensor& t); - - -template<typename SType> -SType Sum(const Tensor& t); +Tensor Abs(const Tensor &t); +Tensor Exp(const Tensor &t); +Tensor Log(const Tensor &t); +Tensor ReLU(const Tensor &t); +Tensor Sigmoid(const Tensor &t); +Tensor Sign(const Tensor &t); +Tensor Sqrt(const Tensor &t); +Tensor Square(const Tensor &t); +Tensor Tanh(const Tensor &t); + +template <typename SType> SType Sum(const Tensor &t); /// Sum elements in the Tensor, currently only support vector and matrix. /// if 'axis' is 0, sum all rows into a single row /// if 'axis' is 1, sum all columns into a single column /// TODO(wangwei) support arbitrary Tensor like numpy.sum -Tensor Sum(const Tensor& t, int axis); +Tensor Sum(const Tensor &t, int axis); /// Average elements in the Tensor, currently only support vector and matrix. /// if 'axis' is 0, average all rows into a single row /// if 'axis' is 1, average all columns into a single column /// TODO(wangwei) support arbitrary Tensor like numpy.average -Tensor Average(const Tensor&t, int axis); +Tensor Average(const Tensor &t, int axis); /// Regarding the internal data as 2d, with shape_[0]*...*shape_[axis] rows, /// and shape_[axis+1]*...*shape_[nDim()] columns. /// and do softmax along each row. -Tensor Softmax(const Tensor& t, int axis = -1); -void Softmax(const Tensor& t, Tensor* ret, int axis = -1); +Tensor Softmax(const Tensor &t, int axis = -1); +void Softmax(const Tensor &t, Tensor *ret, int axis = -1); /// Element-wise opeartion, ret[i]=t[i]^x -template<typename DType> -Tensor Pow(const Tensor& t, DType x); +template <typename DType> Tensor Pow(const Tensor &t, DType x); /// Element-wise opeartion, ret[i]=t[i]^x -template<typename DType> -void Pow(const Tensor& t, DType x, Tensor* ret); +template <typename DType> void Pow(const Tensor &t, DType x, Tensor *ret); /// Element-wise opeartion, ret[i]=baes[i]^exp[i] -Tensor Pow(const Tensor& base, Tensor exp); +Tensor Pow(const Tensor &base, Tensor exp); /// Element-wise opeartion, ret[i]=baes[i]^exp[i] -void Pow(const Tensor& base, const Tensor& exp, Tensor* ret); +void Pow(const Tensor &base, const Tensor &exp, Tensor *ret); -Tensor operator+(const Tensor& lhs, const Tensor& rhs); -void Add(const Tensor& lhs, const Tensor& rhs, Tensor* ret); -Tensor operator-(const Tensor& lhs, const Tensor& rhs); -void Sub(const Tensor& lhs, const Tensor& rhs, Tensor* ret); -Tensor operator*(const Tensor& lhs, const Tensor& rhs); -void EltwiseMult(const Tensor& lhs, const Tensor& rhs, Tensor* ret); -Tensor operator/(const Tensor& lhs, const Tensor& rhs); -void Div(const Tensor& lhs, const Tensor& rhs, Tensor* ret); +Tensor operator+(const Tensor &lhs, const Tensor &rhs); +void Add(const Tensor &lhs, const Tensor &rhs, Tensor *ret); +Tensor operator-(const Tensor &lhs, const Tensor &rhs); +void Sub(const Tensor &lhs, const Tensor &rhs, Tensor *ret); +Tensor operator*(const Tensor &lhs, const Tensor &rhs); +void EltwiseMult(const Tensor &lhs, const Tensor &rhs, Tensor *ret); +Tensor operator/(const Tensor &lhs, const Tensor &rhs); +void Div(const Tensor &lhs, const Tensor &rhs, Tensor *ret); -template <typename DType> -Tensor operator+(const Tensor& t, DType x); -template <typename DType> -void Add(const Tensor& t, DType x, Tensor* ret); +template <typename DType> Tensor operator+(const Tensor &t, DType x); +template <typename DType> void Add(const Tensor &t, DType x, Tensor *ret); -template <typename DType> -Tensor operator-(const Tensor& t, DType x); -template <typename DType> -void Sub(const Tensor& t, DType x, Tensor* ret); +template <typename DType> Tensor operator-(const Tensor &t, DType x); +template <typename DType> void Sub(const Tensor &t, DType x, Tensor *ret); +template <typename DType> Tensor operator*(const Tensor &t, DType x); template <typename DType> -Tensor operator*(const Tensor& t, DType x); -template <typename DType> -void EltwiseMult(const Tensor& t, DType x, Tensor* ret); +void EltwiseMult(const Tensor &t, DType x, Tensor *ret); -template <typename DType> -Tensor operator/(const Tensor& t, DType x); -template <typename DType> -void Div(const Tensor& t, DType x, Tensor* ret); +template <typename DType> Tensor operator/(const Tensor &t, DType x); +template <typename DType> void Div(const Tensor &t, DType x, Tensor *ret); // ================Blas operations============================================ // We fix the scalar argument type to be float. @@ -302,27 +281,59 @@ void Div(const Tensor& t, DType x, Tensor* ret); // void Axpy(DType x, const Blob& t, Blob* ret, Context* ctx); /// Do matrix vector multipication or matrix matrix multiplication depdending -/// on the Tensor shape. ret = lhs * rhs -Tensor Mult(const Tensor& lhs, const Tensor& rhs); +/// on the Tensor shape. result = A * B +Tensor Mult(const Tensor &A, const Tensor &B); /// Do matrix vector multipication or matrix matrix multiplication depdending -/// on the Tensor shape. ret = lhs * rhs -void Mult(const Tensor& lhs, const Tensor& rhs, Tensor* ret); +/// on the Tensor shape. C = A * B +void Mult(const Tensor &A, const Tensor &B, Tensor *C); /// Do matrix vector multipication or matrix matrix multiplication depdending -/// on the Tensor shape. ret = alpha lhs * rhs + beta * ret -Tensor Mult(float alpha, const Tensor& lhs, float beta, const Tensor& rhs); -/// Do matrix vector multipication or matrix matrix multiplication depdending /// on the Tensor shape. ret = alpha lhs * rhs + beta * ret -void Mult(float alpha, const Tensor& lhs, float beta, const Tensor& rhs, - Tensor* C); +void Mult(const float alpha, const Tensor &lhs, const Tensor &rhs, + const float beta, Tensor *C); // ================Random operations========================================== /// For each element x set x = 1 if random() < p; otherwise x = 1. -void Bernoulli(float p, Tensor* t); +void Bernoulli(float p, Tensor *t); /// Fill in Tensor 't' following uniform distribution. -void Uniform(float low, float high, Tensor* t); +void Uniform(float low, float high, Tensor *t); /// Fill in Tensor 't' following Gaussian distribution. -void Gaussian(float mean, float std, Tensor* t); +void Gaussian(float mean, float std, Tensor *t); + +// follow the consistency guide +// ============Matrix vector operations======================================= +/// Add column 'v' with each column of matrix M +void AddColumn(const Tensor &v, Tensor *M); +void AddColumn(const float alpha, const float beta, const Tensor &v, + Tensor *out); +/// Sub column 'v' by each column of matrix M +void SubColumn(const Tensor &v, Tensor *M); +/// Multiply column 'v' and each column of matrix M; write results into 'out' +void MultColumn(const Tensor &v, Tensor *M); +/// Divide column 'v' by each column of matrix M; write results into 'out' +void DivColumn(const Tensor &v, Tensor *M); + +/// Add row 'v' with each row of matrix M; write results into 'out' +void AddRow(const Tensor &v, Tensor *out); +void AddRow(const float alpha, const float beta, const Tensor &v, Tensor *M); +/// Sub row 'v' by each row of matrix M; write results into 'out' +void SubRow(const Tensor &v, Tensor *M); +/// Multiply row 'v' with each row of matrix M; write results into 'out' +void MultRow(const Tensor &v, Tensor *M); +/// Divide row 'v' by each row of matrix M; write results into 'out' +void DivRow(const Tensor &v, Tensor *M); + +/// Sum all rows of matrix M into a single row as 'out' +void SumRows(const Tensor &M, Tensor *out); +/// Sum all columns of matrix M into a single column as 'out' +void SumColumns(const Tensor &M, Tensor *out); + +/// For each element x of Tensor 'in', compute alpha/x +template <typename SType> Tensor Div(const SType alpha, const Tensor &in); + +/// For each element x of Tensor 'in', compute alpha/x into 'out' +template <typename SType> +void Div(const SType alpha, const Tensor &in, Tensor *out); } // namespace singa http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/870d1a97/include/singa/utils/cuda_utils.h ---------------------------------------------------------------------- diff --git a/include/singa/utils/cuda_utils.h b/include/singa/utils/cuda_utils.h index 076d0d1..17eb683 100644 --- a/include/singa/utils/cuda_utils.h +++ b/include/singa/utils/cuda_utils.h @@ -8,33 +8,7 @@ #include <cuda.h> #include <cuda_runtime.h> -// -// CUDA macros -// - -// CUDA: various checks for different function calls. -#define CUDA_CHECK(condition) \ - /* Code block avoids redefinition of cudaError_t error */ \ - do { \ - cudaError_t error = condition; \ - CHECK_EQ(error, cudaSuccess) << " " << cudaGetErrorString(error); \ - } while (0) - -#define CUBLAS_CHECK(condition) \ - do { \ - cublasStatus_t status = condition; \ - CHECK_EQ(status, CUBLAS_STATUS_SUCCESS) << " " \ - << cublasGetErrorString(status); \ - } while (0) - -#define CURAND_CHECK(condition) \ - do { \ - curandStatus_t status = condition; \ - CHECK_EQ(status, CURAND_STATUS_SUCCESS) << " " \ - << curandGetErrorString(status); \ - } while (0) - -const char* cublasGetErrorString(cublasStatus_t error) { +inline const char* cublasGetErrorString(cublasStatus_t error) { switch (error) { case CUBLAS_STATUS_SUCCESS: return "CUBLAS_STATUS_SUCCESS"; @@ -64,7 +38,7 @@ const char* cublasGetErrorString(cublasStatus_t error) { return "Unknown cublas status"; } -const char* curandGetErrorString(curandStatus_t error) { +inline const char* curandGetErrorString(curandStatus_t error) { switch (error) { case CURAND_STATUS_SUCCESS: return "CURAND_STATUS_SUCCESS"; @@ -95,5 +69,33 @@ const char* curandGetErrorString(curandStatus_t error) { } return "Unknown curand status"; } -#endif + +// +// CUDA macros +// + +// CUDA: various checks for different function calls. +#define CUDA_CHECK(condition) \ + /* Code block avoids redefinition of cudaError_t error */ \ + do { \ + cudaError_t error = condition; \ + CHECK_EQ(error, cudaSuccess) << " " << cudaGetErrorString(error); \ + } while (0) + +#define CUBLAS_CHECK(condition) \ + do { \ + cublasStatus_t status = condition; \ + CHECK_EQ(status, CUBLAS_STATUS_SUCCESS) << " " \ + << cublasGetErrorString(status); \ + } while (0) + +#define CURAND_CHECK(condition) \ + do { \ + curandStatus_t status = condition; \ + CHECK_EQ(status, CURAND_STATUS_SUCCESS) << " " \ + << curandGetErrorString(status); \ + } while (0) + + +#endif // USE_CUDA #endif // SINGA_UTILS_CUDA_UTILS_H_ http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/870d1a97/src/core/device/cpp_cpu.cc ---------------------------------------------------------------------- diff --git a/src/core/device/cpp_cpu.cc b/src/core/device/cpp_cpu.cc index 28b0da4..44f614a 100644 --- a/src/core/device/cpp_cpu.cc +++ b/src/core/device/cpp_cpu.cc @@ -33,13 +33,18 @@ void CppCPU::DoExec(function<void(Context*)>&& fn, int executor) { } void* CppCPU::Malloc(int size) { - void *ptr = malloc(size); - memset(ptr, 0, size); - return ptr; + if (size > 0) { + void *ptr = malloc(size); + memset(ptr, 0, size); + return ptr; + } else { + return nullptr; + } } void CppCPU::Free(void* ptr) { - free(ptr); + if (ptr != nullptr) + free(ptr); } void CppCPU::CopyToFrom(void* dst, const void* src, size_t nBytes, http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/870d1a97/src/core/device/cuda_gpu.cc ---------------------------------------------------------------------- diff --git a/src/core/device/cuda_gpu.cc b/src/core/device/cuda_gpu.cc index 0ba05fb..5d4e1ed 100644 --- a/src/core/device/cuda_gpu.cc +++ b/src/core/device/cuda_gpu.cc @@ -89,15 +89,17 @@ void CudaGPU::CopyToFrom(void* dst, const void* src, size_t nBytes, /// Allocate cpu memory. void* CudaGPU::Malloc(int size) { void* ptr = nullptr; - CUDA_CHECK(cudaMalloc(&ptr, size)); - CUDA_CHECK(cudaMemset(ptr, 0, size)); + if (size > 0) { + CUDA_CHECK(cudaMalloc(&ptr, size)); + CUDA_CHECK(cudaMemset(ptr, 0, size)); + } return ptr; } /// Free cpu memory. void CudaGPU::Free(void* ptr) { - CHECK_NE(ptr, nullptr); - CUDA_CHECK(cudaFree(ptr)); + if (ptr != nullptr) + CUDA_CHECK(cudaFree(ptr)); } http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/870d1a97/src/core/device/device.cc ---------------------------------------------------------------------- diff --git a/src/core/device/device.cc b/src/core/device/device.cc index ede3fda..1d3c446 100644 --- a/src/core/device/device.cc +++ b/src/core/device/device.cc @@ -35,7 +35,6 @@ void Device::Exec(function<void(Context*)>&& fn, const vector<Blob*> read_blobs, Blob* Device::NewBlob(int size) { if (size > 0) { void* ptr = Malloc(size); - // memset(ptr, 0, size); return new Blob(ptr, size); } else { return nullptr; http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/870d1a97/src/core/tensor/math_kernel.cu ---------------------------------------------------------------------- diff --git a/src/core/tensor/math_kernel.cu b/src/core/tensor/math_kernel.cu index e67ea7b..88041b1 100644 --- a/src/core/tensor/math_kernel.cu +++ b/src/core/tensor/math_kernel.cu @@ -450,6 +450,32 @@ void set_value(int n, float v, float *out) { void threshold(int n, float alpha, const float *in, float *out) { kernel_threshold<<<ceil(n / CU1DBLOCKF), CU1DBLOCKF>>>(in, out, alpha, n); } + + +// follow the consistency guide for math API +__global__ void KernelDiv(const size_t num, const float alpha, const float *in, + float *out) { + for (size_t idx = blockIdx.x * blockDim.x + threadIdx.x; idx < num; + idx += blockDim.x * gridDim.x) { + out[idx] = alpha / in[idx]; + } +} + +__global__ void KernelSet(const size_t num, const float x, float *out) { + for (size_t idx = blockIdx.x * blockDim.x + threadIdx.x; idx < num; + idx += blockDim.x * gridDim.x) { + out[idx] = x; + } +} + +void Div(const size_t num, float alpha, const float *in, float *out, + cudaStream_t s) { + KernelDiv<<<ceil(num / CU1DBLOCKF), CU1DBLOCKF>>>(num, alpha, in, out); +} + +void Set(const size_t num, const float x, float *out, cudaStream_t s) { + KernelSet<<<ceil(num / CU1DBLOCKF), CU1DBLOCKF>>>(num, x, out); +} } // namespace cuda } // namespace singa http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/870d1a97/src/core/tensor/math_kernel.h ---------------------------------------------------------------------- diff --git a/src/core/tensor/math_kernel.h b/src/core/tensor/math_kernel.h index b016007..925346e 100644 --- a/src/core/tensor/math_kernel.h +++ b/src/core/tensor/math_kernel.h @@ -28,13 +28,7 @@ /// TODO(wangwei) Clean the function APIs as commented in tensor_math.h /// Add 'Context *ctx' as an argument of all cuda functions. namespace singa { -/* - void softmaxloss_forward(int n, int dim, const float *prob, - const int *label, float *loss); - void softmaxloss_backward(int n, int dim, float scale, - const int *label, float *grad); -*/ // TODO(wangwei) make all function templates. namespace cuda { void sum(int n, const float *in, float *out); @@ -44,7 +38,7 @@ void sum_row(int rows, int cols, int stride, const float *in, float *out); void sum_col(int rows, int cols, int stride, const float *in, float *out); void add_row(int rows, int cols, int stride, const float *in_row, - const float *in_mat, float *out); + const float *in_mat, float *out); void add(int n, const float *a, const float *b, float *out); @@ -87,7 +81,12 @@ void div(int n, const float *a, const float *b, float *out); void set_value(int n, float v, float *out); void threshold(int n, float alpha, const float *in, float *out); -} // cuda + +// follow the consistency guide for math API +void Div(const size_t num, const float x, const float *in, float *out, + cudaStream_t s); +void Set(const size_t num, const float x, float *out, cudaStream_t s); +} // cuda } // namespace singa http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/870d1a97/src/core/tensor/tensor.cc ---------------------------------------------------------------------- diff --git a/src/core/tensor/tensor.cc b/src/core/tensor/tensor.cc index 052f3ff..0e47a4f 100644 --- a/src/core/tensor/tensor.cc +++ b/src/core/tensor/tensor.cc @@ -25,51 +25,51 @@ namespace singa { Tensor::~Tensor() { - if (blob_ != nullptr && blob_->DecRefCount() == 0) device_->FreeBlob(blob_); + // LOG(ERROR) << "~"; + if (blob_ != nullptr && blob_->DecRefCount() == 0) + device_->FreeBlob(blob_); blob_ = nullptr; } Tensor::Tensor() { device_ = &defaultDevice; } -Tensor::Tensor(const Shape& shape, DataType dtype) +Tensor::Tensor(const Shape &shape, DataType dtype) : data_type_(dtype), device_(&defaultDevice), shape_(shape) { device_ = &defaultDevice; blob_ = device_->NewBlob(Product(shape_) * SizeOf(data_type_)); } -Tensor::Tensor(Shape&& shape, DataType dtype) +Tensor::Tensor(Shape &&shape, DataType dtype) : data_type_(dtype), device_(&defaultDevice), shape_(shape) { device_ = &defaultDevice; blob_ = device_->NewBlob(Product(shape_) * SizeOf(data_type_)); } -Tensor::Tensor(const Shape& shape, Device* device, DataType dtype) +Tensor::Tensor(const Shape &shape, Device *device, DataType dtype) : data_type_(dtype), device_(device), shape_(shape) { blob_ = device_->NewBlob(Product(shape_) * SizeOf(data_type_)); } -Tensor::Tensor(Shape&& shape, Device* device, DataType dtype) +Tensor::Tensor(Shape &&shape, Device *device, DataType dtype) : data_type_(dtype), device_(device), shape_(shape) { blob_ = device_->NewBlob(Product(shape_) * SizeOf(data_type_)); } -Tensor::Tensor(const Tensor& t) - : transpose_(t.transpose_), - data_type_(t.data_type_), - device_(t.device_), - blob_(t.blob()), - shape_(t.shape_) { +Tensor::Tensor(const Tensor &t) + : transpose_(t.transpose_), data_type_(t.data_type_), device_(t.device_), + blob_(t.blob()), shape_(t.shape_) { blob_->IncRefCount(); + // LOG(ERROR) << "const&"; } -Tensor::Tensor(Tensor&& t) - : transpose_(t.transpose_), - data_type_(t.data_type_), - device_(t.device_), +Tensor::Tensor(Tensor &&t) + : transpose_(t.transpose_), data_type_(t.data_type_), device_(t.device_), shape_(std::move(t.shape_)) { blob_ = t.blob_; t.blob_ = nullptr; + // LOG(ERROR) << "&&"; } -void Tensor::ResetLike(const Tensor& t) { +void Tensor::ResetLike(const Tensor &t) { if (blob_ == nullptr || device_ != t.device_ || MemSize() != t.MemSize()) { - if (blob_ != nullptr && blob_->DecRefCount() == 0) device_->FreeBlob(blob_); + if (blob_ != nullptr && blob_->DecRefCount() == 0) + device_->FreeBlob(blob_); shape_ = t.shape_; device_ = t.device_; data_type_ = t.data_type_; @@ -77,28 +77,40 @@ void Tensor::ResetLike(const Tensor& t) { } } -void Tensor::Reshape(const Shape& shape) { - if (shape_ != shape) { - if (blob_ != nullptr && blob_->DecRefCount() == 0) device_->FreeBlob(blob_); +void Tensor::Reshape(const Shape &shape) { + if (Product(shape_) != Product(shape)) { + if (blob_ != nullptr && blob_->DecRefCount() == 0) + device_->FreeBlob(blob_); blob_ = device_->NewBlob(Product(shape) * SizeOf(data_type_)); - shape_ = shape; } + shape_ = shape; +} + +void Tensor::Reshape(Shape &&shape) { + if (Product(shape_) != Product(shape)) { + if (blob_ != nullptr && blob_->DecRefCount() == 0) + device_->FreeBlob(blob_); + blob_ = device_->NewBlob(Product(shape) * SizeOf(data_type_)); + } + shape_ = std::move(shape); } void Tensor::AsType(DataType type) { if (data_type_ != type) { - if (blob_ != nullptr && blob_->DecRefCount() == 0) device_->FreeBlob(blob_); + if (blob_ != nullptr && blob_->DecRefCount() == 0) + device_->FreeBlob(blob_); blob_ = device_->NewBlob(Product(shape_) * SizeOf(type)); data_type_ = type; } } -void Tensor::ToDevice(Device* dst) { +void Tensor::ToDevice(Device *dst) { // TODO(wangwei) the comparison is very strict. May compare against device ID? if (device_ != dst) { Tensor tmp(shape_, dst, data_type_); tmp.CopyData(*this); - if (blob_ != nullptr && blob_->DecRefCount() == 0) device_->FreeBlob(blob_); + if (blob_ != nullptr && blob_->DecRefCount() == 0) + device_->FreeBlob(blob_); blob_ = tmp.blob_; tmp.blob_ = nullptr; device_ = dst; @@ -108,7 +120,7 @@ void Tensor::ToDevice(Device* dst) { void Tensor::ToHost() { ToDevice(device_->host()); } template <typename DType> -void Tensor::CopyDataFromHostPtr(const DType* src, size_t num) { +void Tensor::CopyDataFromHostPtr(const DType *src, size_t num) { CHECK_EQ(sizeof(DType), SizeOf(data_type_)) << "data_type is " << DataType_Name(data_type_) << " user given type is of size " << sizeof(DType); @@ -118,10 +130,10 @@ void Tensor::CopyDataFromHostPtr(const DType* src, size_t num) { LOG(WARNING) << "Copy data from null host ptr"; } } -template void Tensor::CopyDataFromHostPtr(const float* src, size_t num); -template void Tensor::CopyDataFromHostPtr(const int* src, size_t num); +template void Tensor::CopyDataFromHostPtr(const float *src, size_t num); +template void Tensor::CopyDataFromHostPtr(const int *src, size_t num); -void Tensor::CopyData(const Tensor& src) { +void Tensor::CopyData(const Tensor &src) { CHECK_EQ(Size(), src.Size()); CHECK(blob_ != nullptr); // Do copy only if the src's blob is already initialized. @@ -139,14 +151,21 @@ Tensor Tensor::Clone() { Tensor Tensor::T() const { CHECK_EQ(shape_.size(), 2u); - Tensor t(*this); + Tensor t; + t.device_ = device_; + t.data_type_ = data_type_; t.transpose_ = ~transpose_; - std::swap(t.shape_[0], t.shape_[1]); + t.shape_.push_back(shape_[1]); + t.shape_.push_back(shape_[0]); + t.blob_ = blob_; + blob_->IncRefCount(); return t; } -Tensor& Tensor::operator=(const Tensor& t) { - if (blob_ != nullptr && blob_->DecRefCount() == 0) device_->FreeBlob(blob_); +Tensor &Tensor::operator=(const Tensor &t) { + // LOG(ERROR) << "= const &"; + if (blob_ != nullptr && blob_->DecRefCount() == 0) + device_->FreeBlob(blob_); transpose_ = t.transpose_; data_type_ = t.data_type_; shape_ = t.shape_; @@ -156,8 +175,10 @@ Tensor& Tensor::operator=(const Tensor& t) { return *this; } -Tensor& Tensor::operator=(Tensor&& t) { - if (blob_ != nullptr && blob_->DecRefCount() == 0) device_->FreeBlob(blob_); +Tensor &Tensor::operator=(Tensor &&t) { + // LOG(ERROR) << "= &&"; + if (blob_ != nullptr && blob_->DecRefCount() == 0) + device_->FreeBlob(blob_); transpose_ = t.transpose_; data_type_ = t.data_type_; shape_ = std::move(t.shape_); @@ -167,10 +188,22 @@ Tensor& Tensor::operator=(Tensor&& t) { return *this; } -#define GenUnaryTensorArgMemberFunction(op, fn) \ - Tensor& Tensor::op(const Tensor& t) { \ - fn(*this, t, this); \ - return *this; \ +Tensor Reshape(const Tensor &in, const Shape &s) { + Tensor out(in); + out.Reshape(s); + return out; +} + +Tensor Reshape(const Tensor &in, Shape &&s) { + Tensor out(in); + out.Reshape(std::move(s)); + return out; +} + +#define GenUnaryTensorArgMemberFunction(op, fn) \ + Tensor &Tensor::op(const Tensor &t) { \ + fn(*this, t, this); \ + return *this; \ } GenUnaryTensorArgMemberFunction(operator+=, Add); @@ -178,13 +211,12 @@ GenUnaryTensorArgMemberFunction(operator-=, Sub); GenUnaryTensorArgMemberFunction(operator*=, EltwiseMult); GenUnaryTensorArgMemberFunction(operator/=, Div); -#define GenUnaryScalarArgMemberFunction(op, fn) \ - template <typename DType> \ - Tensor& Tensor::op(DType x) { \ - fn(*this, x, this); \ - return *this; \ - } \ - template Tensor& Tensor::op<float>(float x) +#define GenUnaryScalarArgMemberFunction(op, fn) \ + template <typename DType> Tensor &Tensor::op(DType x) { \ + fn(*this, x, this); \ + return *this; \ + } \ + template Tensor &Tensor::op<float>(float x) GenUnaryScalarArgMemberFunction(operator-=, Sub); GenUnaryScalarArgMemberFunction(operator+=, Add); @@ -192,7 +224,7 @@ GenUnaryScalarArgMemberFunction(operator*=, EltwiseMult); GenUnaryScalarArgMemberFunction(operator/=, Div); // ====================Tensor Operations======================================= -void CopyDataToFrom(Tensor* dst, const Tensor& src, size_t num, +void CopyDataToFrom(Tensor *dst, const Tensor &src, size_t num, size_t dst_offset, size_t src_offset) { auto width = SizeOf(src.data_type()); CHECK_EQ(width, SizeOf(dst->data_type())); @@ -223,94 +255,93 @@ void CopyDataToFrom(Tensor* dst, const Tensor& src, size_t num, //============================================================================ /// typedef DType accroding to type value. /// DType would be used in the code block __VA_ARGS__. -#define TYPE_SWITCH(type, DType, ...) \ - do { \ - switch (type) { \ - case kFloat32: { \ - typedef float DType; \ - { __VA_ARGS__ } \ - break; \ - } \ - case kInt: { \ - typedef int DType; \ - { __VA_ARGS__ } \ - break; \ - } \ - case kChar: { \ - typedef char DType; \ - { __VA_ARGS__ } \ - break; \ - } \ - default: \ - LOG(FATAL) << "Unknow data type = " << DataType_Name(type); \ - } \ +#define TYPE_SWITCH(type, DType, ...) \ + do { \ + switch (type) { \ + case kFloat32: { \ + typedef float DType; \ + { __VA_ARGS__ } \ + break; \ + } \ + case kInt: { \ + typedef int DType; \ + { __VA_ARGS__ } \ + break; \ + } \ + case kChar: { \ + typedef char DType; \ + { __VA_ARGS__ } \ + break; \ + } \ + default: \ + LOG(FATAL) << "Unknow data type = " << DataType_Name(type); \ + } \ } while (0) /// typedef DType and Lang according to data type and device programming /// language respectively. /// type is from DataType, and lang is from LangType. /// DType and Lang would be used in __VA_ARGS__. -#define TYPE_LANG_SWITCH(dtype, DType, ltype, Lang, ...) \ - do { \ - const int _SwitchShift = 3; \ - int _SwitchHash = ((dtype) << _SwitchShift) + (ltype); \ - switch (_SwitchHash) { \ - case ((kFloat32 << _SwitchShift) + kCuda): { \ - typedef float DType; \ - typedef lang::Cuda Lang; \ - { __VA_ARGS__ } \ - break; \ - } \ - case ((kFloat32 << _SwitchShift) + kCpp): { \ - typedef float DType; \ - typedef lang::Cpp Lang; \ - { __VA_ARGS__ } \ - break; \ - } \ - case ((kFloat32 << _SwitchShift) + kOpencl): { \ - typedef float DType; \ - typedef lang::Opencl Lang; \ - { __VA_ARGS__ } \ - break; \ - } \ - default: \ - LOG(FATAL) << "Unknown combination of data type " \ - << DataType_Name(dtype) << " and language " \ - << LangType_Name(ltype); \ - } \ +#define TYPE_LANG_SWITCH(dtype, DType, ltype, Lang, ...) \ + do { \ + const int _SwitchShift = 3; \ + int _SwitchHash = ((dtype) << _SwitchShift) + (ltype); \ + switch (_SwitchHash) { \ + case ((kFloat32 << _SwitchShift) + kCuda): { \ + typedef float DType; \ + typedef lang::Cuda Lang; \ + { __VA_ARGS__ } \ + break; \ + } \ + case ((kFloat32 << _SwitchShift) + kCpp): { \ + typedef float DType; \ + typedef lang::Cpp Lang; \ + { __VA_ARGS__ } \ + break; \ + } \ + case ((kFloat32 << _SwitchShift) + kOpencl): { \ + typedef float DType; \ + typedef lang::Opencl Lang; \ + { __VA_ARGS__ } \ + break; \ + } \ + default: \ + LOG(FATAL) << "Unknown combination of data type " \ + << DataType_Name(dtype) << " and language " \ + << LangType_Name(ltype); \ + } \ } while (0) - -template <typename SType> -void Tensor::SetValue(SType x) { +template <typename SType> void Tensor::SetValue(const SType x) { CHECK_EQ(sizeof(SType), SizeOf(data_type_)); auto size = Size(); auto ptr = blob_; TYPE_LANG_SWITCH(data_type_, DType, device_->lang(), Lang, { + // cast x to DType device_->Exec( - [size, x, ptr](Context* ctx) { Set<DType, Lang>(size, x, ptr, ctx); }, + [size, x, ptr](Context *ctx) { Set<DType, Lang>(size, x, ptr, ctx); }, {}, {ptr}); }); } - - -#define EltwiseUnaryTensorFn(fn, t, ret) \ - do { \ - TYPE_LANG_SWITCH(t.data_type(), DType, t.device()->lang(), Lang, { \ - ret->device()->Exec( \ - [t, ret](Context* ctx) { \ - fn<DType, Lang>(t.Size(), t.blob(), ret->blob(), ctx); \ - }, \ - {t.blob()}, {ret->blob()}); \ - }); \ +template void Tensor::SetValue<float>(const float x); + +#define EltwiseUnaryTensorFn(fn, t, ret) \ + do { \ + TYPE_LANG_SWITCH(t.data_type(), DType, t.device()->lang(), Lang, { \ + ret->device()->Exec( \ + [t, ret](Context *ctx) { \ + fn<DType, Lang>(t.Size(), t.blob(), ret->blob(), ctx); \ + }, \ + {t.blob()}, {ret->blob()}); \ + }); \ } while (0) -#define GenUnaryTensorFunction(fn) \ - Tensor fn(const Tensor& t) { \ - Tensor ret(t.shape(), t.device(), t.data_type()); \ - auto* retptr = &ret; \ - EltwiseUnaryTensorFn(fn, t, retptr); \ - return ret; \ +#define GenUnaryTensorFunction(fn) \ + Tensor fn(const Tensor &t) { \ + Tensor ret(t.shape(), t.device(), t.data_type()); \ + auto *retptr = &ret; \ + EltwiseUnaryTensorFn(fn, t, retptr); \ + return ret; \ } GenUnaryTensorFunction(Abs); @@ -323,63 +354,33 @@ GenUnaryTensorFunction(Sqrt); GenUnaryTensorFunction(Square); GenUnaryTensorFunction(Tanh); -// TODO(wangwei) consider matrix transpose. -Tensor SumRows(const Tensor& t) { - int ndim = t.shape().size(); - CHECK_EQ(ndim, 2) << "Cannot do SumRows for Tensor with ndim = " << ndim; - size_t nrow = t.shape().at(0), ncol = t.shape().at(1); - Tensor ret(Shape{ncol}, t.device(), t.data_type()); - TYPE_LANG_SWITCH(t.data_type(), DType, t.device()->lang(), Lang, { - ret.device()->Exec( - [nrow, ncol, t, ret](Context* ctx) { - SumRows<DType, Lang>(nrow, ncol, t.blob(), ret.blob(), ctx); - }, - {t.blob()}, {ret.blob()}); - }); - return ret; -} - -// TODO(wangwei) consider matrix transpose. -Tensor SumColumns(const Tensor& t) { - int ndim = t.shape().size(); - CHECK_EQ(ndim, 2) << "Cannot do SumColumns for Tensor with ndim = " << ndim; - CHECK(!t.transpose()); // TODO(wangwei) enable transpose - size_t nrow = t.shape().at(0), ncol = t.shape().at(1); - Tensor ret(Shape{nrow}, t.device(), t.data_type()); - TYPE_LANG_SWITCH(t.data_type(), DType, t.device()->lang(), Lang, { - ret.device()->Exec( - [nrow, ncol, t, ret](Context* ctx) { - SumColumns<DType, Lang>(nrow, ncol, t.blob(), ret.blob(), ctx); - }, - {t.blob()}, {ret.blob()}); - }); - return ret; -} - // TODO(wangwei) conside async exec -template<> -float Sum<float>(const Tensor& t) { +template <> float Sum<float>(const Tensor &t) { float s = 0.0f; TYPE_LANG_SWITCH(t.data_type(), DType, t.device()->lang(), Lang, { - t.device()->Exec( - [t, &s](Context* ctx) { - Sum<DType, Lang>(t.Size(), t.blob(), &s, ctx); + t.device()->Exec( + [t, &s](Context *ctx) { + Sum<DType, Lang>(t.Size(), t.blob(), &s, ctx); }, {t.blob()}, {}); - }); + }); return s; } -Tensor Sum(const Tensor& t, int axis) { +Tensor Sum(const Tensor &M, int axis) { if (axis == 0) { - return SumRows(t); + Tensor out(Shape{M.shape(1)}, M.device(), M.data_type()); + SumRows(M, &out); + return out; } else { CHECK_EQ(axis, 1) << "Not support Sum over axis = " << axis; - return SumColumns(t); + Tensor out(Shape{M.shape(0)}, M.device(), M.data_type()); + SumColumns(M, &out); + return out; } } -Tensor Average(const Tensor& t, int axis) { +Tensor Average(const Tensor &t, int axis) { // operator/ only has implementation for float scalar type, hence it is // necessary to cast the denominator to a float. // TODO(wangwei) implement function for cast scalar type involved in Tensor @@ -401,13 +402,13 @@ Tensor Average(const Tensor& t, int axis) { } } -Tensor Softmax(const Tensor& t, int axis) { +Tensor Softmax(const Tensor &t, int axis) { Tensor ret(t.shape(), t.device(), t.data_type()); Softmax(t, &ret, axis); return ret; } -void Softmax(const Tensor& t, Tensor* ret, int axis) { +void Softmax(const Tensor &t, Tensor *ret, int axis) { int nrow = 1, ncol = t.Size(), size = ncol; CHECK_GE(axis, -1); CHECK_GT(t.shape().size(), 0u); @@ -418,34 +419,34 @@ void Softmax(const Tensor& t, Tensor* ret, int axis) { } TYPE_LANG_SWITCH(t.data_type(), DType, t.device()->lang(), Lang, { ret->device()->Exec( - [nrow, ncol, t, ret](Context* ctx) { + [nrow, ncol, t, ret](Context *ctx) { Softmax<DType, Lang>(nrow, ncol, t.blob(), ret->blob(), ctx); }, {t.blob()}, {ret->blob()}); }); } -#define EltwiseBinaryTensorFn(fn, lhs, rhs, ret) \ - do { \ - TYPE_LANG_SWITCH(lhs.data_type(), DType, lhs.device()->lang(), Lang, { \ - CHECK_EQ(sizeof(DType), SizeOf(rhs.data_type())); \ - ret->device()->Exec( \ - [lhs, rhs, ret](Context* ctx) { \ - fn<DType, Lang>(lhs.Size(), lhs.blob(), rhs.blob(), ret->blob(), \ - ctx); \ - }, \ - {lhs.blob(), rhs.blob()}, {ret->blob()}); \ - }); \ +#define EltwiseBinaryTensorFn(fn, lhs, rhs, ret) \ + do { \ + TYPE_LANG_SWITCH(lhs.data_type(), DType, lhs.device()->lang(), Lang, { \ + CHECK_EQ(sizeof(DType), SizeOf(rhs.data_type())); \ + ret->device()->Exec( \ + [lhs, rhs, ret](Context *ctx) { \ + fn<DType, Lang>(lhs.Size(), lhs.blob(), rhs.blob(), ret->blob(), \ + ctx); \ + }, \ + {lhs.blob(), rhs.blob()}, {ret->blob()}); \ + }); \ } while (0) -#define GenBinaryTensorFunction(op, fn) \ - Tensor op(const Tensor& lhs, const Tensor& rhs) { \ - Tensor ret(lhs.shape(), lhs.device(), lhs.data_type()); \ - fn(lhs, rhs, &ret); \ - return ret; \ - } \ - void fn(const Tensor& lhs, const Tensor& rhs, Tensor* ret) { \ - EltwiseBinaryTensorFn(fn, lhs, rhs, ret); \ +#define GenBinaryTensorFunction(op, fn) \ + Tensor op(const Tensor &lhs, const Tensor &rhs) { \ + Tensor ret(lhs.shape(), lhs.device(), lhs.data_type()); \ + fn(lhs, rhs, &ret); \ + return ret; \ + } \ + void fn(const Tensor &lhs, const Tensor &rhs, Tensor *ret) { \ + EltwiseBinaryTensorFn(fn, lhs, rhs, ret); \ } GenBinaryTensorFunction(operator+, Add); @@ -454,32 +455,30 @@ GenBinaryTensorFunction(operator*, EltwiseMult); GenBinaryTensorFunction(operator/, Div); GenBinaryTensorFunction(Pow, Pow); -#define EltwiseTensorScalarFn(fn, t, x, ret) \ - do { \ - TYPE_LANG_SWITCH(t.data_type(), DType, t.device()->lang(), Lang, { \ - static_assert(std::is_same<SType, DType>::value, \ - "The Scalar type must match the Tensor data type"); \ - ret->device()->Exec( \ - [t, x, ret](Context* ctx) { \ - fn<DType, Lang>(t.Size(), t.blob(), x, ret->blob(), ctx); \ - }, \ - {t.blob()}, {ret->blob()}); \ - }); \ +#define EltwiseTensorScalarFn(fn, t, x, ret) \ + do { \ + TYPE_LANG_SWITCH(t.data_type(), DType, t.device()->lang(), Lang, { \ + static_assert(std::is_same<SType, DType>::value, \ + "The Scalar type must match the Tensor data type"); \ + ret->device()->Exec( \ + [t, x, ret](Context *ctx) { \ + fn<DType, Lang>(t.Size(), t.blob(), x, ret->blob(), ctx); \ + }, \ + {t.blob()}, {ret->blob()}); \ + }); \ } while (0) -#define GenTensorScalarFunction(op, fn) \ - template <typename SType> \ - Tensor op(const Tensor& t, SType x) { \ - Tensor ret(t.shape(), t.device(), t.data_type()); \ - fn(t, x, &ret); \ - return ret; \ - } \ - template <typename SType> \ - void fn(const Tensor& t, SType x, Tensor* ret) { \ - EltwiseTensorScalarFn(fn, t, x, ret); \ - } \ - template Tensor op<float>(const Tensor& t, float x); \ - template void fn<float>(const Tensor& t, const float x, Tensor* ret) +#define GenTensorScalarFunction(op, fn) \ + template <typename SType> Tensor op(const Tensor &t, SType x) { \ + Tensor ret(t.shape(), t.device(), t.data_type()); \ + fn(t, x, &ret); \ + return ret; \ + } \ + template <typename SType> void fn(const Tensor &t, SType x, Tensor *ret) { \ + EltwiseTensorScalarFn(fn, t, x, ret); \ + } \ + template Tensor op<float>(const Tensor &t, float x); \ + template void fn<float>(const Tensor &t, const float x, Tensor *ret) GenTensorScalarFunction(operator+, Add); GenTensorScalarFunction(operator-, Sub); @@ -488,83 +487,216 @@ GenTensorScalarFunction(operator/, Div); GenTensorScalarFunction(Pow, Pow); // ================Blas operations============================================ -Tensor Mult(const Tensor& lhs, const Tensor& rhs) { - Tensor ret(lhs.shape(), lhs.device(), lhs.data_type()); +Tensor Mult(const Tensor &lhs, const Tensor &rhs) { + Tensor ret(Shape{lhs.shape(0), rhs.shape(1)}, lhs.device(), lhs.data_type()); Mult(lhs, rhs, &ret); return ret; } -void Mult(const Tensor& lhs, const Tensor& rhs, Tensor* ret) { - Mult(1, lhs, 1, rhs, ret); +void Mult(const Tensor &lhs, const Tensor &rhs, Tensor *ret) { + Mult(1.0f, lhs, rhs, 0.0f, ret); } -Tensor Mult(float alpha, const Tensor& A, float beta, const Tensor& B) { - Tensor ret(A.shape(), A.device(), A.data_type()); - Mult(alpha, A, beta, B, &ret); - return ret; -} - -void Mult(float alpha, const Tensor& A, float beta, const Tensor& B, - Tensor* C) { +void Mult(const float alpha, const Tensor &A, const Tensor &B, const float beta, + Tensor *C) { CHECK_EQ(A.shape().size(), 2u); - bool transA = A.transpose(); - size_t m = transA ? A.shape()[1] : A.shape()[0], n = 0; - if (B.shape().size() == 1u) { - n = C->Size(); + if (B.nDim() == 1u) { TYPE_LANG_SWITCH(A.data_type(), DType, A.device()->lang(), Lang, { C->device()->Exec( - [transA, m, n, alpha, A, beta, B, C](Context* ctx) { - GEMV<DType, Lang>(transA, m, n, alpha, A.blob(), B.blob(), beta, - C->blob(), ctx); + [alpha, A, beta, B, C](Context *ctx) { + GEMV<DType, Lang>(A.transpose(), A.shape(0), A.shape(1), alpha, + A.blob(), B.blob(), beta, C->blob(), ctx); }, {A.blob(), B.blob()}, {C->blob()}); }); } else { CHECK(!C->transpose()); - bool transB = B.transpose(); - size_t k = transB ? B.shape()[1] : B.shape()[0]; - n = C->shape()[1]; - CHECK_EQ(C->shape()[0], m); - CHECK_EQ(A.Size(), m * k); - CHECK_EQ(B.Size(), n * k); TYPE_LANG_SWITCH(A.data_type(), DType, A.device()->lang(), Lang, { C->device()->Exec( - [transA, transB, m, n, k, alpha, A, beta, B, C](Context* ctx) { - GEMM<DType, Lang>(transA, transB, m, n, k, alpha, A.blob(), - B.blob(), beta, C->blob(), ctx); + [alpha, A, beta, B, C](Context *ctx) { + GEMM<DType, Lang>(A.transpose(), B.transpose(), A.shape(0), + B.shape(1), A.shape(1), alpha, A.blob(), B.blob(), + beta, C->blob(), ctx); }, {A.blob(), B.blob()}, {C->blob()}); }); } } -void Bernoulli(float p, Tensor* t) { +void Bernoulli(float p, Tensor *t) { TYPE_LANG_SWITCH(t->data_type(), DType, t->device()->lang(), Lang, { t->device()->Exec( - [p, t](Context* ctx) { + [p, t](Context *ctx) { Bernoulli<DType, Lang>(t->Size(), p, t->blob(), ctx); }, {}, {t->blob()}, true); }); } -void Uniform(float low, float high, Tensor* t) { +void Uniform(float low, float high, Tensor *t) { TYPE_LANG_SWITCH(t->data_type(), DType, t->device()->lang(), Lang, { t->device()->Exec( - [low, high, t](Context* ctx) { + [low, high, t](Context *ctx) { Uniform<DType, Lang>(t->Size(), low, high, t->blob(), ctx); }, {}, {t->blob()}, true); }); } -void Gaussian(float mean, float std, Tensor* t) { +void Gaussian(float mean, float std, Tensor *t) { TYPE_LANG_SWITCH(t->data_type(), DType, t->device()->lang(), Lang, { t->device()->Exec( - [mean, std, t](Context* ctx) { + [mean, std, t](Context *ctx) { Gaussian<DType, Lang>(t->Size(), mean, std, t->blob(), ctx); }, {}, {t->blob()}, true); }); } + +// ======follow the consistency guide +void AddColumn(const Tensor &v, Tensor *M) { AddColumn(1, 1, v, M); } +/// Add column 'v' onto each column of matrix M; +void AddColumn(const float alpha, const float beta, const Tensor &v, + Tensor *M) { + if (M->transpose()) { + Tensor X = M->T(); + AddRow(v, &X); + } else { + CHECK_EQ(M->nDim(), 2); + CHECK_EQ(v.nDim(), 1); + size_t nb_row = M->shape(0), nb_col = M->shape(1); + CHECK_EQ(nb_row, v.Size()); + + Tensor one(Shape{1, nb_col}, M->device(), M->data_type()); + one.SetValue(1.0f); // TODO(wangwei) cast type + Tensor vmat = Reshape(v, Shape{nb_row, 1}); + Mult(alpha, vmat, one, beta, M); + } +} +void AddRow(const Tensor &v, Tensor *M) { AddRow(1, 1, v, M); } + +/// Sub column 'v' by each column of matrix M; write results into 'out' +void AddRow(const float alpha, const float beta, const Tensor &v, Tensor *M) { + if (M->transpose()) { + Tensor X = M->T(); + AddColumn(v, &X); + } else { + CHECK_EQ(M->nDim(), 2); + CHECK_EQ(v.nDim(), 1); + size_t nb_row = M->shape(0), nb_col = M->shape(1); + CHECK_EQ(nb_col, v.Size()); + + Tensor one(Shape{nb_row, 1}, M->device(), M->data_type()); + one.SetValue(1.0f); + Tensor vmat = Reshape(v, Shape{1, nb_col}); + Mult(alpha, one, vmat, beta, M); + } +} + +template <typename SType> Tensor Div(const SType alpha, const Tensor &in) { + Tensor out(in.shape(), in.device(), in.data_type()); + Div(alpha, in, &out); + return out; +} + +template Tensor Div<float>(const float, const Tensor &); + +template <typename SType> +void Div(const SType alpha, const Tensor &in, Tensor *out) { + CheckDataTypeAndLang(in, *out); + CHECK(in.shape() == out->shape()); + TYPE_LANG_SWITCH(in.data_type(), DType, in.device()->lang(), Lang, { + // TODO(wangwei) type cast SType to DType; + in.device()->Exec( + [alpha, in, out](Context *ctx) { + Div<DType, Lang>(in.Size(), alpha, in.blob(), out->blob(), ctx); + }, + {in.blob()}, {out->blob()}); + }); +} +template void Div<float>(const float, const Tensor &, Tensor *); + +/// Divide column 'v' by each column of matrix M; write results into 'out' +void DivColumn(const Tensor &v, Tensor *M) { + Tensor inv; + TYPE_SWITCH(v.data_type(), DType, { inv = Div(DType(1), v); }); + MultColumn(inv, M); +} + +/// Divide row 'v' by each row of matrix M; write results into 'out' +void DivRow(const Tensor &v, Tensor *M) { + Tensor inv; + TYPE_SWITCH(v.data_type(), DType, { inv = Div(DType(1), v); }); + MultRow(inv, M); +} + +/// Multiply column 'v' and each column of matrix M; write results into 'out' +void MultColumn(const Tensor &v, Tensor *M) { + CHECK(!M->transpose()) << "Not supported yet"; + CHECK_EQ(M->nDim(), 2); + CHECK_EQ(v.nDim(), 1); + CHECK_EQ(v.Size(), M->shape(0)); + CheckDataTypeAndLang(*M, v); + TYPE_LANG_SWITCH(v.data_type(), DType, v.device()->lang(), Lang, { + v.device()->Exec( + [M, v](Context *ctx) { + DGMM<DType, Lang>(false, M->shape(0), M->shape(1), M->blob(), + v.blob(), M->blob(), ctx); + }, + {M->blob(), v.blob()}, {M->blob()}); + }); +} + +/// Multiply row 'v' with each row of matrix M; write results into 'out' +void MultRow(const Tensor &v, Tensor *M) { + CHECK(!M->transpose()) << "Not supported yet"; + CHECK_EQ(M->nDim(), 2); + CHECK_EQ(v.nDim(), 1); + CHECK_EQ(v.Size(), M->shape(1)); + CheckDataTypeAndLang(*M, v); + TYPE_LANG_SWITCH(v.data_type(), DType, v.device()->lang(), Lang, { + v.device()->Exec( + [M, v](Context *ctx) { + DGMM<DType, Lang>(true, M->shape(0), M->shape(1), M->blob(), v.blob(), + M->blob(), ctx); + }, + {M->blob(), v.blob()}, {M->blob()}); + }); +} + +void SubColumn(const Tensor &v, Tensor *M) { AddColumn(-1, 1, v, M); } + +void SubRow(const Tensor &v, Tensor *M) { AddRow(-1, 1, v, M); } + +void SumColumns(const Tensor &M, Tensor *v) { + if (M.transpose()) { + Tensor X = M.T(); + SumRows(X, v); + } else { + CHECK_EQ(M.nDim(), 2); + CHECK_EQ(v->nDim(), 1); + size_t nb_row = M.shape().at(0), nb_col = M.shape().at(1); + CHECK_EQ(nb_row, v->Size()); + + Tensor one(Shape{nb_col, 1}, M.device(), M.data_type()); + one.SetValue(1.0f); // TODO(wangwei) cast type + Mult(M, one, v); + } +} +void SumRows(const Tensor &M, Tensor *v) { + if (M.transpose()) { + Tensor X = M.T(); + SumColumns(X, v); + } else { + CHECK_EQ(M.nDim(), 2); + CHECK_EQ(v->nDim(), 1); + size_t nb_row = M.shape(0), nb_col = M.shape(1); + CHECK_EQ(nb_col, v->Size()); + + Tensor one(Shape{nb_row, 1}, M.device(), M.data_type()); + one.SetValue(1.0f); // TODO(wangwei) cast type + Tensor X = M.T(); + Mult(X, one, v); + } +} } // namespace singa http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/870d1a97/src/core/tensor/tensor_math.h ---------------------------------------------------------------------- diff --git a/src/core/tensor/tensor_math.h b/src/core/tensor/tensor_math.h index b53d4cb..98d91bf 100644 --- a/src/core/tensor/tensor_math.h +++ b/src/core/tensor/tensor_math.h @@ -39,178 +39,184 @@ namespace singa { /// Scale(const float alpha, const Blob* in, Blob* out); /// For such cases, use x, v, alpha, etc for scalar types. /// For blas functions, follow the blas style for argument names. +/// Use 'M' and 'v' for matrix and vector tensors in functions involving both +/// matrix and vectors. +/// 5. For Blob argument xxx, name its raw pointer as xxxPtr. +/// 6. Pass the 'cudaStream_t s' to every function in math_kernel.h +/// 7. Use size_t for the number of elements, rows or columns. +/// 8. Use the same name for the Tensor and Blob level math functions. // ================Linear algebra functions==================================== /// ret[i] = |input[i]| template <typename DType, typename Lang> -void Abs(int count, const Blob* input, Blob* ret, Context* ctx) { +void Abs(int count, const Blob *input, Blob *ret, Context *ctx) { LOG(FATAL) << "Not Implemented"; } template <typename DType, typename Lang> -void Set(int count, DType x, Blob* ret, Context* ctx) { +void Set(int count, DType x, Blob *ret, Context *ctx) { LOG(FATAL) << "Not Implemented"; } /// sum all elements of input into ret template <typename DType, typename Lang> -void Sum(int count, const Blob* input, DType* ret, Context* ctx) { +void Sum(int count, const Blob *input, DType *ret, Context *ctx) { LOG(FATAL) << "Not Implemented"; } /// ret[i] = sign(input[i]) template <typename DType, typename Lang> -void Sign(int count, const Blob* input, Blob* ret, Context* ctx) { +void Sign(int count, const Blob *input, Blob *ret, Context *ctx) { LOG(FATAL) << "Not Implemented"; } /// Base is e, Neper number. ret[i]=exp(input[i]) template <typename DType, typename Lang> -void Exp(int count, const Blob* input, Blob* ret, Context* ctx) { +void Exp(int count, const Blob *input, Blob *ret, Context *ctx) { LOG(FATAL) << "Not Implemented"; } /// Natual logarithm, the base is e, Neper number ret[i]=log(input[i]). template <typename DType, typename Lang> -void Log(int count, const Blob* input, Blob* ret, Context* ctx) { +void Log(int count, const Blob *input, Blob *ret, Context *ctx) { LOG(FATAL) << "Not Implemented"; } /// Element-wise operation, ret[i]=sqrt([input[i]) template <typename DType, typename Lang> -void Sqrt(int count, const Blob* input, Blob* ret, Context* ctx) { +void Sqrt(int count, const Blob *input, Blob *ret, Context *ctx) { LOG(FATAL) << "Not Implemented"; } /// Element-wise operation, ret[i]=square([input[i]) template <typename DType, typename Lang> -void Square(int count, const Blob* input, Blob* ret, Context* ctx) { +void Square(int count, const Blob *input, Blob *ret, Context *ctx) { LOG(FATAL) << "Not Implemented"; } /// Element-wise operation, ret[i]=tanh([input[i]) template <typename DType, typename Lang> -void Tanh(int count, const Blob* input, Blob* ret, Context* ctx) { +void Tanh(int count, const Blob *input, Blob *ret, Context *ctx) { LOG(FATAL) << "Not Implemented"; } /// Element-wise operation, ret[i]=max(0, input[i]) template <typename DType, typename Lang> -void ReLU(int count, const Blob* input, Blob* ret, Context* ctx) { +void ReLU(int count, const Blob *input, Blob *ret, Context *ctx) { LOG(FATAL) << "Not Implemented"; } /// Element-wise operation, ret[i]=sigmoid([input[i]) template <typename DType, typename Lang> -void Sigmoid(int count, const Blob* input, Blob* ret, Context* ctx) { +void Sigmoid(int count, const Blob *input, Blob *ret, Context *ctx) { LOG(FATAL) << "Not Implemented"; } /// Do softmax for each row invidually template <typename DType, typename Lang> -void Softmax(int nrow, int ncol, const Blob* input, Blob* ret, Context* ctx) { +void Softmax(int nrow, int ncol, const Blob *input, Blob *ret, Context *ctx) { LOG(FATAL) << "Not Implemented"; } // TODO(wangwei) unify SumRow and SumCol. /// Sum the rows of the input matrix into a vector template <typename DType, typename Lang> -void SumRows(int nrow, int ncol, const Blob* input, Blob* ret, Context* ctx) { +void SumRows(int nrow, int ncol, const Blob *input, Blob *ret, Context *ctx) { LOG(FATAL) << "Not Implemented"; } /// Sum the columns of the input matrix into a vector template <typename DType, typename Lang> -void SumColumns(int nrow, int ncol, const Blob* input, Blob* ret, Context* ctx) { +void SumColumns(int nrow, int ncol, const Blob *input, Blob *ret, + Context *ctx) { LOG(FATAL) << "Not Implemented"; } // TODO(wangwei) unify AddRow and AddCol. /// Add the vector v to every row of A as the row of ret template <typename DType, typename Lang> -void AddRow(int nrow, int ncol, const Blob* A, const Blob* v, Blob* ret, - Context* ctx) { +void AddRow(int nrow, int ncol, const Blob *A, const Blob *v, Blob *ret, + Context *ctx) { LOG(FATAL) << "Not Implemented"; } /// Add the vector v to every column of A as the column of ret template <typename DType, typename Lang> -void AddCol(int nrow, int ncol, const Blob* A, const Blob* v, Blob* ret, - Context* ctx) { +void AddCol(int nrow, int ncol, const Blob *A, const Blob *v, Blob *ret, + Context *ctx) { LOG(FATAL) << "Not Implemented"; } - /// Element-wise operation, do v^x for every v from the input tensor template <typename DType, typename Lang> -void Pow(int count, const Blob* input, DType x, Blob* ret, Context* ctx) { +void Pow(int count, const Blob *input, DType x, Blob *ret, Context *ctx) { LOG(FATAL) << "Not Implemented"; } /// Element-wise operation, do v^x for every v from the lhs and every x from rhs template <typename DType, typename Lang> -void Pow(int count, const Blob* lhs, const Blob* rhs, Blob* ret, Context* ctx) { +void Pow(int count, const Blob *lhs, const Blob *rhs, Blob *ret, Context *ctx) { LOG(FATAL) << "Not Implemented"; } /// Element-wise operation, clamp every element into [low, high] /// if x>high, then x=high; if x<low, then x=low. template <typename DType, typename Lang> -void Clamp(int count, DType low, DType high, const Blob* input, Blob* ret, - Context* ctx) { +void Clamp(int count, DType low, DType high, const Blob *input, Blob *ret, + Context *ctx) { LOG(FATAL) << "Not Implemented"; } /// ret = input + x template <typename DType, typename Lang> -void Add(int count, const Blob* input, DType x, Blob* ret, Context* ctx) { +void Add(int count, const Blob *input, DType x, Blob *ret, Context *ctx) { LOG(FATAL) << "Not Implemented"; } /// ret = input - x template <typename DType, typename Lang> -void Sub(int count, const Blob* input, DType x, Blob* ret, Context* ctx) { +void Sub(int count, const Blob *input, DType x, Blob *ret, Context *ctx) { Add<DType, Lang>(count, input, -x, ret, ctx); } /// ret = input * x template <typename DType, typename Lang> -void EltwiseMult(int count, const Blob* input, DType x, Blob* ret, Context* ctx) -{ +void EltwiseMult(int count, const Blob *input, DType x, Blob *ret, + Context *ctx) { LOG(FATAL) << "Not Implemented"; } /// ret = input / x template <typename DType, typename Lang> -void Div(int count, const Blob* input, DType x, Blob* ret, Context* ctx) { +void Div(int count, const Blob *input, DType x, Blob *ret, Context *ctx) { EltwiseMult<DType, Lang>(count, input, DType(1) / x, ret, ctx); } /// ret = lhs + rhs template <typename DType, typename Lang> -void Add(int count, const Blob* lhs, const Blob* rhs, Blob* ret, Context* ctx) { +void Add(int count, const Blob *lhs, const Blob *rhs, Blob *ret, Context *ctx) { LOG(FATAL) << "Not Implemented"; } /// ret = lhs - rhs template <typename DType, typename Lang> -void Sub(int count, const Blob* lhs, const Blob* rhs, Blob* ret, Context* ctx) { +void Sub(int count, const Blob *lhs, const Blob *rhs, Blob *ret, Context *ctx) { LOG(FATAL) << "Not Implemented"; } /// ret = lhs * rhs template <typename DType, typename Lang> -void EltwiseMult(int count, const Blob* lhs, const Blob* rhs, Blob* ret, - Context* ctx) { +void EltwiseMult(int count, const Blob *lhs, const Blob *rhs, Blob *ret, + Context *ctx) { LOG(FATAL) << "Not Implemented"; } /// ret = lhs / rhs template <typename DType, typename Lang> -void Div(int count, const Blob* lhs, const Blob* rhs, Blob* ret, Context* ctx) { +void Div(int count, const Blob *lhs, const Blob *rhs, Blob *ret, Context *ctx) { LOG(FATAL) << "Not Implemented"; } /// outer-product. /// lhs and rhs are vectors of len m and n. ret is matrix of shape m * n template <typename DType, typename Lang> -void Outer(int m, int n, const Blob* lhs, const Blob* rhs, Blob* ret, - Context* ctx) { +void Outer(int m, int n, const Blob *lhs, const Blob *rhs, Blob *ret, + Context *ctx) { LOG(FATAL) << "Not Implemented"; } @@ -218,36 +224,36 @@ void Outer(int m, int n, const Blob* lhs, const Blob* rhs, Blob* ret, // ===== Level 1 /// return the index of the element with the max value. template <typename DType, typename Lang> -void Amax(int count, const Blob* input, int* ret, Context* ctx) { +void Amax(int count, const Blob *input, int *ret, Context *ctx) { LOG(FATAL) << "Not Implemented"; } /// return the index of the element with the min value. template <typename DType, typename Lang> -void Amin(int count, const Blob* input, int* ret, Context* ctx) { +void Amin(int count, const Blob *input, int *ret, Context *ctx) { LOG(FATAL) << "Not Implemented"; } /// ret = sum |x| for all x in input template <typename DType, typename Lang> -void Asum(int count, const Blob* input, DType* ret, Context* ctx) { +void Asum(int count, const Blob *input, DType *ret, Context *ctx) { LOG(FATAL) << "Not Implemented"; } /// ret = alpha * input + ret template <typename DType, typename Lang> -void Axpy(int count, DType alpha, const Blob* input, Blob* ret, Context* ctx) { +void Axpy(int count, DType alpha, const Blob *input, Blob *ret, Context *ctx) { LOG(FATAL) << "Not Implemented"; } /// ret *= x template <typename DType, typename Lang> -void Scale(int count, DType x, Blob* ret, Context* ctx) { +void Scale(int count, DType x, Blob *ret, Context *ctx) { LOG(FATAL) << "Not Implemented"; } template <typename DType, typename Lang> -void Dot(int count, const Blob* lhs, const Blob* rhs, DType* ret, - Context* ctx) { +void Dot(const size_t num, const Blob *in1, const Blob *in2, DType *out, + Context *ctx) { LOG(FATAL) << "Not Implemented"; } @@ -255,56 +261,64 @@ void Dot(int count, const Blob* lhs, const Blob* rhs, DType* ret, /// ret = alpha * op(A) * v + beta * ret. /// op(A) = A if trans = false; A^T otherwise; rows(op(A)) = m, cols(op(A)) = n. template <typename DType, typename Lang> -void GEMV(bool trans, int m, int n, DType alpha, const Blob* A, const Blob* v, - DType beta, Blob* ret, Context* ctx) { +void GEMV(bool trans, int m, int n, DType alpha, const Blob *A, const Blob *v, + DType beta, Blob *ret, Context *ctx) { LOG(FATAL) << "Not Implemented"; } // ===== Level 3 -/// ret = alpha * op(A) * op(B) + beta * ret. -/// op(A) = A if trans = false; A^T otherwise; rows(ret) = m, cols(ret) = n. -template <typename DType, typename Lang> -void GEMM(bool transA, bool transB, int m, int n, int k, DType alpha, - const Blob* A, const Blob* B, DType beta, Blob* ret, Context* ctx) { - LOG(FATAL) << "Not Implemented"; -} // ================Random functions=========================================== /// Each element of ret would be 1 with prob p and 0 with 1-p. 0<= p <= 1 // Get the random generator from 'ctx' // If DType is not float, then convert the threshold to DType template <typename DType, typename Lang> -void Bernoulli(int count, float p, Blob* ret, Context* ctx) { +void Bernoulli(int count, float p, Blob *ret, Context *ctx) { LOG(FATAL) << "Not Implemented"; } // The random generator should be extracted from ctx. // If DType is not float, then convert the low and high to DType template <typename DType, typename Lang> -void Uniform(int count, float low, float high, Blob* ret, Context* ctx) { +void Uniform(int count, float low, float high, Blob *ret, Context *ctx) { LOG(FATAL) << "Not Implemented"; } // The random generator should be extracted from ctx. // If DType is not float, then convert the mean and std to DType template <typename DType, typename Lang> -void Gaussian(int count, float mean, float std, Blob* ret, Context* ctx) { - LOG(FATAL) << "Not Implemented"; -} - -/*Some operations would have many config/hyper-parameters, e.g., Conv, and -these config vary among diff implementations, e.g., cuda/cudnn/opencl. -To separate the modules, we pass a OpConf pointer to the Tensor Op function. -The specific fields are implemented by inheriting OpConf, and casting the -pointer between the base and the sub-class. -class OpConf { - public: - template <typename T> - T* CastTo() { - static_assert(std::is_base_of<OpConf, T>::value, - "The cast type must be a sub-class of OpConf"); - return static_cast<T*>(this); - } -}; -*/ -} // namespace singa +void Gaussian(int count, float mean, float std, Blob *ret, Context *ctx) { + LOG(FATAL) << "Not Implemented"; +} + +// ========follow the consistency guide of math API + +template <typename DType, typename Lang> +void Set(const size_t num, const DType x, Blob *out, Context *ctx) { + LOG(FATAL) << "Not Implemented"; +} +/// Divide alpha by each element of 'in'. +template <typename DType, typename Lang> +void Div(const size_t num, const DType alpha, const Blob *in, Blob *out, + Context *ctx) { + LOG(FATAL) << "Not Implemented"; +} + +/// multiply a matrix with a diagnoal matrix constructed using values from 'v'. +/// if matrix_lef_side is true, do M*v; else do v*M +template <typename DType, typename Lang> +void DGMM(const bool side_right, const size_t nrow, const size_t ncol, + const Blob *M, const Blob *v, Blob *out, Context *ctx) { + LOG(FATAL) << "Not Implemented"; +} + +/// C = alpha * A * B + beta * C. +/// transA indicates if the internal data layout is transposed of A +template <typename DType, typename Lang> +void GEMM(const bool transA, const bool transB, const size_t nrowA, + const size_t ncolB, const size_t ncolA, const DType alpha, + const Blob *A, const Blob *B, const DType beta, Blob *C, + Context *ctx) { + LOG(FATAL) << "Not Implemented"; +} +} // namespace singa #endif // SINGA_CORE_MATH_H_
