SINGA-184 Add Cross Entropy loss computation Merge with asf dev. Fix the bug for cuda crossentropy caused by the target type (should use int instead of float).
Project: http://git-wip-us.apache.org/repos/asf/incubator-singa/repo Commit: http://git-wip-us.apache.org/repos/asf/incubator-singa/commit/21e4b2d7 Tree: http://git-wip-us.apache.org/repos/asf/incubator-singa/tree/21e4b2d7 Diff: http://git-wip-us.apache.org/repos/asf/incubator-singa/diff/21e4b2d7 Branch: refs/heads/dev Commit: 21e4b2d79a4bd3acfb8b487cf96c197da464ae70 Parents: ec17aca 26df5ac Author: Wei Wang <[email protected]> Authored: Mon Jun 13 13:04:23 2016 +0800 Committer: Wei Wang <[email protected]> Committed: Mon Jun 13 13:04:23 2016 +0800 ---------------------------------------------------------------------- CMakeLists.txt | 4 +- cmake/Dependencies.cmake | 7 +- cmake/ProtoBuf.cmake | 116 ---- include/singa/core/common.h | 2 +- include/singa/core/tensor.h | 400 +++++++------- src/CMakeLists.txt | 27 +- src/core/tensor/math_kernel.cu | 707 +++++++++++++------------ src/core/tensor/math_kernel.h | 93 ++-- src/core/tensor/tensor.cc | 761 ++++++++++++++------------- src/core/tensor/tensor_math.h | 404 +++++++------- src/core/tensor/tensor_math_cpp.h | 610 +++++++++++++++------ src/core/tensor/tensor_math_cuda.h | 423 +++++++++++---- src/model/layer/activation.cc | 10 +- src/model/layer/batchnorm.cc | 70 +++ src/model/layer/batchnorm.h | 84 +++ src/model/layer/cudnn_activation.cc | 13 +- src/model/layer/cudnn_batchnorm.cc | 214 ++++++++ src/model/layer/cudnn_batchnorm.h | 60 +++ src/model/layer/cudnn_convolution.cc | 183 ++++--- src/model/layer/cudnn_lrn.cc | 118 +++++ src/model/layer/cudnn_lrn.h | 56 ++ src/model/layer/cudnn_pooling.cc | 7 +- src/model/layer/cudnn_softmax.cc | 4 +- src/model/layer/dense.cc | 86 +++ src/model/layer/dense.h | 70 +++ src/model/layer/flatten.cc | 55 ++ src/model/layer/flatten.h | 53 ++ src/model/layer/lrn.cc | 59 +++ src/model/layer/lrn.h | 70 +++ src/model/layer/prelu.cc | 139 +++++ src/model/layer/prelu.h | 59 +++ src/model/layer/softmax.cc | 10 +- src/model/loss/softmax_cross_entropy.cc | 8 +- src/proto/model.proto | 39 +- test/CMakeLists.txt | 1 + test/singa/test_activation.cc | 8 +- test/singa/test_cross_entropy.cc | 4 +- test/singa/test_cudnn_activation.cc | 6 +- test/singa/test_cudnn_batchnorm.cc | 257 +++++++++ test/singa/test_cudnn_convolution.cc | 181 +++++++ test/singa/test_cudnn_lrn.cc | 205 ++++++++ test/singa/test_cudnn_softmax.cc | 6 +- test/singa/test_dense.cc | 249 +++++++++ test/singa/test_flatten.cc | 144 +++++ test/singa/test_mse.cc | 1 - test/singa/test_prelu.cc | 137 +++++ test/singa/test_sgd.cc | 2 +- test/singa/test_softmax.cc | 12 +- test/singa/test_tensor_math.cc | 303 ++++++++++- 49 files changed, 4873 insertions(+), 1664 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/21e4b2d7/CMakeLists.txt ---------------------------------------------------------------------- diff --cc CMakeLists.txt index fbe3adc,fbe3adc..a9d9b17 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@@ -10,7 -10,7 +10,9 @@@ LIST(APPEND CMAKE_MODULE_PATH ${PROJECT IF(UNIX OR APPLE) SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fPIC -Wall") ENDIF() -- ++IF(CMAKE_BUILD_TYPE=Debug) ++ SET(NVCC_FLAG "${NVCC_FLAG} -g -G ") ++ENDIF() #message(STATUS "${CMAKE_CXX_FLAGS}") SET(SINGA_INCLUDE_DIR "${CMAKE_SOURCE_DIR}/include;${PROJECT_BINARY_DIR}") #message(STATUS "include path: ${SINGA_INCLUDE_DIR}") http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/21e4b2d7/include/singa/core/tensor.h ---------------------------------------------------------------------- diff --cc include/singa/core/tensor.h index 865e1e4,cd750c5..8cfa705 --- a/include/singa/core/tensor.h +++ b/include/singa/core/tensor.h @@@ -212,199 -213,171 +213,180 @@@ Tensor Reshape(const Tensor &in, Shape /// Copy 'num' elements of src to dst. /// The first 'src_offset' ('dst_offset') elements will be skipped. - void CopyDataToFrom(Tensor *dst, const Tensor &src, size_t num, - size_t src_offset = 0, size_t dst_offset = 0); - - // ==================Simple Linear Algebra Operations========================= - Tensor Abs(const Tensor &t); - Tensor Exp(const Tensor &t); - Tensor Log(const Tensor &t); - Tensor ReLU(const Tensor &t); - Tensor Sigmoid(const Tensor &t); - Tensor Sign(const Tensor &t); - Tensor Sqrt(const Tensor &t); - Tensor Square(const Tensor &t); - Tensor Tanh(const Tensor &t); + void CopyDataToFrom(Tensor *dst, const Tensor &src, const size_t num, + const size_t src_offset = 0, const size_t dst_offset = 0); + + // =============Element-wise operations==================================== + Tensor Abs(const Tensor &in); + Tensor Exp(const Tensor &in); + Tensor Log(const Tensor &in); + Tensor ReLU(const Tensor &in); + Tensor Sigmoid(const Tensor &in); + Tensor Sign(const Tensor &in); + Tensor Sqrt(const Tensor &in); + Tensor Square(const Tensor &in); + Tensor Tanh(const Tensor &in); + + /// Element-wise opeartion, out[i]=in[i]^x + template <typename SType> + Tensor Pow(const Tensor &in, const SType x); + /// Element-wise opeartion, out[i]=in[i]^x + template <typename SType> + void Pow(const Tensor &in, const SType x, Tensor *out); + /// Element-wise opeartion, out[i]=baes[i]^exp[i] + Tensor Pow(const Tensor &base, const Tensor &exp); + /// Element-wise opeartion, out[i]=baes[i]^exp[i] + void Pow(const Tensor &base, const Tensor &exp, Tensor *out); + /// Element-wise operation, out[i]= (in[i] < x) ? 1.f : 0.f template <typename SType> - SType Sum(const Tensor &t); - /// Sum elements in the Tensor, currently only support vector and matrix. - /// if 'axis' is 0, sum all rows into a single row - /// if 'axis' is 1, sum all columns into a single column - /// TODO(wangwei) support arbitrary Tensor like numpy.sum - Tensor Sum(const Tensor &t, int axis); + Tensor operator<(const Tensor &in, const SType x); + template <typename SType> + void LT(const Tensor &in, const SType x, Tensor *out); - /// Average elements in the Tensor, currently only support vector and matrix. - /// if 'axis' is 0, average all rows into a single row - /// if 'axis' is 1, average all columns into a single column - /// TODO(wangwei) support arbitrary Tensor like numpy.average - Tensor Average(const Tensor &t, int axis); - /// Do softmax for each row. 'in' could be a 1-d or 2-d Tensor. - Tensor SoftMax(const Tensor &in); - /// Do softmax for each row. 'in' could be a 1-d or 2-d Tensor. - void SoftMax(const Tensor &in, Tensor *out); + /// Element-wise operation, out[i]= (in[i] <= x) ? 1.f : 0.f + template <typename SType> + Tensor operator<=(const Tensor &in, const SType x); + template <typename SType> + void LE(const Tensor &in, const SType x, Tensor *out); - + /// Element-wise operation, out[i]= (in[i] > x) ? 1.f : 0.f + template <typename SType> + Tensor operator>(const Tensor &in, const SType x); + template <typename SType> + void GT(const Tensor &in, const SType x, Tensor *out); - /// Regarding the internal data as 2d, with shape_[0]*...*shape_[axis] rows, - /// and shape_[axis+1]*...*shape_[nDim()] columns. - /// and do softmax along each row. - // Tensor Softmax(const Tensor& t, int axis = -1); - // void Softmax(const Tensor& t, Tensor* ret, int axis = -1); - - /// Element-wise operation, ret[i]= (t[i] < x) ? 1.f : 0.f - template <typename DType> - Tensor operator<(const Tensor &t, const DType x); - template <typename DType> - void LT(const Tensor &t, DType x, Tensor *ret); - - /// Element-wise operation, ret[i]= (t[i] <= x) ? 1.f : 0.f - template <typename DType> - Tensor operator<=(const Tensor &t, const DType x); - template <typename DType> - void LE(const Tensor &t, DType x, Tensor *ret); - - /// Element-wise operation, ret[i]= (t[i] > x) ? 1.f : 0.f - template <typename DType> - Tensor operator>(const Tensor &t, const DType x); - template <typename DType> - void GT(const Tensor &t, DType x, Tensor *ret); - - /// Element-wise operation, ret[i]= (t[i] >= x) ? 1.f : 0.f - template <typename DType> - Tensor operator>=(const Tensor &t, const DType x); - template <typename DType> - void GE(const Tensor &t, DType x, Tensor *ret); - - /// Element-wise opeartion, ret[i]=t[i]^x - template <typename DType> - Tensor Pow(const Tensor &t, DType x); - /// Element-wise opeartion, ret[i]=t[i]^x - template <typename DType> - void Pow(const Tensor &t, DType x, Tensor *ret); - /// Element-wise opeartion, ret[i]=baes[i]^exp[i] - Tensor Pow(const Tensor &base, Tensor exp); - /// Element-wise opeartion, ret[i]=baes[i]^exp[i] - void Pow(const Tensor &base, const Tensor &exp, Tensor *ret); + /// Element-wise operation, out[i]= (in[i] >= x) ? 1.f : 0.f + template <typename SType> + Tensor operator>=(const Tensor &in, const SType x); + template <typename SType> + void GE(const Tensor &in, const SType x, Tensor *out); Tensor operator+(const Tensor &lhs, const Tensor &rhs); - void Add(const Tensor &lhs, const Tensor &rhs, Tensor *ret); + void Add(const Tensor &lhs, const Tensor &rhs, Tensor *out); Tensor operator-(const Tensor &lhs, const Tensor &rhs); - void Sub(const Tensor &lhs, const Tensor &rhs, Tensor *ret); + void Sub(const Tensor &lhs, const Tensor &rhs, Tensor *out); Tensor operator*(const Tensor &lhs, const Tensor &rhs); - void EltwiseMult(const Tensor &lhs, const Tensor &rhs, Tensor *ret); + void EltwiseMult(const Tensor &lhs, const Tensor &rhs, Tensor *out); Tensor operator/(const Tensor &lhs, const Tensor &rhs); - void Div(const Tensor &lhs, const Tensor &rhs, Tensor *ret); - - template <typename DType> - Tensor operator+(const Tensor &t, DType x); - template <typename DType> - void Add(const Tensor &t, DType x, Tensor *ret); - - template <typename DType> - Tensor operator-(const Tensor &t, DType x); - template <typename DType> - void Sub(const Tensor &t, DType x, Tensor *ret); + void Div(const Tensor &lhs, const Tensor &rhs, Tensor *out); - template <typename DType> - Tensor operator*(const Tensor &t, DType x); - template <typename DType> - void EltwiseMult(const Tensor &t, DType x, Tensor *ret); - - template <typename DType> - Tensor operator/(const Tensor &t, DType x); - template <typename DType> - void Div(const Tensor &t, DType x, Tensor *ret); - - // ================Blas operations============================================ - // We fix the scalar argument type to be float. + template <typename SType> + Tensor operator+(const Tensor &in, const SType x); + template <typename SType> + void Add(const Tensor &in, const SType x, Tensor *out); - // ===== Level 1 - // TODO(wangwei) make amax/amin/asum a member function of tensor - // void Amax(Tensor, Context* ctx); Get the index of the max value in a vector - // void Asum(Tensor Context* ctx); + template <typename SType> + Tensor operator-(const Tensor &in, const SType x); + template <typename SType> + void Sub(const Tensor &in, const SType x, Tensor *out); - // template <typename DType> - // void Axpy(DType x, const Blob& t, Blob* ret, Context* ctx); + template <typename SType> + Tensor operator*(const Tensor &in, const SType x); + template <typename SType> + void EltwiseMult(const Tensor &in, const SType x, Tensor *out); - /// Do matrix vector multipication or matrix matrix multiplication depdending - /// on the Tensor shape. result = A * B - Tensor Mult(const Tensor &A, const Tensor &B); - /// Do matrix vector multipication or matrix matrix multiplication depdending - /// on the Tensor shape. C = A * B - void Mult(const Tensor &A, const Tensor &B, Tensor *C); + /// For each element e of Tensor 'in', compute e / x + template <typename SType> + Tensor operator/(const Tensor &in, const SType x); + /// For each element e of Tensor 'in', compute e / x into out + template <typename SType> + void Div(const Tensor &in, const SType x, Tensor *out); - /// Do matrix vector multipication or matrix matrix multiplication depdending - /// on the Tensor shape. ret = alpha lhs * rhs + beta * ret - void Mult(const float alpha, const Tensor &lhs, const Tensor &rhs, - const float beta, Tensor *C); + /// For each element e of Tensor 'in', compute x/e + template <typename SType> + Tensor Div(const SType x, const Tensor &in); + /// For each element e of Tensor 'in', compute x/e into 'out' + template <typename SType> + void Div(const SType x, const Tensor &in, Tensor *out); - // ================Random operations========================================== - /// For each element x set x = 1 if random() < p; otherwise x = 1. - void Bernoulli(float p, Tensor *t); - /// Fill in Tensor 't' following uniform distribution. - void Uniform(float low, float high, Tensor *t); - /// Fill in Tensor 't' following Gaussian distribution. - void Gaussian(float mean, float std, Tensor *t); + template <typename SType> + SType Sum(const Tensor &in); - + // ============Matrix (row/column) operations================================== + /// Average elements in the Tensor, currently only support vector and matrix. + /// if 'axis' is 0, average all rows into a single row + /// if 'axis' is 1, average all columns into a single column + /// TODO(wangwei) support arbitrary Tensor like numpy.average + Tensor Average(const Tensor &in, const int axis); -/// Sum elements in the Tensor, currently only support vector and matrix. -/// if 'axis' is 0, sum all rows into a single row -/// if 'axis' is 1, sum all columns into a single column -/// TODO(wangwei) support arbitrary Tensor like numpy.sum -Tensor Sum(const Tensor &in, const int axis); -/// Regarding the internal data as 2d, with shape_[0]*...*shape_[axis-1] rows, -/// and shape_[axis]*...*shape_[nDim()] columns. -/// and do softmax along each row. -Tensor SoftMax(const Tensor &in, const int axis = 0); -void SoftMax(const Tensor &in, const int axis, Tensor *out); - // follow the consistency guide - // https://issues.apache.org/jira/browse/SINGA-182 - // ============Matrix vector operations======================================= /// Add column 'v' with each column of matrix M void AddColumn(const Tensor &v, Tensor *M); - void AddColumn(const float alpha, const float beta, const Tensor &v, + /// For each column 'c' of matrix out, do c=alpha*v + beta*c + template <typename SType> + void AddColumn(const SType alpha, const SType beta, const Tensor &v, Tensor *out); - /// Sub column 'v' by each column of matrix M - void SubColumn(const Tensor &v, Tensor *M); - /// Multiply column 'v' and each column of matrix M; write results into 'out' - void MultColumn(const Tensor &v, Tensor *M); - /// Divide column 'v' by each column of matrix M; write results into 'out' - void DivColumn(const Tensor &v, Tensor *M); - /// Add row 'v' with each row of matrix M; write results into 'out' void AddRow(const Tensor &v, Tensor *out); - void AddRow(const float alpha, const float beta, const Tensor &v, Tensor *M); - /// Sub row 'v' by each row of matrix M; write results into 'out' - void SubRow(const Tensor &v, Tensor *M); - /// Multiply row 'v' with each row of matrix M; write results into 'out' - void MultRow(const Tensor &v, Tensor *M); + /// For each row 'r' of matrix out, do r=alpha*v + beta*r + template <typename SType> + void AddRow(const SType alpha, const SType beta, const Tensor &v, Tensor *M); + /// Divide column 'v' by each column of matrix M; write results into 'out' + void DivColumn(const Tensor &v, Tensor *M); /// Divide row 'v' by each row of matrix M; write results into 'out' void DivRow(const Tensor &v, Tensor *M); - - /// Sum all rows of matrix M into a single row as 'out' - void SumRows(const Tensor &M, Tensor *out); + /// Multiply column 'v' and each column of matrix M; write results into 'out' + void MultColumn(const Tensor &v, Tensor *M); + /// Multiply row 'v' with each row of matrix M; write results into 'out' + void MultRow(const Tensor &v, Tensor *M); ++/// Do softmax for each row. 'in' could be a 1-d or 2-d Tensor. ++Tensor SoftMax(const Tensor &in); ++/// Do softmax for each row. 'in' could be a 1-d or 2-d Tensor. ++void SoftMax(const Tensor &in, Tensor *out); + /// Sub column 'v' by each column of matrix M + void SubColumn(const Tensor &v, Tensor *M); + /// Sub row 'v' by each row of matrix M; write results into 'out' + void SubRow(const Tensor &v, Tensor *M); /// Sum all columns of matrix M into a single column as 'out' void SumColumns(const Tensor &M, Tensor *out); + /// Sum all rows of matrix M into a single row as 'out' + void SumRows(const Tensor &M, Tensor *out); + ++/// Sum elements in the Tensor, currently only support vector and matrix. ++/// if 'axis' is 0, sum all rows into a single row ++/// if 'axis' is 1, sum all columns into a single column ++/// TODO(wangwei) support arbitrary Tensor like numpy.sum ++Tensor Sum(const Tensor &in, const int axis); ++ + // ================Random operations========================================== + /// For each element x set x = 1 if random() < p; otherwise x = 1. + template <typename SType> + void Bernoulli(const SType p, Tensor *out); + /// Fill in Tensor 't' following Gaussian distribution. + template <typename SType> + void Gaussian(const SType mean, const SType std, Tensor *out); + /// Fill in Tensor 't' following uniform distribution. + template <typename SType> + void Uniform(const SType low, const SType high, Tensor *out); + + // ================Blas operations============================================ + // TODO(wangwei) make amax/amin/asum a member function of tensor - /// For each element x of Tensor 'in', compute alpha/x + /// out = alpha*in + out template <typename SType> - Tensor Div(const SType alpha, const Tensor &in); + void Axpy(SType alpha, const Tensor &in, Tensor *out); - /// For each element x of Tensor 'in', compute alpha/x into 'out' + /// Do matrix vector multipication or matrix matrix multiplication depdending + /// on the Tensor shape. result = A * B + Tensor Mult(const Tensor &A, const Tensor &B); + /// Do matrix vector multipication or matrix matrix multiplication depdending + /// on the Tensor shape. C = A * B + void Mult(const Tensor &A, const Tensor &B, Tensor *C); - + /// Do matrix vector multipication or matrix matrix multiplication depdending + /// on the Tensor shape. out = alpha lhs * rhs + beta * out template <typename SType> - void Div(const SType alpha, const Tensor &in, Tensor *out); - - /* - /// Multiply each column of the lhs matrix with the rhs column - Tensor MultColumn(const Tensor &lhs, const Tensor &rhs); - void MultColumn(const Tensor &lhs, const Tensor &rhs, Tensor *ret); - /// Multiply each row of the lhs matrix with the rhs row - Tensor MultRow(const Tensor &lhs, const Tensor &rhs); - void MultRow(const Tensor &lhs, const Tensor &rhs, Tensor *ret); - /// Div each row of the lhs matrix with the rhs column - Tensor DivColumn(const Tensor &lhs, const Tensor &rhs); - void DivColumn(const Tensor &lhs, const Tensor &rhs, Tensor *ret); - /// Divide each row of the lhs matrix by the rhs row - Tensor DivRow(const Tensor &lhs, const Tensor &rhs); - void DivRow(const Tensor &lhs, const Tensor &rhs, Tensor *ret); - */ + void Mult(const SType alpha, const Tensor &A, const Tensor &B, const SType beta, + Tensor *C); + ++// ***************** ++// Misc. ++// **************** +/// Compute the cross entropy loss given the prediction probability 'p' and +/// the target (ground truth) labels 't'. 'p' and 't' are either 1-d vector +/// or 2-d matrix. 'loss' is 1-d vector. The loss is computed into p. - void ComputeCrossEntropy(const Tensor& t, Tensor* p); ++void ComputeCrossEntropy(const Tensor& p, const Tensor& t, Tensor* loss); +/// Compute the dx, given prediction probability 'p' (p=softmax(x)) and +/// the target (ground truth) labels 't'. 'p' and 't' are either 1-d vector +/// or 2-d matrix. 'grad' has the same shape as 'p'. dx is computed into p. +void SoftmaxCrossEntropyBwd(const Tensor& t, Tensor* p); } // namespace singa #endif // SINGA_CORE_TENSOR_H_ http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/21e4b2d7/src/CMakeLists.txt ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/21e4b2d7/src/core/tensor/math_kernel.cu ---------------------------------------------------------------------- diff --cc src/core/tensor/math_kernel.cu index f12763e,b618f9b..21ebdd8 --- a/src/core/tensor/math_kernel.cu +++ b/src/core/tensor/math_kernel.cu @@@ -309,142 -258,170 +258,202 @@@ __global__ void KernelLE(const int num } } - __global__ static void kernel_set_value(float *data, float value, int n) { - int index = blockIdx.x * blockDim.x + threadIdx.x; - int num_threads = blockDim.x * gridDim.x; - for (; index < n; index += num_threads) { - data[index] = value; + __global__ void KernelLT(const int num, const float *in, const float x, + float *out) { + for (size_t idx = blockIdx.x * blockDim.x + threadIdx.x; idx < num; + idx += blockDim.x * gridDim.x) { + out[idx] = in[idx] < x ? 1.0f : 0.0f; + } + } ++__global__ void KernelComputeCrossEntropy(const size_t batchsize, ++ const size_t dim, const float *p, ++ const int *t, float *loss) { ++ size_t sample = blockIdx.x * blockDim.x + threadIdx.x; ++ size_t num_threads = blockDim.x * gridDim.x; ++ for (; sample < batchsize; sample += num_threads) { ++ float prob_of_truth = p[sample * dim + t[sample]]; ++ loss[sample] = -std::log(max(prob_of_truth, FLT_MIN)); + } +} - __global__ void kernel_threshold(const float *src_data, float *des_data, - float alpha, int n) { - int index = blockIdx.x * blockDim.x + threadIdx.x; - int num_threads = blockDim.x * gridDim.x; - for (; index < n; index += num_threads) { - des_data[index] = src_data[index] < alpha ? 1.0f : 0.0f; ++__global__ void KernelSoftmaxCrossEntropyBwd(const size_t batchsize, ++ const size_t dim, const float *p, ++ const int *t, float *grad) { ++ size_t sample = blockIdx.x * blockDim.x + threadIdx.x; ++ size_t num_threads = blockDim.x * gridDim.x; ++ for (; sample < batchsize; sample += num_threads) { ++ size_t pos = sample * dim + t[sample]; ++ grad[pos] = p[pos] - 1.0f; // TODO(wangwei) Consider p and grad are diff + } +} - void sum(int n, const float *in, float *out) { - int threads_per_block = n > CU1DBLOCK ? CU1DBLOCK : n; - // here, we only need one block - int num_blocks = 1; + // ******************************** + // Functions call kernels + // ******************************** + + void set(const size_t n, const float v, float *out, cudaStream_t s) { + KernelSet <<<ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (n, v, out); + } + + void abs(const size_t n, const float *in, float *out, cudaStream_t s) { + KernelAbs <<<ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (n, in, out); + } - kernel_sum_vec << <num_blocks, threads_per_block>>> (in, out, n); + void sign(const size_t n, const float *in, float *out, cudaStream_t s) { + KernelSign <<<ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (n, in, out); } - void sum_row(int rows, int cols, int stride, const float *in, float *out) { - int threads_per_block = rows > CU1DBLOCK ? CU1DBLOCK : rows; - int num_blocks = cols; + void exp(const size_t n, const float *in, float *out, cudaStream_t s) { + KernelExp <<<ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (n, in, out); + } + + void log(const size_t n, const float *in, float *out, cudaStream_t s) { + KernelLog <<<ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (n, in, out); + } - kernel_sum_row << <num_blocks, threads_per_block>>> - (in, out, rows, cols, stride); + void sqrt(const size_t n, const float *in, float *out, cudaStream_t s) { + KernelSqrt <<<ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (n, in, out); } - void sum_col(int rows, int cols, int stride, const float *in, float *out) { - int threads_per_block = cols > CU1DBLOCK ? CU1DBLOCK : cols; - int num_blocks = rows; + void square(const size_t n, const float *in, float *out, cudaStream_t s) { + KernelSquare <<<ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (n, in, out); + } - kernel_sum_col << <num_blocks, threads_per_block>>> - (in, out, rows, cols, stride); + void tanh(const size_t n, const float *in, float *out, cudaStream_t s) { + KernelTanh <<<ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (n, in, out); } - void add_row(int rows, int cols, int stride, const float *in_row, - const float *in_mat, float *out) { - dim3 threads_per_block(CU2DBLOCK_X, CU2DBLOCK_Y); - dim3 num_blocks( - cols / threads_per_block.x + (cols % threads_per_block.x == 0 ? 0 : 1), - rows / threads_per_block.y + (rows % threads_per_block.y == 0 ? 0 : 1)); - kernel_add_vec_row << <num_blocks, threads_per_block>>> - (in_row, in_mat, out, rows, cols, stride); + + void relu(const size_t n, const float *in, float *out, cudaStream_t s) { + KernelRelu <<<ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (n, in, out); } - void add(int n, const float *a, const float *b, float *out) { - kernel_add << <ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (a, b, out, n); + void sigmoid(const int n, const float *in, float *out, cudaStream_t s) { + KernelSigmoid <<<ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (n, in, out); } - void sub(int n, const float *a, const float *b, float *out) { - kernel_sub << <ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (a, b, out, n); + void softplus(const size_t n, const float *in, float *out, cudaStream_t s) { + KernelSoftplus <<<ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (n, in, out); } - void exp(int n, const float *in, float *out) { - kernel_exp << <ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (in, out, n); + void clamp(const size_t n, const float low, const float high, const float *in, + float *out, cudaStream_t s) { + KernelClamp <<<ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (n, low, high, in, out); } - void log(int n, const float *in, float *out) { - kernel_log << <ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (in, out, n); + void pow(const size_t n, const float *in, const float x, float *out, + cudaStream_t s) { + KernelPow <<<ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (n, in, x, out); } - void sigmoid(int n, const float *in, float *out) { - kernel_sigmoid << <ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (in, out, n); + void add(const size_t n, const float *in, const float x, float *out, + cudaStream_t s) { + KernelAdd <<<ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (n, in, x, out); } - void sigmoid_grad(int n, const float *in, float *out) { - kernel_sigmoid_grad << <ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (in, out, n); + void mult(const size_t n, const float *in, const float x, float *out, + cudaStream_t s) { + KernelMult <<<ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (n, in, x, out); } - void relu(int n, const float *in, float *out) { - kernel_relu << <ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (in, out, n); + void div(const size_t n, const float x, const float *in, float *out, + cudaStream_t s) { + KernelDiv <<<ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (n, x, in, out); } - void relu_grad(int n, const float *in, float *out) { - kernel_relu_grad << <ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (in, out, n); + void threshold(const size_t n, const float x, const float *in, float *out, + cudaStream_t s) { + KernelThreshold <<<ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (n, x, in, out); } - void tanh(int n, const float *in, float *out) { - kernel_tanh << <ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (in, out, n); + void gt(const size_t num, const float *in, const float x, float *out, + cudaStream_t s) { + KernelGT <<<ceil(num / CU1DBLOCKF), CU1DBLOCKF>>> (num, in, x, out); + } + void ge(const size_t num, const float *in, const float x, float *out, + cudaStream_t s) { + KernelGE <<<ceil(num / CU1DBLOCKF), CU1DBLOCKF>>> (num, in, x, out); + } + void lt(const size_t num, const float *in, const float x, float *out, + cudaStream_t s) { + KernelLT <<<ceil(num / CU1DBLOCKF), CU1DBLOCKF>>> (num, in, x, out); + } + void le(const size_t num, const float *in, const float x, float *out, + cudaStream_t s) { + KernelLE <<<ceil(num / CU1DBLOCKF), CU1DBLOCKF>>> (num, in, x, out); } - void tanh_grad(int n, const float *in, float *out) { - kernel_tanh_grad << <ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (in, out, n); + void pow(const size_t n, const float *in1, const float *in2, float *out, + cudaStream_t s) { + KernelPow <<<ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (n, in1, in2, out); } - void softplus(int n, const float *in, float *out) { - kernel_softplus << <ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (in, out, n); + void add(const size_t n, const float *in1, const float *in2, float *out, + cudaStream_t s) { + KernelAdd <<<ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (n, in1, in2, out); } - void softplus_grad(int n, const float *in, float *out) { - kernel_softplus_grad << <ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (in, out, n); + void sub(const size_t n, const float *in1, const float *in2, float *out, + cudaStream_t s) { + KernelSub <<<ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (n, in1, in2, out); } - void square(int n, const float *in, float *out) { - kernel_square << <ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (in, out, n); + void mult(const size_t n, const float *in1, const float *in2, float *out, + cudaStream_t s) { + KernelMult <<<ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (n, in1, in2, out); } - void square_grad(int n, const float *in, float *out) { - kernel_square_grad << <ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (in, out, n); + void div(const size_t n, const float *in1, const float *in2, float *out, + cudaStream_t s) { + KernelDiv <<<ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (n, in1, in2, out); } - void sqrt(int n, const float *in, float *out) { - kernel_sqrt << <ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (in, out, n); + void sum(const size_t n, const float *in, float *out, cudaStream_t s) { + int threads_per_block = n > CU1DBLOCK ? CU1DBLOCK : n; + // here, we only need one block + int num_blocks = 1; + KernelSum <<<num_blocks, threads_per_block>>> (n, in, out); } + - void pow(int n, const float *a, const float *b, float *out) { - kernel_pow << <ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (a, b, out, n); ++void ComputeCrossEntropy(size_t batchsize, const size_t dim, const float *p, ++ const int *t, float *loss, cudaStream_t stream) { ++ KernelComputeCrossEntropy <<<ceil(batchsize / CU1DBLOCKF), CU1DBLOCKF>>> ++ (batchsize, dim, p, t, loss); +} + - void mult(int n, const float *a, const float *b, float *out) { - kernel_mult << <ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (a, b, out, n); ++void SoftmaxCrossEntropyBwd(size_t batchsize, const size_t dim, const float *p, ++ const int *t, float *grad, cudaStream_t stream) { ++ KernelSoftmaxCrossEntropyBwd <<<ceil(batchsize / CU1DBLOCKF), CU1DBLOCKF>>> ++ (batchsize, dim, p, t, grad); ++} + /* + void square_grad(int n, const float *in, float *out, cudaStream_t s) { + kernel_square_grad <<<ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (in, out, n); } - void mult(int n, const float *a, const float x, float *out) { - kernel_mult << <ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (a, x, out, n); + void tanh_grad(int n, const float *in, float *out, cudaStream_t s) { + kernel_tanh_grad <<<ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (in, out, n); } - void div(int n, const float *a, const float *b, float *out) { - kernel_div << <ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (a, b, out, n); + + void relu_grad(int n, const float *in, float *out, cudaStream_t s) { + kernel_relu_grad <<<ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (in, out, n); } - void set_value(int n, float v, float *out) { - kernel_set_value << <ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (out, v, n); + + void sigmoid_grad(int n, const float *in, float *out, cudaStream_t s) { + kernel_sigmoid_grad <<<ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (in, out, n); } - void threshold(int n, float alpha, const float *in, float *out) { - kernel_threshold << <ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (in, out, alpha, n); + void softplus_grad(int n, const float *in, float *out, cudaStream_t s) { + kernel_softplus_grad <<<ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (in, out, n); } - // follow the consistency guide for math API - __global__ void KernelDiv(const size_t num, const float alpha, const float *in, - float *out) { - for (size_t idx = blockIdx.x * blockDim.x + threadIdx.x; idx < num; - idx += blockDim.x * gridDim.x) { - out[idx] = alpha / in[idx]; + + __global__ void kernel_sum_col(const float *src_mat_data, float *dst_vec_data, + int rows, int cols, int stride) { + int index = blockIdx.x * blockDim.x + threadIdx.x; + int num_threads = blockDim.x * gridDim.x; + for (; index < rows; index += num_threads) { + dst_vec_data[index] = 0.0f; + for (int k = 0; k < cols; k++) { + dst_vec_data[index] += src_mat_data[index * stride + k]; + } } } http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/21e4b2d7/src/core/tensor/math_kernel.h ---------------------------------------------------------------------- diff --cc src/core/tensor/math_kernel.h index 09953e4,d8a58a5..976b78f --- a/src/core/tensor/math_kernel.h +++ b/src/core/tensor/math_kernel.h @@@ -31,72 -31,66 +31,73 @@@ namespace singa // TODO(wangwei) make all function templates. namespace cuda { - void sum(int n, const float *in, float *out); - void sum_row(int rows, int cols, int stride, const float *in, float *out); - - void sum_col(int rows, int cols, int stride, const float *in, float *out); - - void add_row(int rows, int cols, int stride, const float *in_row, - const float *in_mat, float *out); - - void add(int n, const float *a, const float *b, float *out); - - void sub(int n, const float *a, const float *b, float *out); - - void exp(int n, const float *in, float *out); - - void log(int n, const float *in, float *out); - - void sigmoid(int n, const float *in, float *out); - - void sigmoid_grad(int n, const float *in, float *out); - - void relu(int n, const float *in, float *out); - - void relu_grad(int n, const float *in, float *out); - - void tanh(int n, const float *in, float *out); - - void tanh_grad(int n, const float *in, float *out); + // 0 input + void set(const size_t n, const float v, float *out, cudaStream_t s); + + // 1 input + void abs(const size_t n, const float *in, float *out, cudaStream_t s); + void sign(const size_t n, const float *in, float *out, cudaStream_t s); + void exp(const size_t n, const float *in, float *out, cudaStream_t s); + void log(const size_t n, const float *in, float *out, cudaStream_t s); + void sqrt(const size_t n, const float *in, float *out, cudaStream_t s); + void square(const size_t n, const float *in, float *out, cudaStream_t s); + void tanh(const size_t n, const float *in, float *out, cudaStream_t s); + void relu(const size_t n, const float *in, float *out, cudaStream_t s); + void sigmoid(const int n, const float *in, float *out, cudaStream_t s); + void softplus(const size_t n, const float *in, float *out, cudaStream_t s); + void clamp(const size_t n, const float low, const float high, const float *in, + float *out, cudaStream_t s); + + void pow(const size_t n, const float *in, const float x, float *out, + cudaStream_t s); - void softplus(int n, const float *in, float *out); + void add(const size_t n, const float *in, const float x, float *out, + cudaStream_t s); - void softplus_grad(int n, const float *in, float *out); + void mult(const size_t n, const float *in, const float x, float *out, + cudaStream_t s); - void square(int n, const float *in, float *out); + void div(const size_t n, const float x, const float *in, float *out, + cudaStream_t s); - void square_grad(int n, const float *in, float *out); + void threshold(const size_t n, const float x, const float *in, float *out, + cudaStream_t s); - void sqrt(int n, const float *in, float *out); + void gt(const size_t num, const float *in, const float x, float *out, + cudaStream_t s); + void ge(const size_t num, const float *in, const float x, float *out, + cudaStream_t s); + void lt(const size_t num, const float *in, const float x, float *out, + cudaStream_t s); + void le(const size_t num, const float *in, const float x, float *out, + cudaStream_t s); - void pow(int n, const float *a, const float *b, float *out); + // 2 inputs + void pow(const size_t n, const float *in1, const float *in2, float *out, + cudaStream_t s); - void mult(int n, const float *a, const float *b, float *out); + void add(const size_t n, const float *in1, const float *in2, float *out, + cudaStream_t s); - void mult(int n, const float *a, const float x, float *out); + void sub(const size_t n, const float *in1, const float *in2, float *out, + cudaStream_t s); - void div(int n, const float *a, const float *b, float *out); + void mult(const size_t n, const float *in1, const float *in2, float *out, + cudaStream_t s); - void set_value(int n, float v, float *out); + void div(const size_t n, const float *in1, const float *in2, float *out, + cudaStream_t s); - void threshold(int n, float alpha, const float *in, float *out); + void sum(const size_t n, const float *in, float *out, cudaStream_t s); - // follow the consistency guide for math API +void ComputeCrossEntropy(const size_t batchsize, const size_t dim, + const float *p, const int *t, float *loss, + cudaStream_t stream); - void Div(const size_t num, const float x, const float *in, float *out, - cudaStream_t s); - void GT(size_t num, const float *in, const float x, float *out, cudaStream_t s); - void GE(size_t num, const float *in, const float x, float *out, cudaStream_t s); - void LT(size_t num, const float *in, const float x, float *out, cudaStream_t s); - void LE(size_t num, const float *in, const float x, float *out, cudaStream_t s); - void Set(const size_t num, const float x, float *out, cudaStream_t s); +void SoftmaxCrossEntropyBwd(const size_t batchsize, const size_t dim, + const float *p, const int *t, float *grad, + cudaStream_t stream); + } // cuda } // namespace singa http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/21e4b2d7/src/core/tensor/tensor.cc ---------------------------------------------------------------------- diff --cc src/core/tensor/tensor.cc index 1ac25c6,e62386a..4e0d94b --- a/src/core/tensor/tensor.cc +++ b/src/core/tensor/tensor.cc @@@ -592,42 -557,8 +556,8 @@@ void AddRow(const SType alpha, const ST Mult(alpha, one, vmat, beta, M); } } - void ComputeCrossEntropy(const Tensor& t, Tensor* p) { - CHECK_LE(p->nDim(), 2u); - CHECK_LE(t.nDim(), 2u); // TODO(wangwei) consider multi-labels. - size_t batchsize = 1; - if (p->nDim() == 2u) batchsize = p->shape(0); - size_t dim = p->Size() / batchsize; - TYPE_LANG_SWITCH(p->data_type(), DType, p->device()->lang(), Lang, { - p->device()->Exec([batchsize, dim, t, p](Context *ctx) { - ComputeCrossEntropy<DType, Lang>(batchsize, dim, p->blob(), t.blob(), - p->blob(), ctx); - }, {p->blob(), t.blob()}, {p->blob()}); - }); - } - - template <typename SType> Tensor Div(const SType alpha, const Tensor &in) { - Tensor out(in.shape(), in.device(), in.data_type()); - Div(alpha, in, &out); - return out; - } - - template Tensor Div<float>(const float, const Tensor &); - - template <typename SType> - void Div(const SType alpha, const Tensor &in, Tensor *out) { - CheckDataTypeAndLang(in, *out); - CHECK(in.shape() == out->shape()); - TYPE_LANG_SWITCH(in.data_type(), DType, in.device()->lang(), Lang, { - // TODO(wangwei) type cast SType to DType; - in.device()->Exec( - [alpha, in, out](Context *ctx) { - Div<DType, Lang>(in.Size(), alpha, in.blob(), out->blob(), ctx); - }, - {in.blob()}, {out->blob()}); - }); - } - template void Div<float>(const float, const Tensor &, Tensor *); -template <> ++template + void AddRow(const float alpha, const float beta, const Tensor &v, Tensor *M); /// Divide column 'v' by each column of matrix M; write results into 'out' void DivColumn(const Tensor &v, Tensor *M) { @@@ -725,4 -639,92 +638,122 @@@ void SumRows(const Tensor &M, Tensor *v Mult(X, one, v); } } + // ====================Random operations===================================== + template <typename SType> + void Bernoulli(const SType p, Tensor *out) { + TYPE_LANG_SWITCH(out->data_type(), DType, out->device()->lang(), Lang, { + auto prob = TypeCast<SType, DType>(p); + out->device()->Exec([prob, out](Context *ctx) { + Bernoulli<DType, Lang>(out->Size(), prob, out->blob(), ctx); + }, {}, {out->blob()}, true); + }); + } + template void Bernoulli<float>(const float p, Tensor *out); + + template <typename SType> + void Uniform(const SType low, const SType high, Tensor *out) { + TYPE_LANG_SWITCH(out->data_type(), DType, out->device()->lang(), Lang, { + auto l = TypeCast<SType, DType>(low); + auto h = TypeCast<SType, DType>(high); + out->device()->Exec([l, h, out](Context *ctx) { + Uniform<DType, Lang>(out->Size(), l, h, out->blob(), ctx); + }, {}, {out->blob()}, true); + }); + } + template void Uniform<float>(const float low, const float high, Tensor *out); + + template <typename SType> + void Gaussian(const SType mean, const SType std, Tensor *out) { + TYPE_LANG_SWITCH(out->data_type(), DType, out->device()->lang(), Lang, { + auto m = TypeCast<SType, DType>(mean); + auto s = TypeCast<SType, DType>(std); + out->device()->Exec([m, s, out](Context *ctx) { + Gaussian<DType, Lang>(out->Size(), m, s, out->blob(), ctx); + }, {}, {out->blob()}, true); + }); + } + template void Gaussian<float>(const float mean, const float std, Tensor *out); + + // ================Blas operations============================================ + template <typename SType> + void Axpy(const SType alpha, const Tensor &in, Tensor *out) { + TYPE_LANG_SWITCH(in.data_type(), DType, in.device()->lang(), Lang, { + auto a = TypeCast<SType, DType>(alpha); + out->device()->Exec([a, in, out](Context *ctx) { + Axpy<DType, Lang>(in.Size(), a, in.blob(), out->blob(), ctx); + }, {in.blob(), out->blob()}, {out->blob()}); + }); + } -template <> -void Axpy(const float alpha, const Tensor &in, Tensor *out); ++template void Axpy(const float alpha, const Tensor &in, Tensor *out); + + Tensor Mult(const Tensor &A, const Tensor &B) { + Shape s; + s.push_back(A.shape(0)); + if (B.nDim() == 2) s.push_back(B.shape(1)); + Tensor out(s, A.device(), A.data_type()); + Mult(A, B, &out); + return out; + } + + void Mult(const Tensor &A, const Tensor &B, Tensor *out) { + Mult(1.0f, A, B, 0.0f, out); + } + + template <typename SType> + void Mult(const SType alpha, const Tensor &A, const Tensor &B, const SType beta, + Tensor *C) { + CHECK_EQ(A.shape().size(), 2u); + if (B.nDim() == 1u) { + TYPE_LANG_SWITCH(A.data_type(), DType, A.device()->lang(), Lang, { + auto a = TypeCast<SType, DType>(alpha); + auto b = TypeCast<SType, DType>(beta); + C->device()->Exec([a, A, b, B, C](Context *ctx) { + GEMV<DType, Lang>(A.transpose(), A.shape(0), A.shape(1), a, A.blob(), + B.blob(), b, C->blob(), ctx); + }, {A.blob(), B.blob()}, {C->blob()}); + }); + } else { + CHECK(!C->transpose()); + TYPE_LANG_SWITCH(A.data_type(), DType, A.device()->lang(), Lang, { + auto a = TypeCast<SType, DType>(alpha); + auto b = TypeCast<SType, DType>(beta); + C->device()->Exec([a, A, b, B, C](Context *ctx) { + GEMM<DType, Lang>(A.transpose(), B.transpose(), A.shape(0), B.shape(1), + A.shape(1), a, A.blob(), B.blob(), b, C->blob(), ctx); + }, {A.blob(), B.blob()}, {C->blob()}); + }); + } + } + ++ ++// ************************ ++// Misc. ++// *********************** ++void ComputeCrossEntropy(const Tensor &p, const Tensor &t, Tensor *loss) { ++ CHECK_LE(p.nDim(), 2u); ++ CHECK_LE(t.nDim(), 2u); // TODO(wangwei) consider multi-labels. ++ size_t batchsize = 1; ++ if (p.nDim() == 2u) batchsize = p.shape(0); ++ size_t dim = p.Size() / batchsize; ++ TYPE_LANG_SWITCH(p.data_type(), DType, p.device()->lang(), Lang, { ++ p.device()->Exec([batchsize, dim, t, p, loss](Context *ctx) { ++ ComputeCrossEntropy<DType, Lang>(batchsize, dim, p.blob(), t.blob(), ++ loss->blob(), ctx); ++ }, {p.blob(), t.blob()}, {loss->blob()}); ++ }); ++} ++void SoftmaxCrossEntropyBwd(const Tensor &t, Tensor *p) { ++ CHECK_LE(p->nDim(), 2u); ++ CHECK_LE(t.nDim(), 2u); // TODO(wangwei) consider multi-labels. ++ size_t batchsize = 1; ++ if (p->nDim() == 2u) ++ batchsize = p->shape(0); ++ size_t dim = p->Size() / batchsize; ++ TYPE_LANG_SWITCH(p->data_type(), DType, p->device()->lang(), Lang, { ++ p->device()->Exec([batchsize, dim, t, p](Context *ctx) { ++ SoftmaxCrossEntropyBwd<DType, Lang>(batchsize, dim, p->blob(), t.blob(), ++ p->blob(), ctx); ++ }, {p->blob(), t.blob()}, {p->blob()}); ++ }); ++} } // namespace singa http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/21e4b2d7/src/core/tensor/tensor_math.h ---------------------------------------------------------------------- diff --cc src/core/tensor/tensor_math.h index bcf4908,b86e1cb..12490d1 --- a/src/core/tensor/tensor_math.h +++ b/src/core/tensor/tensor_math.h @@@ -269,109 -293,74 +293,95 @@@ void Scale(const size_t num, const DTyp template <typename DType, typename Lang> void Dot(const size_t num, const Blob *in1, const Blob *in2, DType *out, Context *ctx) { - LOG(FATAL) << "Not Implemented"; + LOG(FATAL) << "Dot Not Implemented"; } - // ===== Level 2 - /// ret = alpha * op(A) * v + beta * ret. - /// op(A) = A if trans = false; A^T otherwise; rows(op(A)) = m, cols(op(A)) = n. + /// out = alpha * A * v + beta * out. + /// transA indicates if the internal data layout is transposed of A template <typename DType, typename Lang> - void GEMV(bool trans, int m, int n, DType alpha, const Blob *A, const Blob *v, - DType beta, Blob *ret, Context *ctx) { - LOG(FATAL) << "Not Implemented"; + void GEMV(bool trans, const size_t m, const size_t n, const DType alpha, + const Blob *A, const Blob *v, const DType beta, Blob *out, + Context *ctx) { + LOG(FATAL) << "GEMV Not Implemented"; } - // ===== Level 3 - - // ================Random functions=========================================== - /// Each element of ret would be 1 with prob p and 0 with 1-p. 0<= p <= 1 - // Get the random generator from 'ctx' - // If DType is not float, then convert the threshold to DType - template <typename DType, typename Lang> - void Bernoulli(int count, float p, Blob *ret, Context *ctx) { - LOG(FATAL) << "Not Implemented"; - } - // The random generator should be extracted from ctx. - // If DType is not float, then convert the low and high to DType + /// multiply a matrix with a diagnoal matrix constructed using values from 'v'. + /// if matrix_lef_side is true, do M*v; else do v*M template <typename DType, typename Lang> - void Uniform(int count, float low, float high, Blob *ret, Context *ctx) { - LOG(FATAL) << "Not Implemented"; + void DGMM(const bool side_right, const size_t nrow, const size_t ncol, + const Blob *M, const Blob *v, Blob *out, Context *ctx) { + LOG(FATAL) << "DGMM Not Implemented"; } - // The random generator should be extracted from ctx. - // If DType is not float, then convert the mean and std to DType + + /// C = alpha * A * B + beta * C. + /// transA indicates if the internal data layout is transposed of A template <typename DType, typename Lang> - void Gaussian(int count, float mean, float std, Blob *ret, Context *ctx) { - LOG(FATAL) << "Not Implemented"; + void GEMM(const bool transA, const bool transB, const size_t nrowA, + const size_t ncolB, const size_t ncolA, const DType alpha, + const Blob *A, const Blob *B, const DType beta, Blob *C, + Context *ctx) { + LOG(FATAL) << "GEMM Not Implemented"; } - // ========follow the consistency guide of math API - +/// Divide alpha by each element of 'in'. +// following the consistency guide. +template <typename DType, typename Lang> +void ComputeCrossEntropy(const size_t batchsize, const size_t dim, + const Blob *p, const Blob *t, Blob *loss, + Context *ctx) { + LOG(FATAL) << "Not Implemented"; +} - template <typename DType, typename Lang> - void Div(const size_t num, const DType alpha, const Blob *in, Blob *out, - Context *ctx) { - LOG(FATAL) << "Not Implemented"; - } + - /// multiply a matrix with a diagnoal matrix constructed using values from 'v'. - /// if matrix_lef_side is true, do M*v; else do v*M +template <typename DType, typename Lang> - void DGMM(const bool side_right, const size_t nrow, const size_t ncol, - const Blob *M, const Blob *v, Blob *out, Context *ctx) { ++void SoftmaxCrossEntropyBwd(const size_t batchsize, const size_t dim, ++ const Blob *p, const Blob *t, Blob *grad, ++ Context *ctx) { + LOG(FATAL) << "Not Implemented"; +} + - /// C = alpha * A * B + beta * C. - /// transA indicates if the internal data layout is transposed of A + // ************************************** + // Matrix functions + // ************************************** + /* + /// Add the vector v to every column of A as the column of out template <typename DType, typename Lang> - void GEMM(const bool transA, const bool transB, const size_t nrowA, - const size_t ncolB, const size_t ncolA, const DType alpha, - const Blob *A, const Blob *B, const DType beta, Blob *C, - Context *ctx) { - LOG(FATAL) << "Not Implemented"; + void AddCol(const size_t nrow, const size_t ncol, const Blob *A, const Blob *v, + Blob *out, Context *ctx) { + LOG(FATAL) << "AddCol Not Implemented"; } - /// ret[i]=(input[i]<x)?1.f:0.f + // TODO(wangwei) unify AddRow and AddCol. + /// Add the vector v to every row of A as the row of out template <typename DType, typename Lang> - void LT(const size_t num, const Blob *in, const DType x, Blob *out, - Context *ctx) { - LOG(FATAL) << "Not Implemented"; + void AddRow(const size_t nrow, const size_t ncol, const Blob *A, const Blob *v, + Blob *out, Context *ctx) { + LOG(FATAL) << "AddRow Not Implemented"; } - /// ret[i]=(input[i]<=x)?1.f:0.f - template <typename DType, typename Lang> - void LE(const size_t num, const Blob *in, const DType x, Blob *out, - Context *ctx) { - LOG(FATAL) << "Not Implemented"; - } - /// ret[i]=(input[i]>x)?1.f:0.f + /// outer-product. + /// in1 and in2 are vectors of len m and n. out is matrix of shape m * n template <typename DType, typename Lang> - void GT(const size_t num, const Blob *in, const DType x, Blob *out, - Context *ctx) { - LOG(FATAL) << "Not Implemented"; + void Outer(const size_t m, const size_t n, const Blob *in1, const Blob *in2, + Blob *out, Context *ctx) { + LOG(FATAL) << "Outer Not Implemented"; } - /// ret[i]=(input[i]>=x)?1.f:0.f + + /// Sum the columns of the in matrix into a vector template <typename DType, typename Lang> - void GE(const size_t num, const Blob *in, const DType x, Blob *out, - Context *ctx) { - LOG(FATAL) << "Not Implemented"; + void SumColumns(const size_t nrow, const size_t ncol, const Blob *in, Blob *out, + Context *ctx) { + LOG(FATAL) << "SumColumns Not Implemented"; } +template <typename DType, typename Lang> +void Set(const size_t num, const DType x, Blob *out, Context *ctx) { + LOG(FATAL) << "Not Implemented"; +} + + // TODO(wangwei) unify SumRow and SumCol. + /// Sum the rows of the in matrix into a vector template <typename DType, typename Lang> - void SoftmaxCrossEntropyBwd(const size_t batchsize, const size_t dim, - const Blob *p, const Blob *t, Blob *grad, - Context *ctx) { - LOG(FATAL) << "Not Implemented"; + void SumRows(const size_t nrow, const size_t ncol, const Blob *in, Blob *out, + Context *ctx) { + LOG(FATAL) << "SumRows Not Implemented"; } - + */ } // namespace singa #endif // SINGA_CORE_MATH_H_ http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/21e4b2d7/src/core/tensor/tensor_math_cpp.h ---------------------------------------------------------------------- diff --cc src/core/tensor/tensor_math_cpp.h index 907c656,0b280a3..c5d092b --- a/src/core/tensor/tensor_math_cpp.h +++ b/src/core/tensor/tensor_math_cpp.h @@@ -323,32 -420,196 +422,229 @@@ void GEMM<float, lang::Cpp>(const bool cblas_sgemm(CblasRowMajor, transa, transb, nrowA, ncolB, ncolA, alpha, APtr, lda, BPtr, ldb, beta, CPtr, ldc); } - #endif // USE_CBLAS + + #else template <> - void Set<float, lang::Cpp>(const size_t num, const float x, Blob *out, - Context *ctx) { + void Amax<float, lang::Cpp>(const size_t num, const Blob *in, size_t *out, + Context *ctx) { + size_t maxPos = 0; + float maxVal = 0; + const float *inPtr = static_cast<const float *>(in->data()); + for (size_t i = 0; i < num; i++) { + if (i == 0) { + maxVal = inPtr[i]; + } else if (inPtr[i] > maxVal) { + maxVal = inPtr[i]; + maxPos = i; + } + } + *out = maxPos; + } + template <> + void Amin<float, lang::Cpp>(const size_t num, const Blob *in, size_t *out, + Context *ctx) { + size_t minPos = 0; + float minVal = 0; + const float *inPtr = static_cast<const float *>(in->data()); + for (size_t i = 0; i < num; i++) { + if (i == 0) { + minVal = inPtr[i]; + } else if (inPtr[i] > minVal) { + minVal = inPtr[i]; + minPos = i; + } + } + *out = minPos; + } + + template <> + void Asum<float, lang::Cpp>(const size_t num, const Blob *in, float *out, + Context *ctx) { + float sum = 0; + const float *inPtr = static_cast<const float *>(in->data()); + for (size_t i = 0; i < num; i++) { + sum += fabs(inPtr[i]); + } + } + + template <> + void Axpy<float, lang::Cpp>(const size_t num, const float alpha, const Blob *in, + Blob *out, Context *ctx) { float *outPtr = static_cast<float *>(out->mutable_data()); - for (size_t i = 0; i < num; i++) outPtr[i] = x; + const float *inPtr = static_cast<const float *>(in->data()); + for (size_t i = 0; i < num; i++) { + outPtr[i] += alpha * inPtr[i]; + } } + + template <> + void Scale<float, lang::Cpp>(const size_t num, const float x, Blob *out, + Context *ctx) { + float *outPtr = static_cast<float *>(out->mutable_data()); + for (size_t i = 0; i < num; i++) { + outPtr[i] *= x; + } + } + + template <> + void Dot<float, lang::Cpp>(const size_t num, const Blob *in1, const Blob *in2, + float *out, Context *ctx) { + float sum = 0; + const float *in1Ptr = static_cast<const float *>(in1->data()); + const float *in2Ptr = static_cast<const float *>(in2->data()); + for (size_t i = 0; i < num; i++) { + sum += in1Ptr[i] * in2Ptr[i]; + } + } + + template <> + void GEMV<float, lang::Cpp>(bool trans, const size_t m, const size_t n, + const float alpha, const Blob *A, const Blob *v, + const float beta, Blob *out, Context *ctx) { + float *outPtr = static_cast<float *>(out->mutable_data()); + const float *APtr = static_cast<const float *>(A->data()); + const float *vPtr = static_cast<const float *>(v->data()); + for (size_t r = 0; r < m; r++) { + float sum = 0; + for (size_t c = 0; c < n; c++) { + size_t idx = trans ? c * m + r : r * n + c; + sum += APtr[idx] * vPtr[c]; + } + outPtr[r] = alpha * sum + beta * outPtr[r]; + } + } + + #endif // USE_CBLAS ++template <> ++void ComputeCrossEntropy<float, lang::Cpp>(const size_t batchsize, ++ const size_t dim, const Blob *p, ++ const Blob *t, Blob *loss, ++ Context *ctx) { ++ const float *pPtr = static_cast<const float *>(p->data()); ++ const int *tPtr = static_cast<const int *>(t->data()); ++ float *lossPtr = static_cast<float *>(loss->mutable_data()); ++ for (size_t i = 0; i < batchsize; i++) { ++ int truth_idx = tPtr[i]; ++ CHECK_GE(truth_idx, 0); ++ float prob_of_truth = pPtr[i * dim + truth_idx]; ++ lossPtr[i] = -std::log(std::max(prob_of_truth, FLT_MIN)); ++ } ++} ++ +template <> +void SoftmaxCrossEntropyBwd<float, lang::Cpp>(const size_t batchsize, + const size_t dim, const Blob *p, + const Blob *t, + Blob *grad, Context *ctx) { + CHECK_EQ(p, grad) << "Use the same pointer to optimize performance"; + // const float* pPtr = static_cast<const float*>(p->data()); - const float *tPtr = static_cast<const float *>(t->data()); ++ const int *tPtr = static_cast<const int *>(t->data()); + float *gradPtr = static_cast<float *>(grad->mutable_data()); + + for (size_t i = 0; i < batchsize; i++) { + int truth_idx = static_cast<int>(tPtr[i]); + CHECK_GE(truth_idx, 0); + gradPtr[i * dim + truth_idx] -= 1.0; + } +} + + // =========Matrix operations ================================================ + /* + template <> + void AddCol<float, lang::Cpp>(const size_t nrow, const size_t ncol, + const Blob *A, const Blob *v, Blob *out, + Context *ctx) { + float *outPtr = static_cast<float *>(out->mutable_data()); + const float *APtr = static_cast<const float *>(A->data()); + const float *vPtr = static_cast<const float *>(v->data()); + for (size_t r = 0; r < nrow; r++) { + size_t offset = r * ncol; + for (size_t c = 0; c < ncol; c++) { + outPtr[offset + c] = APtr[offset + c] + vPtr[r]; + } + } + } + + template <> + void AddRow<float, lang::Cpp>(const size_t nrow, const size_t ncol, + const Blob *A, const Blob *v, Blob *out, + Context *ctx) { + float *outPtr = static_cast<float *>(out->mutable_data()); + const float *APtr = static_cast<const float *>(A->data()); + const float *vPtr = static_cast<const float *>(v->data()); + for (size_t r = 0; r < nrow; r++) { + size_t offset = r * ncol; + for (size_t c = 0; c < ncol; c++) { + outPtr[offset + c] = APtr[offset + c] + vPtr[c]; + } + } + } + template <> + void Outer<float, lang::Cpp>(const size_t m, const size_t n, const Blob *in1, + const Blob *in2, Blob *out, Context *ctx) { + float *outPtr = static_cast<float *>(out->mutable_data()); + const float *in1Ptr = static_cast<const float *>(in1->data()); + const float *in2Ptr = static_cast<const float *>(in2->data()); + for (size_t r = 0; r < m; r++) { + size_t offset = r * n; + for (size_t c = 0; c < n; c++) { + outPtr[offset + c] = in1Ptr[r] * in2Ptr[c]; + } + } + } + template <> + void Softmax<float, lang::Cpp>(const size_t nrow, const size_t ncol, + const Blob *in, Blob *out, Context *ctx) { + float *outPtr = static_cast<float *>(out->mutable_data()); + const float *inPtr = static_cast<const float *>(in->data()); + float *bPtr = new float[ncol]; + for (size_t r = 0; r < nrow; r++) { + size_t offset = r * ncol; + float denom = 0.f; + for (size_t c = 0; c < ncol; c++) { + bPtr[c] = exp(inPtr[offset + c]); + denom += bPtr[c]; + } + for (size_t c = 0; c < ncol; c++) { + size_t idx = offset + c; + outPtr[idx] = bPtr[c] / denom; + } + } + delete bPtr; + } + + template <> + void SumColumns<float, lang::Cpp>(const size_t nrow, const size_t ncol, + const Blob *in, Blob *out, Context *ctx) { + float *outPtr = static_cast<float *>(out->mutable_data()); + const float *inPtr = static_cast<const float *>(in->data()); + for (size_t c = 0; c < ncol; c++) { + outPtr[c] = 0.f; + } + for (size_t r = 0; r < nrow; r++) { + size_t offset = r * ncol; + for (size_t c = 0; c < ncol; c++) { + outPtr[c] += inPtr[offset + c]; + } + } + } + + template <> + void SumRows<float, lang::Cpp>(const size_t nrow, const size_t ncol, + const Blob *in, Blob *out, Context *ctx) { + float *outPtr = static_cast<float *>(out->mutable_data()); + const float *inPtr = static_cast<const float *>(in->data()); + for (size_t r = 0; r < nrow; r++) { + size_t offset = r * ncol; + outPtr[r] = 0.f; + for (size_t c = 0; c < ncol; c++) { + outPtr[r] += inPtr[offset + c]; + } + } + } + */ } // namespace singa #endif // SINGA_CORE_TENSOR_TENSOR_MATH_CPP_H_ http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/21e4b2d7/src/core/tensor/tensor_math_cuda.h ---------------------------------------------------------------------- diff --cc src/core/tensor/tensor_math_cuda.h index c69620c,e2597d5..9a8839e --- a/src/core/tensor/tensor_math_cuda.h +++ b/src/core/tensor/tensor_math_cuda.h @@@ -131,55 -398,6 +398,28 @@@ void GEMM<float, lang::Cuda>(const boo BPtr, ldb, APtr, lda, &beta, CPtr, ldc)); } +template <> - void GE<float, lang::Cuda>(const size_t num, const Blob* in, const float x, - Blob* out, Context *ctx) { - float* outPtr = static_cast<float*>(out->mutable_data()); - const float* inPtr = static_cast<const float*>(in->data()); - cuda::GE(num, inPtr, x, outPtr, ctx->stream); - } - template <> - void GT<float, lang::Cuda>(const size_t num, const Blob* in, const float x, - Blob* out, Context *ctx) { - float* outPtr = static_cast<float*>(out->mutable_data()); - const float* inPtr = static_cast<const float*>(in->data()); - cuda::GT(num, inPtr, x, outPtr, ctx->stream); - } - template <> - void LE<float, lang::Cuda>(const size_t num, const Blob* in, const float x, - Blob* out, Context *ctx) { - float* outPtr = static_cast<float*>(out->mutable_data()); - const float* inPtr = static_cast<const float*>(in->data()); - cuda::LE(num, inPtr, x, outPtr, ctx->stream); - } - template <> - void LT<float, lang::Cuda>(const size_t num, const Blob* in, const float x, - Blob* out, Context *ctx) { - float* outPtr = static_cast<float*>(out->mutable_data()); - const float* inPtr = static_cast<const float*>(in->data()); - cuda::LT(num, inPtr, x, outPtr, ctx->stream); - } - - template<> - void Set<float, lang::Cuda>(const size_t num, const float x, Blob *out, - Context *ctx) { - float *outPtr = static_cast<float *>(out->mutable_data()); - cuda::Set(num, x, outPtr, ctx->stream); ++void ComputeCrossEntropy<float, lang::Cuda>(const size_t batchsize, ++ const size_t dim, const Blob *p, ++ const Blob *t, Blob *loss, ++ Context *ctx) { ++ const float *pPtr = static_cast<const float *>(p->data()); ++ const int *tPtr = static_cast<const int *>(t->data()); ++ float *lossPtr = static_cast<float *>(loss->mutable_data()); ++ cuda::ComputeCrossEntropy(batchsize, dim, pPtr, tPtr, lossPtr, ctx->stream); +} - +template <> +void SoftmaxCrossEntropyBwd<float, lang::Cuda>(const size_t batchsize, + const size_t dim, const Blob *p, + const Blob *t, Blob *grad, + Context *ctx) { + CHECK_EQ(p, grad) << "Use the same pointer to optimize performance"; + const float *pPtr = static_cast<const float *>(p->data()); + const int *tPtr = static_cast<const int *>(t->data()); + float *gradPtr = static_cast<float *>(grad->mutable_data()); + cuda::SoftmaxCrossEntropyBwd(batchsize, dim, pPtr, tPtr, gradPtr, + ctx->stream); +} - } // namespace singa #endif // USE_CUDA http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/21e4b2d7/src/model/layer/softmax.cc ---------------------------------------------------------------------- diff --cc src/model/layer/softmax.cc index 8af1d76,b379fc1..f554f25 --- a/src/model/layer/softmax.cc +++ b/src/model/layer/softmax.cc @@@ -25,14 -25,16 +25,17 @@@ void Softmax::Setup(const LayerConf& co } const Tensor Softmax::Forward(int flag, const Tensor& input) { + Tensor output; if (input.nDim() == 1) { - buf_.push(SoftMax(input)); - Tensor tmp = Reshape(input, Shape{1, input.Size()}); - output = SoftMax(tmp, 0); ++ output = SoftMax(input); } else { - output = SoftMax(input, axis_); + size_t nrow = Product(input.shape(), 0, axis_); + const Tensor& tmp = Reshape(input, Shape{nrow, input.Size() / nrow}); - buf_.push(SoftMax(tmp)); ++ output = SoftMax(tmp); } - return buf_.top(); + if (flag & kTrain) + buf_.push(output); + return output; } const std::pair<Tensor, vector<Tensor>> Softmax::Backward(int flag, http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/21e4b2d7/src/model/loss/softmax_cross_entropy.cc ---------------------------------------------------------------------- diff --cc src/model/loss/softmax_cross_entropy.cc index 4ca323a,0000000..bed3348 mode 100644,000000..100644 --- a/src/model/loss/softmax_cross_entropy.cc +++ b/src/model/loss/softmax_cross_entropy.cc @@@ -1,53 -1,0 +1,53 @@@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include <stack> +#include "singa/model/loss.h" + +namespace singa { + - - Tensor SoftmaxCrossEntropy::Forward(const Tensor& prediction, const Tensor& target) { ++Tensor SoftmaxCrossEntropy::Forward(const Tensor& prediction, ++ const Tensor& target) { + CHECK(buf_.empty()) << "Do not call Forward successively for more than twice." + << " The calling pattern is [Forward|Evaluate] Backward"; + size_t batchsize = 1; + if (prediction.nDim() > 1) batchsize = prediction.shape().at(0); + size_t dim = prediction.Size() / batchsize; + const Tensor& input = Reshape(prediction, Shape{batchsize, dim}); + Tensor prob = SoftMax(input); + + // buffer intermediate data + buf_.push(prob); + buf_.push(target); - Tensor loss = prob.Clone(); ++ Tensor loss(Shape{batchsize}, prob.device(), prob.data_type()); + - ComputeCrossEntropy(target, &loss); ++ ComputeCrossEntropy(prob, target, &loss); + return loss; +} + +Tensor SoftmaxCrossEntropy::Backward() { + const Tensor target = buf_.top(); + buf_.pop(); + Tensor prob = buf_.top(); + buf_.pop(); + SoftmaxCrossEntropyBwd(target, &prob); + return prob; +} +} // namespace singa + + http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/21e4b2d7/test/singa/test_cross_entropy.cc ---------------------------------------------------------------------- diff --cc test/singa/test_cross_entropy.cc index 6b8cb69,0000000..0eb36e5 mode 100644,000000..100644 --- a/test/singa/test_cross_entropy.cc +++ b/test/singa/test_cross_entropy.cc @@@ -1,114 -1,0 +1,116 @@@ +/************************************************************ +* +* Licensed to the Apache Software Foundation (ASF) under one +* or more contributor license agreements. See the NOTICE file +* distributed with this work for additional information +* regarding copyright ownership. The ASF licenses this file +* to you under the Apache License, Version 2.0 (the +* "License"); you may not use this file except in compliance +* with the License. You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, +* software distributed under the License is distributed on an +* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +* KIND, either express or implied. See the License for the +* specific language governing permissions and limitations +* under the License. +* +*************************************************************/ + +#include "gtest/gtest.h" +#include "singa/core/tensor.h" +#include "singa/core/device.h" +#include "singa/model/loss.h" +#include "singa_config.h" + +using singa::Tensor; +class TestSoftmaxCrossEntropy : public ::testing::Test { + protected: + virtual void SetUp() { + p.Reshape(singa::Shape{2, 4}); + t.Reshape(singa::Shape{2, 1}); + } + const float pdat[8] = {0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1}; - const float tdat[2] = {0.0, 2.0}; ++ const int tdat[2] = {0, 2}; + + singa::Tensor p, t; +}; + +TEST_F(TestSoftmaxCrossEntropy, CppForward) { + p.CopyDataFromHostPtr(pdat, 8); ++ t.AsType(singa::kInt); + t.CopyDataFromHostPtr(tdat, 2); + + singa::SoftmaxCrossEntropy cross_entropy; + const Tensor& loss = cross_entropy.Forward(p, t); + auto ldat = loss.data<const float*>(); + + const float result_test = -log(0.25); + EXPECT_FLOAT_EQ(ldat[0], result_test); + EXPECT_FLOAT_EQ(ldat[1], result_test); +} + +TEST_F(TestSoftmaxCrossEntropy, CppBackward) { + p.CopyDataFromHostPtr(pdat, 8); ++ t.AsType(singa::kInt); + t.CopyDataFromHostPtr(tdat, 2); + + singa::SoftmaxCrossEntropy cross_entropy; + cross_entropy.Forward(p, t); + const Tensor& grad = cross_entropy.Backward(); + + auto gdat = grad.data<const float*>(); + EXPECT_FLOAT_EQ(gdat[0], -0.75); + EXPECT_FLOAT_EQ(gdat[1], 0.25); + EXPECT_FLOAT_EQ(gdat[2], 0.25); + EXPECT_FLOAT_EQ(gdat[3], 0.25); + EXPECT_FLOAT_EQ(gdat[4], 0.25); + EXPECT_FLOAT_EQ(gdat[5], 0.25); + EXPECT_FLOAT_EQ(gdat[6], -0.75); + EXPECT_FLOAT_EQ(gdat[7], 0.25); +} + +#ifdef USE_CUDA + +TEST_F(TestSoftmaxCrossEntropy, CudaForward) { + singa::SoftmaxCrossEntropy cross_entropy; + singa::CudaGPU dev; + p.ToDevice(&dev); + t.ToDevice(&dev); + p.CopyDataFromHostPtr(pdat, 8); + t.CopyDataFromHostPtr(tdat, 2); + + Tensor loss = cross_entropy.Forward(p, t); + loss.ToHost(); + auto ldat = loss.data<const float*>(); + + const float result_test = -log(0.25); + EXPECT_FLOAT_EQ(ldat[0], result_test); + EXPECT_FLOAT_EQ(ldat[1], result_test); +} + +TEST_F(TestSoftmaxCrossEntropy, CudaBackward) { + singa::SoftmaxCrossEntropy cross_entropy; + singa::CudaGPU dev; + p.ToDevice(&dev); + t.ToDevice(&dev); + p.CopyDataFromHostPtr(pdat, 8); + t.CopyDataFromHostPtr(tdat, 2); + + cross_entropy.Forward(p, t); + Tensor grad = cross_entropy.Backward(); + + grad.ToHost(); + auto gdat = grad.data<const float*>(); + EXPECT_FLOAT_EQ(gdat[0], -0.75); + EXPECT_FLOAT_EQ(gdat[1], 0.25); + EXPECT_FLOAT_EQ(gdat[2], 0.25); + EXPECT_FLOAT_EQ(gdat[3], 0.25); + EXPECT_FLOAT_EQ(gdat[4], 0.25); + EXPECT_FLOAT_EQ(gdat[5], 0.25); + EXPECT_FLOAT_EQ(gdat[6], -0.75); + EXPECT_FLOAT_EQ(gdat[7], 0.25); +} +#endif // USE_CUDA http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/21e4b2d7/test/singa/test_mse.cc ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/21e4b2d7/test/singa/test_softmax.cc ---------------------------------------------------------------------- diff --cc test/singa/test_softmax.cc index 09dfcd9,c087605..fff8510 --- a/test/singa/test_softmax.cc +++ b/test/singa/test_softmax.cc @@@ -36,13 -36,14 +36,14 @@@ TEST(Softmax, Setup) EXPECT_EQ(2, sft.Axis()); } + #ifdef USE_CBLAS TEST(Softmax, Forward) { const float x[] = {1.0f, 2.0f, 0.0f, -2.0f, -3.0f, -1.0}; -- size_t n = sizeof(x) / sizeof(float); size_t row = 2; size_t col = 3; ++ size_t n = row * col; singa::Tensor in(singa::Shape{row, col}); -- in.CopyDataFromHostPtr<float>(x, n); ++ in.CopyDataFromHostPtr<float>(x, row * col); int axis = 1; Softmax sft; http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/21e4b2d7/test/singa/test_tensor_math.cc ---------------------------------------------------------------------- diff --cc test/singa/test_tensor_math.cc index 8368c55,38a9291..1092d69 --- a/test/singa/test_tensor_math.cc +++ b/test/singa/test_tensor_math.cc @@@ -20,6 -22,263 +22,261 @@@ class TestTensorMath : public ::testing const float dat2[6] = {1.1f, 2.1f, 3.1f, 4.1f, 5.1f, 6.1f}; }; + TEST_F(TestTensorMath, MemberAbs) { + Tensor aa = a.Clone(); + Tensor bb = b.Clone(); + Tensor cc = aa - bb; + const float *dptr = cc.data<const float *>(); + EXPECT_NEAR(-0.1, dptr[0], 1e-5); + EXPECT_NEAR(-0.1, dptr[1], 1e-5); + EXPECT_NEAR(-0.1, dptr[2], 1e-5); + + Tensor p = Abs(cc); + const float *dptr1 = p.data<const float *>(); + EXPECT_NEAR(0.1, dptr1[0], 1e-5); + EXPECT_NEAR(0.1, dptr1[1], 1e-5); + EXPECT_NEAR(0.1, dptr1[2], 1e-5); + } + + TEST_F(TestTensorMath, MemberExp) { + Tensor p = Exp(a); + const float *dptr1 = p.data<const float *>(); + EXPECT_NEAR(exp(1.0f), dptr1[0], 1e-5); + EXPECT_NEAR(exp(2.0f), dptr1[1], 1e-5); + EXPECT_NEAR(exp(3.0f), dptr1[2], 1e-5); + } + + TEST_F(TestTensorMath, MemberLog) { + Tensor p = Log(a); + const float *dptr1 = p.data<const float *>(); + EXPECT_NEAR(log(1.0f), dptr1[0], 1e-5); + EXPECT_NEAR(log(2.0f), dptr1[1], 1e-5); + EXPECT_NEAR(log(3.0f), dptr1[2], 1e-5); + } + + TEST_F(TestTensorMath, MemberReLU) { + Tensor aa = a.Clone(); + Tensor cc = aa - 2.0f; + const float *dptr = cc.data<const float *>(); + EXPECT_NEAR(-1.0f, dptr[0], 1e-5); + EXPECT_NEAR(0.0f, dptr[1], 1e-5); + EXPECT_NEAR(1.0f, dptr[2], 1e-5); + + Tensor p = ReLU(cc); + const float *dptr1 = p.data<const float *>(); + EXPECT_NEAR(0.0f, dptr1[0], 1e-5); + EXPECT_NEAR(0.0f, dptr1[1], 1e-5); + EXPECT_NEAR(1.0f, dptr1[2], 1e-5); + } + + TEST_F(TestTensorMath, MemberSigmoid) { + Tensor p = Sigmoid(a); + const float *dptr1 = p.data<const float *>(); + EXPECT_NEAR(1.0f / (1.0f + exp(-1.0f)), dptr1[0], 1e-5); + EXPECT_NEAR(1.0f / (1.0f + exp(-2.0f)), dptr1[1], 1e-5); + EXPECT_NEAR(1.0f / (1.0f + exp(-3.0f)), dptr1[2], 1e-5); + } + + TEST_F(TestTensorMath, MemberSign) { + Tensor aa = a.Clone(); + Tensor cc = aa - 2.0f; + const float *dptr = cc.data<const float *>(); + EXPECT_NEAR(-1.0f, dptr[0], 1e-5); + EXPECT_NEAR(0.0f, dptr[1], 1e-5); + EXPECT_NEAR(1.0f, dptr[2], 1e-5); + + Tensor p = Sign(cc); + const float *dptr1 = p.data<const float *>(); + EXPECT_EQ(0.0f, dptr1[0]); + EXPECT_EQ(0.0f, dptr1[1]); + EXPECT_EQ(1.0f, dptr1[2]); + } + + TEST_F(TestTensorMath, MemberSqrt) { + Tensor p = Sqrt(a); + const float *dptr1 = p.data<const float *>(); + EXPECT_NEAR(sqrt(1.0), dptr1[0], 1e-5); + EXPECT_NEAR(sqrt(2.0), dptr1[1], 1e-5); + EXPECT_NEAR(sqrt(3.0), dptr1[2], 1e-5); + } + + TEST_F(TestTensorMath, MemberSquare) { + Tensor p = Square(a); + const float *dptr1 = p.data<const float *>(); + EXPECT_NEAR(1.0, dptr1[0], 1e-5); + EXPECT_NEAR(4.0, dptr1[1], 1e-5); + EXPECT_NEAR(9.0, dptr1[2], 1e-5); + } + + TEST_F(TestTensorMath, MemberTanh) { + Tensor p = Tanh(a); + const float *dptr1 = p.data<const float *>(); + EXPECT_NEAR(tanh(1.0), dptr1[0], 1e-5); + EXPECT_NEAR(tanh(2.0), dptr1[1], 1e-5); + EXPECT_NEAR(tanh(3.0), dptr1[2], 1e-5); + } + + TEST_F(TestTensorMath, Sum) { + Tensor p1 = Sum(e, 0); + const float *dptr1 = p1.data<const float *>(); + EXPECT_FLOAT_EQ(9.0f, dptr1[0]); + EXPECT_FLOAT_EQ(12.0f, dptr1[1]); + + Tensor p2(Shape{3, 1}); + p2 = Sum(e, 1); + const float *dptr2 = p2.data<const float *>(); + EXPECT_FLOAT_EQ(3.0f, dptr2[0]); + EXPECT_FLOAT_EQ(7.0f, dptr2[1]); + EXPECT_FLOAT_EQ(11.0f, dptr2[2]); + } + + TEST_F(TestTensorMath, SoftMax) { - Tensor p1(Shape{3, 2}); - p1 = SoftMax(e, 0); ++ Tensor p1 = SoftMax(Reshape(e, Shape{1, 6})); + const float *dptr1 = p1.data<const float *>(); + float sum = 0; + for (int i = 0; i < 6; i++) sum += exp(i + 1); + EXPECT_NEAR(exp(1) / sum, dptr1[0], 1e-5); + EXPECT_NEAR(exp(3) / sum, dptr1[2], 1e-5); + EXPECT_NEAR(exp(5) / sum, dptr1[4], 1e-5); + EXPECT_NEAR(exp(2) / sum, dptr1[1], 1e-5); + EXPECT_NEAR(exp(4) / sum, dptr1[3], 1e-5); + EXPECT_NEAR(exp(6) / sum, dptr1[5], 1e-5); + - Tensor p2(Shape{3, 2}); - p2 = SoftMax(e, 1); ++ Tensor p2 = SoftMax(e); + const float *dptr2 = p2.data<const float *>(); + EXPECT_NEAR(exp(1) / (exp(1) + exp(2)), dptr2[0], 1e-5); + EXPECT_NEAR(exp(2) / (exp(1) + exp(2)), dptr2[1], 1e-5); + } + + TEST_F(TestTensorMath, MemberLT) { + Tensor p1 = a < 2.0f; + const float *dptr1 = p1.data<const float *>(); + EXPECT_FLOAT_EQ(1.0f, dptr1[0]); + EXPECT_FLOAT_EQ(0.0f, dptr1[1]); + EXPECT_FLOAT_EQ(0.0f, dptr1[2]); + } + + TEST_F(TestTensorMath, MemberLE) { + Tensor p1 = a <= 2.0f; + const float *dptr1 = p1.data<const float *>(); + EXPECT_FLOAT_EQ(1.0f, dptr1[0]); + EXPECT_FLOAT_EQ(1.0f, dptr1[1]); + EXPECT_FLOAT_EQ(0.0f, dptr1[2]); + } + + TEST_F(TestTensorMath, MemberGT) { + Tensor p1 = a > 2.0f; + const float *dptr1 = p1.data<const float *>(); + EXPECT_FLOAT_EQ(0.0f, dptr1[0]); + EXPECT_FLOAT_EQ(0.0f, dptr1[1]); + EXPECT_FLOAT_EQ(1.0f, dptr1[2]); + } + + TEST_F(TestTensorMath, MemberGE) { + Tensor p1 = a >= 2.0f; + const float *dptr1 = p1.data<const float *>(); + EXPECT_FLOAT_EQ(0.0f, dptr1[0]); + EXPECT_FLOAT_EQ(1.0f, dptr1[1]); + EXPECT_FLOAT_EQ(1.0f, dptr1[2]); + } + + TEST_F(TestTensorMath, MemberPow) { + Tensor p1 = Pow(b, 3.0f); + const float *dptr1 = p1.data<const float *>(); + EXPECT_FLOAT_EQ(pow(1.1f, 3.0f), dptr1[0]); + EXPECT_FLOAT_EQ(pow(2.1f, 3.0f), dptr1[1]); + EXPECT_FLOAT_EQ(pow(3.1f, 3.0f), dptr1[2]); + + // TODO(Yuchen): check pow(tensor a, tensor b) and add testcase after the + // function is complete + // Tensor p2 = Pow(a,b); + // const float *dptr2 = p2.data<const float *>(); + // EXPECT_FLOAT_EQ(pow(1.0f,1.1f), dptr2[0]); + // EXPECT_FLOAT_EQ(pow(2.0f,2.1f), dptr2[1]); + // EXPECT_FLOAT_EQ(pow(3.0f,3.1f), dptr2[2]); + } + + TEST_F(TestTensorMath, MemberSub) { + Tensor p1 = a - b; + const float *dptr1 = p1.data<const float *>(); + EXPECT_NEAR(-0.1, dptr1[0], 1e-5); + EXPECT_NEAR(-0.1, dptr1[1], 1e-5); + EXPECT_NEAR(-0.1, dptr1[2], 1e-5); + } + + TEST_F(TestTensorMath, MemberEltwiseMult) { + Tensor p1 = a * b; + const float *dptr1 = p1.data<const float *>(); + EXPECT_NEAR(1.0 * 1.1, dptr1[0], 1e-5); + EXPECT_NEAR(2.0 * 2.1, dptr1[1], 1e-5); + EXPECT_NEAR(3.0 * 3.1, dptr1[2], 1e-5); + } + + TEST_F(TestTensorMath, MemberDiv) { + Tensor p1 = a / b; + const float *dptr1 = p1.data<const float *>(); + EXPECT_NEAR(1.0 / 1.1, dptr1[0], 1e-5); + EXPECT_NEAR(2.0 / 2.1, dptr1[1], 1e-5); + EXPECT_NEAR(3.0 / 3.1, dptr1[2], 1e-5); + + Tensor p2 = Div(10.0f, b); + const float *dptr2 = p2.data<const float *>(); + EXPECT_NEAR(10.0 / 1.1, dptr2[0], 1e-5); + EXPECT_NEAR(10.0 / 2.1, dptr2[1], 1e-5); + EXPECT_NEAR(10.0 / 3.1, dptr2[2], 1e-5); + + Tensor p3 = a / 8.0f; + const float *dptr3 = p3.data<const float *>(); + EXPECT_NEAR(1.0 / 8.0, dptr3[0], 1e-5); + EXPECT_NEAR(2.0 / 8.0, dptr3[1], 1e-5); + EXPECT_NEAR(3.0 / 8.0, dptr3[2], 1e-5); + } + + TEST_F(TestTensorMath, MemberBernoulli) { + Tensor p1(Shape{10000}); + Bernoulli(0.3f, &p1); + const float *dptr1 = p1.data<const float *>(); + float sum = 0; + for (int i = 0; i < 10000; i++) sum += dptr1[i]; + float mean = sum / 10000; + EXPECT_NEAR(mean, 0.3f, 1e-2); + + sum = 0; + for (int i = 0; i < 10000; i++) sum += (dptr1[i] - mean) * (dptr1[i] - mean); + float variance = sum / 9999; + EXPECT_NEAR(variance, 0.3 * 0.7, 1e-2); + } + + TEST_F(TestTensorMath, MemberUniform) { + Tensor p1(Shape{10000}); + Uniform(0.1f, 0.2f, &p1); + const float *dptr1 = p1.data<const float *>(); + float sum = 0; + for (int i = 0; i < 10000; i++) sum += dptr1[i]; + float mean = sum / 10000; + EXPECT_NEAR(mean, 0.15f, 1e-3); + + sum = 0; + for (int i = 0; i < 10000; i++) sum += (dptr1[i] - mean) * (dptr1[i] - mean); + float variance = sum / 9999; + EXPECT_NEAR(variance, 0.01f / 12, 1e-3); + } + + TEST_F(TestTensorMath, MemberGaussian) { + Tensor p1(Shape{50000}); + Gaussian(0.0f, 1.0f, &p1); + const float *dptr1 = p1.data<const float *>(); + float sum = 0; + for (int i = 0; i < 50000; i++) sum += dptr1[i]; + float mean = sum / 50000; + EXPECT_NEAR(mean, 0.0, 1e-2); + + sum = 0; + for (int i = 0; i < 50000; i++) sum += (dptr1[i] - mean) * (dptr1[i] - mean); + float variance = sum / 49999; + EXPECT_NEAR(variance, 1.0, 1e-2); + } + TEST_F(TestTensorMath, MemberAddTensor) { Tensor aa = a.Clone(); aa += a;
