SINGA-80 New Blob Level and Address Level Math Operation Interface ---
addr level cpu Project: http://git-wip-us.apache.org/repos/asf/incubator-singa/repo Commit: http://git-wip-us.apache.org/repos/asf/incubator-singa/commit/06cdf366 Tree: http://git-wip-us.apache.org/repos/asf/incubator-singa/tree/06cdf366 Diff: http://git-wip-us.apache.org/repos/asf/incubator-singa/diff/06cdf366 Branch: refs/heads/master Commit: 06cdf366eb2f87b8b22a728835e2aae3d3e40661 Parents: e137784 Author: jinyangturbo <pku.tu...@gmail.com> Authored: Wed Oct 14 07:44:49 2015 -0700 Committer: Wei Wang <wang...@comp.nus.edu.sg> Committed: Mon Nov 9 17:04:48 2015 +0800 ---------------------------------------------------------------------- include/singa/blob/math_addr.cc | 33 +++++++ include/singa/blob/math_addr.h | 90 +++++++++++++++++++ include/singa/blob/math_blob.h | 139 +++++++++++++++++++++++++++++ include/singa/blob/singa_op.h | 150 +++++++++++++++++++++++++++++++ include/singa/blob/test.cc | 165 +++++++++++++++++++++++++++++++++++ 5 files changed, 577 insertions(+) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/06cdf366/include/singa/blob/math_addr.cc ---------------------------------------------------------------------- diff --git a/include/singa/blob/math_addr.cc b/include/singa/blob/math_addr.cc new file mode 100644 index 0000000..a03ce60 --- /dev/null +++ b/include/singa/blob/math_addr.cc @@ -0,0 +1,33 @@ +extern "C" +{ + #include <cblas.h> +} + +#include "math_addr.h" +#include "singa_op.h" + +void cpu_gemm(const float * A, const float * B, const int m, const int n, const int k, const float alpha, const float beta, const bool TranA, const bool TranB, float * C) +{ + int lda, ldb; + CBLAS_TRANSPOSE tA, tB; + lda = TranA ? m : k; + ldb = TranB ? k : n; + tA = TranA ? CblasTrans : CblasNoTrans; + tB = TranB ? CblasTrans : CblasNoTrans; + cblas_sgemm(CblasRowMajor, tA, tB, m, n, k, alpha, A, lda, B, ldb, beta, C, n); +} + +void cpu_gemv(const float * A, const float * B, const int m, const int n, const float alpha, const float beta, const bool TranA, float * C) +{ + CBLAS_TRANSPOSE tA; + tA = TranA ? CblasTrans : CblasNoTrans; + cblas_sgemv(CblasRowMajor, tA, m, n, alpha, A, n, B, 1, beta, C, 1); +} + +void cpu_axpy(const float * A, const int n, const float alpha, float * B) +{ + cblas_saxpy(n, alpha, A, 1, B, 1); +} + + + http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/06cdf366/include/singa/blob/math_addr.h ---------------------------------------------------------------------- diff --git a/include/singa/blob/math_addr.h b/include/singa/blob/math_addr.h new file mode 100644 index 0000000..c732060 --- /dev/null +++ b/include/singa/blob/math_addr.h @@ -0,0 +1,90 @@ +#ifndef MATH_ADDR_H +#define MATH_ADDR_H + +void cpu_gemm(const float * A, const float * B, const int m, const int n, const int k, const float alpha, const float beta, const bool TranA, const bool TranB, float * C); + +void cpu_gemv(const float * A, const float * B, const int m, const int n, const float alpha, const float beta, const bool TranA, float * C); +// should be very careful : m is the length of B, and n is the length of C , A is a n*m matrix + +void cpu_axpy(const float * A, const int n, const float alpha, float * B); + +/* +//element-wise +template<typename Op> void cpu_e_f(const int n, const float alpha, float * A); +template<typename Op> void cpu_e_f(const int n,const float * A,const float alpha, float * B); +template<typename Op> void cpu_e_f(const int n,const float * A,const float * B,const float alpha, const float beta,float * C); +// element-wise generalized operation defined in Op +*/ + +//element-wise +template<typename Op> void cpu_e_f(const int n, const float alpha, float * A) +{ + for(int i = 0 ; i < n ; i++) + { + Op::Map(alpha, A[i]); + } +} + +template<typename Op> void cpu_e_f(const int n,const float * A,const float alpha, float * B) +{ + for(int i = 0 ; i < n ; i++) + { + Op::Map(alpha, A[i], B[i]); + } +} + +template<typename Op> void cpu_e_f(const int n,const float * A,const float * B,const float alpha, const float beta,float * C) +{ + for(int i = 0 ; i < n ; i++) + { + Op::Map(alpha, beta, A[i], B[i], C[i]); + } +} +// element-wise generalized operation defined in Op + +/* +//matrix/vector expand/reduce + +template<typename Op> void cpu_reduce_f(const float * A,const int m, const int n, float * B); +//reduce each row of A to an element of B e.g. the sum operation in softmax +template<typename Op> void cpu_expand_f(const float * A,const int m, const int n, float * B); +//expand each element in A into a row of B +*/ + +//matrix/vector expand/reduce + +template<typename Op> void cpu_reduce_f(const float * A,const int m, const int n, float * B) +{ + for(int i = 0 ; i < m ; i++) + { + Op::Map(A+i*n, n, B[i]); + } +} +//reduce each row of A to an element of B e.g. the sum operation in softmax +template<typename Op> void cpu_expand_f(const float * A,const int m, const int n, float * B) +{ + for(int i = 0 ; i < m ; i++) + { + Op::Map(A[i], n, B+i*n); + } +} +//expand each element in A into a row of B + +void gpu_gemm(const float * A, const float * B, const int m, const int n, const int k, const float alpha, const float beta, const bool TranA, const bool TranB, float * C); +void gpu_gemv(const float * A, const float * B, const int m, const int n, const float alpha, const float beta, const bool TranA, float * C); +void gpu_axpy(const float * A, const int n, const float alpha, float * B); + +//element-wise +template<typename Op> void gpu_e_f(const int n, const float alpha, float * A); +template<typename Op> void gpu_e_f(const int n,const float * A,const float alpha, const float beta,float * B); +template<typename Op> void gpu_e_f(const int n,const float * A,const float * B,const float alpha, const float beta,float * C); +// element-wise generalized operation defined in Op + +//matrix/vector expand/reduce + +template<typename Op> void gpu_reduce_f(const float * A,const int m, const int n, float * B); +//reduce each row of A to an element of B e.g. the sum operation in softmax +template<typename Op> void gpu_expand_f(const float * A,const int m, const int n, float * B); +//expand each element in A into a row of B + +#endif // MATH_ADDR_H http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/06cdf366/include/singa/blob/math_blob.h ---------------------------------------------------------------------- diff --git a/include/singa/blob/math_blob.h b/include/singa/blob/math_blob.h new file mode 100644 index 0000000..e26e4d4 --- /dev/null +++ b/include/singa/blob/math_blob.h @@ -0,0 +1,139 @@ +/*********************Level-2 interface, called by user code*******************/ +// c++ ususally use const & for input arguments, and * for output arguments. +// ww: maybe we represent Blob's shape using int s[4]+dim? currently we use a vector, which may +// not be convenient as int array. + +/**********************************************************************************/ +// class1 matrix operation + +void MMDot(const Blob & A, const Blob & B, Blob & C); +// A,B and C are matrix + +void MVDot(const Blob & A, const Blob & B, Blob & C); +// A is matrix,B and C are vector + +void VVDot(const Blob & A, const Blob & B, Blob & C); +// C is matrix,A and B are vector + +float VVdot(const Blob & A, const Blob & B); +//A and B are vectors + +void GEMM(const Blob & A, const Blob & B, Blob & C, float alpha = 1, float beta = 1); +//C = alpha*A*B+beta*C, A, B and C are matrix + +Blob Reshape(const Blob & A, const std::vector<int>& shape); +// the current reshape in blob.h is: void Reshape(const std::vector<int>& shape); +// return the reference of the reshaped blob + +Blob Transpose(const Blob & A); +// return A^T, only reference to the blob A +// ww: just add a bool field in Blob, e.g., transpose_ +/**********************************************************************************/ +// class2 element-wise operation + +void Set(Blob & A,float alpha); +// element-wise operation: Ai = alpha + +void AXPY(const Blob & A, Blob & B, float alpha); +// element-wise operation: Bi = alpha*Ai+Bi A and B should have the same size + +void Add(const Blob & A, const Blob & B, Blob & C); +// element-wise operation: Ci = Ai+Bi A,B and C should have the same size + +void Sub(const Blob & A, const Blob & B, Blob & C); +// element-wise operation: Ci = Ai-Bi A,B and C should have the same size + +void Mult(const Blob & A, const Blob & B, Blob & C); +// element-wise operation: Ci = Ai*Bi A,B and C should have the same size + +void Div(const Blob & A, const Blob & B, Blob & C); +// element-wise operation: Ci = Ai/Bi A,B and C should have the same size + +void Scale(const Blob & A, Blob & B, float alpha); +// element-wise operation: Bi = alpha*Ai + +void Sigmoid(const Blob & A, Blob & B,float t); +// element-wise operation: Bi = 1/(1+e^(-Ai*t)) + +void Relu(const Blob & A, Blob & B,float t = 0); +// element-wise operation: Bi = ((1-t)abs(Ai) + (1+t)Ai)/2 + +void Tanh(const Blob & A, Blob & B,float t); +// element-wise operation: Bi = tanh(Ai*t) + +void Exp(const Blob & A, Blob & B, float alpha = 2.71); +// element-wise operation: Bi = alpha^Ai +// ww: there are many element-wise operations, e.g., log, square, sqrt. +// If MKL/OpenBLAS or other libraries do not optimize these operations in ad-hoc manner, +// then we can implement then using the E_Func below by passing the basic log/square/sqrt operations. + + + +template<typename Op> void E_Func(Blob & A, float alpha); +template<typename Op> void E_Func(const Blob & A, Blob & B, float alpha, float beta); +template<typename Op> void E_Func(const Blob & A, const Blob & B, Blob & C, float alpha, float beta); +// element-wise generalized operation defined in Op + + +// ww: the following functions may require thread specific variables, e.g., seed or random stream state. + +void Gaussian(Blob & A, float mu, float sigma); +// element-wise operation: initialize each element in A following distribution Gaussian(mu, sigma) + +void Uniform(Blob & A, float low, float high); +// element-wise operation: initialize each element in A following uniform distribution from low to high + +void Bernoulli(Blob & A, float p, int n = 1); +// element-wise operation: initialize each element in A following distribution Bernoulli(n,p) + + +/**********************************************************************************/ +//class3 matrix-vector expand/reduce operation + +template<typename Op> void Reduce_F(const Blob & A, Blob & B); +//reduce each row of A to an element of B e.g. the sum operation in softmax +template<typename Op> void Expand_F(const Blob & A, Blob & B); +//expand each element in A into a row of B + +void Repmat(const Blob & A, Blob & B); +// A is a vector, B is a matrix , let each row of B to be A +// just copy memory, will be faster + +// ww may rename to MVAdd, MVSum to be consistent with the MVDot, MMDot, VVDot. +void MVAdd(const Blob & A, Blob & B, float alpha, float beta); +// A is a vector, B is a matrix , Bij = alpha*Ai+beta*Bij +// will use gemm. faster than general expand_f + +void MVSum(const Blob & A, Blob & B, float alpha, float beta); +// A is a vector, B is a matrix , Ai = \sigma_j_{alpha*Bij}+beta*Ai +// will use gemm. faster than general reduce_f + +void Softmax(const Blob & A,Blob & B,float alpha); +// Bij = e^(alpha*Aij) / \sigma_i_{e^(alpha*Aij)} + +/**********************************************************************************/ +//class4 convolution operation + +void Conv(const Blob & A,const Blob & B,Blob & C); +// A is the data, B is the parameter, C is the result + +void Pool(const Blob & A,Blob & B, int method); +// A is the data, B is the result, should indicate max or ave pooling + +// jy: need to provide grad compute function respectively? + +// ww: The conv/pool operations cannot be declared as above, +// because they require other parameters, e.g., filter size, num of filters, pad, stride, etc. +// Caffe and mxnet use cudnn in layer implementations instead of implementing low-level operations. +// Maybe we can follow caffe? layer implementation is not our contribution, we can use others' code directly. +// For CPU version, we use im2col and col2im for the conv layer. +// For GPU version, we use cudnn for the conv layer. Similarly for other layers. +// In conclude, we may not implement Blob level Conv and Pool operations. +// Instead, we implement CaffeConvLayer, CaffePoolLayer, cudnnConvLayer, cudnnPoolLayer. +// Later we may add IntelConvLayer (cpu), NeonConvLayer (gpu). + +Blob setcolspace(const Blob & A); +void im2col(const Blob & A,Blob & B); +void col2im(const Blob & A,Blob & B); +//given an img, use setcolspace to generate colspace Blob +//use pack/unpack to get data in col/img http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/06cdf366/include/singa/blob/singa_op.h ---------------------------------------------------------------------- diff --git a/include/singa/blob/singa_op.h b/include/singa/blob/singa_op.h new file mode 100644 index 0000000..7ca06db --- /dev/null +++ b/include/singa/blob/singa_op.h @@ -0,0 +1,150 @@ +#ifndef SINGA_OP_H +#define SINGA_OP_H + +#include<cmath> +#include <algorithm> + +namespace op { + struct Set { + inline static void Map(float alpha, float & a) { + a= alpha; + } + }; + + struct Scale { + inline static void Map(float alpha, const float & a, float & b) { + b = alpha*a; + } + }; + struct Scale_grad { + inline static void Map(float alpha, float & output) { + output = alpha; + } + }; + + struct Exp { + inline static void Map(float alpha, const float & a, float & b) { + b = pow(a, alpha); + } + }; + struct Exp_grad { + inline static void Map(float alpha, const float & a, float & b) { + b = a * log(alpha); + // log is the natrual log based on e + } + }; + + struct Gsigmoid { + inline static void Map(float alpha, const float & a, float & b) { + b = 1.0f / (1.0f + expf(-a * alpha)); + } + }; + struct Gsigmoid_grad { + inline static void Map(float alpha, const float & a, float & b) { + b = alpha * a * ( 1.0f - a ); + } + }; + + struct Grelu { + inline static void Map(float alpha, const float & a, float & b) { + b = ( 1 - alpha ) * std::max( a, 0.0f ) + alpha * a; + } + }; + struct Grelu_grad { + inline static void Map(float alpha, const float & a, float & b) { + b = a > 0.0f ? 1.0f : alpha; + } + }; + + struct Gtanh { + inline static void Map(float alpha, const float & a, float & b) { + b = tanhf( a * alpha ); + } + }; + struct Gtanh_grad { + inline static void Map(float alpha, const float & a, float & b) { + b = alpha * ( 1.0f - a * a ); + } + }; + + struct Softplus { + inline static void Map(float alpha, const float & a, float & b) { + b = logf(1 + expf(a)); + } + }; + struct Softplus_grad { + inline static void Map(float alpha, const float & a, float & b) { + b = 1.0f / (1.0f + expf(-a)); + } + }; + + struct Square { + inline static void Map(float alpha, const float & a, float & b) { + b = a * a; + } + }; + + struct Square_grad { + inline static void Map(float alpha, const float & a, float & b) { + b = 2 * sqrt(a); + } + }; + + struct Sqrt { + inline static void Map(float alpha, const float & a, float & b) { + b = sqrt(a); + } + }; + + struct Threshold { + inline static void Map(float alpha, float beta, const float & a, const float & b, float & c) { + c = a < b ? 1.0f : 0.0f; + } + }; + + struct Add { + inline static void Map(float alpha, float beta, const float & a, const float & b, float & c) { + c = a + b; + } + }; + + struct Sub { + inline static void Map(float alpha, float beta, const float & a, const float & b, float & c) { + c = a - b; + } + }; + + struct Mult { + inline static void Map(float alpha, float beta, const float & a, const float & b, float & c) { + c = a * b; + } + }; + + struct Div { + inline static void Map(float alpha, float beta, const float & a, const float & b, float & c) { + c = a / b; + } + }; + + struct Sum { + inline static void Map(const float * a, int n, float & b) { + b = 0; + for(int i = 0 ; i < n ; i++) + { + b += a[i]; + } + } + }; + + struct Repmat { + inline static void Map(const float & a, int n, float * b) { + for(int i = 0 ; i < n ; i++) + { + b[i] = a; + } + } + }; + +}; // namespace op + +#endif // SINGA_OP_H http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/06cdf366/include/singa/blob/test.cc ---------------------------------------------------------------------- diff --git a/include/singa/blob/test.cc b/include/singa/blob/test.cc new file mode 100644 index 0000000..d13ed5e --- /dev/null +++ b/include/singa/blob/test.cc @@ -0,0 +1,165 @@ +#include <iostream> + +#include "singa_op.h" +#include "math_addr.h" + +using namespace std; + +void test_gemm1() +{ + float A[3][2] = {}; + float B[3][2] = {}; + float C[2][2] = {}; + for(int i = 0; i < 3; i++) + for(int j = 0; j < 2; j++) + { + A[i][j] = i+j; + B[i][j] = i+j - i*j; + } + cpu_gemm(A[0], B[0], 2, 2, 3 , 1, 0, true, false, C[0]); + float D[2][2] = {}; + for(int i = 0; i < 2; i++) + for(int j = 0; j < 2; j++) + { + D[i][j] = 0; + for(int k = 0; k < 3; k++) + D[i][j] += A[k][i]*B[k][j]; + } + for(int i = 0; i < 2; i++) + for(int j = 0; j < 2; j++) + { + cout<<C[i][j] - D[i][j]<<endl; + } +} + + +void test_gemm2() +{ + float A[2][3] = {}; + float B[3][2] = {}; + float C[2][2] = {}; + for(int i = 0; i < 3; i++) + for(int j = 0; j < 2; j++) + { + A[j][i] = i-j; + B[i][j] = i+j + i*j; + } + cpu_gemm(A[0], B[0], 2, 2, 3 , 1, 0, false, false, C[0]); + float D[2][2] = {}; + for(int i = 0; i < 2; i++) + for(int j = 0; j < 2; j++) + { + D[i][j] = 0; + for(int k = 0; k < 3; k++) + D[i][j] += A[i][k]*B[k][j]; + } + for(int i = 0; i < 2; i++) + for(int j = 0; j < 2; j++) + { + cout<<C[i][j] - D[i][j]<<endl; + } +} + + +void test_gemv() +{ + float A[4][3] = {}; + float B[4]= {}; + float C[3] = {}; + float D[3] = {}; + for(int i = 0; i < 4; i++) + { + for(int j = 0; j < 3; j++) + { + A[j][i] = i-j + i*j; + } + } + for(int i = 0; i < 4; i++)B[i] = i; + for(int i = 0; i < 3; i++)C[i] = 10; + cpu_gemv(A[0], B, 4, 3, 1, 1, true, C); + for(int i = 0; i < 3; i++) + for(int j = 0; j < 4; j++) + { + D[i] += A[j][i]*B[j]; + } + for(int i = 0; i < 3; i++)cout<<C[i] - D[i] - 10<<endl; +} + +void test_axpy() +{ + float A[4][3] = {}; + float C[4][3] = {}; + float B[3][4] = {}; + float D[3][4] = {}; + for(int i = 0; i < 4; i++) + { + for(int j = 0; j < 3; j++) + { + A[i][j] = i-j + i*j; + B[j][i] = i-j + i*j; + C[i][j] = A[i][j]; + D[j][i] = B[j][i]; + } + } + cpu_axpy(A[0], 12, 2, B[0]); + for(int i = 0; i < 12; i++)D[0][i] += 2*C[0][i]; + for(int i = 0; i < 3; i++) + { + for(int j = 0; j < 4; j++) + { + cout<<B[i][j] - D[i][j]<<endl; + } + } +} + +void test_eop() +{ + float A[10] = {}; + float B[10] = {}; + float C[10] = {}; + float D[10] = {}; + float O[10] = {}; + for(int i = 0; i < 10; i++) + { + A[i] = i; + B[i] = -i; + C[i] = i; + } + cpu_e_f<op::Set>(5, 15, O); + for(int i = 0; i < 5; i++)cout<<O[i] - 15<<endl; + for(int i = 5; i < 10; i++)cout<<O[i]<<endl; + cpu_e_f<op::Scale>(10, C, 2, C); + for(int i = 0; i < 10; i++)cout<<C[i] - 2* i<<endl; + cpu_e_f<op::Add>(10, A, B, 0, 0, O); + for(int i = 0; i < 10; i++)cout<<O[i]<<endl; +} + +void test_exrd() +{ + float A[3][10] = {}; + float B[3] = {}; + for(int i = 0; i < 3; i++) + for(int j = 0; j < 10; j++) + { + A[i][j] = (i + 1)*j; + } + cpu_reduce_f<op::Sum>(A[0], 3, 10, B); + for(int i = 0; i < 3; i++) B[i] -= 45*(i+1); + for(int i = 0; i < 3; i++)cout<<B[i]<<endl; + cpu_expand_f<op::Repmat>(B, 3, 10, A[0]); + cpu_reduce_f<op::Sum>(A[0], 3, 10, B); + for(int i = 0; i < 3; i++)cout<<B[i]<<endl; +} + +int main() +{ + test_gemm1() ; + test_gemm2(); + test_gemv(); + test_axpy(); + test_eop(); + test_exrd(); + return 0; +} + +