[11/19] incubator-singa git commit: SINGA-80 New Blob Level and Address Level Math Operation Interface

kaiping Sun, 15 Nov 2015 22:09:29 -0800

SINGA-80 New Blob Level and Address Level Math Operation Interface

---


addr level cpu


Project: http://git-wip-us.apache.org/repos/asf/incubator-singa/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-singa/commit/06cdf366
Tree: http://git-wip-us.apache.org/repos/asf/incubator-singa/tree/06cdf366
Diff: http://git-wip-us.apache.org/repos/asf/incubator-singa/diff/06cdf366

Branch: refs/heads/master
Commit: 06cdf366eb2f87b8b22a728835e2aae3d3e40661
Parents: e137784
Author: jinyangturbo <pku.tu...@gmail.com>
Authored: Wed Oct 14 07:44:49 2015 -0700
Committer: Wei Wang <wang...@comp.nus.edu.sg>
Committed: Mon Nov 9 17:04:48 2015 +0800

----------------------------------------------------------------------
 include/singa/blob/math_addr.cc |  33 +++++++
 include/singa/blob/math_addr.h  |  90 +++++++++++++++++++
 include/singa/blob/math_blob.h  | 139 +++++++++++++++++++++++++++++
 include/singa/blob/singa_op.h   | 150 +++++++++++++++++++++++++++++++
 include/singa/blob/test.cc      | 165 +++++++++++++++++++++++++++++++++++
 5 files changed, 577 insertions(+)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/06cdf366/include/singa/blob/math_addr.cc
----------------------------------------------------------------------
diff --git a/include/singa/blob/math_addr.cc b/include/singa/blob/math_addr.cc
new file mode 100644
index 0000000..a03ce60
--- /dev/null
+++ b/include/singa/blob/math_addr.cc
@@ -0,0 +1,33 @@
+extern "C"
+{
+   #include <cblas.h>
+}
+
+#include "math_addr.h"
+#include "singa_op.h"
+
+void cpu_gemm(const float * A, const float * B, const int m, const int n, 
const int k, const float alpha, const float beta, const bool TranA, const bool 
TranB, float * C)
+{
+                int lda, ldb;
+                CBLAS_TRANSPOSE tA, tB;
+                lda = TranA ? m : k;
+                ldb = TranB ? k : n;
+                tA = TranA ? CblasTrans : CblasNoTrans;
+                tB = TranB ? CblasTrans : CblasNoTrans;
+                cblas_sgemm(CblasRowMajor, tA, tB, m, n, k, alpha, A, lda, B, 
ldb, beta, C, n);
+}
+
+void cpu_gemv(const float * A, const float * B, const int m, const int n, 
const float alpha, const float beta, const bool TranA, float * C)
+{
+                CBLAS_TRANSPOSE tA;
+                tA = TranA ? CblasTrans : CblasNoTrans;
+                cblas_sgemv(CblasRowMajor, tA, m, n, alpha, A, n, B, 1, beta, 
C, 1);
+}
+
+void cpu_axpy(const float * A, const int n, const float alpha, float * B)
+{
+                cblas_saxpy(n, alpha, A, 1, B, 1);
+}
+
+
+

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/06cdf366/include/singa/blob/math_addr.h
----------------------------------------------------------------------
diff --git a/include/singa/blob/math_addr.h b/include/singa/blob/math_addr.h
new file mode 100644
index 0000000..c732060
--- /dev/null
+++ b/include/singa/blob/math_addr.h
@@ -0,0 +1,90 @@
+#ifndef MATH_ADDR_H
+#define MATH_ADDR_H
+
+void cpu_gemm(const float * A, const float * B, const int m, const int n, 
const int k, const float alpha, const float beta, const bool TranA, const bool 
TranB, float * C);
+
+void cpu_gemv(const float * A, const float * B, const int m, const int n, 
const float alpha, const float beta, const bool TranA, float * C);
+// should be very careful : m is the length of B, and n is the length of C , A 
is a n*m matrix
+
+void cpu_axpy(const float * A, const int n, const float alpha, float * B);
+
+/*
+//element-wise
+template<typename Op> void cpu_e_f(const int n, const float alpha, float * A);
+template<typename Op> void cpu_e_f(const int n,const float * A,const float 
alpha, float * B);
+template<typename Op> void cpu_e_f(const int n,const float * A,const float * 
B,const float alpha, const float beta,float * C);
+// element-wise generalized operation defined in Op
+*/
+
+//element-wise
+template<typename Op> void cpu_e_f(const int n, const float alpha, float * A)
+{
+                for(int i = 0 ; i < n ; i++)
+                {
+                                Op::Map(alpha, A[i]);
+                }
+}
+
+template<typename Op> void cpu_e_f(const int n,const float * A,const float 
alpha, float * B)
+{
+                for(int i = 0 ; i < n ; i++)
+                {
+                                Op::Map(alpha, A[i], B[i]);
+                }
+}
+
+template<typename Op> void cpu_e_f(const int n,const float * A,const float * 
B,const float alpha, const float beta,float * C)
+{
+                for(int i = 0 ; i < n ; i++)
+                {
+                                Op::Map(alpha, beta, A[i], B[i], C[i]);
+                }
+}
+// element-wise generalized operation defined in Op
+
+/*
+//matrix/vector expand/reduce
+
+template<typename Op> void cpu_reduce_f(const float * A,const int m, const int 
n, float * B);
+//reduce each row of A to an element of B e.g. the sum operation in softmax
+template<typename Op> void cpu_expand_f(const float * A,const int m, const int 
n, float * B);
+//expand each element in A into a row of B
+*/
+
+//matrix/vector expand/reduce
+
+template<typename Op> void cpu_reduce_f(const float * A,const int m, const int 
n, float * B)
+{
+                for(int i = 0 ; i < m ; i++)
+                {
+                                Op::Map(A+i*n, n, B[i]);
+                }
+}
+//reduce each row of A to an element of B e.g. the sum operation in softmax
+template<typename Op> void cpu_expand_f(const float * A,const int m, const int 
n, float * B)
+{
+                for(int i = 0 ; i < m ; i++)
+                {
+                                Op::Map(A[i], n, B+i*n);
+                }
+}
+//expand each element in A into a row of B
+
+void gpu_gemm(const float * A, const float * B, const int m, const int n, 
const int k, const float alpha, const float beta, const bool TranA, const bool 
TranB, float * C);
+void gpu_gemv(const float * A, const float * B, const int m, const int n, 
const float alpha, const float beta, const bool TranA, float * C);
+void gpu_axpy(const float * A, const int n, const float alpha, float * B);
+
+//element-wise
+template<typename Op> void gpu_e_f(const int n, const float alpha, float * A);
+template<typename Op> void gpu_e_f(const int n,const float * A,const float 
alpha, const float beta,float * B);
+template<typename Op> void gpu_e_f(const int n,const float * A,const float * 
B,const float alpha, const float beta,float * C);
+// element-wise generalized operation defined in Op
+
+//matrix/vector expand/reduce
+
+template<typename Op> void gpu_reduce_f(const float * A,const int m, const int 
n, float * B);
+//reduce each row of A to an element of B e.g. the sum operation in softmax
+template<typename Op> void gpu_expand_f(const float * A,const int m, const int 
n, float * B);
+//expand each element in A into a row of B
+
+#endif // MATH_ADDR_H

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/06cdf366/include/singa/blob/math_blob.h
----------------------------------------------------------------------
diff --git a/include/singa/blob/math_blob.h b/include/singa/blob/math_blob.h
new file mode 100644
index 0000000..e26e4d4
--- /dev/null
+++ b/include/singa/blob/math_blob.h
@@ -0,0 +1,139 @@
+/*********************Level-2 interface, called by user 
code*******************/
+// c++ ususally use const & for input arguments, and * for output arguments.
+// ww: maybe we represent Blob's shape using int s[4]+dim? currently we use a 
vector, which may
+// not be convenient as int array.
+
+/**********************************************************************************/
+// class1 matrix operation
+
+void MMDot(const Blob & A, const Blob & B, Blob & C);
+// A,B and C are matrix
+
+void MVDot(const Blob & A, const Blob & B, Blob & C);
+// A is matrix,B and C are vector
+
+void VVDot(const Blob & A, const Blob & B, Blob & C);
+// C is matrix,A and B are vector
+
+float VVdot(const Blob & A, const Blob & B);
+//A and B are vectors
+
+void GEMM(const Blob & A, const Blob & B, Blob & C, float alpha = 1, float 
beta = 1);
+//C = alpha*A*B+beta*C, A, B and C are matrix
+
+Blob Reshape(const Blob & A, const std::vector<int>& shape);
+// the current reshape in blob.h is: void Reshape(const std::vector<int>& 
shape);
+// return the reference of the reshaped blob
+
+Blob Transpose(const Blob & A);
+// return A^T, only reference to the blob A
+// ww: just add a bool field in Blob, e.g., transpose_
+/**********************************************************************************/
+// class2 element-wise operation
+
+void Set(Blob & A,float alpha);
+// element-wise operation: Ai = alpha
+
+void AXPY(const Blob & A, Blob & B, float alpha);
+// element-wise operation: Bi = alpha*Ai+Bi  A and B should have the same size
+
+void Add(const Blob & A, const Blob & B, Blob & C);
+// element-wise operation: Ci = Ai+Bi  A,B and C should have the same size
+
+void Sub(const Blob & A, const Blob & B, Blob & C);
+// element-wise operation: Ci = Ai-Bi  A,B and C should have the same size
+
+void Mult(const Blob & A, const Blob & B, Blob & C);
+// element-wise operation: Ci = Ai*Bi  A,B and C should have the same size
+
+void Div(const Blob & A, const Blob & B, Blob & C);
+// element-wise operation: Ci = Ai/Bi  A,B and C should have the same size
+
+void Scale(const Blob & A, Blob & B, float alpha);
+// element-wise operation: Bi = alpha*Ai
+
+void Sigmoid(const Blob & A, Blob & B,float t);
+// element-wise operation: Bi = 1/(1+e^(-Ai*t))
+
+void Relu(const Blob & A, Blob & B,float t = 0);
+// element-wise operation: Bi = ((1-t)abs(Ai) + (1+t)Ai)/2
+
+void Tanh(const Blob & A, Blob & B,float t);
+// element-wise operation: Bi = tanh(Ai*t)
+
+void Exp(const Blob & A, Blob & B, float alpha = 2.71);
+// element-wise operation: Bi = alpha^Ai
+// ww: there are many element-wise operations, e.g., log, square, sqrt. 
+// If MKL/OpenBLAS or other libraries do not optimize these operations in 
ad-hoc manner, 
+// then we can implement then using the E_Func below by passing the basic 
log/square/sqrt operations.
+
+
+
+template<typename Op> void E_Func(Blob & A, float alpha);
+template<typename Op> void E_Func(const Blob & A, Blob & B, float alpha, float 
beta);
+template<typename Op> void E_Func(const Blob & A, const Blob & B, Blob & C, 
float alpha, float beta);
+// element-wise generalized operation defined in Op
+
+
+// ww: the following functions may require thread specific variables, e.g., 
seed or random stream state.
+
+void Gaussian(Blob & A, float mu, float sigma);
+// element-wise operation: initialize each element in A following distribution 
Gaussian(mu, sigma)
+
+void Uniform(Blob & A, float low, float high);
+// element-wise operation: initialize each element in A following uniform 
distribution from low to high
+
+void Bernoulli(Blob & A, float p, int n = 1);
+// element-wise operation: initialize each element in A following distribution 
Bernoulli(n,p)
+
+
+/**********************************************************************************/
+//class3 matrix-vector expand/reduce operation
+
+template<typename Op> void Reduce_F(const Blob & A, Blob & B);
+//reduce each row of A to an element of B e.g. the sum operation in softmax
+template<typename Op> void Expand_F(const Blob & A, Blob & B);
+//expand each element in A into a row of B
+
+void Repmat(const Blob & A, Blob & B);
+// A is a vector, B is a matrix , let each row of B to be A
+// just copy memory, will be faster
+
+// ww may rename to MVAdd, MVSum to be consistent with the MVDot, MMDot, VVDot.
+void MVAdd(const Blob & A, Blob & B, float alpha, float beta);
+// A is a vector, B is a matrix , Bij = alpha*Ai+beta*Bij
+// will use gemm. faster than general expand_f
+
+void MVSum(const Blob & A, Blob & B, float alpha, float beta);
+// A is a vector, B is a matrix , Ai = \sigma_j_{alpha*Bij}+beta*Ai
+// will use gemm. faster than general reduce_f
+
+void Softmax(const Blob & A,Blob & B,float alpha);
+// Bij = e^(alpha*Aij) / \sigma_i_{e^(alpha*Aij)}
+
+/**********************************************************************************/
+//class4 convolution operation
+
+void Conv(const Blob & A,const Blob & B,Blob & C);
+// A is the data, B is the parameter, C is the result
+
+void Pool(const Blob & A,Blob & B, int method);
+// A is the data, B is the result, should indicate max or ave pooling
+
+// jy: need to provide grad compute function respectively?
+
+// ww: The conv/pool operations cannot be declared as above, 
+// because they require other parameters, e.g., filter size, num of filters, 
pad, stride, etc.
+// Caffe and mxnet use cudnn in layer implementations instead of implementing 
low-level operations.
+// Maybe we can follow caffe? layer implementation is not our contribution, we 
can use others' code directly.
+// For CPU version, we use im2col and col2im for the conv layer. 
+// For GPU version, we use cudnn for the conv layer. Similarly for other 
layers. 
+// In conclude, we may not implement Blob level Conv and Pool operations. 
+// Instead, we implement CaffeConvLayer, CaffePoolLayer, cudnnConvLayer, 
cudnnPoolLayer.
+// Later we may add IntelConvLayer (cpu), NeonConvLayer (gpu).
+
+Blob setcolspace(const Blob & A);
+void im2col(const Blob & A,Blob & B);
+void col2im(const Blob & A,Blob & B);
+//given an img, use setcolspace to generate colspace Blob
+//use pack/unpack to get data in col/img

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/06cdf366/include/singa/blob/singa_op.h
----------------------------------------------------------------------
diff --git a/include/singa/blob/singa_op.h b/include/singa/blob/singa_op.h
new file mode 100644
index 0000000..7ca06db
--- /dev/null
+++ b/include/singa/blob/singa_op.h
@@ -0,0 +1,150 @@
+#ifndef SINGA_OP_H
+#define SINGA_OP_H
+
+#include<cmath>
+#include <algorithm>
+
+namespace op {
+        struct Set {
+            inline static void Map(float alpha, float & a) {
+                a= alpha;
+            }
+        };
+
+        struct Scale {
+            inline static void Map(float alpha,  const float & a, float & b) {
+                b = alpha*a;
+            }
+        };
+        struct Scale_grad {
+            inline static void Map(float alpha,  float & output) {
+                output = alpha;
+            }
+        };
+
+        struct Exp {
+            inline static void Map(float alpha,  const float & a, float & b) {
+                b = pow(a, alpha);
+            }
+        };
+        struct Exp_grad {
+            inline static void Map(float alpha,  const float & a, float & b) {
+                b = a * log(alpha);
+                // log is the natrual log based on e
+            }
+        };
+
+        struct Gsigmoid {
+            inline static void Map(float alpha,  const float & a, float & b) {
+                b = 1.0f / (1.0f + expf(-a * alpha));
+            }
+        };
+        struct Gsigmoid_grad {
+            inline static void Map(float alpha,  const float & a, float & b) {
+                b = alpha * a * ( 1.0f - a );
+            }
+        };
+
+        struct Grelu {
+            inline static void Map(float alpha,  const float & a, float & b) {
+                b = ( 1 - alpha ) * std::max( a, 0.0f ) + alpha * a;
+            }
+        };
+        struct Grelu_grad {
+            inline static void Map(float alpha,  const float & a, float & b) {
+                b = a > 0.0f ? 1.0f : alpha;
+            }
+        };
+
+        struct Gtanh {
+            inline static void Map(float alpha,  const float & a, float & b) {
+                b = tanhf( a * alpha );
+            }
+        };
+        struct Gtanh_grad {
+            inline static void Map(float alpha,  const float & a, float & b) {
+                b = alpha * ( 1.0f - a * a );
+            }
+        };
+
+        struct Softplus {
+            inline static void Map(float alpha,  const float & a, float & b) {
+                b = logf(1 + expf(a));
+            }
+        };
+        struct Softplus_grad {
+            inline static void Map(float alpha,  const float & a, float & b) {
+                b = 1.0f / (1.0f + expf(-a));
+            }
+        };
+
+        struct Square {
+            inline static void Map(float alpha,  const float & a, float & b) {
+                b = a * a;
+            }
+        };
+
+        struct Square_grad {
+            inline static void Map(float alpha,  const float & a, float & b) {
+                b = 2 * sqrt(a);
+            }
+        };
+
+        struct Sqrt {
+            inline static void Map(float alpha,  const float & a, float & b) {
+                b = sqrt(a);
+            }
+        };
+
+        struct Threshold {
+            inline static void Map(float alpha, float beta, const float & a, 
const float & b, float & c) {
+                c =  a < b ? 1.0f : 0.0f;
+            }
+        };
+
+        struct Add {
+            inline static void Map(float alpha, float beta, const float & a, 
const float & b, float & c) {
+                c =  a + b;
+            }
+        };
+
+        struct Sub {
+            inline static void Map(float alpha, float beta, const float & a, 
const float & b, float & c) {
+                c =  a - b;
+            }
+        };
+
+        struct Mult {
+            inline static void Map(float alpha, float beta, const float & a, 
const float & b, float & c) {
+                c =  a * b;
+            }
+        };
+
+        struct Div {
+            inline static void Map(float alpha, float beta, const float & a, 
const float & b, float & c) {
+                c =  a / b;
+            }
+        };
+
+        struct Sum {
+            inline static void Map(const float * a, int n, float & b) {
+                b = 0;
+                for(int i = 0 ; i < n ; i++)
+                {
+                            b += a[i];
+                }
+            }
+            };
+
+        struct Repmat {
+            inline static void Map(const float & a, int n, float * b) {
+                for(int i = 0 ; i < n ; i++)
+                {
+                            b[i] = a;
+                }
+            }
+        };
+
+}; // namespace op
+
+#endif // SINGA_OP_H

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/06cdf366/include/singa/blob/test.cc
----------------------------------------------------------------------
diff --git a/include/singa/blob/test.cc b/include/singa/blob/test.cc
new file mode 100644
index 0000000..d13ed5e
--- /dev/null
+++ b/include/singa/blob/test.cc
@@ -0,0 +1,165 @@
+#include <iostream>
+
+#include "singa_op.h"
+#include "math_addr.h"
+
+using namespace std;
+
+void test_gemm1()
+{
+            float A[3][2] = {};
+            float B[3][2] = {};
+            float C[2][2] = {};
+            for(int i = 0; i < 3; i++)
+                for(int j = 0; j < 2; j++)
+                {
+                A[i][j] = i+j;
+                B[i][j] = i+j - i*j;
+                }
+            cpu_gemm(A[0], B[0], 2, 2, 3 , 1, 0, true, false, C[0]);
+            float D[2][2] = {};
+            for(int i = 0; i < 2; i++)
+                for(int j = 0; j < 2; j++)
+                {
+                    D[i][j] = 0;
+                    for(int k = 0; k < 3; k++)
+                    D[i][j] += A[k][i]*B[k][j];
+                }
+            for(int i = 0; i < 2; i++)
+                for(int j = 0; j < 2; j++)
+                {
+                cout<<C[i][j] - D[i][j]<<endl;
+                }
+}
+
+
+void test_gemm2()
+{
+            float A[2][3] = {};
+            float B[3][2] = {};
+            float C[2][2] = {};
+            for(int i = 0; i < 3; i++)
+                for(int j = 0; j < 2; j++)
+                {
+                A[j][i] = i-j;
+                B[i][j] = i+j + i*j;
+                }
+            cpu_gemm(A[0], B[0], 2, 2, 3 , 1, 0, false, false, C[0]);
+            float D[2][2] = {};
+            for(int i = 0; i < 2; i++)
+                for(int j = 0; j < 2; j++)
+                {
+                    D[i][j] = 0;
+                    for(int k = 0; k < 3; k++)
+                    D[i][j] += A[i][k]*B[k][j];
+                }
+            for(int i = 0; i < 2; i++)
+                for(int j = 0; j < 2; j++)
+                {
+                cout<<C[i][j] - D[i][j]<<endl;
+                }
+}
+
+
+void test_gemv()
+{
+        float A[4][3] = {};
+        float B[4]= {};
+        float C[3] = {};
+        float D[3] = {};
+        for(int i = 0; i < 4; i++)
+        {
+            for(int j = 0; j < 3; j++)
+                    {
+                    A[j][i] = i-j + i*j;
+                    }
+        }
+        for(int i = 0; i < 4; i++)B[i] = i;
+        for(int i = 0; i < 3; i++)C[i] = 10;
+        cpu_gemv(A[0], B, 4, 3, 1, 1, true, C);
+        for(int i = 0; i < 3; i++)
+                for(int j = 0; j < 4; j++)
+                {
+                    D[i] += A[j][i]*B[j];
+                }
+        for(int i = 0; i < 3; i++)cout<<C[i] - D[i] - 10<<endl;
+}
+
+void test_axpy()
+{
+        float A[4][3] = {};
+        float C[4][3] = {};
+        float B[3][4] = {};
+        float D[3][4] = {};
+        for(int i = 0; i < 4; i++)
+        {
+            for(int j = 0; j < 3; j++)
+                    {
+                    A[i][j] = i-j + i*j;
+                    B[j][i] = i-j + i*j;
+                    C[i][j] = A[i][j];
+                    D[j][i] = B[j][i];
+                    }
+        }
+        cpu_axpy(A[0], 12, 2, B[0]);
+        for(int i = 0; i < 12; i++)D[0][i] += 2*C[0][i];
+        for(int i = 0; i < 3; i++)
+        {
+            for(int j = 0; j < 4; j++)
+                    {
+                    cout<<B[i][j] - D[i][j]<<endl;
+                    }
+        }
+}
+
+void test_eop()
+{
+        float A[10] = {};
+        float B[10] = {};
+        float C[10] = {};
+        float D[10] = {};
+        float O[10] = {};
+        for(int i = 0; i < 10; i++)
+        {
+            A[i] = i;
+            B[i] = -i;
+            C[i] = i;
+        }
+        cpu_e_f<op::Set>(5, 15, O);
+        for(int i = 0; i < 5; i++)cout<<O[i] - 15<<endl;
+        for(int i = 5; i < 10; i++)cout<<O[i]<<endl;
+        cpu_e_f<op::Scale>(10, C, 2, C);
+        for(int i = 0; i < 10; i++)cout<<C[i] - 2* i<<endl;
+        cpu_e_f<op::Add>(10, A, B, 0, 0, O);
+        for(int i = 0; i < 10; i++)cout<<O[i]<<endl;
+}
+
+void test_exrd()
+{
+        float A[3][10] = {};
+        float B[3] = {};
+        for(int i = 0; i < 3; i++)
+            for(int j = 0; j < 10; j++)
+            {
+                A[i][j] = (i + 1)*j;
+            }
+        cpu_reduce_f<op::Sum>(A[0], 3, 10, B);
+        for(int i = 0; i < 3; i++) B[i] -= 45*(i+1);
+        for(int i = 0; i < 3; i++)cout<<B[i]<<endl;
+        cpu_expand_f<op::Repmat>(B, 3, 10, A[0]);
+        cpu_reduce_f<op::Sum>(A[0], 3, 10, B);
+        for(int i = 0; i < 3; i++)cout<<B[i]<<endl;
+}
+
+int main()
+{
+    test_gemm1()  ;
+       test_gemm2();
+       test_gemv();
+       test_axpy();
+       test_eop();
+       test_exrd();
+    return 0;
+}
+
+

[11/19] incubator-singa git commit: SINGA-80 New Blob Level and Address Level Math Operation Interface

Reply via email to