[SYSTEMML-1039] Added variance, row/col variance

Closes #383.


Project: http://git-wip-us.apache.org/repos/asf/incubator-systemml/repo
Commit: 
http://git-wip-us.apache.org/repos/asf/incubator-systemml/commit/ad009d81
Tree: http://git-wip-us.apache.org/repos/asf/incubator-systemml/tree/ad009d81
Diff: http://git-wip-us.apache.org/repos/asf/incubator-systemml/diff/ad009d81

Branch: refs/heads/master
Commit: ad009d81f759caed7ed134771fc6236d7cf21866
Parents: f8d7077
Author: Nakul Jindal <[email protected]>
Authored: Wed Feb 8 11:14:43 2017 -0800
Committer: Niketan Pansare <[email protected]>
Committed: Wed Feb 8 11:14:43 2017 -0800

----------------------------------------------------------------------
 src/main/cpp/kernels/SystemML.cu                |  105 +-
 src/main/cpp/kernels/SystemML.ptx               | 2772 ++++++++----------
 .../java/org/apache/sysml/hops/AggUnaryOp.java  |    1 +
 .../instructions/GPUInstructionParser.java      |    4 +-
 .../MatrixMatrixArithmeticGPUInstruction.java   |    2 +-
 .../ScalarMatrixArithmeticGPUInstruction.java   |    2 +-
 .../runtime/matrix/data/LibMatrixCUDA.java      |  225 +-
 7 files changed, 1479 insertions(+), 1632 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/ad009d81/src/main/cpp/kernels/SystemML.cu
----------------------------------------------------------------------
diff --git a/src/main/cpp/kernels/SystemML.cu b/src/main/cpp/kernels/SystemML.cu
index 4ce6fb2..cda28ba 100644
--- a/src/main/cpp/kernels/SystemML.cu
+++ b/src/main/cpp/kernels/SystemML.cu
@@ -40,7 +40,7 @@ __global__ void copyUpperToLowerTriangleDense(double* ret, 
int dim, int N) {
 }
 
 extern "C"
-__device__ double getBoolean(int val) {
+__forceinline__ __device__ double getBoolean(int val) {
        if(val == 0)
                return 0.0;
        else
@@ -51,39 +51,23 @@ __device__ double getBoolean(int val) {
 // 5=less, 6=lessequal, 7=greater, 8=greaterequal, 9=equal, 10=notequal,
 // 11=min, 12=max, 13=and, 14=or, 15=log}
 extern "C"
-__device__ double binaryOp(double x, double y, int op) {
-       // 0=plus, 1=minus, 2=multiply, 3=divide, 4=power
-       if(op == 0)
-               return x + y;
-       else if(op == 1)
-               return x - y;
-       else if(op == 2)
-               return x * y;
-       else if(op == 3)
-               return x / y;
-       else if(op == 4)
-               return pow(x, y);
-       // 5=less, 6=lessequal, 7=greater, 8=greaterequal, 9=equal, 10=notequal,
-       else if(op == 5)
-               return getBoolean(x < y);
-       else if(op == 6)
-               return getBoolean(x <= y);
-       else if(op == 7)
-               return getBoolean(x > y);
-       else if(op == 8)
-               return getBoolean(x >= y);
-       else if(op == 9)
-               return getBoolean(x == y);
-       else if(op == 10)
-               return getBoolean(x != y);
-       // 11=min, 12=max, 13=and, 14=or, 15=log
-       else if(op == 11) {
-               return min(x, y);
-       }
-       else if(op == 12) {
-               return max(x, y);
-       }
-       return -999;
+__forceinline__ __device__ double binaryOp(double x, double y, int op) {
+       switch(op) {
+        case 0 : return x + y;
+        case 1 : return x - y;
+        case 2 : return x * y;
+        case 3 : return x / y;
+        case 4 : return pow(x, y);
+        case 5 : return getBoolean(x < y);
+        case 6 : return getBoolean(x <= y);
+        case 7 : return getBoolean(x > y);
+        case 8 : return getBoolean(x >= y);
+        case 9 : return getBoolean(x == y);
+        case 10 : return getBoolean(x != y);
+        case 11 : return min(x, y);
+        case 12 : return max(x, y);
+        default : return DBL_MAX;
+    }
 }
 
 extern "C"
@@ -158,8 +142,22 @@ __global__ void compareAndSet(double* A,  double* ret, int 
rlen, int clen, doubl
        }
 }
 
+
+/**
+ * Performs a binary cellwise arithmetic operation on 2 matrices.
+ * Either both matrices are of equal size or one of them is a vector or both 
are.
+ * @param A                 first input matrix allocated on GPU
+ * @param B                 second input matrix allocated on GPU
+ * @param C                 output allocated on GPU
+ * @param maxRlen           maximum of the row lengths of A and B
+ * @param maxClen           maximum of the column lengths of A and B
+ * @param vectorAStatus     if A is a row vector, column vector or neither
+ * @param vectorBStatus     if B is a row vector, column vector or neither
+ * @param op                the numeric code of the arithmetic operation to 
perform
+ *
+ */
 extern "C"
-__global__ void binCellOp(double* A, double* B, double* C,
+__global__ void matrix_matrix_cellwise_op(double* A, double* B, double* C,
        int maxRlen, int maxClen, int vectorAStatus, int vectorBStatus, int op) 
{
        int ix = blockIdx.x * blockDim.x + threadIdx.x;
        int iy = blockIdx.y * blockDim.y + threadIdx.y;
@@ -177,21 +175,32 @@ __global__ void binCellOp(double* A, double* B, double* C,
                else if(vectorBStatus == 2)
                        bIndex = iy; // rlen == 1
                C[outIndex] = binaryOp(A[aIndex], B[bIndex], op);
-               // printf("C[%d] = A[%d](%f) B[%d](%f) (%d %d)\n", outIndex, 
aIndex, A[aIndex], bIndex,  B[bIndex], (ix+1), (iy+1));
+               //printf("C[%d] = A[%d](%f) B[%d](%f) (%d %d)\n", outIndex, 
aIndex, A[aIndex], bIndex,  B[bIndex], (ix+1), (iy+1));
+    __syncthreads();
        }
 }
 
+/**
+ * Performs an arithmetic operation between a matrix and a scalar.
+ * C = s op A or C = A op s (where A is the matrix, s is the scalar and op is 
the operation)
+ * @param A             input matrix allocated on GPU
+ * @param scalar        scalar input
+ * @param C             output matrix allocated on GPU
+ * @param size          number of elements in matrix A
+ * @param op            number code of the arithmetic operation to perform
+ * @param isLeftScalar  whether the scalar is on the left side
+ */
 extern "C"
-__global__ void binCellScalarOp(double* A, double scalar, double* C, int 
rlenA, int clenA, int op, int isLeftScalar) {
-       int ix = blockIdx.x * blockDim.x + threadIdx.x;
-       int iy = blockIdx.y * blockDim.y + threadIdx.y;
-       int index = ix * clenA + iy;
-       if(index < rlenA*clenA) {
-               if(isLeftScalar)
+__global__ void matrix_scalar_op(double* A, double scalar, double* C, int 
size, int op, int isLeftScalar) {
+       int index = blockIdx.x *blockDim.x + threadIdx.x;
+       if(index < size) {
+               if(isLeftScalar) {
                        C[index] = binaryOp(scalar, A[index], op);
-               else
+               } else {
                        C[index] = binaryOp(A[index], scalar, op);
+    }
        }
+  __syncthreads();
 }
 
 
@@ -475,7 +484,7 @@ typedef struct {
 extern "C"
 __global__ void reduce_max(double *g_idata, double *g_odata, unsigned int n){
     MaxOp op;
-    reduce<MaxOp>(g_idata, g_odata, n, op, DBL_MIN);
+    reduce<MaxOp>(g_idata, g_odata, n, op, -DBL_MAX);
 }
 
 /**
@@ -489,7 +498,7 @@ extern "C"
 __global__ void reduce_row_max(double *g_idata, double *g_odata, unsigned int 
rows, unsigned int cols){
     MaxOp op;
     IdentityOp aop;
-    reduce_row<MaxOp, IdentityOp>(g_idata, g_odata, rows, cols, op, aop, 
DBL_MIN);
+    reduce_row<MaxOp, IdentityOp>(g_idata, g_odata, rows, cols, op, aop, 
-DBL_MAX);
 }
 
 /**
@@ -503,7 +512,7 @@ extern "C"
 __global__ void reduce_col_max(double *g_idata, double *g_odata, unsigned int 
rows, unsigned int cols){
     MaxOp op;
     IdentityOp aop;
-    reduce_col<MaxOp, IdentityOp>(g_idata, g_odata, rows, cols, op, aop, 
DBL_MIN);
+    reduce_col<MaxOp, IdentityOp>(g_idata, g_odata, rows, cols, op, aop, 
-DBL_MAX);
 }
 
 /**
@@ -602,7 +611,7 @@ struct MeanOp {
 extern "C"
 __global__ void reduce_row_mean(double *g_idata, double *g_odata, unsigned int 
rows, unsigned int cols){
     SumOp op;
-    MeanOp aop(rows*cols);
+    MeanOp aop(cols);
     reduce_row<SumOp, MeanOp>(g_idata, g_odata, rows, cols, op, aop, 0.0);
 }
 
@@ -616,6 +625,6 @@ __global__ void reduce_row_mean(double *g_idata, double 
*g_odata, unsigned int r
 extern "C"
 __global__ void reduce_col_mean(double *g_idata, double *g_odata, unsigned int 
rows, unsigned int cols){
     SumOp op;
-    MeanOp aop(rows*cols);
+    MeanOp aop(rows);
     reduce_col<SumOp, MeanOp>(g_idata, g_odata, rows, cols, op, aop, 0.0);
 }

Reply via email to