Repository: incubator-systemml
Updated Branches:
  refs/heads/master 13e1bd930 -> 885267024


http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/88526702/src/main/java/org/apache/sysml/hops/AggUnaryOp.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/hops/AggUnaryOp.java 
b/src/main/java/org/apache/sysml/hops/AggUnaryOp.java
index 15308fc..e2b8f12 100644
--- a/src/main/java/org/apache/sysml/hops/AggUnaryOp.java
+++ b/src/main/java/org/apache/sysml/hops/AggUnaryOp.java
@@ -146,7 +146,10 @@ public class AggUnaryOp extends Hop implements 
MultiThreadedHop
                                        int k = 
OptimizerUtils.getConstrainedNumThreads(_maxNumThreads);
                                        if(DMLScript.USE_ACCELERATOR && 
(DMLScript.FORCE_ACCELERATOR || getMemEstimate() < 
OptimizerUtils.GPU_MEMORY_BUDGET) && (_op == AggOp.SUM)) {
                                                // Only implemented methods for 
GPU
-                                               if (_op == AggOp.SUM && 
(_direction == Direction.RowCol || _direction == Direction.Row || _direction == 
Direction.Col)){
+                                               if ((_op == AggOp.SUM && 
(_direction == Direction.RowCol || _direction == Direction.Row || _direction == 
Direction.Col))
+                                                                               
|| (_op == AggOp.MAX && (_direction == Direction.RowCol))
+                                                                               
|| (_op == AggOp.MIN && (_direction == Direction.RowCol))
+                                                                               
|| (_op == AggOp.MEAN && (_direction == Direction.RowCol))){
                                                        et = ExecType.GPU;
                                                        k = 1;
                                                }

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/88526702/src/main/java/org/apache/sysml/runtime/instructions/GPUInstructionParser.java
----------------------------------------------------------------------
diff --git 
a/src/main/java/org/apache/sysml/runtime/instructions/GPUInstructionParser.java 
b/src/main/java/org/apache/sysml/runtime/instructions/GPUInstructionParser.java
index bc9b93e..9dce34a 100644
--- 
a/src/main/java/org/apache/sysml/runtime/instructions/GPUInstructionParser.java
+++ 
b/src/main/java/org/apache/sysml/runtime/instructions/GPUInstructionParser.java
@@ -78,7 +78,9 @@ public class GPUInstructionParser  extends InstructionParser
                String2GPUInstructionType.put( "uark+"   , 
GPUINSTRUCTION_TYPE.AggregateUnary);
                String2GPUInstructionType.put( "uac+"    , 
GPUINSTRUCTION_TYPE.AggregateUnary);
                String2GPUInstructionType.put( "uack+"   , 
GPUINSTRUCTION_TYPE.AggregateUnary);
-
+               String2GPUInstructionType.put( "uamean"  , 
GPUINSTRUCTION_TYPE.AggregateUnary);
+               String2GPUInstructionType.put( "uamax"   , 
GPUINSTRUCTION_TYPE.AggregateUnary);
+               String2GPUInstructionType.put( "uamin"   , 
GPUINSTRUCTION_TYPE.AggregateUnary);
        }
        
        public static GPUInstruction parseSingleInstruction (String str ) 

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/88526702/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCUDA.java
----------------------------------------------------------------------
diff --git 
a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCUDA.java 
b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCUDA.java
index a956d10..0cfffdb 100644
--- a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCUDA.java
+++ b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCUDA.java
@@ -1025,16 +1025,16 @@ public class LibMatrixCUDA {
                        case OP_PLUS: {
                                switch(reductionDirection) {
                                        case REDUCTION_ALL : {
-                                               double result = reduceAll(in, 
size);
+                                               double result = 
reduceAll("reduce_sum", in, size);
                                                ec.setScalarOutput(output, new 
DoubleObject(result));
                                                break;
                                        }
                                        case REDUCTION_COL : {  // The names 
are a bit misleading, REDUCTION_COL refers to the direction (reduce all 
elements in a column)
-                                               reduceRow(in, out, rlen, clen);
+                                               reduceRow("reduce_row", in, 
out, rlen, clen);
                                                break;
                                        }
                                        case REDUCTION_ROW : {
-                                               reduceCol(in, out, rlen, clen);
+                                               reduceCol("reduce_col", in, 
out, rlen, clen);
                                                break;
                                        }
 
@@ -1056,7 +1056,12 @@ public class LibMatrixCUDA {
                        }
                        case OP_MEAN:{
                                switch(reductionDirection) {
-                                       case REDUCTION_ALL:
+                                       case REDUCTION_ALL: {
+                                               double result = 
reduceAll("reduce_sum", in, size);
+                                               double mean = result / size;
+                                               ec.setScalarOutput(output, new 
DoubleObject(mean));
+                                               break;
+                                       }
                                        case REDUCTION_COL:
                                        case REDUCTION_ROW:
                                                throw new 
DMLRuntimeException("Internal Error - All, Row & Column mean of matrix not 
implemented yet for GPU ");
@@ -1087,7 +1092,11 @@ public class LibMatrixCUDA {
                        }
                        case OP_MAX :{
                                switch(reductionDirection) {
-                                       case REDUCTION_ALL:
+                                       case REDUCTION_ALL: {
+                                               double result = 
reduceAll("reduce_max", in, size);
+                                               ec.setScalarOutput(output, new 
DoubleObject(result));
+                                               break;
+                                       }
                                        case REDUCTION_COL:
                                        case REDUCTION_ROW:
                                                throw new 
DMLRuntimeException("Internal Error - All, Row & Column max of matrix not 
implemented yet for GPU ");
@@ -1098,7 +1107,11 @@ public class LibMatrixCUDA {
                        }
                        case OP_MIN :{
                                switch(reductionDirection) {
-                                       case REDUCTION_ALL:
+                                       case REDUCTION_ALL: {
+                                               double result = 
reduceAll("reduce_min", in, size);
+                                               ec.setScalarOutput(output, new 
DoubleObject(result));
+                                               break;
+                                       }
                                        case REDUCTION_COL:
                                        case REDUCTION_ROW:
                                                throw new 
DMLRuntimeException("Internal Error - All, Row & Column min of matrix not 
implemented yet for GPU ");
@@ -1131,24 +1144,25 @@ public class LibMatrixCUDA {
 
        /**
         * Do a simple reduction, the output of which is a single value
-        * @param in    {@link Pointer} to matrix in device memory
-        * @param n             size of array
+        * @param kernelFunction        name of the kernel function to invoke
+        * @param in                                                    {@link 
Pointer} to matrix in device memory
+        * @param n                                                             
size of array
         * @return      the reduced value
         * @throws DMLRuntimeException
         */
-       private static double reduceAll(Pointer in, int n) throws 
DMLRuntimeException {
+       private static double reduceAll(String kernelFunction, Pointer in, int 
n) throws DMLRuntimeException {
                int[] tmp = getKernelParamsForReduceAll(n);
                int blocks = tmp[0], threads = tmp[1], sharedMem = tmp[2];
 
                Pointer tempOut = JCudaObject.allocate(n * Sizeof.DOUBLE);
-               kernels.launchKernel("reduce", new ExecutionConfig(blocks, 
threads, sharedMem),
+               kernels.launchKernel(kernelFunction, new 
ExecutionConfig(blocks, threads, sharedMem),
                                                in, tempOut, n);
                cudaDeviceSynchronize();
                int s = n;
                while (s > 1) {
                        tmp = getKernelParamsForReduceAll(n);
                        blocks = tmp[0]; threads = tmp[1]; sharedMem = tmp[2];
-                       kernels.launchKernel("reduce", new 
ExecutionConfig(blocks, threads, sharedMem),
+                       kernels.launchKernel(kernelFunction, new 
ExecutionConfig(blocks, threads, sharedMem),
                                                        tempOut, tempOut, s);
                        s = (s + (threads*2-1)) / (threads*2);
                }
@@ -1162,21 +1176,32 @@ public class LibMatrixCUDA {
        /**
         * Do a reduction by row. Data is reduced per row and the
         * resulting vector is calculated.
-        * @param in            {@link Pointer} to input matrix in device 
memory (size - rows * columns)
-        * @param out           {@link Pointer} to output matrix in device 
memory (size - rows * 1)
-        * @param rows  number of rows in input matrix
-        * @param cols  number of columns in input matrix
+        * @param kernelFunction        name of the kernel function to invoke
+        * @param in                                                    {@link 
Pointer} to input matrix in device memory (size - rows * columns)
+        * @param out                                                   {@link 
Pointer} to output matrix in device memory (size - rows * 1)
+        * @param rows                                          number of rows 
in input matrix
+        * @param cols                                          number of 
columns in input matrix
         * @throws DMLRuntimeException
         */
-       private static void reduceRow(Pointer in, Pointer out, int rows, int 
cols) throws DMLRuntimeException {
+       private static void reduceRow(String kernelFunction, Pointer in, 
Pointer out, int rows, int cols) throws DMLRuntimeException {
                int[] tmp = getKernelParamsForReduceByRow(rows, cols);
                int blocks = tmp[0], threads = tmp[1], sharedMem = tmp[2];
-               kernels.launchKernel("reduce_row", new ExecutionConfig(blocks, 
threads, sharedMem),
+               kernels.launchKernel(kernelFunction, new 
ExecutionConfig(blocks, threads, sharedMem),
                                                in, out, rows, cols);
                cudaDeviceSynchronize();
        }
 
-       private static void reduceCol(Pointer in, Pointer out, int rows, int 
cols) throws DMLRuntimeException {
+       /**
+        * Do a reduction by column. Data is reduced per column and the
+        * resulting vector is calculated.
+        * @param kernelFunction        name of the kernel function to invoke
+        * @param in                                                    {@link 
Pointer} to input matrix in device memory (size - rows * columns)
+        * @param out                                                   {@link 
Pointer} to output matrix in device memory (size - 1 * cols)
+        * @param rows                                          number of rows 
in input matrix
+        * @param cols                                          number of 
columns in input matrix
+        * @throws DMLRuntimeException
+        */
+       private static void reduceCol(String kernelFunction, Pointer in, 
Pointer out, int rows, int cols) throws DMLRuntimeException {
                int[] tmp = getKernelParamsForReduceByCol(rows, cols);
                int blocks = tmp[0], threads = tmp[1], sharedMem = tmp[2];
                kernels.launchKernel("reduce_col", new ExecutionConfig(blocks, 
threads, sharedMem),
@@ -1747,7 +1772,7 @@ public class LibMatrixCUDA {
                                
ExecutionConfig.getConfigForSimpleMatrixOperations(rlen, clen),
                                src, dest, rlen, clen);
        }
-       
+
        /**
         * Performs daxpy operation
         * 
@@ -1760,9 +1785,9 @@ public class LibMatrixCUDA {
         */
        public static void axpy(ExecutionContext ec, MatrixObject in1, 
MatrixObject in2,
                        String outputName,  double constant) throws 
DMLRuntimeException {
-               if(isInSparseFormat(in1)) 
+               if(isInSparseFormat(in1))
                        ((JCudaObject)in1.getGPUObject()).sparseToDense();
-               if(isInSparseFormat(in2)) 
+               if(isInSparseFormat(in2))
                        ((JCudaObject)in2.getGPUObject()).sparseToDense();
                Pointer A = 
((JCudaObject)in1.getGPUObject()).jcudaDenseMatrixPtr;
                Pointer B = 
((JCudaObject)in2.getGPUObject()).jcudaDenseMatrixPtr;
@@ -1771,7 +1796,7 @@ public class LibMatrixCUDA {
            Pointer C = ((JCudaObject)out.getGPUObject()).jcudaDenseMatrixPtr;
            Pointer alphaPtr = pointerTo(constant);
            long n = (in1.getNumRows()*in1.getNumColumns());
-           // C <- A + alpha*B 
+           // C <- A + alpha*B
            // becomes
            // C <- A
            // C <- alpha*B + C

Reply via email to