Repository: incubator-systemml
Updated Branches:
  refs/heads/master 6b1572e4b -> 20e33bbe8


[SYSTEMML-1311] Performance dataset/libsvm converters (sparse vectors)

This patch modifies the existing dataset-binary block and libsvm-binary
block converters to properly handle sparse input vectors, i.e., scan
non-zeros instead of repeated lookups to avoid unnecessary binary search
operations (O(nnz) vs O(N log nnz), with nnz<=N). On a 81M x 784 mnist
libsvm-binary scenario, this improved performance from 17min24s to
16min14s. Furthermore, this patch also cleans up the creation of
dense/sparse output vectors for binary block-dataset converters.

Finally, the change also includes various cleanups:
* Remove unnecessary gc hint in tear down of all tests
* Reduce unnecessarily large number of columns in dataset-matrix
conversion test
* Fix gpu-related import and formatting issue


Project: http://git-wip-us.apache.org/repos/asf/incubator-systemml/repo
Commit: 
http://git-wip-us.apache.org/repos/asf/incubator-systemml/commit/20e33bbe
Tree: http://git-wip-us.apache.org/repos/asf/incubator-systemml/tree/20e33bbe
Diff: http://git-wip-us.apache.org/repos/asf/incubator-systemml/diff/20e33bbe

Branch: refs/heads/master
Commit: 20e33bbe80aa10a1b11ea179e82feb15408f6cf8
Parents: 6b1572e
Author: Matthias Boehm <mboe...@gmail.com>
Authored: Tue Mar 7 14:26:36 2017 -0800
Committer: Matthias Boehm <mboe...@gmail.com>
Committed: Tue Mar 7 14:29:46 2017 -0800

----------------------------------------------------------------------
 .../instructions/gpu/context/GPUContext.java    |  1 -
 .../spark/utils/RDDConverterUtils.java          | 97 ++++++++++++--------
 .../org/apache/sysml/utils/GPUStatistics.java   |  4 +-
 .../test/integration/AutomatedTestBase.java     |  2 -
 .../DataFrameMatrixConversionTest.java          |  2 +-
 5 files changed, 63 insertions(+), 43 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/20e33bbe/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUContext.java
----------------------------------------------------------------------
diff --git 
a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUContext.java
 
b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUContext.java
index a7c0ab8..b792882 100644
--- 
a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUContext.java
+++ 
b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUContext.java
@@ -21,7 +21,6 @@ package org.apache.sysml.runtime.instructions.gpu.context;
 import java.util.ArrayList;
 import java.util.concurrent.ConcurrentLinkedQueue;
 import java.util.concurrent.ExecutorService;
-import java.util.concurrent.Executors;
 import java.util.concurrent.Future;
 
 import org.apache.sysml.api.DMLScript;

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/20e33bbe/src/main/java/org/apache/sysml/runtime/instructions/spark/utils/RDDConverterUtils.java
----------------------------------------------------------------------
diff --git 
a/src/main/java/org/apache/sysml/runtime/instructions/spark/utils/RDDConverterUtils.java
 
b/src/main/java/org/apache/sysml/runtime/instructions/spark/utils/RDDConverterUtils.java
index 902924a..f370890 100644
--- 
a/src/main/java/org/apache/sysml/runtime/instructions/spark/utils/RDDConverterUtils.java
+++ 
b/src/main/java/org/apache/sysml/runtime/instructions/spark/utils/RDDConverterUtils.java
@@ -36,7 +36,7 @@ import org.apache.spark.api.java.function.Function;
 import org.apache.spark.api.java.function.PairFlatMapFunction;
 import org.apache.spark.api.java.function.PairFunction;
 import org.apache.spark.ml.feature.LabeledPoint;
-import org.apache.spark.ml.linalg.DenseVector;
+import org.apache.spark.ml.linalg.SparseVector;
 import org.apache.spark.ml.linalg.Vector;
 import org.apache.spark.ml.linalg.VectorUDT;
 import org.apache.spark.ml.linalg.Vectors;
@@ -64,7 +64,6 @@ import org.apache.sysml.runtime.matrix.data.MatrixBlock;
 import org.apache.sysml.runtime.matrix.data.MatrixCell;
 import org.apache.sysml.runtime.matrix.data.MatrixIndexes;
 import org.apache.sysml.runtime.matrix.data.OutputInfo;
-import org.apache.sysml.runtime.matrix.data.SparseBlock;
 import org.apache.sysml.runtime.matrix.mapred.IndexedMatrixValue;
 import org.apache.sysml.runtime.matrix.mapred.ReblockBuffer;
 import org.apache.sysml.runtime.util.DataConverter;
@@ -416,6 +415,16 @@ public class RDDConverterUtils
                return lnnz;
        }
        
+       private static Vector createVector(MatrixBlock row) {
+               if( row.isEmptyBlock(false) ) //EMPTY SPARSE ROW
+                       return Vectors.sparse(row.getNumColumns(), new int[0], 
new double[0]);
+               else if( row.isInSparseFormat() ) //SPARSE ROW
+                       return Vectors.sparse(row.getNumColumns(), 
+                                       row.getSparseBlock().indexes(0), 
row.getSparseBlock().values(0));
+               else // DENSE ROW
+                       return Vectors.dense(row.getDenseBlock());
+       }
+       
        /////////////////////////////////
        // BINARYBLOCK-SPECIFIC FUNCTIONS
 
@@ -433,24 +442,9 @@ public class RDDConverterUtils
                        throws Exception 
                {
                        ArrayList<LabeledPoint> ret = new 
ArrayList<LabeledPoint>();
-                       for( int i=0; i<arg0.getNumRows(); i++ )
-                       {
+                       for( int i=0; i<arg0.getNumRows(); i++ ) {
                                MatrixBlock tmp = arg0.sliceOperations(i, i, 0, 
arg0.getNumColumns()-2, new MatrixBlock());
-                               double[] data = 
DataConverter.convertToDoubleVector(tmp);
-                               if( tmp.isEmptyBlock(false) ) //EMPTY SPARSE ROW
-                               {
-                                       ret.add(new 
LabeledPoint(arg0.getValue(i, arg0.getNumColumns()-1), Vectors.sparse(0, new 
int[0], new double[0])));
-                               }
-                               else if( tmp.isInSparseFormat() ) //SPARSE ROW
-                               {
-                                       SparseBlock sblock = 
tmp.getSparseBlock();
-                                       ret.add(new 
LabeledPoint(arg0.getValue(i, arg0.getNumColumns()-1), 
-                                                       
Vectors.sparse(sblock.size(0), sblock.indexes(0), sblock.values(0))));
-                               }
-                               else // DENSE ROW
-                               {
-                                       ret.add(new 
LabeledPoint(arg0.getValue(i, arg0.getNumColumns()-1), Vectors.dense(data)));
-                               }
+                               ret.add(new LabeledPoint(arg0.getValue(i, 
arg0.getNumColumns()-1), createVector(tmp)));
                        }
                        
                        return ret.iterator();
@@ -808,6 +802,8 @@ public class RDDConverterUtils
                        {
                                
Tuple2<org.apache.spark.mllib.regression.LabeledPoint,Long> tmp = arg0.next();
                                org.apache.spark.mllib.regression.LabeledPoint 
row = tmp._1();
+                               boolean lsparse = _sparseX || (!_labels && 
+                                               row.features() instanceof 
org.apache.spark.mllib.linalg.SparseVector);
                                long rowix = tmp._2() + 1;
                                
                                long rix = 
UtilFunctions.computeBlockIndex(rowix, _brlen);
@@ -818,7 +814,7 @@ public class RDDConverterUtils
                                        if( ix[0] !=null )
                                                flushBlocksToList(ix, mb, ret);
                                        long len = 
UtilFunctions.computeBlockSize(_rlen, rix, _brlen);
-                                       createBlocks(rowix, (int)len, ix, mb);
+                                       createBlocks(rowix, (int)len, ix, mb, 
lsparse);
                                }
                                
                                //process row data
@@ -828,12 +824,26 @@ public class RDDConverterUtils
                                        _aNnz.add((val != 0) ? 1 : 0);
                                }
                                else { //features
-                                       for( int cix=1, pix=0; cix<=ncblks; 
cix++ ) {
-                                               int lclen = 
(int)UtilFunctions.computeBlockSize(_clen, cix, _bclen);
-                                               for( int j=0; j<lclen; j++ )
-                                                       
mb[cix-1].appendValue(pos, j, row.features().apply(pix++));
+                                       int lnnz = row.features().numNonzeros();
+                                       if( row.features() instanceof 
org.apache.spark.mllib.linalg.SparseVector )
+                                       {
+                                               
org.apache.spark.mllib.linalg.SparseVector srow = 
+                                                               
(org.apache.spark.mllib.linalg.SparseVector) row.features();
+                                               for( int k=0; k<lnnz; k++ ) {
+                                                       int gix = 
srow.indices()[k]+1;
+                                                       int cix = 
(int)UtilFunctions.computeBlockIndex(gix, _bclen);
+                                                       int j = 
UtilFunctions.computeCellInBlock(gix, _bclen);
+                                                       
mb[cix-1].appendValue(pos, j, srow.values()[k]);
+                                               }
+                                       }
+                                       else { //dense
+                                               for( int cix=1, pix=0; 
cix<=ncblks; cix++ ) {
+                                                       int lclen = 
(int)UtilFunctions.computeBlockSize(_clen, cix, _bclen);
+                                                       for( int j=0; j<lclen; 
j++ )
+                                                               
mb[cix-1].appendValue(pos, j, row.features().apply(pix++));
+                                               }
                                        }
-                                       _aNnz.add(row.features().numNonzeros());
+                                       _aNnz.add(lnnz);
                                }
                        }
                
@@ -844,7 +854,7 @@ public class RDDConverterUtils
                }
                
                // Creates new state of empty column blocks for current global 
row index.
-               private void createBlocks(long rowix, int lrlen, 
MatrixIndexes[] ix, MatrixBlock[] mb)
+               private void createBlocks(long rowix, int lrlen, 
MatrixIndexes[] ix, MatrixBlock[] mb, boolean lsparse)
                {
                        //compute row block index and number of column blocks
                        long rix = UtilFunctions.computeBlockIndex(rowix, 
_brlen);
@@ -852,9 +862,9 @@ public class RDDConverterUtils
                        
                        //create all column blocks (assume dense since csv is 
dense text format)
                        for( int cix=1; cix<=ncblks; cix++ ) {
-                               int lclen = 
(int)UtilFunctions.computeBlockSize(_clen, cix, _bclen);                        
    
+                               int lclen = 
(int)UtilFunctions.computeBlockSize(_clen, cix, _bclen);
                                ix[cix-1] = new MatrixIndexes(rix, cix);
-                               mb[cix-1] = new MatrixBlock(lrlen, lclen, 
_sparseX);
+                               mb[cix-1] = new MatrixBlock(lrlen, lclen, 
lsparse);
                                mb[cix-1].allocateDenseOrSparseBlock();
                        }
                }
@@ -1058,8 +1068,18 @@ public class RDDConverterUtils
                                        //append data to matrix blocks
                                        if( _isVector ) {
                                                Vector vect = (Vector) obj;
-                                               for( int j=0; j<lclen; j++ )
-                                                       
mb[cix-1].appendValue(pos, j, vect.apply(pix++));       
+                                               if( vect instanceof 
SparseVector ) {
+                                                       SparseVector svect = 
(SparseVector) vect;
+                                                       int[] svectIx = 
svect.indices();
+                                                       while( 
pix<svectIx.length && svectIx[pix]<cix*_bclen ) {
+                                                               int j = 
UtilFunctions.computeCellInBlock(svectIx[pix]+1, _bclen);
+                                                               
mb[cix-1].appendValue(pos, j, svect.values()[pix++]);
+                                                       }
+                                               }
+                                               else { //dense
+                                                       for( int j=0; j<lclen; 
j++ )
+                                                               
mb[cix-1].appendValue(pos, j, vect.apply(pix++));       
+                                               }
                                        }
                                        else { //row
                                                Row row = (Row) obj;
@@ -1176,13 +1196,18 @@ public class RDDConverterUtils
                        
                        //copy block data into target row
                        if( _toVector ) {
-                               double[] tmp = new double[_clen];
-                               for(Tuple2<Long, MatrixBlock> kv : arg0._2()) {
-                                       int cl = (kv._1().intValue()-1)*_bclen;
-                                       MatrixBlock mb = kv._2();
-                                       DataConverter.copyToDoubleVector(mb, 
tmp, cl);
+                               if( _clen <= _bclen ) { //single block
+                                       row[1] = 
createVector(arg0._2().iterator().next()._2());
+                               }
+                               else { //multiple column blocks
+                                       double[] tmp = new double[_clen];
+                                       for(Tuple2<Long, MatrixBlock> kv : 
arg0._2()) {
+                                               int cl = 
(kv._1().intValue()-1)*_bclen;
+                                               MatrixBlock mb = kv._2();
+                                               
DataConverter.copyToDoubleVector(mb, tmp, cl);
+                                       }
+                                       row[1] = Vectors.dense(tmp);
                                }
-                               row[1] = new DenseVector(tmp);
                        }
                        else {
                                for(Tuple2<Long, MatrixBlock> kv : arg0._2()) {

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/20e33bbe/src/main/java/org/apache/sysml/utils/GPUStatistics.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/utils/GPUStatistics.java 
b/src/main/java/org/apache/sysml/utils/GPUStatistics.java
index 122a7ea..8347902 100644
--- a/src/main/java/org/apache/sysml/utils/GPUStatistics.java
+++ b/src/main/java/org/apache/sysml/utils/GPUStatistics.java
@@ -155,9 +155,7 @@ public class GPUStatistics {
     StringBuffer sb = new StringBuffer();
     HashMap<String, Long> miscTimerMap = _cpInstMiscTime.get(instructionName);
     if (miscTimerMap != null) {
-      List<Map.Entry<String, Long>> sortedList 
-      = new ArrayList<
-      Map.Entry<String, Long>>(miscTimerMap.entrySet());
+      List<Map.Entry<String, Long>> sortedList = new 
ArrayList<Map.Entry<String, Long>>(miscTimerMap.entrySet());
       // Sort the times to display by the most expensive first
       Collections.sort(sortedList, new Comparator<Map.Entry<String, Long>>() {
         @Override

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/20e33bbe/src/test/java/org/apache/sysml/test/integration/AutomatedTestBase.java
----------------------------------------------------------------------
diff --git 
a/src/test/java/org/apache/sysml/test/integration/AutomatedTestBase.java 
b/src/test/java/org/apache/sysml/test/integration/AutomatedTestBase.java
index e6123ef..8b9f10c 100644
--- a/src/test/java/org/apache/sysml/test/integration/AutomatedTestBase.java
+++ b/src/test/java/org/apache/sysml/test/integration/AutomatedTestBase.java
@@ -1487,8 +1487,6 @@ public abstract class AutomatedTestBase
                }
 
                TestUtils.clearAssertionInformation();
-
-               System.gc();
        }
 
        /**

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/20e33bbe/src/test/java/org/apache/sysml/test/integration/functions/mlcontext/DataFrameMatrixConversionTest.java
----------------------------------------------------------------------
diff --git 
a/src/test/java/org/apache/sysml/test/integration/functions/mlcontext/DataFrameMatrixConversionTest.java
 
b/src/test/java/org/apache/sysml/test/integration/functions/mlcontext/DataFrameMatrixConversionTest.java
index 199100e..bf5d33d 100644
--- 
a/src/test/java/org/apache/sysml/test/integration/functions/mlcontext/DataFrameMatrixConversionTest.java
+++ 
b/src/test/java/org/apache/sysml/test/integration/functions/mlcontext/DataFrameMatrixConversionTest.java
@@ -50,7 +50,7 @@ public class DataFrameMatrixConversionTest extends 
AutomatedTestBase
        private final static int rows3 = 7;
        private final static int cols1 = 745;
        private final static int cols2 = 1264;
-       private final static int cols3 = 1003820;
+       private final static int cols3 = 10038;
        private final static double sparsity1 = 0.9;
        private final static double sparsity2 = 0.1;
        private final static double eps=0.0000000001;

Reply via email to