[3/7] systemml git commit: [SYSTEMML2259] Performance dense-sparse MV binary multiply operations

mboehm7 Fri, 20 Apr 2018 01:46:21 -0700

[SYSTEMML2259] Performance dense-sparse MV binary multiply operations

This patch improves the performance of dense-dense and dense-sparse
binary multiply operations with sparse output, which so far fell back to
a generic MV case with binary search and reallocation overheads. First,
for MV mult with column vectors, we now use a new special case that
allocates the output directly into the memory-efficient CSR format.
Second, for MV mult with row vectors, we preallocate sparse rows.


On a scenario of 100x MV binary operations of the following shapes, the
cumulative runtime improved as follows:
a) 37,500x200 (dense) * 37,500x1 (dense, sp=0.05): 10.7s -> 1.3s
b) 200x37,500 (dense) * 1x37,500 (sparse, sp=0.05): 3.8s -> 2.1s


Project: http://git-wip-us.apache.org/repos/asf/systemml/repo
Commit: http://git-wip-us.apache.org/repos/asf/systemml/commit/fde708f5
Tree: http://git-wip-us.apache.org/repos/asf/systemml/tree/fde708f5
Diff: http://git-wip-us.apache.org/repos/asf/systemml/diff/fde708f5

Branch: refs/heads/master
Commit: fde708f5df93bd7dd1b2c523b43ee128a53d7809
Parents: 5590513
Author: Matthias Boehm <[email protected]>
Authored: Thu Apr 19 21:15:33 2018 -0700
Committer: Matthias Boehm <[email protected]>
Committed: Thu Apr 19 21:15:33 2018 -0700

----------------------------------------------------------------------
 .../runtime/matrix/data/LibMatrixBincell.java   | 56 +++++++++++++++++---
 1 file changed, 49 insertions(+), 7 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/systemml/blob/fde708f5/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixBincell.java
----------------------------------------------------------------------
diff --git 
a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixBincell.java 
b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixBincell.java
index 5a93508..79b4c36 100644
--- a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixBincell.java
+++ b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixBincell.java
@@ -44,6 +44,7 @@ import 
org.apache.sysml.runtime.matrix.operators.BinaryOperator;
 import org.apache.sysml.runtime.matrix.operators.ScalarOperator;
 import org.apache.sysml.runtime.util.DataConverter;
 import org.apache.sysml.runtime.util.SortUtils;
+import org.apache.sysml.runtime.util.UtilFunctions;
 
 /**
  * MB:
@@ -211,6 +212,10 @@ public class LibMatrixBincell
                                safeBinaryMVDense(m1, m2, ret, op);
                        else if( m1.sparse ) //SPARSE m1
                                safeBinaryMVSparse(m1, m2, ret, op);
+                       else if( !m1.sparse && !m2.sparse && ret.sparse && 
op.fn instanceof Multiply
+                               && atype == BinaryAccessType.MATRIX_COL_VECTOR
+                               && (long)m1.rlen * m2.clen < Integer.MAX_VALUE )
+                               safeBinaryMVDenseSparseMult(m1, m2, ret, op);
                        else //generic combinations
                                safeBinaryMVGeneric(m1, m2, ret, op);
                }       
@@ -440,6 +445,44 @@ public class LibMatrixBincell
                //no need to recomputeNonZeros since maintained in append value
        }
 
+       private static void safeBinaryMVDenseSparseMult(MatrixBlock m1, 
MatrixBlock m2, MatrixBlock ret, BinaryOperator op) {
+               if( m1.isEmptyBlock(false) || m2.isEmptyBlock(false) )
+                       return;
+               int rlen = m1.rlen;
+               int clen = m1.clen;
+               BinaryAccessType atype = getBinaryAccessType(m1, m2);
+               double[] a = m1.getDenseBlockValues();
+               double[] b = m2.getDenseBlockValues();
+               
+               //note: invocation condition ensures max int nnz
+               if( atype == BinaryAccessType.MATRIX_COL_VECTOR ) {
+                       //count output nnz (for CSR preallocation)
+                       int nnz = 0;
+                       for(int i=0, aix=0; i<rlen; i++, aix+=clen)
+                               nnz += (b[i] != 0) ? UtilFunctions
+                                       .countNonZeros(a, aix, clen) : 0;
+                       //allocate and compute output in CSR format
+                       int[] rptr = new int[rlen+1];
+                       int[] indexes = new int[nnz];
+                       double[] vals = new double[nnz];
+                       rptr[0] = 0;
+                       for( int i=0, aix=0, pos=0; i<rlen; i++, aix+=clen ) {
+                               double bval = b[i];
+                               if( bval != 0 ) {
+                                       for( int j=0; j<clen; j++ ) {
+                                               indexes[pos] = j;
+                                               vals[pos] = a[aix+j] * bval;
+                                               pos++;
+                                       }
+                               }
+                               rptr[i+1] = pos;
+                       }
+                       ret.sparseBlock = new SparseBlockCSR(
+                               rptr, indexes, vals, nnz);
+                       ret.setNonZeros(nnz);
+               }
+       }
+       
        private static void safeBinaryMVGeneric(MatrixBlock m1, MatrixBlock m2, 
MatrixBlock ret, BinaryOperator op) {
                boolean isMultiply = (op.fn instanceof Multiply);
                boolean skipEmpty = (isMultiply);
@@ -486,22 +529,21 @@ public class LibMatrixBincell
                        //if the right hand side row vector is sparse we have 
to exploit that;
                        //otherwise, both sparse access (binary search) and 
asymtotic behavior
                        //in the number of cells become major bottlenecks
-                       if( m2.sparse && isMultiply ) //SPARSE *
+                       if( m2.sparse && ret.sparse && isMultiply ) //SPARSE *
                        {
                                //note: sparse block guaranteed to be allocated 
(otherwise early about)
                                SparseBlock b = m2.sparseBlock;
+                               SparseBlock c = ret.sparseBlock;
                                if( b.isEmpty(0) ) return; 
                                int blen = b.size(0); //always pos 0
                                int[] bix = b.indexes(0);
                                double[] bvals = b.values(0);
                                for( int i=0; i<rlen; i++ ) {
-                                       //for each row iterate only over 
non-zeros elements in rhs
-                                       for( int j=0; j<blen; j++ ) {
-                                               double v1 = m1.quickGetValue(i, 
bix[j]);
-                                               double v = op.fn.execute( v1, 
bvals[j] );
-                                               ret.appendValue(i, bix[j], v);
-                                       }
+                                       c.allocate(i, blen);
+                                       for( int j=0; j<blen; j++ )
+                                               c.append(i, bix[j], 
m1.quickGetValue(i, bix[j]) * bvals[j]);
                                }
+                               ret.setNonZeros(c.size());
                        }
                        else //GENERAL CASE
                        {

[3/7] systemml git commit: [SYSTEMML2259] Performance dense-sparse MV binary multiply operations

Reply via email to