[SYSTEMML2259] Performance dense-sparse MV binary multiply operations This patch improves the performance of dense-dense and dense-sparse binary multiply operations with sparse output, which so far fell back to a generic MV case with binary search and reallocation overheads. First, for MV mult with column vectors, we now use a new special case that allocates the output directly into the memory-efficient CSR format. Second, for MV mult with row vectors, we preallocate sparse rows.
On a scenario of 100x MV binary operations of the following shapes, the cumulative runtime improved as follows: a) 37,500x200 (dense) * 37,500x1 (dense, sp=0.05): 10.7s -> 1.3s b) 200x37,500 (dense) * 1x37,500 (sparse, sp=0.05): 3.8s -> 2.1s Project: http://git-wip-us.apache.org/repos/asf/systemml/repo Commit: http://git-wip-us.apache.org/repos/asf/systemml/commit/fde708f5 Tree: http://git-wip-us.apache.org/repos/asf/systemml/tree/fde708f5 Diff: http://git-wip-us.apache.org/repos/asf/systemml/diff/fde708f5 Branch: refs/heads/master Commit: fde708f5df93bd7dd1b2c523b43ee128a53d7809 Parents: 5590513 Author: Matthias Boehm <[email protected]> Authored: Thu Apr 19 21:15:33 2018 -0700 Committer: Matthias Boehm <[email protected]> Committed: Thu Apr 19 21:15:33 2018 -0700 ---------------------------------------------------------------------- .../runtime/matrix/data/LibMatrixBincell.java | 56 +++++++++++++++++--- 1 file changed, 49 insertions(+), 7 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/systemml/blob/fde708f5/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixBincell.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixBincell.java b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixBincell.java index 5a93508..79b4c36 100644 --- a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixBincell.java +++ b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixBincell.java @@ -44,6 +44,7 @@ import org.apache.sysml.runtime.matrix.operators.BinaryOperator; import org.apache.sysml.runtime.matrix.operators.ScalarOperator; import org.apache.sysml.runtime.util.DataConverter; import org.apache.sysml.runtime.util.SortUtils; +import org.apache.sysml.runtime.util.UtilFunctions; /** * MB: @@ -211,6 +212,10 @@ public class LibMatrixBincell safeBinaryMVDense(m1, m2, ret, op); else if( m1.sparse ) //SPARSE m1 safeBinaryMVSparse(m1, m2, ret, op); + else if( !m1.sparse && !m2.sparse && ret.sparse && op.fn instanceof Multiply + && atype == BinaryAccessType.MATRIX_COL_VECTOR + && (long)m1.rlen * m2.clen < Integer.MAX_VALUE ) + safeBinaryMVDenseSparseMult(m1, m2, ret, op); else //generic combinations safeBinaryMVGeneric(m1, m2, ret, op); } @@ -440,6 +445,44 @@ public class LibMatrixBincell //no need to recomputeNonZeros since maintained in append value } + private static void safeBinaryMVDenseSparseMult(MatrixBlock m1, MatrixBlock m2, MatrixBlock ret, BinaryOperator op) { + if( m1.isEmptyBlock(false) || m2.isEmptyBlock(false) ) + return; + int rlen = m1.rlen; + int clen = m1.clen; + BinaryAccessType atype = getBinaryAccessType(m1, m2); + double[] a = m1.getDenseBlockValues(); + double[] b = m2.getDenseBlockValues(); + + //note: invocation condition ensures max int nnz + if( atype == BinaryAccessType.MATRIX_COL_VECTOR ) { + //count output nnz (for CSR preallocation) + int nnz = 0; + for(int i=0, aix=0; i<rlen; i++, aix+=clen) + nnz += (b[i] != 0) ? UtilFunctions + .countNonZeros(a, aix, clen) : 0; + //allocate and compute output in CSR format + int[] rptr = new int[rlen+1]; + int[] indexes = new int[nnz]; + double[] vals = new double[nnz]; + rptr[0] = 0; + for( int i=0, aix=0, pos=0; i<rlen; i++, aix+=clen ) { + double bval = b[i]; + if( bval != 0 ) { + for( int j=0; j<clen; j++ ) { + indexes[pos] = j; + vals[pos] = a[aix+j] * bval; + pos++; + } + } + rptr[i+1] = pos; + } + ret.sparseBlock = new SparseBlockCSR( + rptr, indexes, vals, nnz); + ret.setNonZeros(nnz); + } + } + private static void safeBinaryMVGeneric(MatrixBlock m1, MatrixBlock m2, MatrixBlock ret, BinaryOperator op) { boolean isMultiply = (op.fn instanceof Multiply); boolean skipEmpty = (isMultiply); @@ -486,22 +529,21 @@ public class LibMatrixBincell //if the right hand side row vector is sparse we have to exploit that; //otherwise, both sparse access (binary search) and asymtotic behavior //in the number of cells become major bottlenecks - if( m2.sparse && isMultiply ) //SPARSE * + if( m2.sparse && ret.sparse && isMultiply ) //SPARSE * { //note: sparse block guaranteed to be allocated (otherwise early about) SparseBlock b = m2.sparseBlock; + SparseBlock c = ret.sparseBlock; if( b.isEmpty(0) ) return; int blen = b.size(0); //always pos 0 int[] bix = b.indexes(0); double[] bvals = b.values(0); for( int i=0; i<rlen; i++ ) { - //for each row iterate only over non-zeros elements in rhs - for( int j=0; j<blen; j++ ) { - double v1 = m1.quickGetValue(i, bix[j]); - double v = op.fn.execute( v1, bvals[j] ); - ret.appendValue(i, bix[j], v); - } + c.allocate(i, blen); + for( int j=0; j<blen; j++ ) + c.append(i, bix[j], m1.quickGetValue(i, bix[j]) * bvals[j]); } + ret.setNonZeros(c.size()); } else //GENERAL CASE {
