[SYSTEMML-2220] Performance ultra-sparse block matrix mult operations This patch improves the performance of ultra-sparse matrix multiply operations with ultra-sparse lhs matrices, which have rows that only contain 1 non-zero values. In this special case we simply copy the related rhs row into the sparse output (with optional scaling) because this structure guarantees that there is no need for aggregation.
Project: http://git-wip-us.apache.org/repos/asf/systemml/repo Commit: http://git-wip-us.apache.org/repos/asf/systemml/commit/42e30a15 Tree: http://git-wip-us.apache.org/repos/asf/systemml/tree/42e30a15 Diff: http://git-wip-us.apache.org/repos/asf/systemml/diff/42e30a15 Branch: refs/heads/master Commit: 42e30a159a25b03411d36816b7ff679ad97c4e33 Parents: c16738d Author: Matthias Boehm <[email protected]> Authored: Fri Mar 30 20:55:25 2018 -0700 Committer: Matthias Boehm <[email protected]> Committed: Fri Mar 30 20:55:25 2018 -0700 ---------------------------------------------------------------------- .../runtime/matrix/data/LibMatrixMult.java | 90 ++++++++++++-------- 1 file changed, 53 insertions(+), 37 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/systemml/blob/42e30a15/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixMult.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixMult.java b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixMult.java index a98cd79..8919ab6 100644 --- a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixMult.java +++ b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixMult.java @@ -1503,48 +1503,50 @@ public class LibMatrixMult for( int i=rl; i<ru; i++ ) { - if( !a.isEmpty(i) ) - { - int apos = a.pos(i); - int alen = a.size(i); - int[] aixs = a.indexes(i); - double[] avals = a.values(i); - - if( alen==1 && avals[apos]==1 ) //ROW SELECTION (no aggregation) - { - int aix = aixs[apos]; - if( rightSparse ) { //sparse right matrix (full row copy) - if( !m2.sparseBlock.isEmpty(aix) ) { - ret.rlen=m; - ret.allocateSparseRowsBlock(false); //allocation on demand - boolean ldeep = (m2.sparseBlock instanceof SparseBlockMCSR); - ret.sparseBlock.set(i, m2.sparseBlock.get(aix), ldeep); - ret.nonZeros += ret.sparseBlock.size(i); - } + if( a.isEmpty(i) ) continue; + int apos = a.pos(i); + int alen = a.size(i); + int[] aixs = a.indexes(i); + double[] avals = a.values(i); + + if( alen==1 ) { + //row selection (now aggregation) with potential scaling + int aix = aixs[apos]; + if( rightSparse ) { //sparse right matrix (full row copy) + if( !m2.sparseBlock.isEmpty(aix) ) { + ret.rlen=m; + ret.allocateSparseRowsBlock(false); //allocation on demand + boolean ldeep = (m2.sparseBlock instanceof SparseBlockMCSR); + ret.sparseBlock.set(i, m2.sparseBlock.get(aix), ldeep); + ret.nonZeros += ret.sparseBlock.size(i); } - else { //dense right matrix (append all values) - int lnnz = (int)m2.recomputeNonZeros(aix, aix, 0, n-1); - if( lnnz > 0 ) { - c.allocate(i, lnnz); //allocate once - for( int j=0; j<n; j++ ) - c.append(i, j, m2.quickGetValue(aix, j)); - ret.nonZeros += lnnz; - } + } + else { //dense right matrix (append all values) + int lnnz = (int)m2.recomputeNonZeros(aix, aix, 0, n-1); + if( lnnz > 0 ) { + c.allocate(i, lnnz); //allocate once + double[] bvals = m2.getDenseBlock().values(aix); + for( int j=0, bix=m2.getDenseBlock().pos(aix); j<n; j++ ) + c.append(i, j, bvals[bix+j]); + ret.nonZeros += lnnz; } } - else //GENERAL CASE + //optional scaling if not pure selection + if( avals[apos] != 1 ) + vectMultiplyInPlace(avals[apos], c.values(i), c.pos(i), c.size(i)); + } + else //GENERAL CASE + { + for( int k=apos; k<apos+alen; k++ ) { - for( int k=apos; k<apos+alen; k++ ) + double aval = avals[k]; + int aix = aixs[k]; + for( int j=0; j<n; j++ ) { - double aval = avals[k]; - int aix = aixs[k]; - for( int j=0; j<n; j++ ) - { - double cval = ret.quickGetValue(i, j); - double cvald = aval*m2.quickGetValue(aix, j); - if( cvald != 0 ) - ret.quickSetValue(i, j, cval+cvald); - } + double cval = ret.quickGetValue(i, j); + double cvald = aval*m2.quickGetValue(aix, j); + if( cvald != 0 ) + ret.quickSetValue(i, j, cval+cvald); } } } @@ -3209,6 +3211,20 @@ public class LibMatrixMult c[ ci+7 ] = aval * b[ bi+7 ]; } } + + public static void vectMultiplyInPlace( final double aval, double[] c, int ci, final int len ) { + final int bn = len%8; + //rest, not aligned to 8-blocks + for( int j = 0; j < bn; j++, ci++) + c[ ci ] *= aval; + //unrolled 8-block (for better instruction-level parallelism) + for( int j = bn; j < len; j+=8, ci+=8) { + c[ ci+0 ] *= aval; c[ ci+1 ] *= aval; + c[ ci+2 ] *= aval; c[ ci+3 ] *= aval; + c[ ci+4 ] *= aval; c[ ci+5 ] *= aval; + c[ ci+6 ] *= aval; c[ ci+7 ] *= aval; + } + } //note: public for use by codegen for consistency public static void vectMultiplyWrite( double[] a, double[] b, double[] c, int ai, int bi, int ci, final int len )
