[SYSTEMML-2220] Performance ultra-sparse block matrix mult operations

This patch improves the performance of ultra-sparse matrix multiply
operations with ultra-sparse lhs matrices, which have rows that only
contain 1 non-zero values. In this special case we simply copy the
related rhs row into the sparse output (with optional scaling) because
this structure guarantees that there is no need for aggregation.


Project: http://git-wip-us.apache.org/repos/asf/systemml/repo
Commit: http://git-wip-us.apache.org/repos/asf/systemml/commit/42e30a15
Tree: http://git-wip-us.apache.org/repos/asf/systemml/tree/42e30a15
Diff: http://git-wip-us.apache.org/repos/asf/systemml/diff/42e30a15

Branch: refs/heads/master
Commit: 42e30a159a25b03411d36816b7ff679ad97c4e33
Parents: c16738d
Author: Matthias Boehm <[email protected]>
Authored: Fri Mar 30 20:55:25 2018 -0700
Committer: Matthias Boehm <[email protected]>
Committed: Fri Mar 30 20:55:25 2018 -0700

----------------------------------------------------------------------
 .../runtime/matrix/data/LibMatrixMult.java      | 90 ++++++++++++--------
 1 file changed, 53 insertions(+), 37 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/systemml/blob/42e30a15/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixMult.java
----------------------------------------------------------------------
diff --git 
a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixMult.java 
b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixMult.java
index a98cd79..8919ab6 100644
--- a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixMult.java
+++ b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixMult.java
@@ -1503,48 +1503,50 @@ public class LibMatrixMult
                        
                        for( int i=rl; i<ru; i++ )
                        {
-                               if( !a.isEmpty(i) ) 
-                               {
-                                       int apos = a.pos(i);
-                                       int alen = a.size(i);
-                                       int[] aixs = a.indexes(i);
-                                       double[] avals = a.values(i);
-                                       
-                                       if( alen==1 && avals[apos]==1 ) //ROW 
SELECTION (no aggregation)
-                                       {
-                                               int aix = aixs[apos];
-                                               if( rightSparse ) { //sparse 
right matrix (full row copy)
-                                                       if( 
!m2.sparseBlock.isEmpty(aix) ) {
-                                                               ret.rlen=m;
-                                                               
ret.allocateSparseRowsBlock(false); //allocation on demand
-                                                               boolean ldeep = 
(m2.sparseBlock instanceof SparseBlockMCSR);
-                                                               
ret.sparseBlock.set(i, m2.sparseBlock.get(aix), ldeep);
-                                                               ret.nonZeros += 
ret.sparseBlock.size(i);
-                                                       }
+                               if( a.isEmpty(i) ) continue; 
+                               int apos = a.pos(i);
+                               int alen = a.size(i);
+                               int[] aixs = a.indexes(i);
+                               double[] avals = a.values(i);
+                               
+                               if( alen==1 ) { 
+                                       //row selection (now aggregation) with 
potential scaling
+                                       int aix = aixs[apos];
+                                       if( rightSparse ) { //sparse right 
matrix (full row copy)
+                                               if( 
!m2.sparseBlock.isEmpty(aix) ) {
+                                                       ret.rlen=m;
+                                                       
ret.allocateSparseRowsBlock(false); //allocation on demand
+                                                       boolean ldeep = 
(m2.sparseBlock instanceof SparseBlockMCSR);
+                                                       ret.sparseBlock.set(i, 
m2.sparseBlock.get(aix), ldeep);
+                                                       ret.nonZeros += 
ret.sparseBlock.size(i);
                                                }
-                                               else { //dense right matrix 
(append all values)
-                                                       int lnnz = 
(int)m2.recomputeNonZeros(aix, aix, 0, n-1);
-                                                       if( lnnz > 0 ) {
-                                                               c.allocate(i, 
lnnz); //allocate once
-                                                               for( int j=0; 
j<n; j++ )
-                                                                       
c.append(i, j, m2.quickGetValue(aix, j));
-                                                               ret.nonZeros += 
lnnz;
-                                                       }
+                                       }
+                                       else { //dense right matrix (append all 
values)
+                                               int lnnz = 
(int)m2.recomputeNonZeros(aix, aix, 0, n-1);
+                                               if( lnnz > 0 ) {
+                                                       c.allocate(i, lnnz); 
//allocate once
+                                                       double[] bvals = 
m2.getDenseBlock().values(aix);
+                                                       for( int j=0, 
bix=m2.getDenseBlock().pos(aix); j<n; j++ )
+                                                               c.append(i, j, 
bvals[bix+j]);
+                                                       ret.nonZeros += lnnz;
                                                }
                                        }
-                                       else //GENERAL CASE
+                                       //optional scaling if not pure selection
+                                       if( avals[apos] != 1 )
+                                               
vectMultiplyInPlace(avals[apos], c.values(i), c.pos(i), c.size(i));
+                               }
+                               else //GENERAL CASE
+                               {
+                                       for( int k=apos; k<apos+alen; k++ )
                                        {
-                                               for( int k=apos; k<apos+alen; 
k++ )
+                                               double aval = avals[k];
+                                               int aix = aixs[k];
+                                               for( int j=0; j<n; j++ )
                                                {
-                                                       double aval = avals[k];
-                                                       int aix = aixs[k];
-                                                       for( int j=0; j<n; j++ )
-                                                       {
-                                                               double cval = 
ret.quickGetValue(i, j);
-                                                               double cvald = 
aval*m2.quickGetValue(aix, j);
-                                                               if( cvald != 0 )
-                                                                       
ret.quickSetValue(i, j, cval+cvald);
-                                                       }
+                                                       double cval = 
ret.quickGetValue(i, j);
+                                                       double cvald = 
aval*m2.quickGetValue(aix, j);
+                                                       if( cvald != 0 )
+                                                               
ret.quickSetValue(i, j, cval+cvald);
                                                }
                                        }
                                }
@@ -3209,6 +3211,20 @@ public class LibMatrixMult
                        c[ ci+7 ] = aval * b[ bi+7 ];
                }
        }
+       
+       public static void vectMultiplyInPlace( final double aval, double[] c, 
int ci, final int len ) {
+               final int bn = len%8;
+               //rest, not aligned to 8-blocks
+               for( int j = 0; j < bn; j++, ci++)
+                       c[ ci ] *= aval;
+               //unrolled 8-block  (for better instruction-level parallelism)
+               for( int j = bn; j < len; j+=8, ci+=8) {
+                       c[ ci+0 ] *= aval; c[ ci+1 ] *= aval;
+                       c[ ci+2 ] *= aval; c[ ci+3 ] *= aval;
+                       c[ ci+4 ] *= aval; c[ ci+5 ] *= aval;
+                       c[ ci+6 ] *= aval; c[ ci+7 ] *= aval;
+               }
+       }
 
        //note: public for use by codegen for consistency
        public static void vectMultiplyWrite( double[] a, double[] b, double[] 
c, int ai, int bi, int ci, final int len )

Reply via email to