[SYSTEMML-2062] Performance ultra-sparse transpose (adaptive blocking)

This patch improves the performance of sparse-sparse transpose
operations for ultra-sparse matrices. So far, we simply uses a blocksize
of 128 x 128 (equivalent to dense) for cache-conscious operations. For
ultra-sparse matrices, this creates a lot of overhead for maintaining
the block boundaries. We now use an adaptive (sparsity-aware) block size
that is derived from the input sparsity. 

On scenarios of executing 100 transpose operations, the total runtime
improved as follows:
a) 100K x 100K, sp=1e-4: 12.2s --> 5.2s.
b) 1M x 1M, sp=1e-6:     >25min --> 24.4s.


Project: http://git-wip-us.apache.org/repos/asf/systemml/repo
Commit: http://git-wip-us.apache.org/repos/asf/systemml/commit/6b8084b9
Tree: http://git-wip-us.apache.org/repos/asf/systemml/tree/6b8084b9
Diff: http://git-wip-us.apache.org/repos/asf/systemml/diff/6b8084b9

Branch: refs/heads/master
Commit: 6b8084b9796637a5aeb633fad84a30724bc1e29a
Parents: e9fb661
Author: Matthias Boehm <[email protected]>
Authored: Sun Jan 7 22:13:24 2018 -0800
Committer: Matthias Boehm <[email protected]>
Committed: Sun Jan 7 22:13:24 2018 -0800

----------------------------------------------------------------------
 .../runtime/matrix/data/LibMatrixReorg.java     | 23 ++++++++++----------
 1 file changed, 11 insertions(+), 12 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/systemml/blob/6b8084b9/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixReorg.java
----------------------------------------------------------------------
diff --git 
a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixReorg.java 
b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixReorg.java
index 37663df..1c34252 100644
--- a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixReorg.java
+++ b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixReorg.java
@@ -859,12 +859,13 @@ public class LibMatrixReorg
                                        c.allocate(i, cnt[i]);
                }
                
-               //blocking according to typical L2 cache sizes 
-               final int blocksizeI = 128;
-               final int blocksizeJ = 128;
+               //blocking according to typical L2 cache sizes w/ awareness of 
sparsity
+               final long xsp = (long)in.rlen*in.clen/in.nonZeros;
+               final int blocksizeI = Math.max(128, (int) (8*xsp));
+               final int blocksizeJ = Math.max(128, (int) (8*xsp));
        
                //temporary array for block boundaries (for preventing binary 
search) 
-               int[] ix = new int[blocksizeI];
+               int[] ix = new int[Math.min(blocksizeI, ru-rl)];
                
                //blocked execution
                for( int bi=rl; bi<ru; bi+=blocksizeI )
@@ -873,27 +874,25 @@ public class LibMatrixReorg
                        //find column starting positions
                        int bimin = Math.min(bi+blocksizeI, ru);
                        if( cl > 0 ) {
-                               for( int i=bi; i<bimin; i++ )
-                                       if( !a.isEmpty(i) ) {
-                                               int j = a.posFIndexGTE(i, cl);
-                                               ix[i-bi] = (j>=0) ? j : 
a.size(i);
-                                       }
+                               for( int i=bi; i<bimin; i++ ) {
+                                       if( a.isEmpty(i) ) continue;
+                                       int j = a.posFIndexGTE(i, cl);
+                                       ix[i-bi] = (j>=0) ? j : a.size(i);
+                               }
                        }
                        
                        for( int bj=cl; bj<cu; bj+=blocksizeJ ) {
                                int bjmin = Math.min(bj+blocksizeJ, cu);
-
                                //core block transpose operation
                                for( int i=bi; i<bimin; i++ ) {
                                        if( a.isEmpty(i) ) continue;
-                                       
                                        int apos = a.pos(i);
                                        int alen = a.size(i);
                                        int[] aix = a.indexes(i);
                                        double[] avals = a.values(i);
                                        int j = ix[i-bi] + apos; //last block 
boundary
                                        for( ; j<apos+alen && aix[j]<bjmin; j++ 
) {
-                                               c.allocate(aix[j], ennz2,n2);
+                                               c.allocate(aix[j], ennz2, n2);
                                                c.append(aix[j], i, avals[j]);
                                        }
                                        ix[i-bi] = j - apos; //keep block 
boundary

Reply via email to