[SYSTEMML-2062] Performance ultra-sparse transpose (adaptive blocking) This patch improves the performance of sparse-sparse transpose operations for ultra-sparse matrices. So far, we simply uses a blocksize of 128 x 128 (equivalent to dense) for cache-conscious operations. For ultra-sparse matrices, this creates a lot of overhead for maintaining the block boundaries. We now use an adaptive (sparsity-aware) block size that is derived from the input sparsity.
On scenarios of executing 100 transpose operations, the total runtime improved as follows: a) 100K x 100K, sp=1e-4: 12.2s --> 5.2s. b) 1M x 1M, sp=1e-6: >25min --> 24.4s. Project: http://git-wip-us.apache.org/repos/asf/systemml/repo Commit: http://git-wip-us.apache.org/repos/asf/systemml/commit/6b8084b9 Tree: http://git-wip-us.apache.org/repos/asf/systemml/tree/6b8084b9 Diff: http://git-wip-us.apache.org/repos/asf/systemml/diff/6b8084b9 Branch: refs/heads/master Commit: 6b8084b9796637a5aeb633fad84a30724bc1e29a Parents: e9fb661 Author: Matthias Boehm <[email protected]> Authored: Sun Jan 7 22:13:24 2018 -0800 Committer: Matthias Boehm <[email protected]> Committed: Sun Jan 7 22:13:24 2018 -0800 ---------------------------------------------------------------------- .../runtime/matrix/data/LibMatrixReorg.java | 23 ++++++++++---------- 1 file changed, 11 insertions(+), 12 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/systemml/blob/6b8084b9/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixReorg.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixReorg.java b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixReorg.java index 37663df..1c34252 100644 --- a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixReorg.java +++ b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixReorg.java @@ -859,12 +859,13 @@ public class LibMatrixReorg c.allocate(i, cnt[i]); } - //blocking according to typical L2 cache sizes - final int blocksizeI = 128; - final int blocksizeJ = 128; + //blocking according to typical L2 cache sizes w/ awareness of sparsity + final long xsp = (long)in.rlen*in.clen/in.nonZeros; + final int blocksizeI = Math.max(128, (int) (8*xsp)); + final int blocksizeJ = Math.max(128, (int) (8*xsp)); //temporary array for block boundaries (for preventing binary search) - int[] ix = new int[blocksizeI]; + int[] ix = new int[Math.min(blocksizeI, ru-rl)]; //blocked execution for( int bi=rl; bi<ru; bi+=blocksizeI ) @@ -873,27 +874,25 @@ public class LibMatrixReorg //find column starting positions int bimin = Math.min(bi+blocksizeI, ru); if( cl > 0 ) { - for( int i=bi; i<bimin; i++ ) - if( !a.isEmpty(i) ) { - int j = a.posFIndexGTE(i, cl); - ix[i-bi] = (j>=0) ? j : a.size(i); - } + for( int i=bi; i<bimin; i++ ) { + if( a.isEmpty(i) ) continue; + int j = a.posFIndexGTE(i, cl); + ix[i-bi] = (j>=0) ? j : a.size(i); + } } for( int bj=cl; bj<cu; bj+=blocksizeJ ) { int bjmin = Math.min(bj+blocksizeJ, cu); - //core block transpose operation for( int i=bi; i<bimin; i++ ) { if( a.isEmpty(i) ) continue; - int apos = a.pos(i); int alen = a.size(i); int[] aix = a.indexes(i); double[] avals = a.values(i); int j = ix[i-bi] + apos; //last block boundary for( ; j<apos+alen && aix[j]<bjmin; j++ ) { - c.allocate(aix[j], ennz2,n2); + c.allocate(aix[j], ennz2, n2); c.append(aix[j], i, avals[j]); } ix[i-bi] = j - apos; //keep block boundary
