This is an automated email from the ASF dual-hosted git repository. mboehm7 pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/systemds.git
The following commit(s) were added to refs/heads/master by this push: new 3ce3270 [SYSTEMDS-2760] Performance sparse-sparse transpose operations 3ce3270 is described below commit 3ce32709366203eccb8da9063cedafa567f8d3bf Author: Matthias Boehm <mboe...@gmail.com> AuthorDate: Thu Dec 17 23:04:18 2020 +0100 [SYSTEMDS-2760] Performance sparse-sparse transpose operations This patch makes two minor performance improvements to multi-threaded sparse-sparse transpose operations. The tasks get column groups of the input and append outputs to the sparse output rows. For tall&skinny sparse matrices there were two shortcomings. First, we aligned the minimum column group size to 8, which is only required for dense-dense transpose operations. This improved performance on above scenario. Second, if the number of cores is larger than the number of columns, cache blocking and related position maintenance only adds unnecessary overhead. Accordingly, we now use added a special case for these scenarios. On a scenario of a 2.5M x 50 matrix with sparsity = 0.1, the total execution time of 10 transpose operations improved from 2.7s to 2.5s (with 1) and 1.9s (with 1 and 2) respectively. --- .../sysds/runtime/matrix/data/LibMatrixReorg.java | 101 ++++++++++++--------- 1 file changed, 60 insertions(+), 41 deletions(-) diff --git a/src/main/java/org/apache/sysds/runtime/matrix/data/LibMatrixReorg.java b/src/main/java/org/apache/sysds/runtime/matrix/data/LibMatrixReorg.java index edc38a7..ec1731f 100644 --- a/src/main/java/org/apache/sysds/runtime/matrix/data/LibMatrixReorg.java +++ b/src/main/java/org/apache/sysds/runtime/matrix/data/LibMatrixReorg.java @@ -205,7 +205,7 @@ public class LibMatrixReorg boolean row = (in.sparse || in.rlen >= in.clen) && !out.sparse; int len = row ? in.rlen : in.clen; int blklen = (int)(Math.ceil((double)len/k)); - blklen += (blklen%8 != 0)?8-blklen%8:0; + blklen += (!out.sparse && (blklen%8)!=0) ? 8-blklen%8 : 0; for( int i=0; i<k & i*blklen<len; i++ ) tasks.add(new TransposeTask(in, out, row, i*blklen, Math.min((i+1)*blklen, len), cnt)); List<Future<Object>> taskret = pool.invokeAll(tasks); @@ -861,50 +861,69 @@ public class LibMatrixReorg SparseBlock a = in.getSparseBlock(); SparseBlock c = out.getSparseBlock(); - //allocate output sparse rows - if( cnt != null ) { - for( int i=cl; i<cu; i++ ) - if( cnt[i] > 0 ) - c.allocate(i, cnt[i]); + if( cu-cl == 1 ) { // SINGLE TARGET ROW + //number of columns <= num cores, use sequential scan over input + //and avoid cache blocking and temporary position maintenance + if( cnt[cl] > 0 ) + c.allocate(cl, cnt[cl]); + + for( int i=0; i<ru; i++ ) { + if( a.isEmpty(i) ) continue; + int apos = a.pos(i); + int alen = a.size(i); + int[] aix = a.indexes(i); + double[] avals = a.values(i); + for( int j=apos; j<apos+alen && aix[j]<=cl; j++ ) + if( aix[j] == cl ) + c.append(cl, i, avals[j]); + } } - - //blocking according to typical L2 cache sizes w/ awareness of sparsity - final long xsp = (long)in.rlen*in.clen/in.nonZeros; - final int blocksizeI = Math.max(128, (int) (8*xsp)); - final int blocksizeJ = Math.max(128, (int) (8*xsp)); - - //temporary array for block boundaries (for preventing binary search) - int[] ix = new int[Math.min(blocksizeI, ru-rl)]; - - //blocked execution - for( int bi=rl; bi<ru; bi+=blocksizeI ) - { - Arrays.fill(ix, 0); - //find column starting positions - int bimin = Math.min(bi+blocksizeI, ru); - if( cl > 0 ) { - for( int i=bi; i<bimin; i++ ) { - if( a.isEmpty(i) ) continue; - int j = a.posFIndexGTE(i, cl); - ix[i-bi] = (j>=0) ? j : a.size(i); - } + else { //GENERAL CASE + //allocate output sparse rows + if( cnt != null ) { + for( int i=cl; i<cu; i++ ) + if( cnt[i] > 0 ) + c.allocate(i, cnt[i]); } - for( int bj=cl; bj<cu; bj+=blocksizeJ ) { - int bjmin = Math.min(bj+blocksizeJ, cu); - //core block transpose operation - for( int i=bi; i<bimin; i++ ) { - if( a.isEmpty(i) ) continue; - int apos = a.pos(i); - int alen = a.size(i); - int[] aix = a.indexes(i); - double[] avals = a.values(i); - int j = ix[i-bi] + apos; //last block boundary - for( ; j<apos+alen && aix[j]<bjmin; j++ ) { - c.allocate(aix[j], ennz2, n2); - c.append(aix[j], i, avals[j]); + //blocking according to typical L2 cache sizes w/ awareness of sparsity + final long xsp = (long)in.rlen*in.clen/in.nonZeros; + final int blocksizeI = Math.max(128, (int) (8*xsp)); + final int blocksizeJ = Math.max(128, (int) (8*xsp)); + + //temporary array for block boundaries (for preventing binary search) + int[] ix = new int[Math.min(blocksizeI, ru-rl)]; + + //blocked execution + for( int bi=rl; bi<ru; bi+=blocksizeI ) + { + Arrays.fill(ix, 0); + //find column starting positions + int bimin = Math.min(bi+blocksizeI, ru); + if( cl > 0 ) { + for( int i=bi; i<bimin; i++ ) { + if( a.isEmpty(i) ) continue; + int j = a.posFIndexGTE(i, cl); + ix[i-bi] = (j>=0) ? j : a.size(i); + } + } + + for( int bj=cl; bj<cu; bj+=blocksizeJ ) { + int bjmin = Math.min(bj+blocksizeJ, cu); + //core block transpose operation + for( int i=bi; i<bimin; i++ ) { + if( a.isEmpty(i) ) continue; + int apos = a.pos(i); + int alen = a.size(i); + int[] aix = a.indexes(i); + double[] avals = a.values(i); + int j = ix[i-bi] + apos; //last block boundary + for( ; j<apos+alen && aix[j]<bjmin; j++ ) { + c.allocate(aix[j], ennz2, n2); + c.append(aix[j], i, avals[j]); + } + ix[i-bi] = j - apos; //keep block boundary } - ix[i-bi] = j - apos; //keep block boundary } } }