This is an automated email from the ASF dual-hosted git repository. mboehm7 pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/systemds.git
commit 85c54928d78679137222dc0de1057eb75e39fa3f Author: Matthias Boehm <[email protected]> AuthorDate: Sun Jul 25 18:28:05 2021 +0200 [SYSTEMDS-3073] Performance ultra-sparse transpose operations This patch adds an additional specialized code path for ultra-sparse transpose operations for nnz < max(rlen, clen) in order to avoid unnecessary overhead for cache blocking and preallocation. On the following scenario it improve the cummulative transpose time from 5.061s to 0.546s. X = rand(rows=1e6, cols=1e6, sparsity=1e-11); for(i in 1:100) X = t(X) print(sum(X)) --- .../sysds/runtime/matrix/data/LibMatrixReorg.java | 32 ++++++++++++++++------ 1 file changed, 24 insertions(+), 8 deletions(-) diff --git a/src/main/java/org/apache/sysds/runtime/matrix/data/LibMatrixReorg.java b/src/main/java/org/apache/sysds/runtime/matrix/data/LibMatrixReorg.java index ecad23c..72d4ace 100644 --- a/src/main/java/org/apache/sysds/runtime/matrix/data/LibMatrixReorg.java +++ b/src/main/java/org/apache/sysds/runtime/matrix/data/LibMatrixReorg.java @@ -169,15 +169,18 @@ public class LibMatrixReorg { out.allocateDenseBlock(false); //execute transpose operation + boolean ultraSparse = (in.sparse && out.sparse && in.nonZeros < Math.max(in.rlen, in.clen)); if( !in.sparse && !out.sparse ) - transposeDenseToDense( in, out, 0, in.rlen, 0, in.clen ); + transposeDenseToDense(in, out, 0, in.rlen, 0, in.clen); + else if( ultraSparse ) + transposeUltraSparse(in, out); else if( in.sparse && out.sparse ) - transposeSparseToSparse( in, out, 0, in.rlen, 0, in.clen, + transposeSparseToSparse(in, out, 0, in.rlen, 0, in.clen, countNnzPerColumn(in, 0, in.rlen)); else if( in.sparse ) - transposeSparseToDense( in, out, 0, in.rlen, 0, in.clen ); + transposeSparseToDense(in, out, 0, in.rlen, 0, in.clen); else - transposeDenseToSparse( in, out ); + transposeDenseToSparse(in, out); // System.out.println("r' ("+in.rlen+", "+in.clen+", "+in.sparse+", "+out.sparse+") in "+time.stop()+" ms."); @@ -190,10 +193,11 @@ public class LibMatrixReorg { public static MatrixBlock transpose(MatrixBlock in, MatrixBlock out, int k, boolean allowCSR) { // redirect small or special cases to sequential execution - if(in.isEmptyBlock(false) || ((long) in.rlen * (long) in.clen < PAR_NUMCELL_THRESHOLD) || k == 1 || - (SHALLOW_COPY_REORG && !in.sparse && !out.sparse && (in.rlen == 1 || in.clen == 1)) || - (in.sparse && !out.sparse && in.rlen == 1) || (!in.sparse && out.sparse && in.rlen == 1) || - (!in.sparse && out.sparse)) { + if(in.isEmptyBlock(false) || ((long) in.rlen * (long) in.clen < PAR_NUMCELL_THRESHOLD) || k == 1 + || (SHALLOW_COPY_REORG && !in.sparse && !out.sparse && (in.rlen == 1 || in.clen == 1)) + || (in.sparse && !out.sparse && in.rlen == 1) || (!in.sparse && out.sparse && in.rlen == 1) + || (in.sparse && out.sparse && in.nonZeros < Math.max(in.rlen, in.clen)) //ultra-sparse + || (!in.sparse && out.sparse) ) { return transpose(in, out); } // set meta data and allocate output arrays (if required) @@ -901,6 +905,18 @@ public class LibMatrixReorg { } } + private static void transposeUltraSparse(MatrixBlock in, MatrixBlock out) { + //note: applied if nnz < max(rlen, clen) - so no cache blocking + // but basic, naive transposition in a single-threaded context + Iterator<IJV> iter = in.getSparseBlockIterator(); + SparseBlock b = out.getSparseBlock(); + while( iter.hasNext() ) { + IJV cell = iter.next(); + b.append(cell.getJ(), cell.getI(), cell.getV()); + } + out.setNonZeros(in.getNonZeros()); + } + private static void transposeSparseToSparse(MatrixBlock in, MatrixBlock out, int rl, int ru, int cl, int cu, int[] cnt) { //NOTE: called only in sequential or column-wise parallel execution
