This is an automated email from the ASF dual-hosted git repository.

mboehm7 pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/systemds.git


The following commit(s) were added to refs/heads/master by this push:
     new 3ce3270  [SYSTEMDS-2760] Performance sparse-sparse transpose operations
3ce3270 is described below

commit 3ce32709366203eccb8da9063cedafa567f8d3bf
Author: Matthias Boehm <mboe...@gmail.com>
AuthorDate: Thu Dec 17 23:04:18 2020 +0100

    [SYSTEMDS-2760] Performance sparse-sparse transpose operations
    
    This patch makes two minor performance improvements to multi-threaded
    sparse-sparse transpose operations. The tasks get column groups of the
    input and append outputs to the sparse output rows. For tall&skinny
    sparse matrices there were two shortcomings.
    
    First, we aligned the minimum column group size to 8, which is only
    required for dense-dense transpose operations. This improved performance
    on above scenario.
    
    Second, if the number of cores is larger than the number of columns,
    cache blocking and related position maintenance only adds unnecessary
    overhead. Accordingly, we now use added a special case for these
    scenarios.
    
    On a scenario of a 2.5M x 50 matrix with sparsity = 0.1, the total
    execution time of 10 transpose operations improved from 2.7s to 2.5s
    (with 1) and 1.9s (with 1 and 2) respectively.
---
 .../sysds/runtime/matrix/data/LibMatrixReorg.java  | 101 ++++++++++++---------
 1 file changed, 60 insertions(+), 41 deletions(-)

diff --git 
a/src/main/java/org/apache/sysds/runtime/matrix/data/LibMatrixReorg.java 
b/src/main/java/org/apache/sysds/runtime/matrix/data/LibMatrixReorg.java
index edc38a7..ec1731f 100644
--- a/src/main/java/org/apache/sysds/runtime/matrix/data/LibMatrixReorg.java
+++ b/src/main/java/org/apache/sysds/runtime/matrix/data/LibMatrixReorg.java
@@ -205,7 +205,7 @@ public class LibMatrixReorg
                        boolean row = (in.sparse || in.rlen >= in.clen) && 
!out.sparse;
                        int len = row ? in.rlen : in.clen;
                        int blklen = (int)(Math.ceil((double)len/k));
-                       blklen += (blklen%8 != 0)?8-blklen%8:0;
+                       blklen += (!out.sparse && (blklen%8)!=0) ? 8-blklen%8 : 
0;
                        for( int i=0; i<k & i*blklen<len; i++ )
                                tasks.add(new TransposeTask(in, out, row, 
i*blklen, Math.min((i+1)*blklen, len), cnt));
                        List<Future<Object>> taskret = pool.invokeAll(tasks);
@@ -861,50 +861,69 @@ public class LibMatrixReorg
                SparseBlock a = in.getSparseBlock();
                SparseBlock c = out.getSparseBlock();
 
-               //allocate output sparse rows
-               if( cnt != null ) {
-                       for( int i=cl; i<cu; i++ )
-                               if( cnt[i] > 0 )
-                                       c.allocate(i, cnt[i]);
+               if( cu-cl == 1 ) { // SINGLE TARGET ROW
+                       //number of columns <= num cores, use sequential scan 
over input
+                       //and avoid cache blocking and temporary position 
maintenance
+                       if( cnt[cl] > 0 )
+                               c.allocate(cl, cnt[cl]);
+                       
+                       for( int i=0; i<ru; i++ ) {
+                               if( a.isEmpty(i) ) continue;
+                               int apos = a.pos(i);
+                               int alen = a.size(i);
+                               int[] aix = a.indexes(i);
+                               double[] avals = a.values(i);
+                               for( int j=apos; j<apos+alen && aix[j]<=cl; j++ 
)
+                                       if( aix[j] == cl )
+                                               c.append(cl, i, avals[j]);
+                       }
                }
-               
-               //blocking according to typical L2 cache sizes w/ awareness of 
sparsity
-               final long xsp = (long)in.rlen*in.clen/in.nonZeros;
-               final int blocksizeI = Math.max(128, (int) (8*xsp));
-               final int blocksizeJ = Math.max(128, (int) (8*xsp));
-       
-               //temporary array for block boundaries (for preventing binary 
search) 
-               int[] ix = new int[Math.min(blocksizeI, ru-rl)];
-               
-               //blocked execution
-               for( int bi=rl; bi<ru; bi+=blocksizeI )
-               {
-                       Arrays.fill(ix, 0);
-                       //find column starting positions
-                       int bimin = Math.min(bi+blocksizeI, ru);
-                       if( cl > 0 ) {
-                               for( int i=bi; i<bimin; i++ ) {
-                                       if( a.isEmpty(i) ) continue;
-                                       int j = a.posFIndexGTE(i, cl);
-                                       ix[i-bi] = (j>=0) ? j : a.size(i);
-                               }
+               else { //GENERAL CASE
+                       //allocate output sparse rows
+                       if( cnt != null ) {
+                               for( int i=cl; i<cu; i++ )
+                                       if( cnt[i] > 0 )
+                                               c.allocate(i, cnt[i]);
                        }
                        
-                       for( int bj=cl; bj<cu; bj+=blocksizeJ ) {
-                               int bjmin = Math.min(bj+blocksizeJ, cu);
-                               //core block transpose operation
-                               for( int i=bi; i<bimin; i++ ) {
-                                       if( a.isEmpty(i) ) continue;
-                                       int apos = a.pos(i);
-                                       int alen = a.size(i);
-                                       int[] aix = a.indexes(i);
-                                       double[] avals = a.values(i);
-                                       int j = ix[i-bi] + apos; //last block 
boundary
-                                       for( ; j<apos+alen && aix[j]<bjmin; j++ 
) {
-                                               c.allocate(aix[j], ennz2, n2);
-                                               c.append(aix[j], i, avals[j]);
+                       //blocking according to typical L2 cache sizes w/ 
awareness of sparsity
+                       final long xsp = (long)in.rlen*in.clen/in.nonZeros;
+                       final int blocksizeI = Math.max(128, (int) (8*xsp));
+                       final int blocksizeJ = Math.max(128, (int) (8*xsp));
+               
+                       //temporary array for block boundaries (for preventing 
binary search) 
+                       int[] ix = new int[Math.min(blocksizeI, ru-rl)];
+                       
+                       //blocked execution
+                       for( int bi=rl; bi<ru; bi+=blocksizeI )
+                       {
+                               Arrays.fill(ix, 0);
+                               //find column starting positions
+                               int bimin = Math.min(bi+blocksizeI, ru);
+                               if( cl > 0 ) {
+                                       for( int i=bi; i<bimin; i++ ) {
+                                               if( a.isEmpty(i) ) continue;
+                                               int j = a.posFIndexGTE(i, cl);
+                                               ix[i-bi] = (j>=0) ? j : 
a.size(i);
+                                       }
+                               }
+                               
+                               for( int bj=cl; bj<cu; bj+=blocksizeJ ) {
+                                       int bjmin = Math.min(bj+blocksizeJ, cu);
+                                       //core block transpose operation
+                                       for( int i=bi; i<bimin; i++ ) {
+                                               if( a.isEmpty(i) ) continue;
+                                               int apos = a.pos(i);
+                                               int alen = a.size(i);
+                                               int[] aix = a.indexes(i);
+                                               double[] avals = a.values(i);
+                                               int j = ix[i-bi] + apos; //last 
block boundary
+                                               for( ; j<apos+alen && 
aix[j]<bjmin; j++ ) {
+                                                       c.allocate(aix[j], 
ennz2, n2);
+                                                       c.append(aix[j], i, 
avals[j]);
+                                               }
+                                               ix[i-bi] = j - apos; //keep 
block boundary
                                        }
-                                       ix[i-bi] = j - apos; //keep block 
boundary
                                }
                        }
                }

Reply via email to