[SYSTEMML-2046] Large dense matrix blocks in mm, tsmm, mmchain, pmm

This patch modifies the core block matrix multiplication operators,
including mm, tsmm, mmchain, and pmm (with all their combinations of
dense and sparse matrices), to support large dense blocks. For cases
that require more detailed micro benchmarks, we still fall back to
single block operations. Quaternary mm operators are also not yet
supported. However, this now already allows for MV and similar operators
over very large dense (e.g., 24GB matrix-vector multiply in 940ms which
is still at peak memory bandwidth).


Project: http://git-wip-us.apache.org/repos/asf/systemml/repo
Commit: http://git-wip-us.apache.org/repos/asf/systemml/commit/fc840470
Tree: http://git-wip-us.apache.org/repos/asf/systemml/tree/fc840470
Diff: http://git-wip-us.apache.org/repos/asf/systemml/diff/fc840470

Branch: refs/heads/master
Commit: fc84047045531769a6887b50efa1b339f46fb844
Parents: 2af9c66
Author: Matthias Boehm <[email protected]>
Authored: Mon Jan 8 22:23:10 2018 -0800
Committer: Matthias Boehm <[email protected]>
Committed: Tue Jan 9 11:45:32 2018 -0800

----------------------------------------------------------------------
 .../sysml/runtime/matrix/data/DenseBlock.java   |   8 +
 .../runtime/matrix/data/DenseBlockDRB.java      |   5 +
 .../runtime/matrix/data/DenseBlockLDRB.java     |   5 +
 .../runtime/matrix/data/LibMatrixMult.java      | 628 +++++++++++--------
 4 files changed, 369 insertions(+), 277 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/systemml/blob/fc840470/src/main/java/org/apache/sysml/runtime/matrix/data/DenseBlock.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/matrix/data/DenseBlock.java 
b/src/main/java/org/apache/sysml/runtime/matrix/data/DenseBlock.java
index 725cd7f..df6f2fb 100644
--- a/src/main/java/org/apache/sysml/runtime/matrix/data/DenseBlock.java
+++ b/src/main/java/org/apache/sysml/runtime/matrix/data/DenseBlock.java
@@ -97,6 +97,14 @@ public abstract class DenseBlock implements Serializable
        public abstract int blockSize(int bix);
        
        /**
+        * Indicates if the dense block has a single
+        * underlying block, i.e., if numBlocks==1.
+        * 
+        * @return true if single block
+        */
+       public abstract boolean isContiguous();
+       
+       /**
         * Get the length of the dense block as the product
         * of row and column dimensions.
         * 

http://git-wip-us.apache.org/repos/asf/systemml/blob/fc840470/src/main/java/org/apache/sysml/runtime/matrix/data/DenseBlockDRB.java
----------------------------------------------------------------------
diff --git 
a/src/main/java/org/apache/sysml/runtime/matrix/data/DenseBlockDRB.java 
b/src/main/java/org/apache/sysml/runtime/matrix/data/DenseBlockDRB.java
index 03b7cd4..33dc492 100644
--- a/src/main/java/org/apache/sysml/runtime/matrix/data/DenseBlockDRB.java
+++ b/src/main/java/org/apache/sysml/runtime/matrix/data/DenseBlockDRB.java
@@ -86,6 +86,11 @@ public class DenseBlockDRB extends DenseBlock
        public int blockSize(int bix) {
                return rlen;
        }
+       
+       @Override
+       public boolean isContiguous() {
+               return true;
+       }
 
        @Override
        public long size() {

http://git-wip-us.apache.org/repos/asf/systemml/blob/fc840470/src/main/java/org/apache/sysml/runtime/matrix/data/DenseBlockLDRB.java
----------------------------------------------------------------------
diff --git 
a/src/main/java/org/apache/sysml/runtime/matrix/data/DenseBlockLDRB.java 
b/src/main/java/org/apache/sysml/runtime/matrix/data/DenseBlockLDRB.java
index 9a4d47d..5f5ff79 100644
--- a/src/main/java/org/apache/sysml/runtime/matrix/data/DenseBlockLDRB.java
+++ b/src/main/java/org/apache/sysml/runtime/matrix/data/DenseBlockLDRB.java
@@ -109,6 +109,11 @@ public class DenseBlockLDRB extends DenseBlock
        public int blockSize(int bix) {
                return Math.min(blen, rlen-bix*blen);
        }
+       
+       @Override
+       public boolean isContiguous() {
+               return rlen <= blen;
+       }
 
        @Override
        public long size() {

http://git-wip-us.apache.org/repos/asf/systemml/blob/fc840470/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixMult.java
----------------------------------------------------------------------
diff --git 
a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixMult.java 
b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixMult.java
index a90ef5d..c781a2e 100644
--- a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixMult.java
+++ b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixMult.java
@@ -218,7 +218,7 @@ public class LibMatrixMult
                        //aggregate partial results (nnz, ret for vector/matrix)
                        ret.nonZeros = 0; //reset after execute
                        for( Future<Object> task : taskret ) {
-                               if( pm2r )
+                               if( pm2r ) //guaranteed single block
                                        vectAdd((double[])task.get(), 
ret.getDenseBlockValues(), 0, 0, ret.rlen*ret.clen);
                                else
                                        ret.nonZeros += (Long)task.get();
@@ -325,7 +325,7 @@ public class LibMatrixMult
                        for( int i=0, lb=0; i<blklens.size(); 
lb+=blklens.get(i), i++ )
                                tasks.add(new MatrixMultChainTask(mX, mV, mW, 
ct, lb, lb+blklens.get(i)));
                        //execute tasks
-                       List<Future<double[]>> taskret = pool.invokeAll(tasks); 
+                       List<Future<double[]>> taskret = pool.invokeAll(tasks);
                        pool.shutdown();
                        //aggregate partial results
                        for( Future<double[]> task : taskret )
@@ -367,7 +367,7 @@ public class LibMatrixMult
                //post-processing
                long nnz = copyUpperToLowerTriangle(ret);
                ret.setNonZeros(nnz);
-               ret.examSparsity();     
+               ret.examSparsity();
                
                //System.out.println("TSMM 
("+m1.isInSparseFormat()+","+m1.getNumRows()+","+m1.getNumColumns()+","+m1.getNonZeros()+","+leftTranspose+")
 in "+time.stop());
        }
@@ -395,7 +395,7 @@ public class LibMatrixMult
                
                //pre-processing (no need to check isThreadSafe)
                m1 = prepMatrixMultTransposeSelfInput(m1, leftTranspose);
-               ret.sparse = false;     
+               ret.sparse = false;
                ret.allocateDenseBlock();
        
                //core multi-threaded matrix mult computation
@@ -406,7 +406,7 @@ public class LibMatrixMult
                        int blklen = (int)(Math.ceil((double)ret.rlen/(2*k)));
                        for( int i=0; i<2*k & i*blklen<ret.rlen; i++ )
                                tasks.add(new MatrixMultTransposeTask(m1, ret, 
leftTranspose, i*blklen, Math.min((i+1)*blklen, ret.rlen)));
-                       List<Future<Object>> rtasks = pool.invokeAll(tasks);    
+                       List<Future<Object>> rtasks = pool.invokeAll(tasks);
                        pool.shutdown();
                        for( Future<Object> rtask : rtasks )
                                rtask.get(); //error handling
@@ -416,7 +416,7 @@ public class LibMatrixMult
                }
                
                //post-processing
-               long nnz = copyUpperToLowerTriangle(ret);               
+               long nnz = copyUpperToLowerTriangle(ret);
                ret.setNonZeros(nnz);
                ret.examSparsity();     
                
@@ -926,29 +926,37 @@ public class LibMatrixMult
        private static void matrixMultDenseDense(MatrixBlock m1, MatrixBlock 
m2, MatrixBlock ret, boolean tm2, boolean pm2, int rl, int ru, int cl, int cu) 
                throws DMLRuntimeException
        {
-               double[] a = m1.getDenseBlockValues();
-               double[] b = m2.getDenseBlockValues();
-               double[] c = ret.getDenseBlockValues();
+               DenseBlock a = m1.getDenseBlock();
+               DenseBlock b = m2.getDenseBlock();
+               DenseBlock c = ret.getDenseBlock();
                final int m = m1.rlen;
                final int n = m2.clen;
                final int cd = m1.clen;
                
                if( LOW_LEVEL_OPTIMIZATION ) {
                        if( m==1 && n==1 ) {            //DOT PRODUCT
-                               c[0] = dotProduct(a, b, cd);
+                               double[] avals = a.valuesAt(0);
+                               double[] bvals = b.valuesAt(0);
+                               c.set(0, 0, dotProduct(avals, bvals, cd));
                        }
                        else if( n>1 && cd == 1 ) {     //OUTER PRODUCT
-                               for( int i=rl, cix=rl*n; i < ru; i++, cix+=n) {
-                                       if( a[i] == 1 )
-                                               System.arraycopy(b, 0, c, cix, 
n);
-                                       else if( a[i] != 0 )
-                                               vectMultiplyWrite(a[i], b, c, 
0, cix, n);
+                               double[] avals = a.valuesAt(0);
+                               double[] bvals = b.valuesAt(0);
+                               for( int i=rl; i < ru; i++) {
+                                       double[] cvals = c.values(i);
+                                       int cix = c.pos(i);
+                                       if( avals[i] == 1 )
+                                               System.arraycopy(bvals, 0, 
cvals, cix, n);
+                                       else if( avals[i] != 0 )
+                                               vectMultiplyWrite(avals[i], 
bvals, cvals, 0, cix, n);
                                        else
-                                               Arrays.fill(c, cix, cix+n, 0);
+                                               Arrays.fill(cvals, cix, cix+n, 
0);
                                }
                        }
                        else if( n==1 && cd == 1 ) {    //VECTOR-SCALAR
-                               vectMultiplyWrite(b[0], a, c, rl, rl, ru-rl);
+                               double[] avals = a.valuesAt(0);
+                               double[] cvals = c.valuesAt(0);
+                               vectMultiplyWrite(b.get(0,0), avals, cvals, rl, 
rl, ru-rl);
                        }
                        else if( n==1 && cd<=2*1024 ) { //MATRIX-VECTOR (short 
rhs)
                                matrixMultDenseDenseMVShortRHS(a, b, c, cd, rl, 
ru);
@@ -970,69 +978,90 @@ public class LibMatrixMult
                        }
                }
                else {
-                       for( int i = rl, aix=rl*cd, cix=rl*n; i < ru; i++, 
cix+=n) 
-                               for( int k = 0, bix=0; k < cd; k++, aix++, 
bix+=n) {
-                                       double val = a[ aix ];
-                                       if( val != 0 )
+                       for( int i = rl; i < ru; i++) {
+                               double[] avals = a.values(i);
+                               double[] cvals = c.values(i);
+                               int aix = a.pos(i), cix = c.pos(i);
+                               for( int k = 0; k < cd; k++) {
+                                       double val = avals[aix + k];
+                                       if( val != 0 ) {
+                                               double[] bvals = b.values(k);
+                                               int bix = b.pos(k);
                                                for( int j = 0; j < n; j++) 
-                                                       c[ cix+j ] += val * b[ 
bix+j ];
+                                                       cvals[cix+j] += val * 
bvals[bix+j];
+                                       }
                                }
+                       }
                }
        }
        
-       private static void matrixMultDenseDenseMVShortRHS(double[] a, double[] 
b, double[] c, int cd, int rl, int ru) 
+       private static void matrixMultDenseDenseMVShortRHS(DenseBlock a, 
DenseBlock b, DenseBlock c, int cd, int rl, int ru) 
                throws DMLRuntimeException
        {
-               for( int i=rl, aix=rl*cd; i < ru; i++, aix+=cd) 
-                       c[i] = dotProduct(a, b, aix, 0, cd);
+               double[] bvals = b.valuesAt(0);
+               double[] cvals = c.valuesAt(0);
+               for( int i=rl; i < ru; i++ )
+                       cvals[i] = dotProduct(a.values(i), bvals, a.pos(i), 0, 
cd);
        }
        
-       private static void matrixMultDenseDenseMVTallRHS(double[] a, double[] 
b, double[] c, int cd, int rl, int ru) 
+       private static void matrixMultDenseDenseMVTallRHS(DenseBlock a, 
DenseBlock b, DenseBlock c, int cd, int rl, int ru) 
                throws DMLRuntimeException
        {
                final int blocksizeI = 32;
                final int blocksizeK = 2*1024; //16KB vector blocks (L1)
+               double[] bvals = b.valuesAt(0);
+               double[] cvals = c.valuesAt(0);
                for( int bi=rl; bi<ru; bi+=blocksizeI ) {
                        int bimin = Math.min(bi+blocksizeI, ru);
                        for( int bk=0; bk<cd; bk+=blocksizeK ) {
                                int bkmin = Math.min(bk+blocksizeK, cd);
-                               for( int i=bi, aix=bi*cd+bk; i<bimin; i++, 
aix+=cd) 
-                                       c[i] += dotProduct(a, b, aix, bk, 
bkmin-bk);
+                               for( int i=bi; i<bimin; i++) 
+                                       cvals[i] += dotProduct(a.values(i), 
bvals, a.pos(i), bk, bkmin-bk);
                        }
                }
        }
        
-       private static void matrixMultDenseDenseVM(double[] a, double[] b, 
double[] c, int n, int cd, int rl, int ru) 
+       private static void matrixMultDenseDenseVM(DenseBlock a, DenseBlock b, 
DenseBlock c, int n, int cd, int rl, int ru) 
                throws DMLRuntimeException
        {
+               double[] avals = a.valuesAt(0); //vector
+               double[] cvals = c.valuesAt(0); //vector
+               
                //parallelization over rows in rhs matrix
                //rest not aligned to blocks of 2 rows
-               final int kn = (ru-rl)%2;
-               if( kn == 1 && a[rl] != 0 )
-                       vectMultiplyAdd(a[rl], b, c, rl*n, 0, n);
-               
-               //compute blocks of 2 rows (2 instead of 4 for small n<64) 
-               for( int k=rl+kn, bix=(rl+kn)*n; k<ru; k+=2, bix+=2*n ){
-                       if( a[k] != 0 && a[k+1] != 0  )
-                               vectMultiplyAdd2(a[k], a[k+1], b, c, bix, 
bix+n, 0, n);
-                       else if( a[k] != 0 )
-                               vectMultiplyAdd(a[k], b, c, bix, 0, n);
-                       else if( a[k+1] != 0 )  
-                               vectMultiplyAdd(a[k+1], b, c, bix+n, 0, n);
+               final int kn = b.isContiguous() ? rl+(ru-rl)%2 : ru;
+               for( int k = rl; k < kn; k++ )
+                       if( avals[k] != 0 )
+                               vectMultiplyAdd(avals[k], b.values(k), cvals, 
b.pos(k), 0, n);
+               
+               //compute blocks of 2 rows (2 instead of 4 for small n<64)
+               double[] bvals = b.valuesAt(0); //only for special case
+               for( int k=kn, bix=kn*n; k<ru; k+=2, bix+=2*n ){
+                       if( avals[k] != 0 && avals[k+1] != 0  )
+                               vectMultiplyAdd2(avals[k], avals[k+1], bvals, 
cvals, bix, bix+n, 0, n);
+                       else if( avals[k] != 0 )
+                               vectMultiplyAdd(avals[k], bvals, cvals, bix, 0, 
n);
+                       else if( avals[k+1] != 0 )
+                               vectMultiplyAdd(avals[k+1], bvals, cvals, 
bix+n, 0, n);
                }
        }
        
-       private static void matrixMultDenseDenseMMShortLHS(double[] a, double[] 
b, double[] c, int m, int n, int cd, int rl, int ru)
+       private static void matrixMultDenseDenseMMShortLHS(DenseBlock a, 
DenseBlock b, DenseBlock c, int m, int n, int cd, int rl, int ru)
                throws DMLRuntimeException
        {
+               //TODO robustness large blocks (perf critical)
+               double[] avals = a.valuesAt(0);
+               double[] bvals = b.valuesAt(0);
+               double[] cvals = c.valuesAt(0);
+               
                //cache-conscious parallelization over rows in rhs matrix
                final int kn = (ru-rl)%4;
                
                //rest not aligned to blocks of 2 rows
                for( int i=0, aix=0, cix=0; i<m; i++, aix+=cd, cix+=n )
                        for( int k=rl, bix=rl*n; k<rl+kn; k++, bix+=n )
-                               if( a[aix+k] != 0 )
-                                       vectMultiplyAdd(a[aix+k], b, c, bix, 
cix, n);
+                               if( avals[aix+k] != 0 )
+                                       vectMultiplyAdd(avals[aix+k], bvals, 
cvals, bix, cix, n);
                
                final int blocksizeK = 48;
                final int blocksizeJ = 1024;
@@ -1044,25 +1073,33 @@ public class LibMatrixMult
                                int bjlen = Math.min(n, bj+blocksizeJ)-bj;
                                for( int i=0, aix=0, cix=bj; i<m; i++, aix+=cd, 
cix+=n )
                                        for( int k=bk, bix=bk*n+bj; k<bkmin; 
k+=4, bix+=4*n ) {
-                                               vectMultiplyAdd4(a[aix+k], 
a[aix+k+1], a[aix+k+2], a[aix+k+3],
-                                                       b, c, bix, bix+n, 
bix+2*n, bix+3*n, cix, bjlen);
+                                               vectMultiplyAdd4(avals[aix+k], 
avals[aix+k+1], avals[aix+k+2], avals[aix+k+3],
+                                                       bvals, cvals, bix, 
bix+n, bix+2*n, bix+3*n, cix, bjlen);
                                        }
                        }
        }
        
-       private static void matrixMultDenseDenseMMSkinnyRHS(double[] a, 
double[] b, double[] c, int n2, int cd, int rl, int ru) 
+       private static void matrixMultDenseDenseMMSkinnyRHS(DenseBlock a, 
DenseBlock b, DenseBlock c, int n2, int cd, int rl, int ru) 
                throws DMLRuntimeException
        {
                //note: prepared rhs input via transpose for: m > n && cd > 64 
&& n < 64
                //however, explicit flag required since dimension change m2
-               for( int i=rl, aix=rl*cd, cix=rl*n2; i < ru; i++, aix+=cd, 
cix+=n2 ) 
-                       for( int j=0, bix=0; j<n2; j++, bix+=cd )
-                               c[cix+j] = dotProduct(a, b, aix, bix, cd);
+               for( int i=rl; i < ru; i++ ) {
+                       double[] avals = a.values(i), cvals = c.values(i);
+                       int aix = a.pos(i), cix = c.pos(i);
+                       for( int j=0; j<n2; j++ )
+                               cvals[cix+j] = dotProduct(avals, b.values(j), 
aix, b.pos(j), cd);
+               }
        }
        
-       private static void matrixMultDenseDenseMM(double[] a, double[] b, 
double[] c, int n, int cd, int rl, int ru, int cl, int cu) 
+       private static void matrixMultDenseDenseMM(DenseBlock a, DenseBlock b, 
DenseBlock c, int n, int cd, int rl, int ru, int cl, int cu) 
                throws DMLRuntimeException
        {
+               //TODO robustness large blocks (perf critical)
+               double[] avals = a.valuesAt(0);
+               double[] bvals = b.valuesAt(0);
+               double[] cvals = c.valuesAt(0);
+               
                //1) Unrolled inner loop (for better instruction-level 
parallelism)
                //2) Blocked execution (for less cache trashing in parallel 
exec)       
                //3) Asymmetric block sizes (for less misses in inner loop, yet 
blocks in L1/L2)
@@ -1090,20 +1127,20 @@ public class LibMatrixMult
                                                int cixj = i * n + bj; //scan 
index on c
                                                
                                                //determine nnz of a (for 
sparsity-aware skipping of rows)
-                                               int knnz = 
copyNonZeroElements(a, aixi, bk, bj, n, ta, tbi, bklen);
+                                               int knnz = 
copyNonZeroElements(avals, aixi, bk, bj, n, ta, tbi, bklen);
                                                //if( knnz > 0 ) //for skipping 
empty rows
                                                
                                                //rest not aligned to blocks of 
4 rows
                                                final int bn = knnz % 4;
                                                switch( bn ){
-                                                       case 1: 
vectMultiplyAdd(ta[0], b, c, tbi[0], cixj, bjlen); break;
-                                                       case 2: 
vectMultiplyAdd2(ta[0],ta[1], b, c, tbi[0], tbi[1], cixj, bjlen); break;
-                                                       case 3: 
vectMultiplyAdd3(ta[0],ta[1],ta[2], b, c, tbi[0], tbi[1],tbi[2], cixj, bjlen); 
break;
+                                                       case 1: 
vectMultiplyAdd(ta[0], bvals, cvals, tbi[0], cixj, bjlen); break;
+                                                       case 2: 
vectMultiplyAdd2(ta[0],ta[1], bvals, cvals, tbi[0], tbi[1], cixj, bjlen); break;
+                                                       case 3: 
vectMultiplyAdd3(ta[0],ta[1],ta[2], bvals, cvals, tbi[0], tbi[1],tbi[2], cixj, 
bjlen); break;
                                                }
                                                
                                                //compute blocks of 4 rows 
(core inner loop)
                                                for( int k = bn; k<knnz; k+=4 ){
-                                                       vectMultiplyAdd4( 
ta[k], ta[k+1], ta[k+2], ta[k+3], b, c, 
+                                                       vectMultiplyAdd4( 
ta[k], ta[k+1], ta[k+2], ta[k+3], bvals, cvals, 
                                                                        tbi[k], 
tbi[k+1], tbi[k+2], tbi[k+3], cixj, bjlen );
                                                }
                                        }
@@ -1113,27 +1150,27 @@ public class LibMatrixMult
        private static void matrixMultDenseSparse(MatrixBlock m1, MatrixBlock 
m2, MatrixBlock ret, boolean pm2, int rl, int ru) 
                throws DMLRuntimeException 
        {
-               double[] a = m1.getDenseBlockValues();
-               double[] c = ret.getDenseBlockValues();
+               DenseBlock a = m1.getDenseBlock();
+               DenseBlock c = ret.getDenseBlock();
                int m = m1.rlen;
                int cd = m1.clen;
-               int n = m2.clen;
                
                // MATRIX-MATRIX (VV, MV not applicable here because V always 
dense)
                if( LOW_LEVEL_OPTIMIZATION )
                {
                        SparseBlock b = m2.sparseBlock;
                        
-                       if( pm2 && m==1 )          //VECTOR-MATRIX
-                       {
+                       if( pm2 && m==1 ) {        //VECTOR-MATRIX
                                //parallelization over rows in rhs matrix
+                               double[] avals = a.valuesAt(0); //vector
+                               double[] cvals = c.valuesAt(0); //vector
                                for( int k=rl; k<ru; k++ )
-                                       if( a[k] != 0 && !b.isEmpty(k) ) {
-                                               vectMultiplyAdd(a[k], 
b.values(k), c, b.indexes(k), b.pos(k), 0, b.size(k));
+                                       if( avals[k] != 0 && !b.isEmpty(k) ) {
+                                               vectMultiplyAdd(avals[k], 
b.values(k), cvals,
+                                                       b.indexes(k), b.pos(k), 
0, b.size(k));
                                        }
                        }
-                       else                       //MATRIX-MATRIX
-                       {
+                       else {                     //MATRIX-MATRIX
                                //best effort blocking, without blocking over J 
because it is 
                                //counter-productive, even with front of 
current indexes
                                final int blocksizeK = 32;
@@ -1144,38 +1181,36 @@ public class LibMatrixMult
                                        for( int bk = 0, bimin = Math.min(ru, 
bi+blocksizeI); bk < cd; bk+=blocksizeK ) {
                                                int bkmin = Math.min(cd, 
bk+blocksizeK);
                                                //core sub block matrix 
multiplication
-                                               for(int i = bi, aix=bi*cd, 
cix=bi*n; i < bimin; i++, aix+=cd, cix+=n) {
+                                               for(int i = bi; i < bimin; i++) 
{
+                                                       double[] avals = 
a.values(i), cvals = c.values(i);
+                                                       int aix = a.pos(i), cix 
= c.pos(i);
                                                        for( int k = bk; k < 
bkmin; k++ ) {
-                                                               double aval = 
a[aix+k];
+                                                               double aval = 
avals[aix+k];
                                                                if( aval == 0 
|| b.isEmpty(k) )
                                                                        
continue;
-                                                               
vectMultiplyAdd(aval, b.values(k), c, 
+                                                               
vectMultiplyAdd(aval, b.values(k), cvals, 
                                                                        
b.indexes(k), b.pos(k), cix, b.size(k));
                                                        }
                                                }
                                        }
                        }
                }
-               else
-               {
+               else {
                        SparseBlock b = m2.sparseBlock;
-                       for( int i=rl, aix=rl*cd, cix=rl*n; i < ru; i++, cix+=n 
) 
-                               for(int k = 0; k < cd; k++, aix++ ) 
-                               {
-                                       double val = a[aix];
-                                       if( val!=0 )
-                                       {
-                                               if( !b.isEmpty(k) ) 
-                                               {
-                                                       int bpos = b.pos(k);
-                                                       int blen = b.size(k);
-                                                       int[] bix = 
b.indexes(k);
-                                                       double[] bvals = 
b.values(k);   
-                                                       for(int j = bpos; j < 
bpos+blen; j++)
-                                                               c[cix+bix[j]] 
+= val * bvals[j];
-                                               }
-                                       }
+                       for( int i=rl; i < ru; i++ ) {
+                               double[] avals = a.values(i), cvals = 
c.values(i);
+                               int aix = a.pos(i), cix = c.pos(i);
+                               for(int k = 0; k < cd; k++ ) {
+                                       double val = avals [aix];
+                                       if( val == 0 || b.isEmpty(k) ) continue;
+                                       int bpos = b.pos(k);
+                                       int blen = b.size(k);
+                                       int[] bix = b.indexes(k);
+                                       double[] bvals = b.values(k);
+                                       for(int j = bpos; j < bpos+blen; j++)
+                                               cvals[cix+bix[j]] += val * 
bvals[j];
                                }
+                       }
                }
        }
 
@@ -1183,8 +1218,8 @@ public class LibMatrixMult
                throws DMLRuntimeException
        {
                SparseBlock a = m1.sparseBlock;
-               double[] b = m2.getDenseBlockValues();
-               double[] c = ret.getDenseBlockValues();
+               DenseBlock b = m2.getDenseBlock();
+               DenseBlock c = ret.getDenseBlock();
                final int m = m1.rlen;
                final int n = m2.clen;
                final int cd = m2.rlen;
@@ -1193,7 +1228,7 @@ public class LibMatrixMult
                if( LOW_LEVEL_OPTIMIZATION ) {
                        if( m==1 && n==1 ) {            //DOT PRODUCT
                                if( !a.isEmpty(0) )
-                                       c[0] = dotProduct(a.values(0), b, 
a.indexes(0), a.pos(0), 0, a.size(0));
+                                       c.set(0, 0, dotProduct(a.values(0), 
b.values(0), a.indexes(0), a.pos(0), 0, a.size(0)));
                        }
                        else if( n==1 && cd<=2*1024 ) { //MATRIX-VECTOR (short 
rhs)
                                matrixMultSparseDenseMVShortRHS(a, b, c, rl, 
ru);
@@ -1215,32 +1250,42 @@ public class LibMatrixMult
                        }
                }
                else {
-                       for( int i=rl, cix=rl*n; i<ru; i++, cix+=n ) {
+                       for( int i=rl; i<ru; i++ ) {
                                if( a.isEmpty(i) ) continue; 
                                int apos = a.pos(i);
                                int alen = a.size(i);
                                int[] aix = a.indexes(i);
                                double[] avals = a.values(i);
+                               double[] cvals = c.values(i);
+                               int cix = c.pos(i);
                                for(int k = apos; k < apos+alen; k++) {
                                        double val = avals[k];
-                                       for(int j = 0, bix=aix[k]*n; j < n; j++)
-                                               c[cix+j] += val * b[bix+j];
+                                       double[] bvals = b.values(k);
+                                       int bix = b.pos(k);
+                                       for(int j = 0; j < n; j++)
+                                               cvals[cix+j] += val * 
bvals[bix+j];
                                }
                        }
                }
        }
        
-       private static void matrixMultSparseDenseMVShortRHS(SparseBlock a, 
double[] b, double[] c, int rl, int ru) 
+       private static void matrixMultSparseDenseMVShortRHS(SparseBlock a, 
DenseBlock b, DenseBlock c, int rl, int ru) 
                throws DMLRuntimeException
        {
+               double[] bvals = b.valuesAt(0);
+               double[] cvals = c.valuesAt(0);
                for( int i=rl; i<ru; i++ )
                        if( !a.isEmpty(i) )
-                               c[i] = dotProduct(a.values(i), b, a.indexes(i), 
a.pos(i), 0, a.size(i));
+                               cvals[i] = dotProduct(a.values(i), bvals,
+                                       a.indexes(i), a.pos(i), 0, a.size(i));
        }
        
-       private static void matrixMultSparseDenseMVTallRHS(SparseBlock a, 
double[] b, double[] c, int cd, long xsp, int rl, int ru) 
+       private static void matrixMultSparseDenseMVTallRHS(SparseBlock a, 
DenseBlock b, DenseBlock c, int cd, long xsp, int rl, int ru) 
                throws DMLRuntimeException
-       {       
+       {
+               double[] bvals = b.valuesAt(0);
+               double[] cvals = c.valuesAt(0);
+               
                final int blocksizeI = 32;
                final int blocksizeK = (int)Math.max(2*1024,2*1024*xsp/32); //~ 
16KB L1
                int[] curk = new int[blocksizeI];
@@ -1256,14 +1301,14 @@ public class LibMatrixMult
                                        double[] avals = a.values(i);
                                        int k = curk[i-bi] + apos;
                                        for( ; k<apos+alen && aix[k]<bkmin; k++ 
)
-                                               c[i] += avals[k] * b[aix[k]];
+                                               cvals[i] += avals[k] * 
bvals[aix[k]];
                                        curk[i-bi] = k - apos;
                                }
                        }
                }
        }
        
-       private static void matrixMultSparseDenseVM(SparseBlock a, double[] b, 
double[] c, int n, int rl, int ru) 
+       private static void matrixMultSparseDenseVM(SparseBlock a, DenseBlock 
b, DenseBlock c, int n, int rl, int ru) 
                throws DMLRuntimeException
        {
                if( a.isEmpty(0) )
@@ -1273,20 +1318,31 @@ public class LibMatrixMult
                int alen = a.size(0);
                int[] aix = a.indexes(0);
                double[] avals = a.values(0);
+               double[] cvals = c.valuesAt(0);
                int rlix = (rl==0) ? 0 : a.posFIndexGTE(0,rl);
                rlix = (rlix>=0) ? rlix : alen;
                
-               for( int k=rlix; k<alen && aix[k]<ru; k++ ) {
-                       if( k+1<alen && aix[k+1]<ru )
-                               vectMultiplyAdd2(avals[k], avals[k+1], b, c, 
aix[k]*n, aix[++k]*n, 0, n);
-                       else
-                               vectMultiplyAdd(avals[k], b, c, aix[k]*n, 0, n);
+               if( b.isContiguous() ) {
+                       double[] bvals = b.valuesAt(0);
+                       for( int k=rlix; k<alen && aix[k]<ru; k++ )
+                               if( k+1<alen && aix[k+1]<ru )
+                                       vectMultiplyAdd2(avals[k], avals[k+1], 
bvals, cvals, aix[k]*n, aix[++k]*n, 0, n);
+                               else
+                                       vectMultiplyAdd(avals[k], bvals, cvals, 
aix[k]*n, 0, n);
+               }
+               else {
+                       for( int k=rlix; k<alen && aix[k]<ru; k++ )
+                               vectMultiplyAdd(avals[k], b.values(aix[k]), 
cvals, b.pos(aix[k]), 0, n);
                }
        }
        
-       private static void matrixMultSparseDenseMMShortLHS(SparseBlock a, 
double[] b, double[] c, int n, int cd, int rl, int ru) 
+       private static void matrixMultSparseDenseMMShortLHS(SparseBlock a, 
DenseBlock b, DenseBlock c, int n, int cd, int rl, int ru) 
                throws DMLRuntimeException
-       {       
+       {
+               //TODO robustness large blocks (perf critical)
+               double[] bvals = b.valuesAt(0);
+               double[] cvals = c.valuesAt(0);
+               
                int arlen = a.numRows();
                for( int i=0, cix=0; i<arlen; i++, cix+=n ) {
                        if( a.isEmpty(i) ) continue;
@@ -1303,22 +1359,26 @@ public class LibMatrixMult
                        //rest not aligned to blocks of 4 rows
                        final int bn = (k2-k1) % 4;
                        switch( bn ){
-                               case 1: vectMultiplyAdd(avals[k1], b, c, 
aix[k1]*n, cix, n); break;
-                               case 2: vectMultiplyAdd2(avals[k1],avals[k1+1], 
b, c, aix[k1]*n, aix[k1+1]*n, cix, n); break;
-                               case 3: 
vectMultiplyAdd3(avals[k1],avals[k1+1],avals[k1+2], b, c, aix[k1]*n, 
aix[k1+1]*n, aix[k1+2]*n, cix, n); break;
+                               case 1: vectMultiplyAdd(avals[k1], bvals, 
cvals, aix[k1]*n, cix, n); break;
+                               case 2: vectMultiplyAdd2(avals[k1],avals[k1+1], 
bvals, cvals, aix[k1]*n, aix[k1+1]*n, cix, n); break;
+                               case 3: 
vectMultiplyAdd3(avals[k1],avals[k1+1],avals[k1+2], bvals, cvals, aix[k1]*n, 
aix[k1+1]*n, aix[k1+2]*n, cix, n); break;
                        }
                        
                        //compute blocks of 4 rows (core inner loop)
                        for( int k = k1+bn; k<k2; k+=4 ) {
-                               vectMultiplyAdd4( avals[k], avals[k+1], 
avals[k+2], avals[k+3], b, c, 
+                               vectMultiplyAdd4( avals[k], avals[k+1], 
avals[k+2], avals[k+3], bvals, cvals, 
                                        aix[k]*n, aix[k+1]*n, aix[k+2]*n, 
aix[k+3]*n, cix, n );
                        }
                }
        }
        
-       private static void matrixMultSparseDenseMMSkinnyRHS(SparseBlock a, 
double[] b, double[] c, int n, int rl, int ru) 
+       private static void matrixMultSparseDenseMMSkinnyRHS(SparseBlock a, 
DenseBlock b, DenseBlock c, int n, int rl, int ru) 
                throws DMLRuntimeException
-       {       
+       {
+               //TODO robustness large blocks (perf critical)
+               double[] bvals = b.valuesAt(0);
+               double[] cvals = c.valuesAt(0);
+               
                //no blocking since b and c fit into cache anyway
                for( int i=rl, cix=rl*n; i<ru; i++, cix+=n ) {
                        if( a.isEmpty(i) ) continue;
@@ -1329,17 +1389,21 @@ public class LibMatrixMult
                        //rest not aligned to blocks of 4 rows
                        int bn = alen%4;
                        for( int k=apos; k<apos+bn; k++ )
-                               vectMultiplyAdd(avals[k], b, c, aix[k]*n, cix, 
n);
+                               vectMultiplyAdd(avals[k], bvals, cvals, 
aix[k]*n, cix, n);
                        //compute blocks of 4 rows (core inner loop)
                        for( int k=apos+bn; k<apos+alen; k+=4 )
-                               vectMultiplyAdd4( avals[k], avals[k+1], 
avals[k+2], avals[k+3], b, c,
+                               vectMultiplyAdd4( avals[k], avals[k+1], 
avals[k+2], avals[k+3], bvals, cvals,
                                        aix[k]*n, aix[k+1]*n, aix[k+2]*n, 
aix[k+3]*n, cix, n );
                }
        }
        
-       private static void matrixMultSparseDenseMM(SparseBlock a, double[] b, 
double[] c, int n, int cd, long xsp, int rl, int ru) 
+       private static void matrixMultSparseDenseMM(SparseBlock a, DenseBlock 
b, DenseBlock c, int n, int cd, long xsp, int rl, int ru) 
                throws DMLRuntimeException
-       {       
+       {
+               //TODO robustness large blocks (perf critical)
+               double[] bvals = b.valuesAt(0);
+               double[] cvals = c.valuesAt(0);
+               
                //blocksizes to fit blocks of B (dense) and several rows of A/C 
in common L2 cache size, 
                //while blocking A/C for L1/L2 yet allowing long scans (2 
pages) in the inner loop over j
                //in case of almost ultra-sparse matrices, we cannot ensure the 
blocking for the rhs and
@@ -1370,10 +1434,10 @@ public class LibMatrixMult
                                                        //rest not aligned to 
blocks of 4 rows
                                                        int bn = alen%4;
                                                        for( ; k<apos+bn && 
aix[k]<bkmin; k++ )
-                                                               
vectMultiplyAdd(avals[k], b, c, aix[k]*n+bj, cix, bjlen); 
+                                                               
vectMultiplyAdd(avals[k], bvals, cvals, aix[k]*n+bj, cix, bjlen); 
                                                        //compute blocks of 4 
rows (core inner loop), allowed to exceed bkmin
                                                        for( ; k<apos+alen && 
aix[k]<bkmin; k+=4 )
-                                                               
vectMultiplyAdd4( avals[k], avals[k+1], avals[k+2], avals[k+3], b, c, 
+                                                               
vectMultiplyAdd4( avals[k], avals[k+1], avals[k+2], avals[k+3], bvals, cvals, 
                                                                        
aix[k]*n+bj, aix[k+1]*n+bj, aix[k+2]*n+bj, aix[k+3]*n+bj, cix, bjlen );
                                                        //update positions on 
last bj block
                                                        if( bj+bjlen==n )
@@ -1390,10 +1454,9 @@ public class LibMatrixMult
        {       
                SparseBlock a = m1.sparseBlock;
                SparseBlock b = m2.sparseBlock;
-               double[] c = ret.getDenseBlockValues();
+               DenseBlock c = ret.getDenseBlock();
                int m = m1.rlen;
                int cd = m1.clen;
-               int n = m2.clen;
                
                // MATRIX-MATRIX (VV, MV not applicable here because V always 
dense)
                if(LOW_LEVEL_OPTIMIZATION)
@@ -1406,6 +1469,7 @@ public class LibMatrixMult
                                        int alen = a.size(0);
                                        int[] aix = a.indexes(0);
                                        double[] avals = a.values(0);
+                                       double[] cvals = c.valuesAt(0);
                                        int rlix = (rl==0) ? 0 : 
a.posFIndexGTE(0,rl);
                                        rlix = (rlix>=0) ? rlix : alen;
                                        
@@ -1415,10 +1479,10 @@ public class LibMatrixMult
                                                        int blen = 
b.size(aix[k]);
                                                        int[] bix = 
b.indexes(aix[k]);
                                                        double[] bvals = 
b.values(aix[k]);
-                                                       
vectMultiplyAdd(avals[k], bvals, c, bix, bpos, 0, blen);
+                                                       
vectMultiplyAdd(avals[k], bvals, cvals, bix, bpos, 0, blen);
                                                }
                                }
-                       }       
+                       }
                        else                       //MATRIX-MATRIX
                        {
                                //block sizes for best-effort blocking w/ 
sufficient row reuse in B yet small overhead
@@ -1434,37 +1498,36 @@ public class LibMatrixMult
                                        Arrays.fill(curk, 0); //reset positions
                                        for( int bk = 0, bimin = Math.min(ru, 
bi+blocksizeI); bk < cd; bk+=blocksizeK ) {
                                                final int bkmin = Math.min(cd, 
bk+blocksizeK); 
-                               
                                                //core sub block matrix 
multiplication
-                                               for( int i=bi, cix=bi*n; 
i<bimin; i++, cix+=n ) {
-                                                       if( !a.isEmpty(i) ) {
-                                                               final int apos 
= a.pos(i);
-                                                               final int alen 
= a.size(i);
-                                                               int[] aix = 
a.indexes(i);
-                                                               double[] avals 
= a.values(i);   
-                                                               
-                                                               int k = 
curk[i-bi] + apos;
-                                                               for(; k < 
apos+alen && aix[k]<bkmin; k++) {
-                                                                       if( 
!b.isEmpty(aix[k]) )
-                                                                               
vectMultiplyAdd(avals[k], b.values(aix[k]), c, 
-                                                                               
        b.indexes(aix[k]), b.pos(aix[k]), cix, b.size(aix[k]));
-                                                               }
-                                                               curk[i-bi] = k 
- apos;
+                                               for( int i=bi; i<bimin; i++ ) {
+                                                       if( a.isEmpty(i) ) 
continue;
+                                                       final int apos = 
a.pos(i);
+                                                       final int alen = 
a.size(i);
+                                                       int[] aix = 
a.indexes(i);
+                                                       double[] avals = 
a.values(i);
+                                                       double[] cvals = 
c.values(i);
+                                                       int cix = c.pos(i);
+                                                       int k = curk[i-bi] + 
apos;
+                                                       for(; k < apos+alen && 
aix[k]<bkmin; k++) {
+                                                               if( 
b.isEmpty(aix[k]) ) continue;
+                                                               
vectMultiplyAdd(avals[k], b.values(aix[k]), cvals,
+                                                                       
b.indexes(aix[k]), b.pos(aix[k]), cix, b.size(aix[k]));
                                                        }
+                                                       curk[i-bi] = k - apos;
                                                }
                                        }
                                }
                        }
                }
-               else
-               {
-                       for( int i=rl, cix=rl*n; i<Math.min(ru, a.numRows()); 
i++, cix+=n ) {
+               else {
+                       for( int i=rl; i<Math.min(ru, a.numRows()); i++ ) {
                                if( a.isEmpty(i) ) continue;
                                int apos = a.pos(i);
                                int alen = a.size(i);
                                int[] aix = a.indexes(i);
                                double[] avals = a.values(i);
-                               
+                               double[] cvals = c.values(i);
+                               int cix = c.pos(i);
                                for(int k = apos; k < apos+alen; k++) {
                                        if( b.isEmpty(aix[k]) ) continue;
                                        double val = avals[k];
@@ -1473,7 +1536,7 @@ public class LibMatrixMult
                                        int[] bix = b.indexes(aix[k]);
                                        double[] bvals = b.values(aix[k]);
                                        for(int j = bpos; j < bpos+blen; j++)
-                                               c[cix+bix[j]] += val * bvals[j];
+                                               cvals[cix+bix[j]] += val * 
bvals[j];
                                }
                        }
                }
@@ -1591,7 +1654,7 @@ public class LibMatrixMult
 
        private static void matrixMultChainDense(MatrixBlock mX, MatrixBlock 
mV, MatrixBlock mW, MatrixBlock ret, ChainType ct, int rl, int ru) 
        {
-               double[] a = mX.getDenseBlockValues();
+               DenseBlock a = mX.getDenseBlock();
                double[] b = mV.getDenseBlockValues();
                double[] w = (mW!=null) ? mW.getDenseBlockValues() : null;
                double[] c = ret.getDenseBlockValues();
@@ -1607,11 +1670,13 @@ public class LibMatrixMult
                final int bn = (ru-rl) % blocksizeI;
                
                //compute rest (not aligned to blocksize)
-               for( int i=rl, aix=rl*cd; i < rl+bn; i++, aix+=cd ) {
-                       double val = dotProduct(a, b, aix, 0, cd);
+               for( int i=rl; i < rl+bn; i++ ) {
+                       double[] avals = a.values(i);
+                       int aix = a.pos(i);
+                       double val = dotProduct(avals, b, aix, 0, cd);
                        val *= (weights) ? w[i] : 1;
                        val -= (weights2) ? w[i] : 0;
-                       vectMultiplyAdd(val, a, c, aix, 0, cd);
+                       vectMultiplyAdd(val, avals, c, aix, 0, cd);
                }
                
                //blockwise mmchain computation
@@ -1621,8 +1686,8 @@ public class LibMatrixMult
                        Arrays.fill(tmp, 0);
                        for( int bj=0; bj<cd; bj+=blocksizeJ ) {
                                int bjmin = Math.min(cd-bj, blocksizeJ);
-                               for( int i=0, aix=bi*cd+bj; i < blocksizeI; 
i++, aix+=cd)
-                                       tmp[i] += dotProduct(a, b, aix, bj, 
bjmin);
+                               for( int i=0; i < blocksizeI; i++ )
+                                       tmp[i] += dotProduct(a.values(bi+i), b, 
a.pos(bi+i,bj), bj, bjmin);
                        }
                        
                        //multiply/subtract weights (in-place), if required
@@ -1634,9 +1699,16 @@ public class LibMatrixMult
                        //compute 2nd matrix vector for row block and aggregate
                        for( int bj = 0; bj<cd; bj+=blocksizeJ ) {
                                int bjmin = Math.min(cd-bj, blocksizeJ);
-                               for (int i=0, aix=bi*cd+bj; i<blocksizeI; i+=4, 
aix+=4*cd)
-                                       vectMultiplyAdd4(tmp[i], tmp[i+1], 
tmp[i+2], tmp[i+3],
-                                               a, c, aix, aix+cd, aix+2*cd, 
aix+3*cd, bj, bjmin);
+                               if( a.isContiguous() ) {
+                                       double[] avals = a.values(0);
+                                       for( int i=0, aix=bi*cd+bj; 
i<blocksizeI; i+=4, aix+=4*cd )
+                                               vectMultiplyAdd4(tmp[i], 
tmp[i+1], tmp[i+2], tmp[i+3],
+                                                       avals, c, aix, aix+cd, 
aix+2*cd, aix+3*cd, bj, bjmin);
+                               }
+                               else {
+                                       for( int i=0; i<blocksizeI; i++ )
+                                               vectMultiplyAdd(tmp[i], 
a.values(bi+i), c, a.pos(bi+i,bj), bj, bjmin);
+                               }
                        }
                }
        }
@@ -1677,8 +1749,8 @@ public class LibMatrixMult
        {
                //2) transpose self matrix multiply dense
                // (compute only upper-triangular matrix due to symmetry)
-               double[] a = m1.getDenseBlockValues();
-               double[] c = ret.getDenseBlockValues();
+               DenseBlock a = m1.getDenseBlock();
+               DenseBlock c = ret.getDenseBlock();
                int m = m1.rlen;
                int n = m1.clen;
                
@@ -1688,10 +1760,15 @@ public class LibMatrixMult
                        {
                                if( n==1 ) //VECTOR (col)
                                {
-                                       c[0] = dotProduct(a, a, m);
+                                       double[] avals = a.valuesAt(0);
+                                       c.set(0, 0, dotProduct(avals, avals, 
m));
                                }
                                else //MATRIX
-                               {       
+                               {
+                                       //TODO robustness large blocks (perf 
critical)
+                                       double[] avals = a.valuesAt(0);
+                                       double[] cvals = c.valuesAt(0);
+                                       
                                        //1) Unrolled inner loop (for better 
instruction-level parallelism)
                                        //2) Blocked execution (for less cache 
trashing in parallel exec)       
                                        //3) Asymmetric block sizes (for less 
misses in inner loop, yet blocks in L1/L2)
@@ -1710,48 +1787,50 @@ public class LibMatrixMult
                                        
                                        //blocked execution
                                        for( int bi = rl; bi < mx; 
bi+=blocksizeI ) //from bi due to symmetry
-                                               for( int bk = 0, bimin = 
Math.min(mx, bi+blocksizeI); bk < cdx; bk+=blocksizeK ) 
-                                                       for( int bj = bi, bkmin 
= Math.min(cdx, bk+blocksizeK); bj < nx; bj+=blocksizeJ ) 
+                                               for( int bk = 0, bimin = 
Math.min(mx, bi+blocksizeI); bk < cdx; bk+=blocksizeK )
+                                                       for( int bj = bi, bkmin 
= Math.min(cdx, bk+blocksizeK); bj < nx; bj+=blocksizeJ )
                                                        {
                                                                int bklen = 
bkmin-bk;
                                                                int bjlen = 
Math.min(nx, bj+blocksizeJ)-bj;
                                                                
                                                                //core sub 
block matrix multiplication
-                                                       for( int i = bi; i < 
bimin; i++) 
-                                                       {
-                                                               int aixi = bk*n 
+i; //start index on a (logical t(X))
-                                                               int cixj = i * 
nx + bj; //scan index on c
-                                                               
-                                                               //determine nnz 
of a (for sparsity-aware skipping of rows)
-                                                               int knnz = 
copyNonZeroElements(a, aixi, bk, bj, n, nx, ta, tbi, bklen);
-                                                               
-                                                               //rest not 
aligned to blocks of 4 rows
-                                                               final int bn = 
knnz % 4;
-                                                               switch( bn ){
-                                                                       case 1: 
vectMultiplyAdd(ta[0], a, c, tbi[0], cixj, bjlen); break;
-                                                               case 2: 
vectMultiplyAdd2(ta[0],ta[1], a, c, tbi[0], tbi[1], cixj, bjlen); break;
-                                                                       case 3: 
vectMultiplyAdd3(ta[0],ta[1],ta[2], a, c, tbi[0], tbi[1],tbi[2], cixj, bjlen); 
break;
-                                                               }
-                                                               
-                                                               //compute 
blocks of 4 rows (core inner loop)
-                                                               for( int k = 
bn; k<knnz; k+=4 ){
-                                                                       
vectMultiplyAdd4( ta[k], ta[k+1], ta[k+2], ta[k+3], a, c, 
-                                                                               
                  tbi[k], tbi[k+1], tbi[k+2], tbi[k+3], cixj, bjlen );
-                                                               }
-                                                       }
+                                                               for( int i = 
bi; i < bimin; i++) 
+                                                               {
+                                                                       int 
aixi = bk*n +i; //start index on a (logical t(X))
+                                                                       int 
cixj = i * nx + bj; //scan index on c
+                                                                       
+                                                                       
//determine nnz of a (for sparsity-aware skipping of rows)
+                                                                       int 
knnz = copyNonZeroElements(avals, aixi, bk, bj, n, nx, ta, tbi, bklen);
+                                                                       
+                                                                       //rest 
not aligned to blocks of 4 rows
+                                                                       final 
int bn = knnz % 4;
+                                                                       switch( 
bn ){
+                                                                               
case 1: vectMultiplyAdd(ta[0], avals, cvals, tbi[0], cixj, bjlen); break;
+                                                                               
case 2: vectMultiplyAdd2(ta[0],ta[1], avals, cvals, tbi[0], tbi[1], cixj, 
bjlen); break;
+                                                                               
case 3: vectMultiplyAdd3(ta[0],ta[1],ta[2], avals, cvals, tbi[0], 
tbi[1],tbi[2], cixj, bjlen); break;
+                                                                       }
+                                                                       
+                                                                       
//compute blocks of 4 rows (core inner loop)
+                                                                       for( 
int k = bn; k<knnz; k+=4 ){
+                                                                               
vectMultiplyAdd4( ta[k], ta[k+1], ta[k+2], ta[k+3], avals, cvals,
+                                                                               
        tbi[k], tbi[k+1], tbi[k+2], tbi[k+3], cixj, bjlen );
+                                                                       }
+                                                               }
                                                        }
                                }
                        }
                        else
-                       {       
+                       {
+                               //TODO robustness large blocks (perf critical)
+                               double[] avals = a.valuesAt(0);
+                               double[] cvals = c.valuesAt(0);
+                               
                                for(int k = 0, ix1 = 0; k < m; k++, ix1+=n)
-                                       for(int i = rl, ix3 = 0; i < ru; i++, 
ix3+=n) 
-                                       {
-                                               double val = a[ ix1+i ];
-                                               if( val != 0 )
-                                               {
+                                       for(int i = rl, ix3 = 0; i < ru; i++, 
ix3+=n) {
+                                               double val = avals[ ix1+i ];
+                                               if( val != 0 ) {
                                                        for(int j = i; j < n; 
j++) //from i due to symmetry
-                                                               c[ ix3+j ]  += 
val * a[ ix1+j ];
+                                                               cvals[ ix3+j ]  
+= val * avals[ ix1+j ];
                                                }
                                        }
                        }
@@ -1762,11 +1841,16 @@ public class LibMatrixMult
                        {
                                if( m==1 ) //VECTOR
                                {
-                                       c[0] = dotProduct(a, a, n);
+                                       double[] avals = a.valuesAt(0);
+                                       c.set(0, 0, dotProduct(avals, avals, 
n));
                                }
                                else //MATRIX
                                {
-                                       //algorithm: scan c, foreach ci,j: scan 
row of a and t(a) (IJK)                         
+                                       //TODO robustness large blocks (perf 
critical)
+                                       double[] avals = a.valuesAt(0);
+                                       double[] cvals = c.valuesAt(0);
+                                       
+                                       //algorithm: scan c, foreach ci,j: scan 
row of a and t(a) (IJK)
                                
                                        //1) Unrolled inner loop, for better ILP
                                        //2) Blocked execution, for less cache 
trashing in parallel exec 
@@ -1784,20 +1868,24 @@ public class LibMatrixMult
                                                                for(int i=bi, 
ix1=bi*n+bk, ix3=bi*m; i<bimin; i++, ix1+=n, ix3+=m) {
                                                                        final 
int bjmax = Math.max(i,bj); //from i due to symmetry
                                                                        for(int 
j=bjmax, ix2=bjmax*n+bk; j <bjmin; j++, ix2+=n) 
-                                                                               
c[ ix3+j ] += dotProduct(a, a, ix1, ix2, bklen);        
+                                                                               
cvals[ ix3+j ] += dotProduct(avals, avals, ix1, ix2, bklen);
                                                                }
                                                        }
                                }
                        }
                        else
                        {
+                               //TODO robustness large blocks (perf critical)
+                               double[] avals = a.valuesAt(0);
+                               double[] cvals = c.valuesAt(0);
+                               
                                for(int i = rl, ix1 = 0, ix3 = 0; i < ru; i++, 
ix1+=n, ix3+=m)
                                        for(int j = i, ix2 = i*n; j < m; j++, 
ix2+=n) //from i due to symmetry
                                        {
                                                double val = 0;
                                                for(int k = 0; k < n; k++)
-                                                       val += a[ ix1+k ] * 
a[ix2+k];
-                                               c[ ix3+j ] = val;       
+                                                       val += avals[ ix1+k ] * 
avals[ix2+k];
+                                               cvals[ ix3+j ] = val;
                                        }
                        }
                }
@@ -1809,12 +1897,15 @@ public class LibMatrixMult
                //2) transpose self matrix multiply sparse
                // (compute only upper-triangular matrix due to symmetry)
                SparseBlock a = m1.sparseBlock;
-               double[] c = ret.getDenseBlockValues();
+               DenseBlock c = ret.getDenseBlock();
                int m = m1.rlen;
                int n = m1.clen;
 
                if( leftTranspose ) // t(X)%*%X 
                {
+                       //TODO robustness large blocks (perf critical)
+                       double[] cvals = c.valuesAt(0);
+                       
                        //only general case (because vectors always dense)
                        //algorithm: scan rows, foreach row self join (KIJ)
                        if( LOW_LEVEL_OPTIMIZATION )
@@ -1833,31 +1924,28 @@ public class LibMatrixMult
                                                double val = avals[i];
                                                if( val != 0 ) {
                                                        int ix2 = aix[i]*n;
-                                                       vectMultiplyAdd(val, 
avals, c, aix, i, ix2, len-i);
+                                                       vectMultiplyAdd(val, 
avals, cvals, aix, i, ix2, len-i);
                                                }
                                        }
                                }
                        }
                        else
                        {
-                               for( int r=0; r<a.numRows(); r++ )
-                                       if( !a.isEmpty(r) ) 
-                                       {
-                                               int apos = a.pos(r);
-                                               int alen = a.size(r);
-                                               int[] aix = a.indexes(r);
-                                               double[] avals = a.values(r);
-                                               int rlix = (rl==0) ? 0 : 
a.posFIndexGTE(r, rl);
-                                               rlix = (rlix>=0) ? apos+rlix : 
apos+alen;
-                                               
-                                               for(int i = rlix; i < apos+alen 
&& aix[i]<ru; i++) 
-                                               {
-                                                       double val = avals[i];
-                                                       if( val != 0 )
-                                                               for(int j = i, 
ix2 = aix[i]*n; j < apos+alen; j++)
-                                                                       
c[ix2+aix[j]] += val * avals[j];
-                                               }
+                               for( int r=0; r<a.numRows(); r++ ) {
+                                       if( a.isEmpty(r) ) continue;
+                                       int apos = a.pos(r);
+                                       int alen = a.size(r);
+                                       int[] aix = a.indexes(r);
+                                       double[] avals = a.values(r);
+                                       int rlix = (rl==0) ? 0 : 
a.posFIndexGTE(r, rl);
+                                       rlix = (rlix>=0) ? apos+rlix : 
apos+alen;
+                                       for(int i = rlix; i < apos+alen && 
aix[i]<ru; i++) {
+                                               double val = avals[i];
+                                               if( val != 0 )
+                                                       for(int j = i, ix2 = 
aix[i]*n; j < apos+alen; j++)
+                                                               
cvals[ix2+aix[j]] += val * avals[j];
                                        }
+                               }
                        }
                }
                else // X%*%t(X)
@@ -1866,12 +1954,15 @@ public class LibMatrixMult
                        {
                                if( !m1.sparseBlock.isEmpty(0) ) {
                                        int alen = m1.sparseBlock.size(0); 
//pos always 0
-                                       double[] avals = a.values(0);   
-                                       c[0] = dotProduct(avals, avals, alen);
+                                       double[] avals = a.values(0);
+                                       c.set(0, 0, dotProduct(avals, avals, 
alen));
                                }
                        }
                        else //MATRIX
-                       {                       
+                       {
+                               //TODO robustness large blocks (perf critical)
+                               double[] cvals = c.valuesAt(0);
+                               
                                //note: reorg to similar layout as t(X)%*%X 
because faster than 
                                //direct computation with IJK (no 
dependencies/branches in inner loop)
                                //see preprocessMatrixMultTransposeSelf 
m1<-tmpBlock
@@ -1882,46 +1973,41 @@ public class LibMatrixMult
                                if( LOW_LEVEL_OPTIMIZATION )
                                {
                                        int arlen = a.numRows();
-                                       for( int r=0; r<arlen; r++ )
-                                               if( !a.isEmpty(r) ) 
-                                               {
-                                                       int apos = a.pos(r);
-                                                       int alen = a.size(r);
-                                                       int[] aix = 
a.indexes(r);
-                                                       double[] avals = 
a.values(r);
-                                                       int rlix = (rl==0) ? 0 
: a.posFIndexGTE(r, rl);
-                                                       rlix = (rlix>=0) ? 
apos+rlix : apos+alen;
-                                                       
-                                                       for(int i = rlix; i < 
apos+alen && aix[i]<ru; i++) 
-                                                       {
-                                                               double val = 
avals[i];
-                                                               if( val != 0 ) {
-                                                                       int ix2 
= aix[i]*m;
-                                                                       
vectMultiplyAdd(val, avals, c, aix, i, ix2, alen-i);
-                                                               }
+                                       for( int r=0; r<arlen; r++ ) {
+                                               if( a.isEmpty(r) ) continue;
+                                               int apos = a.pos(r);
+                                               int alen = a.size(r);
+                                               int[] aix = a.indexes(r);
+                                               double[] avals = a.values(r);
+                                               int rlix = (rl==0) ? 0 : 
a.posFIndexGTE(r, rl);
+                                               rlix = (rlix>=0) ? apos+rlix : 
apos+alen;
+                                               
+                                               for(int i = rlix; i < apos+alen 
&& aix[i]<ru; i++) {
+                                                       double val = avals[i];
+                                                       if( val != 0 ) {
+                                                               int ix2 = 
aix[i]*m;
+                                                               
vectMultiplyAdd(val, avals, cvals, aix, i, ix2, alen-i);
                                                        }
                                                }
+                                       }
                                }
                                else
                                {
-                                       for( int r=0; r<a.numRows(); r++ )
-                                               if( !a.isEmpty(r) ) 
-                                               {
-                                                       int apos = a.pos(r);
-                                                       int alen = a.size(r);
-                                                       int[] aix = 
a.indexes(r);
-                                                       double[] avals = 
a.values(r);
-                                                       int rlix = (rl==0) ? 0 
: a.posFIndexGTE(r,rl);
-                                                       rlix = (rlix>=0) ? 
apos+rlix : apos+alen;
-                                                       
-                                                       for(int i = rlix; i < 
apos+alen && aix[i]<ru; i++) 
-                                                       {
-                                                               double val = 
avals[i];
-                                                               if( val != 0 )
-                                                                       for(int 
j = i, ix2 = aix[i]*m; j < alen; j++)
-                                                                               
c[ix2+aix[j]] += val * avals[j];
-                                                       }
+                                       for( int r=0; r<a.numRows(); r++ ) {
+                                               if( a.isEmpty(r) ) continue;
+                                               int apos = a.pos(r);
+                                               int alen = a.size(r);
+                                               int[] aix = a.indexes(r);
+                                               double[] avals = a.values(r);
+                                               int rlix = (rl==0) ? 0 : 
a.posFIndexGTE(r,rl);
+                                               rlix = (rlix>=0) ? apos+rlix : 
apos+alen;
+                                               for(int i = rlix; i < apos+alen 
&& aix[i]<ru; i++) {
+                                                       double val = avals[i];
+                                                       if( val != 0 )
+                                                               for(int j = i, 
ix2 = aix[i]*m; j < alen; j++)
+                                                                       
cvals[ix2+aix[j]] += val * avals[j];
                                                }
+                                       }
                                }
                        }
                }
@@ -1931,33 +2017,28 @@ public class LibMatrixMult
                throws DMLRuntimeException
        {
                double[] a = pm1.getDenseBlockValues();
-               double[] b = m2.getDenseBlockValues();
-               double[] c = ret1.getDenseBlockValues();
+               DenseBlock b = m2.getDenseBlock();
+               DenseBlock c = ret1.getDenseBlock();
 
                final int n = m2.clen;
                final int brlen = ret1.getNumRows();
-               
                int lastblk = -1;
                
-               for( int i=rl, bix=rl*n; i<ru; i++, bix+=n ) 
-               {
+               for( int i=rl; i<ru; i++ ) {
                        //compute block index and in-block indexes
                        int pos = UtilFunctions.toInt( a[ i ]); //safe cast
-                       if( pos > 0 ) //selected row
-                       {
+                       if( pos > 0 ) { //selected row
                                int bpos = (pos-1) % brlen;
                                int blk = (pos-1) / brlen;
-                               
                                //allocate and switch to second output block
                                //(never happens in cp, correct for 
multi-threaded usage)
-                               if( lastblk!=-1 && lastblk<blk ){ 
+                               if( lastblk!=-1 && lastblk<blk ) {
                                        ret2.sparse = false;
                                        ret2.allocateDenseBlock();
-                                       c = ret2.getDenseBlockValues();
+                                       c = ret2.getDenseBlock();
                                }
-               
                                //memcopy entire dense row into target position
-                               System.arraycopy(b, bix, c, bpos*n, n);
+                               System.arraycopy(b.values(i), b.pos(i), 
c.values(bpos), c.pos(bpos), n);
                                lastblk = blk;
                        }
                }
@@ -1965,67 +2046,60 @@ public class LibMatrixMult
 
        private static void matrixMultPermuteDenseSparse( MatrixBlock pm1, 
MatrixBlock m2, MatrixBlock ret1, MatrixBlock ret2, int rl, int ru)
        {
-               double[] a = pm1.getDenseBlockValues();
-               double[] b = m2.getDenseBlockValues();
+               double[] a = pm1.getDenseBlockValues(); //vector
+               DenseBlock b = m2.getDenseBlock();
                SparseBlock c = ret1.sparseBlock;
 
                final int n = m2.clen;
                final int brlen = ret1.getNumRows();
                
                int lastblk = -1;
-               for( int i=rl, bix=rl*n; i<ru; i++, bix+=n ) 
-               {
+               for( int i=rl; i<ru; i++ ) {
                        //compute block index and in-block indexes
                        int pos = UtilFunctions.toInt( a[ i ]); //safe cast
-                       if( pos > 0 ) //selected row
-                       {
+                       if( pos > 0 ) { //selected row
+                               double[] bvals = b.values(i);
+                               int bix = b.pos(i);
                                int bpos = (pos-1) % brlen;
                                int blk = (pos-1) / brlen;
-                               
                                //allocate and switch to second output block
                                //(never happens in cp, correct for 
multi-threaded usage)
                                if( lastblk!=-1 && lastblk<blk ){ 
                                        ret2.sparse = true;
                                        ret2.rlen=ret1.rlen;
                                        ret2.allocateSparseRowsBlock();
-                                       c = ret2.sparseBlock;           
+                                       c = ret2.sparseBlock;
                                }
-               
                                //append entire dense row into sparse target 
position
                                for( int j=0; j<n; j++ )
-                                       c.append(bpos, j, b[bix+j]);
+                                       c.append(bpos, j, bvals[bix+j]);
                                lastblk = blk;
                        }
                }
-               
        }
 
        private static void matrixMultPermuteSparse( MatrixBlock pm1, 
MatrixBlock m2, MatrixBlock ret1, MatrixBlock ret2, int rl, int ru)
        {
-               double[] a = pm1.getDenseBlockValues();
+               double[] a = pm1.getDenseBlockValues(); //vector
                SparseBlock b = m2.sparseBlock;
                SparseBlock c = ret1.sparseBlock;
 
                final int brlen = ret1.getNumRows();
                
                int lastblk = -1;
-               for( int i=rl; i<ru; i++ ) 
-               {
+               for( int i=rl; i<ru; i++ )  {
                        //compute block index and in-block indexes
                        int pos = UtilFunctions.toInt( a[ i ]); //safe cast
-                       if( pos > 0 ) //selected row
-                       {
+                       if( pos > 0 ) { //selected row
                                int bpos = (pos-1) % brlen;
                                int blk = (pos-1) / brlen;
-                               
                                //allocate and switch to second output block
                                //(never happens in cp, correct for 
multi-threaded usage)
                                if( lastblk!=-1 && lastblk<blk ){ 
                                        ret2.sparse = true;
                                        ret2.allocateSparseRowsBlock();
-                                       c = ret2.sparseBlock;           
+                                       c = ret2.sparseBlock;
                                }
-               
                                //memcopy entire sparse row into target position
                                c.set(bpos, b.get(i), true);
                                lastblk = blk;

Reply via email to