[SYSTEMML-2046] Large dense blocks in cache-conscious mm operators This patch finalizes the work on supporting large dense blocks >16GB in matrix multiplication operators, handling all remaining cache-conscious implementations, which required detailed micro benchmarks to ensure this generalization did not impact performance.
Project: http://git-wip-us.apache.org/repos/asf/systemml/repo Commit: http://git-wip-us.apache.org/repos/asf/systemml/commit/0a9c91a6 Tree: http://git-wip-us.apache.org/repos/asf/systemml/tree/0a9c91a6 Diff: http://git-wip-us.apache.org/repos/asf/systemml/diff/0a9c91a6 Branch: refs/heads/master Commit: 0a9c91a63c4290f6edc3d8107e4338be4de679fb Parents: c9977b7 Author: Matthias Boehm <[email protected]> Authored: Wed Jan 10 17:12:31 2018 -0800 Committer: Matthias Boehm <[email protected]> Committed: Wed Jan 10 18:42:15 2018 -0800 ---------------------------------------------------------------------- .../sysml/runtime/matrix/data/DenseBlock.java | 11 + .../runtime/matrix/data/DenseBlockDRB.java | 5 + .../runtime/matrix/data/DenseBlockLDRB.java | 5 + .../runtime/matrix/data/LibMatrixMult.java | 338 ++++++++++--------- 4 files changed, 200 insertions(+), 159 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/systemml/blob/0a9c91a6/src/main/java/org/apache/sysml/runtime/matrix/data/DenseBlock.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/runtime/matrix/data/DenseBlock.java b/src/main/java/org/apache/sysml/runtime/matrix/data/DenseBlock.java index df6f2fb..50beb3c 100644 --- a/src/main/java/org/apache/sysml/runtime/matrix/data/DenseBlock.java +++ b/src/main/java/org/apache/sysml/runtime/matrix/data/DenseBlock.java @@ -105,6 +105,17 @@ public abstract class DenseBlock implements Serializable public abstract boolean isContiguous(); /** + * Indicates if the dense block has a single + * underlying block for the given row range. + * + * @param rl row lower index + * @param ru row upper index (inclusive) + * @return true if single block in row range + */ + public abstract boolean isContiguous(int rl, int ru); + + + /** * Get the length of the dense block as the product * of row and column dimensions. * http://git-wip-us.apache.org/repos/asf/systemml/blob/0a9c91a6/src/main/java/org/apache/sysml/runtime/matrix/data/DenseBlockDRB.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/runtime/matrix/data/DenseBlockDRB.java b/src/main/java/org/apache/sysml/runtime/matrix/data/DenseBlockDRB.java index 33dc492..7f2ddb0 100644 --- a/src/main/java/org/apache/sysml/runtime/matrix/data/DenseBlockDRB.java +++ b/src/main/java/org/apache/sysml/runtime/matrix/data/DenseBlockDRB.java @@ -91,6 +91,11 @@ public class DenseBlockDRB extends DenseBlock public boolean isContiguous() { return true; } + + @Override + public boolean isContiguous(int rl, int ru) { + return true; + } @Override public long size() { http://git-wip-us.apache.org/repos/asf/systemml/blob/0a9c91a6/src/main/java/org/apache/sysml/runtime/matrix/data/DenseBlockLDRB.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/runtime/matrix/data/DenseBlockLDRB.java b/src/main/java/org/apache/sysml/runtime/matrix/data/DenseBlockLDRB.java index 5f5ff79..e041f39 100644 --- a/src/main/java/org/apache/sysml/runtime/matrix/data/DenseBlockLDRB.java +++ b/src/main/java/org/apache/sysml/runtime/matrix/data/DenseBlockLDRB.java @@ -114,6 +114,11 @@ public class DenseBlockLDRB extends DenseBlock public boolean isContiguous() { return rlen <= blen; } + + @Override + public boolean isContiguous(int rl, int ru) { + return isContiguous() || index(rl)==index(ru); + } @Override public long size() { http://git-wip-us.apache.org/repos/asf/systemml/blob/0a9c91a6/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixMult.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixMult.java b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixMult.java index 7159a7d..36d3dc2 100644 --- a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixMult.java +++ b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixMult.java @@ -1049,34 +1049,44 @@ public class LibMatrixMult private static void matrixMultDenseDenseMMShortLHS(DenseBlock a, DenseBlock b, DenseBlock c, int m, int n, int cd, int rl, int ru) throws DMLRuntimeException { - //TODO robustness large blocks (perf critical) - double[] avals = a.valuesAt(0); - double[] bvals = b.valuesAt(0); - double[] cvals = c.valuesAt(0); - //cache-conscious parallelization over rows in rhs matrix final int kn = (ru-rl)%4; //rest not aligned to blocks of 2 rows - for( int i=0, aix=0, cix=0; i<m; i++, aix+=cd, cix+=n ) - for( int k=rl, bix=rl*n; k<rl+kn; k++, bix+=n ) + for( int i=0; i<m; i++ ) { + double[] avals = a.values(i), cvals = c.values(i); + int aix = a.pos(i), cix = c.pos(i); + for( int k=rl; k<rl+kn; k++ ) if( avals[aix+k] != 0 ) - vectMultiplyAdd(avals[aix+k], bvals, cvals, bix, cix, n); + vectMultiplyAdd(avals[aix+k], b.values(k), cvals, b.pos(k), cix, n); + } final int blocksizeK = 48; final int blocksizeJ = 1024; //blocked execution - for( int bk = rl+kn; bk < ru; bk+=blocksizeK ) - for( int bj = 0, bkmin = Math.min(ru, bk+blocksizeK); bj < n; bj+=blocksizeJ ) { + for( int bk = rl+kn; bk < ru; bk+=blocksizeK ) { + int bkmin = Math.min(ru, bk+blocksizeK); + for( int bj = 0; bj < n; bj+=blocksizeJ ) { //compute blocks of 4 rows in rhs w/ IKJ int bjlen = Math.min(n, bj+blocksizeJ)-bj; - for( int i=0, aix=0, cix=bj; i<m; i++, aix+=cd, cix+=n ) - for( int k=bk, bix=bk*n+bj; k<bkmin; k+=4, bix+=4*n ) { - vectMultiplyAdd4(avals[aix+k], avals[aix+k+1], avals[aix+k+2], avals[aix+k+3], - bvals, cvals, bix, bix+n, bix+2*n, bix+3*n, cix, bjlen); + for( int i=0; i<m; i++ ) { + double[] avals = a.values(i), cvals = c.values(i); + int aix = a.pos(i), cix = c.pos(i, bj); + if( b.isContiguous(bk, bkmin-1) ) { + double[] bvals = b.values(bk); + for( int k=bk, bix=b.pos(bk, bj); k<bkmin; k+=4, bix+=4*n ) + vectMultiplyAdd4(avals[aix+k], avals[aix+k+1], avals[aix+k+2], avals[aix+k+3], + bvals, cvals, bix, bix+n, bix+2*n, bix+3*n, cix, bjlen); } + else { + for( int k=rl; k<rl+kn; k++ ) + if( avals[aix+k] != 0 ) + vectMultiplyAdd(avals[aix+k], b.values(k), cvals, b.pos(k), cix, n); + } + } } + } } private static void matrixMultDenseDenseMMSkinnyRHS(DenseBlock a, DenseBlock b, DenseBlock c, int n2, int cd, int rl, int ru) @@ -1095,11 +1105,6 @@ public class LibMatrixMult private static void matrixMultDenseDenseMM(DenseBlock a, DenseBlock b, DenseBlock c, int n, int cd, int rl, int ru, int cl, int cu) throws DMLRuntimeException { - //TODO robustness large blocks (perf critical) - double[] avals = a.valuesAt(0); - double[] bvals = b.valuesAt(0); - double[] cvals = c.valuesAt(0); - //1) Unrolled inner loop (for better instruction-level parallelism) //2) Blocked execution (for less cache trashing in parallel exec) //3) Asymmetric block sizes (for less misses in inner loop, yet blocks in L1/L2) @@ -1121,27 +1126,37 @@ public class LibMatrixMult int bjlen = Math.min(cu, bj+blocksizeJ)-bj; //core sub block matrix multiplication - for( int i = bi; i < bimin; i++) - { - int aixi = i * cd + bk; //start index on a - int cixj = i * n + bj; //scan index on c - - //determine nnz of a (for sparsity-aware skipping of rows) - int knnz = copyNonZeroElements(avals, aixi, bk, bj, n, ta, tbi, bklen); - //if( knnz > 0 ) //for skipping empty rows - - //rest not aligned to blocks of 4 rows - final int bn = knnz % 4; - switch( bn ){ - case 1: vectMultiplyAdd(ta[0], bvals, cvals, tbi[0], cixj, bjlen); break; - case 2: vectMultiplyAdd2(ta[0],ta[1], bvals, cvals, tbi[0], tbi[1], cixj, bjlen); break; - case 3: vectMultiplyAdd3(ta[0],ta[1],ta[2], bvals, cvals, tbi[0], tbi[1],tbi[2], cixj, bjlen); break; - } + for( int i = bi; i < bimin; i++) { + double[] avals = a.values(i), cvals = c.values(i); + int aixi = a.pos(i, bk), cixj = c.pos(i, bj); - //compute blocks of 4 rows (core inner loop) - for( int k = bn; k<knnz; k+=4 ){ - vectMultiplyAdd4( ta[k], ta[k+1], ta[k+2], ta[k+3], bvals, cvals, + if( b.isContiguous(bk, bkmin-1) ) { + double[] bvals = b.values(bk); + int bkpos = b.pos(bk, bj); + + //determine nnz of a (for sparsity-aware skipping of rows) + int knnz = copyNonZeroElements(avals, aixi, bkpos, bj, n, ta, tbi, bklen); + + //rest not aligned to blocks of 4 rows + final int bn = knnz % 4; + switch( bn ){ + case 1: vectMultiplyAdd(ta[0], bvals, cvals, tbi[0], cixj, bjlen); break; + case 2: vectMultiplyAdd2(ta[0],ta[1], bvals, cvals, tbi[0], tbi[1], cixj, bjlen); break; + case 3: vectMultiplyAdd3(ta[0],ta[1],ta[2], bvals, cvals, tbi[0], tbi[1],tbi[2], cixj, bjlen); break; + } + + //compute blocks of 4 rows (core inner loop) + for( int k = bn; k<knnz; k+=4 ){ + vectMultiplyAdd4( ta[k], ta[k+1], ta[k+2], ta[k+3], bvals, cvals, tbi[k], tbi[k+1], tbi[k+2], tbi[k+3], cixj, bjlen ); + } + } + else { + for( int k = bk; k<bkmin; k++ ) { + if( avals[k] != 0 ) + vectMultiplyAdd( avals[k], b.values(k), + cvals, b.pos(k, bj), cixj, bjlen ); + } } } } @@ -1339,35 +1354,41 @@ public class LibMatrixMult private static void matrixMultSparseDenseMMShortLHS(SparseBlock a, DenseBlock b, DenseBlock c, int n, int cd, int rl, int ru) throws DMLRuntimeException { - //TODO robustness large blocks (perf critical) - double[] bvals = b.valuesAt(0); - double[] cvals = c.valuesAt(0); - int arlen = a.numRows(); - for( int i=0, cix=0; i<arlen; i++, cix+=n ) { + for( int i=0; i<arlen; i++ ) { if( a.isEmpty(i) ) continue; int apos = a.pos(i); int alen = a.size(i); int[] aix = a.indexes(i); double[] avals = a.values(i); + double[] cvals = c.values(i); + int cix = c.pos(i); int k1 = (rl==0) ? 0 : a.posFIndexGTE(i, rl); k1 = (k1>=0) ? apos+k1 : apos+alen; int k2 = (ru==cd) ? alen : a.posFIndexGTE(i, ru); k2 = (k2>=0) ? apos+k2 : apos+alen; - //rest not aligned to blocks of 4 rows - final int bn = (k2-k1) % 4; - switch( bn ){ - case 1: vectMultiplyAdd(avals[k1], bvals, cvals, aix[k1]*n, cix, n); break; - case 2: vectMultiplyAdd2(avals[k1],avals[k1+1], bvals, cvals, aix[k1]*n, aix[k1+1]*n, cix, n); break; - case 3: vectMultiplyAdd3(avals[k1],avals[k1+1],avals[k1+2], bvals, cvals, aix[k1]*n, aix[k1+1]*n, aix[k1+2]*n, cix, n); break; + if( b.isContiguous(aix[k1], aix[k2-1]) ) { + double[] bvals = b.values(aix[k1]); + int base = aix[k1]*n - b.pos(aix[k1]); + //rest not aligned to blocks of 4 rows + final int bn = (k2-k1) % 4; + switch( bn ){ + case 1: vectMultiplyAdd(avals[k1], bvals, cvals, aix[k1]*n-base, cix, n); break; + case 2: vectMultiplyAdd2(avals[k1],avals[k1+1], bvals, cvals, aix[k1]*n-base, aix[k1+1]*n-base, cix, n); break; + case 3: vectMultiplyAdd3(avals[k1],avals[k1+1],avals[k1+2], bvals, cvals, aix[k1]*n-base, aix[k1+1]*n-base, aix[k1+2]*n-base, cix, n); break; + } + + //compute blocks of 4 rows (core inner loop) + for( int k = k1+bn; k<k2; k+=4 ) { + vectMultiplyAdd4( avals[k], avals[k+1], avals[k+2], avals[k+3], bvals, cvals, + aix[k]*n-base, aix[k+1]*n-base, aix[k+2]*n-base, aix[k+3]*n-base, cix, n ); + } } - - //compute blocks of 4 rows (core inner loop) - for( int k = k1+bn; k<k2; k+=4 ) { - vectMultiplyAdd4( avals[k], avals[k+1], avals[k+2], avals[k+3], bvals, cvals, - aix[k]*n, aix[k+1]*n, aix[k+2]*n, aix[k+3]*n, cix, n ); + else { + for( int k = k1; k<k2; k++ ) + vectMultiplyAdd( avals[k], b.values(aix[k]), cvals, b.pos(aix[k]), cix, n ); } } } @@ -1375,10 +1396,6 @@ public class LibMatrixMult private static void matrixMultSparseDenseMMSkinnyRHS(SparseBlock a, DenseBlock b, DenseBlock c, int n, int rl, int ru) throws DMLRuntimeException { - //TODO robustness large blocks (perf critical) - double[] bvals = b.valuesAt(0); - double[] cvals = c.valuesAt(0); - //no blocking since b and c fit into cache anyway for( int i=rl, cix=rl*n; i<ru; i++, cix+=n ) { if( a.isEmpty(i) ) continue; @@ -1386,11 +1403,13 @@ public class LibMatrixMult int alen = a.size(i); int[] aix = a.indexes(i); double[] avals = a.values(i); + double[] cvals = c.values(i); //rest not aligned to blocks of 4 rows - int bn = alen%4; + int bn = b.isContiguous() ? alen%4 : alen; for( int k=apos; k<apos+bn; k++ ) - vectMultiplyAdd(avals[k], bvals, cvals, aix[k]*n, cix, n); + vectMultiplyAdd(avals[k], b.values(aix[k]), cvals, b.pos(aix[k]), cix, n); //compute blocks of 4 rows (core inner loop) + double[] bvals = b.valuesAt(0); //only for contiguous for( int k=apos+bn; k<apos+alen; k+=4 ) vectMultiplyAdd4( avals[k], avals[k+1], avals[k+2], avals[k+3], bvals, cvals, aix[k]*n, aix[k+1]*n, aix[k+2]*n, aix[k+3]*n, cix, n ); @@ -1400,10 +1419,6 @@ public class LibMatrixMult private static void matrixMultSparseDenseMM(SparseBlock a, DenseBlock b, DenseBlock c, int n, int cd, long xsp, int rl, int ru) throws DMLRuntimeException { - //TODO robustness large blocks (perf critical) - double[] bvals = b.valuesAt(0); - double[] cvals = c.valuesAt(0); - //blocksizes to fit blocks of B (dense) and several rows of A/C in common L2 cache size, //while blocking A/C for L1/L2 yet allowing long scans (2 pages) in the inner loop over j //in case of almost ultra-sparse matrices, we cannot ensure the blocking for the rhs and @@ -1423,26 +1438,28 @@ public class LibMatrixMult int bjlen = Math.min(n, bj+blocksizeJ)-bj; //core sub block matrix multiplication - for( int i=bi, cix=bi*n+bj; i<bimin; i++, cix+=n ) { - if( !a.isEmpty(i) ) { - int apos = a.pos(i); - int alen = a.size(i); - int[] aix = a.indexes(i); - double[] avals = a.values(i); - - int k = curk[i-bi] + apos; - //rest not aligned to blocks of 4 rows - int bn = alen%4; - for( ; k<apos+bn && aix[k]<bkmin; k++ ) - vectMultiplyAdd(avals[k], bvals, cvals, aix[k]*n+bj, cix, bjlen); - //compute blocks of 4 rows (core inner loop), allowed to exceed bkmin - for( ; k<apos+alen && aix[k]<bkmin; k+=4 ) - vectMultiplyAdd4( avals[k], avals[k+1], avals[k+2], avals[k+3], bvals, cvals, - aix[k]*n+bj, aix[k+1]*n+bj, aix[k+2]*n+bj, aix[k+3]*n+bj, cix, bjlen ); - //update positions on last bj block - if( bj+bjlen==n ) - curk[i-bi] = k - apos; - } + for( int i=bi; i<bimin; i++ ) { + if( a.isEmpty(i) ) continue; + int apos = a.pos(i); + int alen = a.size(i); + int[] aix = a.indexes(i); + double[] avals = a.values(i); + double[] cvals = c.values(i); + int cix = c.pos(i, bj); + + int k = curk[i-bi] + apos; + //rest not aligned to blocks of 4 rows + int bn = b.isContiguous() ? alen%4 : alen; + for( ; k<apos+bn && aix[k]<bkmin; k++ ) + vectMultiplyAdd(avals[k], b.values(aix[k]), cvals, b.pos(aix[k],bj), cix, bjlen); + //compute blocks of 4 rows (core inner loop), allowed to exceed bkmin + double[] bvals = b.valuesAt(0); //only for contiguous + for( ; k<apos+alen && aix[k]<bkmin; k+=4 ) + vectMultiplyAdd4( avals[k], avals[k+1], avals[k+2], avals[k+3], bvals, cvals, + aix[k]*n+bj, aix[k+1]*n+bj, aix[k+2]*n+bj, aix[k+3]*n+bj, cix, bjlen ); + //update positions on last bj block + if( bj+bjlen==n ) + curk[i-bi] = k - apos; } } } @@ -1765,17 +1782,13 @@ public class LibMatrixMult } else //MATRIX { - //TODO robustness large blocks (perf critical) - double[] avals = a.valuesAt(0); - double[] cvals = c.valuesAt(0); - //1) Unrolled inner loop (for better instruction-level parallelism) - //2) Blocked execution (for less cache trashing in parallel exec) + //2) Blocked execution (for less cache trashing in parallel exec) //3) Asymmetric block sizes (for less misses in inner loop, yet blocks in L1/L2) - final int blocksizeI = 32; //64//256KB c block (typical L2 size per core), 32KB a block + final int blocksizeI = 32; //64//256KB c block (typical L2 size per core), 32KB a block final int blocksizeK = 24; //64//256KB b block (typical L2 size per core), used while read 512B of a / read/write 4KB of c - final int blocksizeJ = 1024; //512//4KB (typical main-memory page size), for scan + final int blocksizeJ = 1024; //512//4KB (typical main-memory page size), for scan //temporary arrays (nnz a, b index) double[] ta = new double[ blocksizeK ]; @@ -1796,43 +1809,57 @@ public class LibMatrixMult //core sub block matrix multiplication for( int i = bi; i < bimin; i++) { - int aixi = bk*n +i; //start index on a (logical t(X)) - int cixj = i * nx + bj; //scan index on c + double[] cvals = c.values(i); + int cixj = c.pos(i, bj); - //determine nnz of a (for sparsity-aware skipping of rows) - int knnz = copyNonZeroElements(avals, aixi, bk, bj, n, nx, ta, tbi, bklen); - - //rest not aligned to blocks of 4 rows - final int bn = knnz % 4; - switch( bn ){ - case 1: vectMultiplyAdd(ta[0], avals, cvals, tbi[0], cixj, bjlen); break; - case 2: vectMultiplyAdd2(ta[0],ta[1], avals, cvals, tbi[0], tbi[1], cixj, bjlen); break; - case 3: vectMultiplyAdd3(ta[0],ta[1],ta[2], avals, cvals, tbi[0], tbi[1],tbi[2], cixj, bjlen); break; + if( a.isContiguous(bk, bkmin-1) ) { + double[] avals = a.values(bk); + int aixi = a.pos(bk, i); + + //determine nnz of a (for sparsity-aware skipping of rows) + int knnz = copyNonZeroElements(avals, aixi, bk, bj, n, nx, ta, tbi, bklen); + + //rest not aligned to blocks of 4 rows + final int bn = knnz % 4; + switch( bn ){ + case 1: vectMultiplyAdd(ta[0], avals, cvals, tbi[0], cixj, bjlen); break; + case 2: vectMultiplyAdd2(ta[0],ta[1], avals, cvals, tbi[0], tbi[1], cixj, bjlen); break; + case 3: vectMultiplyAdd3(ta[0],ta[1],ta[2], avals, cvals, tbi[0], tbi[1],tbi[2], cixj, bjlen); break; + } + + //compute blocks of 4 rows (core inner loop) + for( int k = bn; k<knnz; k+=4 ){ + vectMultiplyAdd4( ta[k], ta[k+1], ta[k+2], ta[k+3], avals, cvals, + tbi[k], tbi[k+1], tbi[k+2], tbi[k+3], cixj, bjlen ); + } } - - //compute blocks of 4 rows (core inner loop) - for( int k = bn; k<knnz; k+=4 ){ - vectMultiplyAdd4( ta[k], ta[k+1], ta[k+2], ta[k+3], avals, cvals, - tbi[k], tbi[k+1], tbi[k+2], tbi[k+3], cixj, bjlen ); + else { + for( int k = bk; k<bkmin; k++ ) { + double[] avals = a.values(bk); + int aix = a.pos(bk, i); + if( avals[aix] != 0 ) + vectMultiplyAdd( avals[aix], a.values(k), + cvals, a.pos(k, bj), cixj, bjlen ); + } } } } } } - else - { - //TODO robustness large blocks (perf critical) - double[] avals = a.valuesAt(0); - double[] cvals = c.valuesAt(0); - - for(int k = 0, ix1 = 0; k < m; k++, ix1+=n) - for(int i = rl, ix3 = 0; i < ru; i++, ix3+=n) { - double val = avals[ ix1+i ]; + else { + for( int k = 0; k < m; k++ ) { + double[] avals = a.values(k); + int aix = a.pos(k); + for( int i = rl; i < ru; i++ ) { + double[] cvals = c.values(i); + int cix = c.pos(i); + double val = avals[ aix+i ]; if( val != 0 ) { for(int j = i; j < n; j++) //from i due to symmetry - cvals[ ix3+j ] += val * avals[ ix1+j ]; + cvals[ cix+j ] += val * avals[ aix+j ]; } } + } } } else // X%*%t(X) @@ -1846,10 +1873,6 @@ public class LibMatrixMult } else //MATRIX { - //TODO robustness large blocks (perf critical) - double[] avals = a.valuesAt(0); - double[] cvals = c.valuesAt(0); - //algorithm: scan c, foreach ci,j: scan row of a and t(a) (IJK) //1) Unrolled inner loop, for better ILP @@ -1864,29 +1887,31 @@ public class LibMatrixMult for( int bk = 0, bimin = Math.min(ru, bi+blocksizeIJ); bk<n; bk+=blocksizeK ) for( int bj = bi, bklen = Math.min(blocksizeK, n-bk); bj<m; bj+=blocksizeIJ ) { //core tsmm block operation (15x15 vectors of length 1K elements) - int bjmin = Math.min(m, bj+blocksizeIJ); - for(int i=bi, ix1=bi*n+bk, ix3=bi*m; i<bimin; i++, ix1+=n, ix3+=m) { + int bjmin = Math.min(m, bj+blocksizeIJ); + for( int i=bi; i<bimin; i++ ) { final int bjmax = Math.max(i,bj); //from i due to symmetry - for(int j=bjmax, ix2=bjmax*n+bk; j <bjmin; j++, ix2+=n) - cvals[ ix3+j ] += dotProduct(avals, avals, ix1, ix2, bklen); + double[] avals = a.values(i), cvals = c.values(i); + int aix = a.pos(i, bk), cix = c.pos(i); + for(int j=bjmax; j <bjmin; j++) + cvals[ cix+j ] += dotProduct(avals, a.values(j), aix, a.pos(j, bk), bklen); } } } } else { - //TODO robustness large blocks (perf critical) - double[] avals = a.valuesAt(0); - double[] cvals = c.valuesAt(0); - - for(int i = rl, ix1 = 0, ix3 = 0; i < ru; i++, ix1+=n, ix3+=m) - for(int j = i, ix2 = i*n; j < m; j++, ix2+=n) //from i due to symmetry - { + for( int i = rl; i < ru; i++ ) { + double[] avals1 = a.values(i), cvals = c.values(i); + int aix1 = a.pos(i), cix = c.pos(i); + for(int j = i; j < m; j++) { //from i due to symmetry + double[] avals2 = a.values(j); + int aix2 = a.pos(j); double val = 0; for(int k = 0; k < n; k++) - val += avals[ ix1+k ] * avals[ix2+k]; - cvals[ ix3+j ] = val; + val += avals1[aix1+k] * avals2[aix2+k]; + cvals[cix+j] = val; } + } } } } @@ -1899,13 +1924,9 @@ public class LibMatrixMult SparseBlock a = m1.sparseBlock; DenseBlock c = ret.getDenseBlock(); int m = m1.rlen; - int n = m1.clen; if( leftTranspose ) // t(X)%*%X { - //TODO robustness large blocks (perf critical) - double[] cvals = c.valuesAt(0); - //only general case (because vectors always dense) //algorithm: scan rows, foreach row self join (KIJ) if( LOW_LEVEL_OPTIMIZATION ) @@ -1922,10 +1943,9 @@ public class LibMatrixMult int len = apos + alen; for(int i = rlix; i < len && aix[i]<ru; i++) { double val = avals[i]; - if( val != 0 ) { - int ix2 = aix[i]*n; - vectMultiplyAdd(val, avals, cvals, aix, i, ix2, len-i); - } + if( val != 0 ) + vectMultiplyAdd(val, avals, c.values(aix[i]), + aix, i, c.pos(aix[i]), len-i); } } } @@ -1941,9 +1961,12 @@ public class LibMatrixMult rlix = (rlix>=0) ? apos+rlix : apos+alen; for(int i = rlix; i < apos+alen && aix[i]<ru; i++) { double val = avals[i]; - if( val != 0 ) - for(int j = i, ix2 = aix[i]*n; j < apos+alen; j++) - cvals[ix2+aix[j]] += val * avals[j]; + if( val != 0 ) { + double[] cvals = c.values(aix[i]); + int cix = c.pos(aix[i]); + for(int j = i; j < apos+alen; j++) + cvals[cix+aix[j]] += val * avals[j]; + } } } } @@ -1960,14 +1983,10 @@ public class LibMatrixMult } else //MATRIX { - //TODO robustness large blocks (perf critical) - double[] cvals = c.valuesAt(0); - //note: reorg to similar layout as t(X)%*%X because faster than //direct computation with IJK (no dependencies/branches in inner loop) //see preprocessMatrixMultTransposeSelf m1<-tmpBlock m = m1.clen; - n = m1.rlen; //algorithm: scan rows, foreach row self join (KIJ) if( LOW_LEVEL_OPTIMIZATION ) @@ -1981,13 +2000,11 @@ public class LibMatrixMult double[] avals = a.values(r); int rlix = (rl==0) ? 0 : a.posFIndexGTE(r, rl); rlix = (rlix>=0) ? apos+rlix : apos+alen; - for(int i = rlix; i < apos+alen && aix[i]<ru; i++) { double val = avals[i]; - if( val != 0 ) { - int ix2 = aix[i]*m; - vectMultiplyAdd(val, avals, cvals, aix, i, ix2, alen-i); - } + if( val != 0 ) + vectMultiplyAdd(val, avals, c.values(aix[i]), + aix, i, c.pos(aix[i]), alen-i); } } } @@ -2003,9 +2020,12 @@ public class LibMatrixMult rlix = (rlix>=0) ? apos+rlix : apos+alen; for(int i = rlix; i < apos+alen && aix[i]<ru; i++) { double val = avals[i]; - if( val != 0 ) - for(int j = i, ix2 = aix[i]*m; j < alen; j++) - cvals[ix2+aix[j]] += val * avals[j]; + if( val != 0 ) { + double[] cvals = c.values(aix[i]); + int cix = c.pos(aix[i]); + for( int j = i; j < alen; j++ ) + cvals[cix+aix[j]] += val * avals[j]; + } } } } @@ -3752,13 +3772,13 @@ public class LibMatrixMult return ret; } - private static int copyNonZeroElements( double[] a, final int aixi, final int bk, final int bj, final int n, double[] tmpa, int[] tmpbi, final int bklen ) + private static int copyNonZeroElements( double[] a, final int aixi, final int bixk, final int bj, final int n, double[] tmpa, int[] tmpbi, final int bklen ) { int knnz = 0; for( int k = 0; k < bklen; k++ ) if( a[ aixi+k ] != 0 ) { tmpa[ knnz ] = a[ aixi+k ]; - tmpbi[ knnz ] = (bk+k) * n + bj; //scan index on b + tmpbi[ knnz ] = bixk + k*n; knnz ++; }
