[SYSTEMML-2351] Performance sparse matrix-vector mult w/ large vector This patch improves the performance of sparse matrix-vector multiplication with large vectors (> L3 cache size) as used for algorithms such as PageRank and local singlenode operations. Although we already had a cache-conscious implementation, this case required additional tuning. In detail, this includes better blocking for L1 (pos and output vector), and L2 (input vector) which allows a better exploitation of cache locality of the vector across rows of the large, sparse input matrix.
For executing 100 iterations of PageRank over a graph with 10M nodes and 5G edges (~60GB, 80MB vector) on a 2-socket box w/ Xeon Gold 6138 (80 vcores), this patch improved the end-to-end runtime from 288s to 164s. Project: http://git-wip-us.apache.org/repos/asf/systemml/repo Commit: http://git-wip-us.apache.org/repos/asf/systemml/commit/af9cc8a9 Tree: http://git-wip-us.apache.org/repos/asf/systemml/tree/af9cc8a9 Diff: http://git-wip-us.apache.org/repos/asf/systemml/diff/af9cc8a9 Branch: refs/heads/master Commit: af9cc8a90b73520b04ecd579d7359a62d577009f Parents: 7eebb42 Author: Matthias Boehm <[email protected]> Authored: Wed May 30 22:05:18 2018 -0700 Committer: Matthias Boehm <[email protected]> Committed: Wed May 30 22:05:18 2018 -0700 ---------------------------------------------------------------------- .../runtime/matrix/data/LibMatrixMult.java | 23 +++++++++++++++++--- 1 file changed, 20 insertions(+), 3 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/systemml/blob/af9cc8a9/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixMult.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixMult.java b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixMult.java index 92a6b7a..31f827b 100644 --- a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixMult.java +++ b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixMult.java @@ -1237,14 +1237,14 @@ public class LibMatrixMult double[] bvals = b.valuesAt(0); double[] cvals = c.valuesAt(0); - final int blocksizeI = 32; - final int blocksizeK = (int)Math.max(2*1024,2*1024*xsp/32); //~ 16KB L1 + final int blocksizeI = 512; //8KB curk+cvals in L1 + final int blocksizeK = (int)Math.max(2048,2048*xsp/32); //~256KB bvals in L2 int[] curk = new int[blocksizeI]; for( int bi = rl; bi < ru; bi+=blocksizeI ) { Arrays.fill(curk, 0); //reset positions for( int bk=0, bimin = Math.min(ru, bi+blocksizeI); bk<cd; bk+=blocksizeK ) { - for( int i=bi, bkmin = Math.min(bk+blocksizeK, cd); i<bimin; i++) { + for( int i=bi, bkmin = bk+blocksizeK; i<bimin; i++) { if( a.isEmpty(i) ) continue; int apos = a.pos(i); int alen = a.size(i); @@ -3814,6 +3814,23 @@ public class LibMatrixMult return knnz; } + + @SuppressWarnings("unused") + private static void resetPosVect(int[] curk, SparseBlock sblock, int rl, int ru) { + if( sblock instanceof SparseBlockMCSR ) { + //all rows start at position 0 (individual arrays) + Arrays.fill(curk, 0, ru-rl, 0); + } + else if( sblock instanceof SparseBlockCSR ) { + //row start positions given in row ptr array + SparseBlockCSR csr = (SparseBlockCSR) sblock; + System.arraycopy(csr.rowPointers(), rl, curk, 0, ru-rl); + } + else { //general case + for(int i=rl; i<ru; i++) + curk[i-rl] = sblock.pos(i); + } + } private static void sumScalarResults(List<Future<Double>> tasks, MatrixBlock ret) throws InterruptedException, ExecutionException
