[SYSTEMML-1976] Performance codegen outer ops w/ ultra-sparse inputs This patch improves the performance of codegen outer operations (especially left mm), for ultra-sparse inputs. On ultra-sparse inputs, the allocation and maintenance of column indexes can become a bottleneck. Accordingly, this patch adds a special case for ultra-sparse matrices.
For the core update rules of ALS-CG, this patch improved the performance as follows (10 iterations over the amazon review dataset): Left (t(t(U) %*% (W * (U %*% t(V))))): 125.7s -> 14.2s Right ((W * (U %*% t(V))) %*% V): 25.2s -> 17.9s Project: http://git-wip-us.apache.org/repos/asf/systemml/repo Commit: http://git-wip-us.apache.org/repos/asf/systemml/commit/ede870de Tree: http://git-wip-us.apache.org/repos/asf/systemml/tree/ede870de Diff: http://git-wip-us.apache.org/repos/asf/systemml/diff/ede870de Branch: refs/heads/master Commit: ede870de8a7b01bd44ac3b6bcfe7f0e86b1c93c8 Parents: 83e01b0 Author: Matthias Boehm <[email protected]> Authored: Fri Oct 27 22:01:36 2017 -0700 Committer: Matthias Boehm <[email protected]> Committed: Fri Oct 27 22:01:36 2017 -0700 ---------------------------------------------------------------------- .../runtime/codegen/SpoofOuterProduct.java | 77 +++++++++++++------- 1 file changed, 52 insertions(+), 25 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/systemml/blob/ede870de/src/main/java/org/apache/sysml/runtime/codegen/SpoofOuterProduct.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/runtime/codegen/SpoofOuterProduct.java b/src/main/java/org/apache/sysml/runtime/codegen/SpoofOuterProduct.java index ac8dc57..26a661a 100644 --- a/src/main/java/org/apache/sysml/runtime/codegen/SpoofOuterProduct.java +++ b/src/main/java/org/apache/sysml/runtime/codegen/SpoofOuterProduct.java @@ -28,6 +28,7 @@ import java.util.concurrent.ExecutorService; import java.util.concurrent.Executors; import java.util.concurrent.Future; +import org.apache.sysml.hops.OptimizerUtils; import org.apache.sysml.runtime.DMLRuntimeException; import org.apache.sysml.runtime.compress.CompressedMatrixBlock; import org.apache.sysml.runtime.instructions.cp.DoubleObject; @@ -391,7 +392,7 @@ public abstract class SpoofOuterProduct extends SpoofOperator c[0] = sum; } - private void executeSparse(SparseBlock sblock, double[] u, double[] v, double[][] b, double[] scalars, + private void executeSparse(SparseBlock sblock, double[] u, double[] v, double[][] b, double[] scalars, double[] c, int m, int n, int k, long nnz, OutProdType type, int rl, int ru, int cl, int cu) { boolean left = (_outerProductType== OutProdType.LEFT_OUTER_PRODUCT); @@ -401,37 +402,63 @@ public abstract class SpoofOuterProduct extends SpoofOperator //blocksize is chosen such that we reuse each Ui/Vj vector on average 8 times, //with custom blocksizeJ for wdivmm_left to avoid LLC misses on output. final int blocksizeI = (int) (8L*m*n/nnz); - final int blocksizeJ = left ? Math.max(8,Math.min(L2_CACHESIZE/(k*8), blocksizeI)) : blocksizeI; - int[] curk = new int[Math.min(blocksizeI,ru-rl)]; - for( int bi = rl; bi < ru; bi+=blocksizeI ) + if( OptimizerUtils.getSparsity(m, n, nnz) < MatrixBlock.ULTRA_SPARSITY_TURN_POINT ) //ultra-sparse { - int bimin = Math.min(ru, bi+blocksizeI); - //prepare starting indexes for block row - for( int i=bi; i<bimin; i++ ) { + //for ultra-sparse matrices, we do not allocate the index array because + //its allocation and maintenance can dominate the total runtime. + + //core wdivmm block matrix mult + for( int i=rl, uix=rl*k; i<ru; i++, uix+=k ) { + if( sblock.isEmpty(i) ) continue; + + int wpos = sblock.pos(i); + int wlen = sblock.size(i); + int[] wix = sblock.indexes(i); + double[] wval = sblock.values(i); + int index = (cl==0||sblock.isEmpty(i)) ? 0 : sblock.posFIndexGTE(i,cl); - curk[i-bi] = (index>=0) ? index : n; + index = wpos + ((index>=0) ? index : n); + for( ; index<wpos+wlen && wix[index]<cu; index++ ) { + genexecDense(wval[index], u, uix, v, wix[index]*k, b, scalars, c, + (left ? wix[index]*k : uix), m, n, k, i, wix[index]); + } } + } + else //sparse + { + final int blocksizeJ = left ? Math.max(8,Math.min(L2_CACHESIZE/(k*8), blocksizeI)) : blocksizeI; + int[] curk = new int[Math.min(blocksizeI,ru-rl)]; - //blocked execution over column blocks - for( int bj = cl; bj < cu; bj+=blocksizeJ ) + for( int bi = rl; bi < ru; bi+=blocksizeI ) { - int bjmin = Math.min(cu, bj+blocksizeJ); - //core wdivmm block matrix mult - for( int i=bi, uix=bi*k; i<bimin; i++, uix+=k ) { - if( sblock.isEmpty(i) ) continue; - - int wpos = sblock.pos(i); - int wlen = sblock.size(i); - int[] wix = sblock.indexes(i); - double[] wval = sblock.values(i); - - int index = wpos + curk[i-bi]; - for( ; index<wpos+wlen && wix[index]<bjmin; index++ ) { - genexecDense(wval[index], u, uix, v, wix[index]*k, b, scalars, c, - (left ? wix[index]*k : uix), m, n, k, i, wix[index]); + int bimin = Math.min(ru, bi+blocksizeI); + //prepare starting indexes for block row + for( int i=bi; i<bimin; i++ ) { + int index = (cl==0||sblock.isEmpty(i)) ? 0 : sblock.posFIndexGTE(i,cl); + curk[i-bi] = (index>=0) ? index : n; + } + + //blocked execution over column blocks + for( int bj = cl; bj < cu; bj+=blocksizeJ ) + { + int bjmin = Math.min(cu, bj+blocksizeJ); + //core wdivmm block matrix mult + for( int i=bi, uix=bi*k; i<bimin; i++, uix+=k ) { + if( sblock.isEmpty(i) ) continue; + + int wpos = sblock.pos(i); + int wlen = sblock.size(i); + int[] wix = sblock.indexes(i); + double[] wval = sblock.values(i); + + int index = wpos + curk[i-bi]; + for( ; index<wpos+wlen && wix[index]<bjmin; index++ ) { + genexecDense(wval[index], u, uix, v, wix[index]*k, b, scalars, c, + (left ? wix[index]*k : uix), m, n, k, i, wix[index]); + } + curk[i-bi] = index - wpos; } - curk[i-bi] = index - wpos; } } }
