[SYSTEMML-2020] Avoid sparse-dense conversion in codegen outer tpls This patch generalizes the codegen outer product template to consume - similar to the cell and row templates - generic SideInputs instead of dense matrices as side input, which avoids unnecessary conversions from sparse to dense matrices and thus improves performance.
Project: http://git-wip-us.apache.org/repos/asf/systemml/repo Commit: http://git-wip-us.apache.org/repos/asf/systemml/commit/db3d54c1 Tree: http://git-wip-us.apache.org/repos/asf/systemml/tree/db3d54c1 Diff: http://git-wip-us.apache.org/repos/asf/systemml/diff/db3d54c1 Branch: refs/heads/master Commit: db3d54c1ce6651f2abf43d3ba919527b943aa8ba Parents: 9391728 Author: Matthias Boehm <[email protected]> Authored: Fri Nov 17 16:49:45 2017 -0800 Committer: Matthias Boehm <[email protected]> Committed: Fri Nov 17 16:49:45 2017 -0800 ---------------------------------------------------------------------- .../hops/codegen/cplan/CNodeOuterProduct.java | 5 +- .../runtime/codegen/SpoofOuterProduct.java | 66 +++++++++++--------- 2 files changed, 39 insertions(+), 32 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/systemml/blob/db3d54c1/src/main/java/org/apache/sysml/hops/codegen/cplan/CNodeOuterProduct.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/hops/codegen/cplan/CNodeOuterProduct.java b/src/main/java/org/apache/sysml/hops/codegen/cplan/CNodeOuterProduct.java index e518246..e2967a5 100644 --- a/src/main/java/org/apache/sysml/hops/codegen/cplan/CNodeOuterProduct.java +++ b/src/main/java/org/apache/sysml/hops/codegen/cplan/CNodeOuterProduct.java @@ -31,6 +31,7 @@ public class CNodeOuterProduct extends CNodeTpl private static final String TEMPLATE = "package codegen;\n" + "import org.apache.sysml.runtime.codegen.LibSpoofPrimitives;\n" + + "import org.apache.sysml.runtime.codegen.SpoofOperator.SideInput;\n" + "import org.apache.sysml.runtime.codegen.SpoofOuterProduct;\n" + "import org.apache.sysml.runtime.codegen.SpoofOuterProduct.OutProdType;\n" + "import org.apache.commons.math3.util.FastMath;\n" @@ -39,10 +40,10 @@ public class CNodeOuterProduct extends CNodeTpl + " public %TMP%() {\n" + " super(OutProdType.%TYPE%);\n" + " }\n" - + " protected void genexecDense(double a, double[] a1, int a1i, double[] a2, int a2i, double[][] b, double[] scalars, double[] c, int ci, int m, int n, int len, int rix, int cix) { \n" + + " protected void genexecDense(double a, double[] a1, int a1i, double[] a2, int a2i, SideInput[] b, double[] scalars, double[] c, int ci, int m, int n, int len, int rix, int cix) { \n" + "%BODY_dense%" + " }\n" - + " protected double genexecCellwise(double a, double[] a1, int a1i, double[] a2, int a2i, double[][] b, double[] scalars, int m, int n, int len, int rix, int cix) { \n" + + " protected double genexecCellwise(double a, double[] a1, int a1i, double[] a2, int a2i, SideInput[] b, double[] scalars, int m, int n, int len, int rix, int cix) { \n" + "%BODY_cellwise%" + " return %OUT_cellwise%;\n" + " }\n" http://git-wip-us.apache.org/repos/asf/systemml/blob/db3d54c1/src/main/java/org/apache/sysml/runtime/codegen/SpoofOuterProduct.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/runtime/codegen/SpoofOuterProduct.java b/src/main/java/org/apache/sysml/runtime/codegen/SpoofOuterProduct.java index 26a661a..4d4da51 100644 --- a/src/main/java/org/apache/sysml/runtime/codegen/SpoofOuterProduct.java +++ b/src/main/java/org/apache/sysml/runtime/codegen/SpoofOuterProduct.java @@ -82,7 +82,7 @@ public abstract class SpoofOuterProduct extends SpoofOperator //input preparation double[][] ab = getDenseMatrices(prepInputMatrices(inputs, 1, 2, true, false)); - double[][] b = getDenseMatrices(prepInputMatrices(inputs, 3, true)); + SideInput[] b = prepInputMatrices(inputs, 3, false); double[] scalars = prepInputScalars(scalarObjects); //core sequential execute @@ -118,7 +118,7 @@ public abstract class SpoofOuterProduct extends SpoofOperator //input preparation double[][] ab = getDenseMatrices(prepInputMatrices(inputs, 1, 2, true, false)); - double[][] b = getDenseMatrices(prepInputMatrices(inputs, 3, true)); + SideInput[] b = prepInputMatrices(inputs, 3, false); double[] scalars = prepInputScalars(scalarObjects); //core sequential execute @@ -186,9 +186,9 @@ public abstract class SpoofOuterProduct extends SpoofOperator //input preparation double[][] ab = getDenseMatrices(prepInputMatrices(inputs, 1, 2, true, false)); - double[][] b = getDenseMatrices(prepInputMatrices(inputs, 3, true)); + SideInput[] b = prepInputMatrices(inputs, 3, false); double[] scalars = prepInputScalars(scalarObjects); - + //core sequential execute final int m = inputs.get(0).getNumRows(); final int n = inputs.get(0).getNumColumns(); @@ -268,7 +268,7 @@ public abstract class SpoofOuterProduct extends SpoofOperator //input preparation double[][] ab = getDenseMatrices(prepInputMatrices(inputs, 1, 2, true, false)); - double[][] b = getDenseMatrices(prepInputMatrices(inputs, 3, true)); + SideInput[] b = prepInputMatrices(inputs, 3, false); double[] scalars = prepInputScalars(scalarObjects); //core sequential execute @@ -338,13 +338,14 @@ public abstract class SpoofOuterProduct extends SpoofOperator return UtilFunctions.roundToNext(base, k); } - private void executeDense(double[] a, double[] u, double[] v, double[][] b, double[] scalars, + private void executeDense(double[] a, double[] u, double[] v, SideInput[] b, double[] scalars, double[] c, int m, int n, int k, OutProdType type, int rl, int ru, int cl, int cu ) { //approach: iterate over non-zeros of w, selective mm computation //cache-conscious blocking: due to blocksize constraint (default 1000), //a blocksize of 16 allows to fit blocks of UV into L2 cache (256KB) + SideInput[] lb = createSparseSideInputs(b); final int blocksizeIJ = 16; //u/v block (max at typical L2 size) int cix = 0; //blocked execution @@ -358,18 +359,19 @@ public abstract class SpoofOuterProduct extends SpoofOperator for( int j=bj, vix=bj*k; j<bjmin; j++, vix+=k) if( a[ix+j] != 0 ) { cix = (type == OutProdType.LEFT_OUTER_PRODUCT) ? vix : uix; - genexecDense( a[ix+j], u, uix, v, vix, b, scalars, c, cix, m, n, k, i, j); + genexecDense( a[ix+j], u, uix, v, vix, lb, scalars, c, cix, m, n, k, i, j); } } } - private void executeCellwiseDense(double[] a, double[] u, double[] v, double[][] b, double[] scalars, + private void executeCellwiseDense(double[] a, double[] u, double[] v, SideInput[] b, double[] scalars, double[] c, int m, int n, int k, OutProdType type, int rl, int ru, int cl, int cu ) { //approach: iterate over non-zeros of w, selective mm computation //cache-conscious blocking: due to blocksize constraint (default 1000), //a blocksize of 16 allows to fit blocks of UV into L2 cache (256KB) + SideInput[] lb = createSparseSideInputs(b); final int blocksizeIJ = 16; //u/v block (max at typical L2 size) //blocked execution double sum = 0; @@ -383,18 +385,19 @@ public abstract class SpoofOuterProduct extends SpoofOperator for( int j=bj, vix=bj*k; j<bjmin; j++, vix+=k) if( a[ix+j] != 0 ) { if(type == OutProdType.CELLWISE_OUTER_PRODUCT) - c[ix+j] = genexecCellwise( a[ix+j], u, uix, v, vix, b, scalars, m, n, k, i, j ); + c[ix+j] = genexecCellwise( a[ix+j], u, uix, v, vix, lb, scalars, m, n, k, i, j ); else - sum += genexecCellwise( a[ix+j], u, uix, v, vix, b, scalars, m, n, k, i, j); + sum += genexecCellwise( a[ix+j], u, uix, v, vix, lb, scalars, m, n, k, i, j); } } if( type != OutProdType.CELLWISE_OUTER_PRODUCT ) c[0] = sum; } - private void executeSparse(SparseBlock sblock, double[] u, double[] v, double[][] b, double[] scalars, + private void executeSparse(SparseBlock sblock, double[] u, double[] v, SideInput[] b, double[] scalars, double[] c, int m, int n, int k, long nnz, OutProdType type, int rl, int ru, int cl, int cu) { + SideInput[] lb = createSparseSideInputs(b); boolean left = (_outerProductType== OutProdType.LEFT_OUTER_PRODUCT); //approach: iterate over non-zeros of w, selective mm computation @@ -420,7 +423,7 @@ public abstract class SpoofOuterProduct extends SpoofOperator int index = (cl==0||sblock.isEmpty(i)) ? 0 : sblock.posFIndexGTE(i,cl); index = wpos + ((index>=0) ? index : n); for( ; index<wpos+wlen && wix[index]<cu; index++ ) { - genexecDense(wval[index], u, uix, v, wix[index]*k, b, scalars, c, + genexecDense(wval[index], u, uix, v, wix[index]*k, lb, scalars, c, (left ? wix[index]*k : uix), m, n, k, i, wix[index]); } } @@ -454,7 +457,7 @@ public abstract class SpoofOuterProduct extends SpoofOperator int index = wpos + curk[i-bi]; for( ; index<wpos+wlen && wix[index]<bjmin; index++ ) { - genexecDense(wval[index], u, uix, v, wix[index]*k, b, scalars, c, + genexecDense(wval[index], u, uix, v, wix[index]*k, lb, scalars, c, (left ? wix[index]*k : uix), m, n, k, i, wix[index]); } curk[i-bi] = index - wpos; @@ -464,9 +467,10 @@ public abstract class SpoofOuterProduct extends SpoofOperator } } - private void executeCellwiseSparse(SparseBlock sblock, double[] u, double[] v, double[][] b, double[] scalars, + private void executeCellwiseSparse(SparseBlock sblock, double[] u, double[] v, SideInput[] b, double[] scalars, MatrixBlock out, int m, int n, int k, long nnz, OutProdType type, int rl, int ru, int cl, int cu ) { + SideInput[] lb = createSparseSideInputs(b); final int blocksizeIJ = (int) (8L*m*n/nnz); int[] curk = new int[Math.min(blocksizeIJ, ru-rl)]; @@ -491,11 +495,11 @@ public abstract class SpoofOuterProduct extends SpoofOperator if( type == OutProdType.CELLWISE_OUTER_PRODUCT ) for( ; index<wpos+wlen && wix[index]<bjmin; index++ ) c[wix[index]] = genexecCellwise( wval[index], - u, uix, v, wix[index]*k, b, scalars, m, n, k, i, wix[index] ); + u, uix, v, wix[index]*k, lb, scalars, m, n, k, i, wix[index] ); else for( ; index<wpos+wlen && wix[index]<bjmin; index++ ) tmp += genexecCellwise( wval[index], - u, uix, v, wix[index]*k, b, scalars, m, n, k, i, wix[index]); + u, uix, v, wix[index]*k, lb, scalars, m, n, k, i, wix[index]); curk[i-bi] = index - wpos; } } @@ -522,7 +526,7 @@ public abstract class SpoofOuterProduct extends SpoofOperator int index = wpos + curk[i-bi]; for( ; index<wpos+wlen && wix[index]<bjmin; index++ ) { c.append(i, wix[index], genexecCellwise( wval[index], u, uix, v, - wix[index]*k, b, scalars, m, n, k, i, wix[index] )); + wix[index]*k, lb, scalars, m, n, k, i, wix[index] )); } curk[i-bi] = index - wpos; } @@ -531,9 +535,10 @@ public abstract class SpoofOuterProduct extends SpoofOperator } } - private void executeCompressed(CompressedMatrixBlock a, double[] u, double[] v, double[][] b, double[] scalars, + private void executeCompressed(CompressedMatrixBlock a, double[] u, double[] v, SideInput[] b, double[] scalars, double[] c, int m, int n, int k, OutProdType type, int rl, int ru, int cl, int cu) { + SideInput[] lb = createSparseSideInputs(b); boolean left = (_outerProductType==OutProdType.LEFT_OUTER_PRODUCT); Iterator<IJV> iter = !left ? a.getIterator(rl, ru, false) : @@ -542,14 +547,15 @@ public abstract class SpoofOuterProduct extends SpoofOperator IJV cell = iter.next(); int uix = cell.getI() * k; int vix = cell.getJ() * k; - genexecDense(cell.getV(), u, uix, v, vix, b, scalars, c, + genexecDense(cell.getV(), u, uix, v, vix, lb, scalars, c, left ? vix : uix, m, n, k, cell.getI(), cell.getJ()); } } - private void executeCellwiseCompressed(CompressedMatrixBlock a, double[] u, double[] v, double[][] b, double[] scalars, + private void executeCellwiseCompressed(CompressedMatrixBlock a, double[] u, double[] v, SideInput[] b, double[] scalars, MatrixBlock out, int m, int n, int k, OutProdType type, int rl, int ru, int cl, int cu ) - { + { + SideInput[] lb = createSparseSideInputs(b); double[] c = out.getDenseBlock(); SparseBlock csblock = out.getSparseBlock(); @@ -562,23 +568,23 @@ public abstract class SpoofOuterProduct extends SpoofOperator if( out.isInSparseFormat() ) { csblock.allocate(cell.getI()); csblock.append(cell.getI(), cell.getJ(), - genexecCellwise(cell.getV(), u, uix, v, vix, b, scalars, m, n, k, cell.getI(), cell.getJ())); + genexecCellwise(cell.getV(), u, uix, v, vix, lb, scalars, m, n, k, cell.getI(), cell.getJ())); } else { c[cell.getI()*n+cell.getJ()] = - genexecCellwise(cell.getV(), u, uix, v, vix, b, scalars, m, n, k, cell.getI(), cell.getJ()); + genexecCellwise(cell.getV(), u, uix, v, vix, lb, scalars, m, n, k, cell.getI(), cell.getJ()); } } else { - c[0] += genexecCellwise(cell.getV(), u, uix, v, vix, b, scalars, m, n, k, cell.getI(), cell.getJ()); + c[0] += genexecCellwise(cell.getV(), u, uix, v, vix, lb, scalars, m, n, k, cell.getI(), cell.getJ()); } } } - protected abstract void genexecDense( double a, double[] u, int ui, double[] v, int vi, double[][] b, + protected abstract void genexecDense( double a, double[] u, int ui, double[] v, int vi, SideInput[] b, double[] scalars, double[] c, int ci, int m, int n, int k, int rowIndex, int colIndex); - protected abstract double genexecCellwise( double a, double[] u, int ui, double[] v, int vi, double[][] b, + protected abstract double genexecCellwise( double a, double[] u, int ui, double[] v, int vi, SideInput[] b, double[] scalars, int m, int n, int k, int rowIndex, int colIndex); private class ParExecTask implements Callable<Long> @@ -586,7 +592,7 @@ public abstract class SpoofOuterProduct extends SpoofOperator private final MatrixBlock _a; private final double[] _u; private final double[] _v; - private final double[][] _b; + private final SideInput[] _b; private final double[] _scalars; private final MatrixBlock _c; private final int _clen; @@ -598,7 +604,7 @@ public abstract class SpoofOuterProduct extends SpoofOperator private final int _cl; private final int _cu; - protected ParExecTask( MatrixBlock a, double[] u, double[] v, double[][] b, double[] scalars , MatrixBlock c, int m, int n, int k, OutProdType type, int rl, int ru, int cl, int cu ) { + protected ParExecTask( MatrixBlock a, double[] u, double[] v, SideInput[] b, double[] scalars , MatrixBlock c, int m, int n, int k, OutProdType type, int rl, int ru, int cl, int cu ) { _a = a; _u = u; _v = v; @@ -653,7 +659,7 @@ public abstract class SpoofOuterProduct extends SpoofOperator private final MatrixBlock _a; private final double[] _u; private final double[] _v; - private final double[][] _b; + private final SideInput[] _b; private final double[] _scalars; private final int _rlen; private final int _clen; @@ -664,7 +670,7 @@ public abstract class SpoofOuterProduct extends SpoofOperator private final int _cl; private final int _cu; - protected ParOuterProdAggTask( MatrixBlock a, double[] u, double[] v, double[][] b, double[] scalars, int m, int n, int k, OutProdType type, int rl, int ru, int cl, int cu ) { + protected ParOuterProdAggTask( MatrixBlock a, double[] u, double[] v, SideInput[] b, double[] scalars, int m, int n, int k, OutProdType type, int rl, int ru, int cl, int cu ) { _a = a; _u = u; _v = v;
