[SYSTEMML-2267] Generalized multi-threaded unary ops dense blocks >16GB This patch generalized the newly introduced multi-threaded unary operations for large dense blocks >16GB by processing a physical block at a time via parallelSetAll.
Project: http://git-wip-us.apache.org/repos/asf/systemml/repo Commit: http://git-wip-us.apache.org/repos/asf/systemml/commit/2f278bc2 Tree: http://git-wip-us.apache.org/repos/asf/systemml/tree/2f278bc2 Diff: http://git-wip-us.apache.org/repos/asf/systemml/diff/2f278bc2 Branch: refs/heads/master Commit: 2f278bc2ac85d391b9353124ce85b7db884cba5b Parents: c1a7f85 Author: Matthias Boehm <[email protected]> Authored: Fri Apr 20 19:44:37 2018 -0700 Committer: Matthias Boehm <[email protected]> Committed: Fri Apr 20 19:44:37 2018 -0700 ---------------------------------------------------------------------- .../apache/sysml/runtime/matrix/data/MatrixBlock.java | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/systemml/blob/2f278bc2/src/main/java/org/apache/sysml/runtime/matrix/data/MatrixBlock.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/runtime/matrix/data/MatrixBlock.java b/src/main/java/org/apache/sysml/runtime/matrix/data/MatrixBlock.java index bb5e79b..9e032b6 100644 --- a/src/main/java/org/apache/sysml/runtime/matrix/data/MatrixBlock.java +++ b/src/main/java/org/apache/sysml/runtime/matrix/data/MatrixBlock.java @@ -2584,15 +2584,18 @@ public class MatrixBlock extends MatrixValue implements CacheBlock, Externalizab else LibMatrixAgg.cumaggregateUnaryMatrix(this, ret, op); } - else if(!sparse && !isEmptyBlock(false) && getDenseBlock().isContiguous() + else if(!sparse && !isEmptyBlock(false) && OptimizerUtils.isMaxLocalParallelism(op.getNumThreads())) { //note: we apply multi-threading in a best-effort manner here //only for expensive operators such as exp, log, sigmoid, because //otherwise allocation, read and write anyway dominates ret.allocateDenseBlock(false); - double[] a = getDenseBlockValues(); - double[] c = ret.getDenseBlockValues(); - Arrays.parallelSetAll(c, i -> op.fn.execute(a[i])); + DenseBlock a = getDenseBlock(); + DenseBlock c = ret.getDenseBlock(); + for(int bi=0; bi<a.numBlocks(); bi++) { + double[] avals = a.valuesAt(bi), cvals = c.valuesAt(bi); + Arrays.parallelSetAll(cvals, i -> op.fn.execute(avals[i])); + } ret.recomputeNonZeros(); } else {
