Repository: systemml Updated Branches: refs/heads/master 79a5e80f6 -> 5a155f3d2
[SYSTEMML-2329] New sampling-based mm sparsity estimator This patch introduces an additional baseline sparsity estimator based on random sampling of columns from A and aligned rows from B for estimating the output sparsity of AB. Project: http://git-wip-us.apache.org/repos/asf/systemml/repo Commit: http://git-wip-us.apache.org/repos/asf/systemml/commit/5a155f3d Tree: http://git-wip-us.apache.org/repos/asf/systemml/tree/5a155f3d Diff: http://git-wip-us.apache.org/repos/asf/systemml/diff/5a155f3d Branch: refs/heads/master Commit: 5a155f3d2402b1d77ca90a1d024cb04d6a2ca80d Parents: 79a5e80 Author: Matthias Boehm <mboe...@gmail.com> Authored: Thu May 17 20:39:24 2018 -0700 Committer: Matthias Boehm <mboe...@gmail.com> Committed: Thu May 17 20:39:24 2018 -0700 ---------------------------------------------------------------------- .../sysml/hops/estim/EstimatorSample.java | 111 +++++++++++++++++++ .../estim/CompressedSizeEstimatorSample.java | 10 +- .../sysml/runtime/util/UtilFunctions.java | 14 ++- .../functions/estim/OuterProductTest.java | 21 ++++ .../functions/estim/SquaredProductTest.java | 21 ++++ 5 files changed, 166 insertions(+), 11 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/systemml/blob/5a155f3d/src/main/java/org/apache/sysml/hops/estim/EstimatorSample.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/hops/estim/EstimatorSample.java b/src/main/java/org/apache/sysml/hops/estim/EstimatorSample.java new file mode 100644 index 0000000..6ed918a --- /dev/null +++ b/src/main/java/org/apache/sysml/hops/estim/EstimatorSample.java @@ -0,0 +1,111 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.sysml.hops.estim; + +import org.apache.sysml.hops.OptimizerUtils; +import org.apache.sysml.runtime.DMLRuntimeException; +import org.apache.sysml.runtime.matrix.MatrixCharacteristics; +import org.apache.sysml.runtime.matrix.data.DenseBlock; +import org.apache.sysml.runtime.matrix.data.LibMatrixAgg; +import org.apache.sysml.runtime.matrix.data.MatrixBlock; +import org.apache.sysml.runtime.matrix.data.SparseBlock; +import org.apache.sysml.runtime.util.UtilFunctions; + +/** + * This estimator implements an approach based on row/column sampling + * Yongyang Yu, MingJie Tang, Walid G. Aref, Qutaibah M. Malluhi, Mostafa M. Abbas, Mourad Ouzzani: + * In-Memory Distributed Matrix Computation Processing and Optimization. ICDE 2017: 1047-1058 + * + * The basic idea is to draw random samples of aligned columns SA and rows SB, + * and compute the output nnz as max(nnz(SA_i)*nnz(SB_i)). However, this estimator is + * biased toward underestimation as the maximum is unlikely sampled and collisions are + * not accounted for. + */ +public class EstimatorSample extends SparsityEstimator +{ + private static final double SAMPLE_FRACTION = 0.1; //10% + + private final double _frac; + + public EstimatorSample() { + this(SAMPLE_FRACTION); + } + + public EstimatorSample(double sampleFrac) { + if( sampleFrac < 0 || sampleFrac > 1.0 ) + throw new DMLRuntimeException("Invalid sample fraction: "+sampleFrac); + _frac = sampleFrac; + } + + @Override + public double estim(MMNode root) { + LOG.warn("Recursive estimates not supported by EstimatorSample, falling back to EstimatorBasicAvg."); + return new EstimatorBasicAvg().estim(root); + } + + @Override + public double estim(MatrixBlock m1, MatrixBlock m2) { + //get sampled indexes + int k = m1.getNumColumns(); + int[] ix = UtilFunctions.getSortedSampleIndexes( + k, (int)Math.max(k*_frac, 1)); + //compute output sparsity + int[] cnnz = computeColumnNnz(m1, ix); + long nnzOut = 0; + for(int i=0; i<ix.length; i++) + nnzOut = Math.max(nnzOut, cnnz[i] * m2.recomputeNonZeros(ix[i], ix[i])); + return OptimizerUtils.getSparsity( + m1.getNumRows(), m2.getNumColumns(), nnzOut); + } + + @Override + public double estim(MatrixCharacteristics mc1, MatrixCharacteristics mc2) { + LOG.warn("Meta-data-only estimates not supported by EstimatorSample, falling back to EstimatorBasicAvg."); + return new EstimatorBasicAvg().estim(mc1, mc2); + } + + private int[] computeColumnNnz(MatrixBlock in, int[] ix) { + int[] nnz = new int[in.getNumColumns()]; + //count column nnz brute force or selective + if( in.isInSparseFormat() ) { + SparseBlock sblock = in.getSparseBlock(); + for( int i=0; i<in.getNumRows(); i++ ) { + if( sblock.isEmpty(i) ) continue; + LibMatrixAgg.countAgg(sblock.values(i), nnz, + sblock.indexes(i), sblock.pos(i), sblock.size(i)); + } + } + else { + DenseBlock dblock = in.getDenseBlock(); + for( int i=0; i<in.getNumRows(); i++ ) { + double[] avals = dblock.values(i); + int aix = dblock.pos(i); + for( int j=0; j<in.getNumColumns(); j++ ) + nnz[j] += (avals[aix+j] != 0) ? 1 : 0; + } + } + + //copy nnz into reduced vector + int[] ret = new int[ix.length]; + for(int i=0; i<ix.length; i++) + ret[i] = nnz[ix[i]]; + return ret; + } +} http://git-wip-us.apache.org/repos/asf/systemml/blob/5a155f3d/src/main/java/org/apache/sysml/runtime/compress/estim/CompressedSizeEstimatorSample.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/runtime/compress/estim/CompressedSizeEstimatorSample.java b/src/main/java/org/apache/sysml/runtime/compress/estim/CompressedSizeEstimatorSample.java index d4c829c..73c93bf 100644 --- a/src/main/java/org/apache/sysml/runtime/compress/estim/CompressedSizeEstimatorSample.java +++ b/src/main/java/org/apache/sysml/runtime/compress/estim/CompressedSizeEstimatorSample.java @@ -27,13 +27,13 @@ import org.apache.commons.logging.LogFactory; import org.apache.commons.math3.analysis.UnivariateFunction; import org.apache.commons.math3.analysis.solvers.UnivariateSolverUtils; import org.apache.commons.math3.distribution.ChiSquaredDistribution; -import org.apache.commons.math3.random.RandomDataGenerator; import org.apache.sysml.runtime.compress.BitmapEncoder; import org.apache.sysml.runtime.compress.ReaderColumnSelection; import org.apache.sysml.runtime.compress.CompressedMatrixBlock; import org.apache.sysml.runtime.compress.UncompressedBitmap; import org.apache.sysml.runtime.compress.utils.DblArray; import org.apache.sysml.runtime.matrix.data.MatrixBlock; +import org.apache.sysml.runtime.util.UtilFunctions; public class CompressedSizeEstimatorSample extends CompressedSizeEstimator { @@ -303,12 +303,8 @@ public class CompressedSizeEstimatorSample extends CompressedSizeEstimator * @return sorted array of integers */ private static int[] getSortedUniformSample(int range, int smplSize) { - if (smplSize == 0) - return new int[] {}; - RandomDataGenerator rng = new RandomDataGenerator(); - int[] sample = rng.nextPermutation(range, smplSize); - Arrays.sort(sample); - return sample; + if (smplSize == 0) return new int[] {}; + return UtilFunctions.getSortedSampleIndexes(range, smplSize); } http://git-wip-us.apache.org/repos/asf/systemml/blob/5a155f3d/src/main/java/org/apache/sysml/runtime/util/UtilFunctions.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/runtime/util/UtilFunctions.java b/src/main/java/org/apache/sysml/runtime/util/UtilFunctions.java index e32cb22..58dde27 100644 --- a/src/main/java/org/apache/sysml/runtime/util/UtilFunctions.java +++ b/src/main/java/org/apache/sysml/runtime/util/UtilFunctions.java @@ -29,6 +29,7 @@ import java.util.stream.Stream; import java.util.stream.StreamSupport; import org.apache.commons.lang.ArrayUtils; +import org.apache.commons.math3.random.RandomDataGenerator; import org.apache.sysml.parser.Expression.ValueType; import org.apache.sysml.runtime.matrix.MetaDataNumItemsByEachReducer; import org.apache.sysml.runtime.matrix.data.FrameBlock; @@ -515,17 +516,22 @@ public class UtilFunctions return 0; //equal } - public static boolean isIntegerNumber( String str ) - { + public static boolean isIntegerNumber( String str ) { byte[] c = str.getBytes(); for( int i=0; i<c.length; i++ ) if( c[i] < 48 || c[i] > 57 ) return false; return true; } + + public static int[] getSortedSampleIndexes(int range, int sampleSize) { + RandomDataGenerator rng = new RandomDataGenerator(); + int[] sample = rng.nextPermutation(range, sampleSize); + Arrays.sort(sample); + return sample; + } - public static byte max( byte[] array ) - { + public static byte max( byte[] array ) { byte ret = Byte.MIN_VALUE; for( int i=0; i<array.length; i++ ) ret = (array[i]>ret)?array[i]:ret; http://git-wip-us.apache.org/repos/asf/systemml/blob/5a155f3d/src/test/java/org/apache/sysml/test/integration/functions/estim/OuterProductTest.java ---------------------------------------------------------------------- diff --git a/src/test/java/org/apache/sysml/test/integration/functions/estim/OuterProductTest.java b/src/test/java/org/apache/sysml/test/integration/functions/estim/OuterProductTest.java index f45ca70..70facf7 100644 --- a/src/test/java/org/apache/sysml/test/integration/functions/estim/OuterProductTest.java +++ b/src/test/java/org/apache/sysml/test/integration/functions/estim/OuterProductTest.java @@ -25,6 +25,7 @@ import org.apache.sysml.hops.estim.EstimatorBasicWorst; import org.apache.sysml.hops.estim.EstimatorBitsetMM; import org.apache.sysml.hops.estim.EstimatorDensityMap; import org.apache.sysml.hops.estim.EstimatorMatrixHistogram; +import org.apache.sysml.hops.estim.EstimatorSample; import org.apache.sysml.hops.estim.SparsityEstimator; import org.apache.sysml.runtime.instructions.InstructionUtils; import org.apache.sysml.runtime.matrix.data.MatrixBlock; @@ -118,6 +119,26 @@ public class OuterProductTest extends AutomatedTestBase runSparsityEstimateTest(new EstimatorMatrixHistogram(true), m, k, n, case2); } + @Test + public void testSamplingDefCase1() { + runSparsityEstimateTest(new EstimatorSample(), m, k, n, case1); + } + + @Test + public void testSamplingDefCase2() { + runSparsityEstimateTest(new EstimatorSample(), m, k, n, case2); + } + + @Test + public void testSampling20Case1() { + runSparsityEstimateTest(new EstimatorSample(0.2), m, k, n, case1); + } + + @Test + public void testSampling20Case2() { + runSparsityEstimateTest(new EstimatorSample(0.2), m, k, n, case2); + } + private void runSparsityEstimateTest(SparsityEstimator estim, int m, int k, int n, double[] sp) { MatrixBlock m1 = MatrixBlock.randOperations(m, k, sp[0], 1, 1, "uniform", 3); MatrixBlock m2 = MatrixBlock.randOperations(k, n, sp[1], 1, 1, "uniform", 3); http://git-wip-us.apache.org/repos/asf/systemml/blob/5a155f3d/src/test/java/org/apache/sysml/test/integration/functions/estim/SquaredProductTest.java ---------------------------------------------------------------------- diff --git a/src/test/java/org/apache/sysml/test/integration/functions/estim/SquaredProductTest.java b/src/test/java/org/apache/sysml/test/integration/functions/estim/SquaredProductTest.java index 204842c..917764b 100644 --- a/src/test/java/org/apache/sysml/test/integration/functions/estim/SquaredProductTest.java +++ b/src/test/java/org/apache/sysml/test/integration/functions/estim/SquaredProductTest.java @@ -25,6 +25,7 @@ import org.apache.sysml.hops.estim.EstimatorBasicWorst; import org.apache.sysml.hops.estim.EstimatorBitsetMM; import org.apache.sysml.hops.estim.EstimatorDensityMap; import org.apache.sysml.hops.estim.EstimatorMatrixHistogram; +import org.apache.sysml.hops.estim.EstimatorSample; import org.apache.sysml.hops.estim.SparsityEstimator; import org.apache.sysml.runtime.instructions.InstructionUtils; import org.apache.sysml.runtime.matrix.data.MatrixBlock; @@ -123,6 +124,26 @@ public class SquaredProductTest extends AutomatedTestBase runSparsityEstimateTest(new EstimatorMatrixHistogram(true), m, k, n, case2); } + @Test + public void testSamplingDefCase1() { + runSparsityEstimateTest(new EstimatorSample(), m, k, n, case1); + } + + @Test + public void testSamplingDefCase2() { + runSparsityEstimateTest(new EstimatorSample(), m, k, n, case2); + } + + @Test + public void testSampling20Case1() { + runSparsityEstimateTest(new EstimatorSample(0.2), m, k, n, case1); + } + + @Test + public void testSampling20Case2() { + runSparsityEstimateTest(new EstimatorSample(0.2), m, k, n, case2); + } + private void runSparsityEstimateTest(SparsityEstimator estim, int m, int k, int n, double[] sp) { MatrixBlock m1 = MatrixBlock.randOperations(m, k, sp[0], 1, 1, "uniform", 3); MatrixBlock m2 = MatrixBlock.randOperations(k, n, sp[1], 1, 1, "uniform", 3);