Repository: systemml
Updated Branches:
  refs/heads/master 79a5e80f6 -> 5a155f3d2


[SYSTEMML-2329] New sampling-based mm sparsity estimator 

This patch introduces an additional baseline sparsity estimator based on
random sampling of columns from A and aligned rows from B for estimating
the output sparsity of AB.


Project: http://git-wip-us.apache.org/repos/asf/systemml/repo
Commit: http://git-wip-us.apache.org/repos/asf/systemml/commit/5a155f3d
Tree: http://git-wip-us.apache.org/repos/asf/systemml/tree/5a155f3d
Diff: http://git-wip-us.apache.org/repos/asf/systemml/diff/5a155f3d

Branch: refs/heads/master
Commit: 5a155f3d2402b1d77ca90a1d024cb04d6a2ca80d
Parents: 79a5e80
Author: Matthias Boehm <mboe...@gmail.com>
Authored: Thu May 17 20:39:24 2018 -0700
Committer: Matthias Boehm <mboe...@gmail.com>
Committed: Thu May 17 20:39:24 2018 -0700

----------------------------------------------------------------------
 .../sysml/hops/estim/EstimatorSample.java       | 111 +++++++++++++++++++
 .../estim/CompressedSizeEstimatorSample.java    |  10 +-
 .../sysml/runtime/util/UtilFunctions.java       |  14 ++-
 .../functions/estim/OuterProductTest.java       |  21 ++++
 .../functions/estim/SquaredProductTest.java     |  21 ++++
 5 files changed, 166 insertions(+), 11 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/systemml/blob/5a155f3d/src/main/java/org/apache/sysml/hops/estim/EstimatorSample.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/hops/estim/EstimatorSample.java 
b/src/main/java/org/apache/sysml/hops/estim/EstimatorSample.java
new file mode 100644
index 0000000..6ed918a
--- /dev/null
+++ b/src/main/java/org/apache/sysml/hops/estim/EstimatorSample.java
@@ -0,0 +1,111 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ * 
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.sysml.hops.estim;
+
+import org.apache.sysml.hops.OptimizerUtils;
+import org.apache.sysml.runtime.DMLRuntimeException;
+import org.apache.sysml.runtime.matrix.MatrixCharacteristics;
+import org.apache.sysml.runtime.matrix.data.DenseBlock;
+import org.apache.sysml.runtime.matrix.data.LibMatrixAgg;
+import org.apache.sysml.runtime.matrix.data.MatrixBlock;
+import org.apache.sysml.runtime.matrix.data.SparseBlock;
+import org.apache.sysml.runtime.util.UtilFunctions;
+
+/**
+ * This estimator implements an approach based on row/column sampling
+ * Yongyang Yu, MingJie Tang, Walid G. Aref, Qutaibah M. Malluhi, Mostafa M. 
Abbas, Mourad Ouzzani:
+ * In-Memory Distributed Matrix Computation Processing and Optimization. ICDE 
2017: 1047-1058
+ * 
+ * The basic idea is to draw random samples of aligned columns SA and rows SB,
+ * and compute the output nnz as max(nnz(SA_i)*nnz(SB_i)). However, this 
estimator is
+ * biased toward underestimation as the maximum is unlikely sampled and 
collisions are
+ * not accounted for.
+ */
+public class EstimatorSample extends SparsityEstimator
+{
+       private static final double SAMPLE_FRACTION = 0.1; //10%
+       
+       private final double _frac;
+       
+       public EstimatorSample() {
+               this(SAMPLE_FRACTION);
+       }
+       
+       public EstimatorSample(double sampleFrac) {
+               if( sampleFrac < 0 || sampleFrac > 1.0 )
+                       throw new DMLRuntimeException("Invalid sample fraction: 
"+sampleFrac);
+               _frac = sampleFrac;
+       }
+       
+       @Override
+       public double estim(MMNode root) {
+               LOG.warn("Recursive estimates not supported by EstimatorSample, 
falling back to EstimatorBasicAvg.");
+               return new EstimatorBasicAvg().estim(root);
+       }
+
+       @Override
+       public double estim(MatrixBlock m1, MatrixBlock m2) {
+               //get sampled indexes
+               int k = m1.getNumColumns();
+               int[] ix = UtilFunctions.getSortedSampleIndexes(
+                       k, (int)Math.max(k*_frac, 1));
+               //compute output sparsity 
+               int[] cnnz = computeColumnNnz(m1, ix);
+               long nnzOut = 0;
+               for(int i=0; i<ix.length; i++)
+                       nnzOut = Math.max(nnzOut, cnnz[i] * 
m2.recomputeNonZeros(ix[i], ix[i]));
+               return OptimizerUtils.getSparsity( 
+                       m1.getNumRows(), m2.getNumColumns(), nnzOut);
+       }
+
+       @Override
+       public double estim(MatrixCharacteristics mc1, MatrixCharacteristics 
mc2) {
+               LOG.warn("Meta-data-only estimates not supported by 
EstimatorSample, falling back to EstimatorBasicAvg.");
+               return new EstimatorBasicAvg().estim(mc1, mc2);
+       }
+       
+       private int[] computeColumnNnz(MatrixBlock in, int[] ix) {
+               int[] nnz = new int[in.getNumColumns()];
+               //count column nnz brute force or selective
+               if( in.isInSparseFormat() ) {
+                       SparseBlock sblock = in.getSparseBlock();
+                       for( int i=0; i<in.getNumRows(); i++ ) {
+                               if( sblock.isEmpty(i) ) continue;
+                               LibMatrixAgg.countAgg(sblock.values(i), nnz,
+                                       sblock.indexes(i), sblock.pos(i), 
sblock.size(i));
+                       }
+               }
+               else {
+                       DenseBlock dblock = in.getDenseBlock();
+                       for( int i=0; i<in.getNumRows(); i++ ) {
+                               double[] avals = dblock.values(i);
+                               int aix = dblock.pos(i);
+                               for( int j=0; j<in.getNumColumns(); j++ )
+                                       nnz[j] += (avals[aix+j] != 0) ? 1 : 0;
+                       }
+               }
+               
+               //copy nnz into reduced vector
+               int[] ret = new int[ix.length];
+               for(int i=0; i<ix.length; i++)
+                       ret[i] = nnz[ix[i]];
+               return ret;
+       }
+}

http://git-wip-us.apache.org/repos/asf/systemml/blob/5a155f3d/src/main/java/org/apache/sysml/runtime/compress/estim/CompressedSizeEstimatorSample.java
----------------------------------------------------------------------
diff --git 
a/src/main/java/org/apache/sysml/runtime/compress/estim/CompressedSizeEstimatorSample.java
 
b/src/main/java/org/apache/sysml/runtime/compress/estim/CompressedSizeEstimatorSample.java
index d4c829c..73c93bf 100644
--- 
a/src/main/java/org/apache/sysml/runtime/compress/estim/CompressedSizeEstimatorSample.java
+++ 
b/src/main/java/org/apache/sysml/runtime/compress/estim/CompressedSizeEstimatorSample.java
@@ -27,13 +27,13 @@ import org.apache.commons.logging.LogFactory;
 import org.apache.commons.math3.analysis.UnivariateFunction;
 import org.apache.commons.math3.analysis.solvers.UnivariateSolverUtils;
 import org.apache.commons.math3.distribution.ChiSquaredDistribution;
-import org.apache.commons.math3.random.RandomDataGenerator;
 import org.apache.sysml.runtime.compress.BitmapEncoder;
 import org.apache.sysml.runtime.compress.ReaderColumnSelection;
 import org.apache.sysml.runtime.compress.CompressedMatrixBlock;
 import org.apache.sysml.runtime.compress.UncompressedBitmap;
 import org.apache.sysml.runtime.compress.utils.DblArray;
 import org.apache.sysml.runtime.matrix.data.MatrixBlock;
+import org.apache.sysml.runtime.util.UtilFunctions;
 
 public class CompressedSizeEstimatorSample extends CompressedSizeEstimator 
 {
@@ -303,12 +303,8 @@ public class CompressedSizeEstimatorSample extends 
CompressedSizeEstimator
         * @return sorted array of integers
         */
        private static int[] getSortedUniformSample(int range, int smplSize) {
-               if (smplSize == 0)
-                       return new int[] {};
-               RandomDataGenerator rng = new RandomDataGenerator();
-               int[] sample = rng.nextPermutation(range, smplSize);
-               Arrays.sort(sample);
-               return sample;
+               if (smplSize == 0) return new int[] {};
+               return UtilFunctions.getSortedSampleIndexes(range, smplSize);
        }
        
 

http://git-wip-us.apache.org/repos/asf/systemml/blob/5a155f3d/src/main/java/org/apache/sysml/runtime/util/UtilFunctions.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/util/UtilFunctions.java 
b/src/main/java/org/apache/sysml/runtime/util/UtilFunctions.java
index e32cb22..58dde27 100644
--- a/src/main/java/org/apache/sysml/runtime/util/UtilFunctions.java
+++ b/src/main/java/org/apache/sysml/runtime/util/UtilFunctions.java
@@ -29,6 +29,7 @@ import java.util.stream.Stream;
 import java.util.stream.StreamSupport;
 
 import org.apache.commons.lang.ArrayUtils;
+import org.apache.commons.math3.random.RandomDataGenerator;
 import org.apache.sysml.parser.Expression.ValueType;
 import org.apache.sysml.runtime.matrix.MetaDataNumItemsByEachReducer;
 import org.apache.sysml.runtime.matrix.data.FrameBlock;
@@ -515,17 +516,22 @@ public class UtilFunctions
                return 0; //equal 
        }
        
-       public static boolean isIntegerNumber( String str )
-       {
+       public static boolean isIntegerNumber( String str ) {
                byte[] c = str.getBytes();
                for( int i=0; i<c.length; i++ )
                        if( c[i] < 48 || c[i] > 57 )
                                return false;
                return true;
        }
+       
+       public static int[] getSortedSampleIndexes(int range, int sampleSize) {
+               RandomDataGenerator rng = new RandomDataGenerator();
+               int[] sample = rng.nextPermutation(range, sampleSize);
+               Arrays.sort(sample);
+               return sample;
+       }
 
-       public static byte max( byte[] array )
-       {
+       public static byte max( byte[] array ) {
                byte ret = Byte.MIN_VALUE;
                for( int i=0; i<array.length; i++ )
                        ret = (array[i]>ret)?array[i]:ret;

http://git-wip-us.apache.org/repos/asf/systemml/blob/5a155f3d/src/test/java/org/apache/sysml/test/integration/functions/estim/OuterProductTest.java
----------------------------------------------------------------------
diff --git 
a/src/test/java/org/apache/sysml/test/integration/functions/estim/OuterProductTest.java
 
b/src/test/java/org/apache/sysml/test/integration/functions/estim/OuterProductTest.java
index f45ca70..70facf7 100644
--- 
a/src/test/java/org/apache/sysml/test/integration/functions/estim/OuterProductTest.java
+++ 
b/src/test/java/org/apache/sysml/test/integration/functions/estim/OuterProductTest.java
@@ -25,6 +25,7 @@ import org.apache.sysml.hops.estim.EstimatorBasicWorst;
 import org.apache.sysml.hops.estim.EstimatorBitsetMM;
 import org.apache.sysml.hops.estim.EstimatorDensityMap;
 import org.apache.sysml.hops.estim.EstimatorMatrixHistogram;
+import org.apache.sysml.hops.estim.EstimatorSample;
 import org.apache.sysml.hops.estim.SparsityEstimator;
 import org.apache.sysml.runtime.instructions.InstructionUtils;
 import org.apache.sysml.runtime.matrix.data.MatrixBlock;
@@ -118,6 +119,26 @@ public class OuterProductTest extends AutomatedTestBase
                runSparsityEstimateTest(new EstimatorMatrixHistogram(true), m, 
k, n, case2);
        }
        
+       @Test
+       public void testSamplingDefCase1() {
+               runSparsityEstimateTest(new EstimatorSample(), m, k, n, case1);
+       }
+       
+       @Test
+       public void testSamplingDefCase2() {
+               runSparsityEstimateTest(new EstimatorSample(), m, k, n, case2);
+       }
+       
+       @Test
+       public void testSampling20Case1() {
+               runSparsityEstimateTest(new EstimatorSample(0.2), m, k, n, 
case1);
+       }
+       
+       @Test
+       public void testSampling20Case2() {
+               runSparsityEstimateTest(new EstimatorSample(0.2), m, k, n, 
case2);
+       }
+       
        private void runSparsityEstimateTest(SparsityEstimator estim, int m, 
int k, int n, double[] sp) {
                MatrixBlock m1 = MatrixBlock.randOperations(m, k, sp[0], 1, 1, 
"uniform", 3);
                MatrixBlock m2 = MatrixBlock.randOperations(k, n, sp[1], 1, 1, 
"uniform", 3);

http://git-wip-us.apache.org/repos/asf/systemml/blob/5a155f3d/src/test/java/org/apache/sysml/test/integration/functions/estim/SquaredProductTest.java
----------------------------------------------------------------------
diff --git 
a/src/test/java/org/apache/sysml/test/integration/functions/estim/SquaredProductTest.java
 
b/src/test/java/org/apache/sysml/test/integration/functions/estim/SquaredProductTest.java
index 204842c..917764b 100644
--- 
a/src/test/java/org/apache/sysml/test/integration/functions/estim/SquaredProductTest.java
+++ 
b/src/test/java/org/apache/sysml/test/integration/functions/estim/SquaredProductTest.java
@@ -25,6 +25,7 @@ import org.apache.sysml.hops.estim.EstimatorBasicWorst;
 import org.apache.sysml.hops.estim.EstimatorBitsetMM;
 import org.apache.sysml.hops.estim.EstimatorDensityMap;
 import org.apache.sysml.hops.estim.EstimatorMatrixHistogram;
+import org.apache.sysml.hops.estim.EstimatorSample;
 import org.apache.sysml.hops.estim.SparsityEstimator;
 import org.apache.sysml.runtime.instructions.InstructionUtils;
 import org.apache.sysml.runtime.matrix.data.MatrixBlock;
@@ -123,6 +124,26 @@ public class SquaredProductTest extends AutomatedTestBase
                runSparsityEstimateTest(new EstimatorMatrixHistogram(true), m, 
k, n, case2);
        }
        
+       @Test
+       public void testSamplingDefCase1() {
+               runSparsityEstimateTest(new EstimatorSample(), m, k, n, case1);
+       }
+       
+       @Test
+       public void testSamplingDefCase2() {
+               runSparsityEstimateTest(new EstimatorSample(), m, k, n, case2);
+       }
+       
+       @Test
+       public void testSampling20Case1() {
+               runSparsityEstimateTest(new EstimatorSample(0.2), m, k, n, 
case1);
+       }
+       
+       @Test
+       public void testSampling20Case2() {
+               runSparsityEstimateTest(new EstimatorSample(0.2), m, k, n, 
case2);
+       }
+       
        private void runSparsityEstimateTest(SparsityEstimator estim, int m, 
int k, int n, double[] sp) {
                MatrixBlock m1 = MatrixBlock.randOperations(m, k, sp[0], 1, 1, 
"uniform", 3);
                MatrixBlock m2 = MatrixBlock.randOperations(k, n, sp[1], 1, 1, 
"uniform", 3);

Reply via email to