Repository: systemml
Updated Branches:
  refs/heads/master 78e9d836e -> 11f0291d7


[SYSTEMML-2486] Performance bitset sparsity estimator (multi-threading)

This patch extends the bitset sparsity estimator (based on boolean
matrix multiply) by optional multi-threaded construction and matrix
multiplication. Similar to floating point matrix multiplication C=AB, we
parallelize over row partitions in A and C with 4*k partitions to
mitigate load imbalance.

In a single node setup of an Intel Xeon Gold 6138 (40 pcores, 80
vcores), this patch improved the end-to-end estimation time (2x build,
1x matrixmult) as follows:

a) 10K x 10K, sp=0.01 (2x): 302ms -> 12.4ms.
b) 50K x 50K, sp=0.01 (2x): 30,531ms -> 1,372ms (baseline mm: 6,233ms).

With multi-threaded matrix mult, this kernel now became memory-bandwidth
bound with 79% LLC load misses. So far, this implementation is based on
Java's BitSet per row. For the sake of a cache-conscious implementation
(that requires range ORs), we should create our own tailor-made bit
matrix representation and operations.


Project: http://git-wip-us.apache.org/repos/asf/systemml/repo
Commit: http://git-wip-us.apache.org/repos/asf/systemml/commit/11f0291d
Tree: http://git-wip-us.apache.org/repos/asf/systemml/tree/11f0291d
Diff: http://git-wip-us.apache.org/repos/asf/systemml/diff/11f0291d

Branch: refs/heads/master
Commit: 11f0291d7537c7639241789b084385196024e45e
Parents: 78e9d83
Author: Matthias Boehm <mboe...@gmail.com>
Authored: Sun Aug 5 21:38:24 2018 -0700
Committer: Matthias Boehm <mboe...@gmail.com>
Committed: Sun Aug 5 21:38:47 2018 -0700

----------------------------------------------------------------------
 .../sysml/hops/estim/EstimatorBitsetMM.java     | 75 ++++++++++++++------
 .../sysml/hops/estim/SparsityEstimator.java     |  5 ++
 2 files changed, 60 insertions(+), 20 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/systemml/blob/11f0291d/src/main/java/org/apache/sysml/hops/estim/EstimatorBitsetMM.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/hops/estim/EstimatorBitsetMM.java 
b/src/main/java/org/apache/sysml/hops/estim/EstimatorBitsetMM.java
index 011975e..9bde8c8 100644
--- a/src/main/java/org/apache/sysml/hops/estim/EstimatorBitsetMM.java
+++ b/src/main/java/org/apache/sysml/hops/estim/EstimatorBitsetMM.java
@@ -20,9 +20,11 @@
 package org.apache.sysml.hops.estim;
 
 import java.util.BitSet;
+import java.util.stream.IntStream;
 
 import org.apache.commons.lang.NotImplementedException;
 import org.apache.sysml.hops.OptimizerUtils;
+import 
org.apache.sysml.runtime.controlprogram.parfor.stat.InfrastructureAnalyzer;
 import org.apache.sysml.runtime.matrix.data.DenseBlock;
 import org.apache.sysml.runtime.matrix.data.MatrixBlock;
 import org.apache.sysml.runtime.matrix.data.SparseBlock;
@@ -87,8 +89,6 @@ public class EstimatorBitsetMM extends SparsityEstimator {
                        _rlen = rlen;
                        _clen = clen;
                        _data = new BitSet[_rlen];
-                       for (int i = 0; i < _rlen; i++)
-                               _data[i] = new BitSet(_clen);
                        _nonZeros = 0;
                }
 
@@ -112,12 +112,45 @@ public class EstimatorBitsetMM extends SparsityEstimator {
                private void init(MatrixBlock in) {
                        if (in.isEmptyBlock(false))
                                return;
+                       if( MULTI_THREADED_BUILD && in.getNonZeros() > 
MIN_PAR_THRESHOLD ) {
+                               int k = 8 * 
InfrastructureAnalyzer.getLocalParallelism();
+                               int blklen = (int)Math.ceil((double)_rlen/k);
+                               IntStream.range(0, k).parallel().forEach(i ->
+                                       buildIntern(this, in, i*blklen, 
Math.min((i+1)*blklen, _rlen)));
+                       }
+                       else {
+                               //single-threaded bitset construction
+                               buildIntern(this, in, 0, in.getNumRows());
+                       }
+                       _nonZeros = in.getNonZeros();
+               }
+
+               public BitsetMatrix matMult(BitsetMatrix m2) {
+                       BitsetMatrix out = new BitsetMatrix(_rlen, m2._clen);
+                       if( this.getNonZeros() == 0 || m2.getNonZeros() == 0 )
+                               return out;
+                       long size = (long)_rlen*_clen+(long)m2._rlen*m2._clen;
+                       if( MULTI_THREADED_ESTIM && size > MIN_PAR_THRESHOLD ) {
+                               int k = 8 * 
InfrastructureAnalyzer.getLocalParallelism();
+                               int blklen = (int)Math.ceil((double)_rlen/k);
+                               out._nonZeros = IntStream.range(0, 
k).parallel().mapToLong(i ->
+                                       matMultIntern(this, m2, out, i*blklen, 
Math.min((i+1)*blklen, _rlen))).sum();
+                       }
+                       else {
+                               //single-threaded boolean matrix mult
+                               out._nonZeros = matMultIntern(this, m2, out, 0, 
_rlen);
+                       }
+                       return out;
+               }
+               
+               private static void buildIntern(BitsetMatrix bitset, 
MatrixBlock in, int rl, int ru) {
+                       final int clen = in.getNumColumns();
                        if (in.isInSparseFormat()) {
                                SparseBlock sblock = in.getSparseBlock();
-                               for (int i = 0; i < in.getNumRows(); i++) {
+                               for (int i = rl; i < ru; i++) {
                                        if (sblock.isEmpty(i))
                                                continue;
-                                       BitSet lbs = _data[i];
+                                       BitSet lbs = bitset._data[i] = new 
BitSet(clen);
                                        int alen = sblock.size(i);
                                        int apos = sblock.pos(i);
                                        int[] aix = sblock.indexes(i);
@@ -127,7 +160,7 @@ public class EstimatorBitsetMM extends SparsityEstimator {
                        } else {
                                DenseBlock dblock = in.getDenseBlock();
                                for (int i = 0; i < in.getNumRows(); i++) {
-                                       BitSet lbs = _data[i];
+                                       BitSet lbs = bitset._data[i] = new 
BitSet(clen);
                                        double[] avals = dblock.values(i);
                                        int aix = dblock.pos(i);
                                        for (int j = 0; j < in.getNumColumns(); 
j++)
@@ -135,25 +168,27 @@ public class EstimatorBitsetMM extends SparsityEstimator {
                                                        lbs.set(j);
                                }
                        }
-                       _nonZeros = in.getNonZeros();
                }
-
-               public BitsetMatrix matMult(BitsetMatrix m2) {
-                       final int m = this._rlen;
-                       final int cd = this._clen;
-                       final int n = m2._clen;
+               
+               private static long matMultIntern(BitsetMatrix bsa, 
BitsetMatrix bsb, BitsetMatrix bsc, int rl, int ru) {
+                       final int cd = bsa._clen;
+                       final int n = bsb._clen;
                        // matrix multiply with IKJ schedule and pure OR ops in 
inner loop
-                       BitsetMatrix out = new BitsetMatrix(m, n);
-                       for (int i = 0; i < m; i++) {
-                               BitSet a = this._data[i], c = out._data[i];
-                               for (int k = 0; k < cd; k++) {
-                                       if (a.get(k))
-                                               c.or(m2._data[k]);
+                       long lnnz = 0;
+                       for (int i = rl; i < ru; i++) {
+                               BitSet a = bsa._data[i];
+                               if( a != null ) {
+                                       BitSet c = bsc._data[i] = new BitSet(n);
+                                       for (int k = 0; k < cd; k++) {
+                                               BitSet b = bsb._data[k];
+                                               if (a.get(k) && b != null)
+                                                       c.or(b);
+                                       }
+                                       // maintain nnz
+                                       lnnz += c.cardinality();
                                }
-                               // maintain nnz
-                               out._nonZeros += c.cardinality();
                        }
-                       return out;
+                       return lnnz;
                }
        }
 }

http://git-wip-us.apache.org/repos/asf/systemml/blob/11f0291d/src/main/java/org/apache/sysml/hops/estim/SparsityEstimator.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/hops/estim/SparsityEstimator.java 
b/src/main/java/org/apache/sysml/hops/estim/SparsityEstimator.java
index 61a7207..411d9cd 100644
--- a/src/main/java/org/apache/sysml/hops/estim/SparsityEstimator.java
+++ b/src/main/java/org/apache/sysml/hops/estim/SparsityEstimator.java
@@ -27,6 +27,11 @@ public abstract class SparsityEstimator
 {
        protected static final Log LOG = 
LogFactory.getLog(SparsityEstimator.class.getName());
        
+       //internal configuration
+       public static boolean MULTI_THREADED_BUILD = false;
+       public static boolean MULTI_THREADED_ESTIM = false;
+       public static final int MIN_PAR_THRESHOLD = 10 * 1024;
+       
        public static enum OpCode {
                MM, 
                MULT, PLUS, EQZERO, NEQZERO,

Reply via email to