This is an automated email from the ASF dual-hosted git repository.

baunsgaard pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/systemds.git

commit e7abe8765b06f7a9a59c349d6a2f15f67f15685f
Author: baunsgaard <[email protected]>
AuthorDate: Mon May 10 17:47:25 2021 +0200

    [SYSTEMDS-2997] CLA MatrixBlock Dictionary
    
    - Add Basic MatrixBlock Dictionary
    - CocodeMatrixCost function for estimating the matrix multiplication cost.
---
 .../runtime/compress/CompressionSettings.java      |  23 ++-
 .../compress/CompressionSettingsBuilder.java       |  19 +-
 .../runtime/compress/cocode/AColumnCoCoder.java    |  10 +-
 .../sysds/runtime/compress/cocode/CoCodeCost.java  |   8 +-
 .../{CoCodeCost.java => CoCodeCostMatrixMult.java} |  97 ++++-----
 .../runtime/compress/cocode/PlanningCoCoder.java   |   8 +-
 .../runtime/compress/colgroup/ColGroupSDC.java     |  28 ++-
 .../compress/colgroup/ColGroupSDCSingle.java       |  28 ++-
 .../runtime/compress/colgroup/ColGroupValue.java   |  67 ++-----
 .../compress/colgroup/dictionary/ADictionary.java  |   4 +-
 .../compress/colgroup/dictionary/Dictionary.java   |   3 -
 .../compress/colgroup/dictionary/QDictionary.java  |  50 +++--
 .../colgroup/dictionary/SparseDictionary.java      | 218 +++++++++++++++++++++
 .../compress/estim/CompressedSizeEstimator.java    |  19 +-
 .../estim/CompressedSizeEstimatorSample.java       |   8 +-
 .../compress/estim/CompressedSizeInfoColGroup.java |   8 +
 16 files changed, 440 insertions(+), 158 deletions(-)

diff --git 
a/src/main/java/org/apache/sysds/runtime/compress/CompressionSettings.java 
b/src/main/java/org/apache/sysds/runtime/compress/CompressionSettings.java
index f1ff14e..7be74ef 100644
--- a/src/main/java/org/apache/sysds/runtime/compress/CompressionSettings.java
+++ b/src/main/java/org/apache/sysds/runtime/compress/CompressionSettings.java
@@ -21,6 +21,8 @@ package org.apache.sysds.runtime.compress;
 
 import java.util.EnumSet;
 
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
 import 
org.apache.sysds.runtime.compress.cocode.PlanningCoCoder.PartitionerType;
 import org.apache.sysds.runtime.compress.colgroup.AColGroup.CompressionType;
 
@@ -29,6 +31,7 @@ import 
org.apache.sysds.runtime.compress.colgroup.AColGroup.CompressionType;
  * CompressionSettingsBuilder for default non static parameters.
  */
 public class CompressionSettings {
+       private static final Log LOG = 
LogFactory.getLog(CompressionSettings.class.getName());
 
        /**
         * Size of the blocks used in a blocked bitmap representation. Note it 
is exactly Character.MAX_VALUE. This is not
@@ -79,8 +82,14 @@ public class CompressionSettings {
        /** The selected method for column partitioning used in CoCoding 
compressed columns */
        public final PartitionerType columnPartitioner;
 
-       /** The maximum number of columns CoCoded if the Static CoCoding 
strategy is selected */
-       public final int maxStaticColGroupCoCode;
+       /** The maximum number of columns CoCoded allowed */
+       public final int maxColGroupCoCode;
+
+       /**
+        * A Cocode parameter that differ in behavior based on compression 
method, in general it is a value that reflects
+        * aggressively likely coCoding is used.
+        */
+       public final double coCodePercentage;
 
        /**
         * Valid Compressions List, containing the ColGroup CompressionTypes 
that are allowed to be used for the compression
@@ -91,7 +100,7 @@ public class CompressionSettings {
        protected CompressionSettings(double samplingRatio, boolean 
allowSharedDictionary, String transposeInput,
                boolean skipList, int seed, boolean investigateEstimate, 
boolean lossy,
                EnumSet<CompressionType> validCompressions, boolean 
sortValuesByLength, PartitionerType columnPartitioner,
-               int maxStaticColGroupCoCode) {
+               int maxColGroupCoCode, double coCodePercentage) {
                this.samplingRatio = samplingRatio;
                this.allowSharedDictionary = allowSharedDictionary;
                this.transposeInput = transposeInput;
@@ -102,7 +111,9 @@ public class CompressionSettings {
                this.lossy = lossy;
                this.sortValuesByLength = sortValuesByLength;
                this.columnPartitioner = columnPartitioner;
-               this.maxStaticColGroupCoCode = maxStaticColGroupCoCode;
+               this.maxColGroupCoCode = maxColGroupCoCode;
+               this.coCodePercentage = coCodePercentage;
+               LOG.debug(this);
        }
 
        @Override
@@ -113,6 +124,10 @@ public class CompressionSettings {
                sb.append("\n DDC1 share dict: " + allowSharedDictionary);
                sb.append("\n Partitioner: " + columnPartitioner);
                sb.append("\n Lossy: " + lossy);
+               sb.append("\n sortValuesByLength: " + sortValuesByLength);
+               sb.append("\n column Partitioner: " + columnPartitioner);
+               sb.append("\n max Static ColGroup CoCode " + maxColGroupCoCode);
+               sb.append("\n max cocodePercentage " + coCodePercentage);
                // If needed for debugging add more fields to the printing.
                return sb.toString();
        }
diff --git 
a/src/main/java/org/apache/sysds/runtime/compress/CompressionSettingsBuilder.java
 
b/src/main/java/org/apache/sysds/runtime/compress/CompressionSettingsBuilder.java
index a53bcfa..72aeeeb 100644
--- 
a/src/main/java/org/apache/sysds/runtime/compress/CompressionSettingsBuilder.java
+++ 
b/src/main/java/org/apache/sysds/runtime/compress/CompressionSettingsBuilder.java
@@ -41,7 +41,8 @@ public class CompressionSettingsBuilder {
        private EnumSet<CompressionType> validCompressions;
        private boolean sortValuesByLength = true;
        private PartitionerType columnPartitioner;
-       private int maxStaticColGroupCoCode = 10;
+       private int maxStaticColGroupCoCode = 10000;
+       private double coCodePercentage = 0.01;
 
        public CompressionSettingsBuilder() {
 
@@ -236,6 +237,20 @@ public class CompressionSettingsBuilder {
        }
 
        /**
+        * Set the coCode percentage, the effect is different based on the 
coCoding strategy, but the general effect is that
+        * higher values results in more coCoding while lower values result in 
less.
+        * 
+        * Note that with high coCoding the compression ratio would possibly be 
lower.
+        * 
+        * @param coCodePercentage The percentage to set.
+        * @return The CompressionSettingsBuilder
+        */
+       public CompressionSettingsBuilder setCoCodePercentage(double 
coCodePercentage) {
+               this.coCodePercentage = coCodePercentage;
+               return this;
+       }
+
+       /**
         * Create the CompressionSettings object to use in the compression.
         * 
         * @return The CompressionSettings
@@ -243,6 +258,6 @@ public class CompressionSettingsBuilder {
        public CompressionSettings create() {
                return new CompressionSettings(samplingRatio, 
allowSharedDictionary, transposeInput, skipList, seed,
                        investigateEstimate, lossy, validCompressions, 
sortValuesByLength, columnPartitioner,
-                       maxStaticColGroupCoCode);
+                       maxStaticColGroupCoCode, coCodePercentage);
        }
 }
diff --git 
a/src/main/java/org/apache/sysds/runtime/compress/cocode/AColumnCoCoder.java 
b/src/main/java/org/apache/sysds/runtime/compress/cocode/AColumnCoCoder.java
index 4dc1f8e..1ede652 100644
--- a/src/main/java/org/apache/sysds/runtime/compress/cocode/AColumnCoCoder.java
+++ b/src/main/java/org/apache/sysds/runtime/compress/cocode/AColumnCoCoder.java
@@ -30,14 +30,14 @@ public abstract class AColumnCoCoder {
 
        protected static final Log LOG = 
LogFactory.getLog(AColumnCoCoder.class.getName());
 
-       protected CompressedSizeEstimator _est;
-       protected CompressionSettings _cs;
-       protected int _numRows;
+       final protected CompressedSizeEstimator _est;
+       final protected CompressionSettings _cs;
+       // final protected int _numRows;
 
        protected AColumnCoCoder(CompressedSizeEstimator sizeEstimator, 
CompressionSettings cs, int numRows) {
                _est = sizeEstimator;
                _cs = cs;
-               _numRows = numRows;
+               // _numRows = numRows;
        }
 
        /**
@@ -64,7 +64,7 @@ public abstract class AColumnCoCoder {
                CompressedSizeInfoColGroup rhs) {
                int[] joined = Util.join(lhs.getColumns(), rhs.getColumns());
                int numVals = lhs.getNumVals() + rhs.getNumVals();
-               return new CompressedSizeInfoColGroup(joined, numVals, 
_numRows);
+               return new CompressedSizeInfoColGroup(joined, numVals, 
_est.getNumRows());
        }
 
        protected CompressedSizeInfoColGroup analyze(CompressedSizeInfoColGroup 
g) {
diff --git 
a/src/main/java/org/apache/sysds/runtime/compress/cocode/CoCodeCost.java 
b/src/main/java/org/apache/sysds/runtime/compress/cocode/CoCodeCost.java
index 1cce5b1..d2aec2c 100644
--- a/src/main/java/org/apache/sysds/runtime/compress/cocode/CoCodeCost.java
+++ b/src/main/java/org/apache/sysds/runtime/compress/cocode/CoCodeCost.java
@@ -48,15 +48,11 @@ public class CoCodeCost extends AColumnCoCoder {
         */
        private final int largestDistinct;
 
-       private final int toSmallForAnalysis;
-
-       private final double percentMaxCardinality = 0.08;
+       private final static int toSmallForAnalysis = 64;
 
        protected CoCodeCost(CompressedSizeEstimator sizeEstimator, 
CompressionSettings cs, int numRows) {
                super(sizeEstimator, cs, numRows);
-               largestDistinct = Math.max(256, (int) (_numRows * 
percentMaxCardinality));
-               toSmallForAnalysis = largestDistinct / 4;
-               LOG.error("CocodeCost largest Distinct: "+ largestDistinct + " 
toSmallForAnalysis: " + toSmallForAnalysis);
+               largestDistinct = Math.min(4096, Math.max(256, (int) 
(sizeEstimator.getNumRows() * cs.coCodePercentage)));
        }
 
        @Override
diff --git 
a/src/main/java/org/apache/sysds/runtime/compress/cocode/CoCodeCost.java 
b/src/main/java/org/apache/sysds/runtime/compress/cocode/CoCodeCostMatrixMult.java
similarity index 56%
copy from src/main/java/org/apache/sysds/runtime/compress/cocode/CoCodeCost.java
copy to 
src/main/java/org/apache/sysds/runtime/compress/cocode/CoCodeCostMatrixMult.java
index 1cce5b1..09a9990 100644
--- a/src/main/java/org/apache/sysds/runtime/compress/cocode/CoCodeCost.java
+++ 
b/src/main/java/org/apache/sysds/runtime/compress/cocode/CoCodeCostMatrixMult.java
@@ -20,13 +20,12 @@
 package org.apache.sysds.runtime.compress.cocode;
 
 import java.util.ArrayList;
-import java.util.Comparator;
+import java.util.Arrays;
 import java.util.List;
 import java.util.PriorityQueue;
 import java.util.Queue;
 
 import org.apache.sysds.runtime.compress.CompressionSettings;
-import org.apache.sysds.runtime.compress.colgroup.AColGroup.CompressionType;
 import org.apache.sysds.runtime.compress.estim.CompressedSizeEstimator;
 import org.apache.sysds.runtime.compress.estim.CompressedSizeInfo;
 import org.apache.sysds.runtime.compress.estim.CompressedSizeInfoColGroup;
@@ -39,7 +38,7 @@ import 
org.apache.sysds.runtime.compress.estim.CompressedSizeInfoColGroup;
  * This method allows us to compress many more columns than the BinPacking
  * 
  */
-public class CoCodeCost extends AColumnCoCoder {
+public class CoCodeCostMatrixMult extends AColumnCoCoder {
 
        /**
         * This value specifies the maximum distinct count allowed int a 
coCoded group. Note that this value is the number
@@ -50,13 +49,11 @@ public class CoCodeCost extends AColumnCoCoder {
 
        private final int toSmallForAnalysis;
 
-       private final double percentMaxCardinality = 0.08;
-
-       protected CoCodeCost(CompressedSizeEstimator sizeEstimator, 
CompressionSettings cs, int numRows) {
-               super(sizeEstimator, cs, numRows);
-               largestDistinct = Math.max(256, (int) (_numRows * 
percentMaxCardinality));
-               toSmallForAnalysis = largestDistinct / 4;
-               LOG.error("CocodeCost largest Distinct: "+ largestDistinct + " 
toSmallForAnalysis: " + toSmallForAnalysis);
+       protected CoCodeCostMatrixMult(CompressedSizeEstimator e, 
CompressionSettings cs, int numRows) {
+               super(e, cs, numRows);
+               largestDistinct = Math.max(256, (int) (_est.getNumRows() * 
_est.getNumColumns() * cs.coCodePercentage * 0.2));
+               toSmallForAnalysis = Math.min(Math.max(256, largestDistinct / 
4), 1028);
+               LOG.debug("CocodeCost largest Distinct: " + largestDistinct + " 
toSmallForAnalysis: " + toSmallForAnalysis);
        }
 
        @Override
@@ -67,54 +64,66 @@ public class CoCodeCost extends AColumnCoCoder {
 
        private List<CompressedSizeInfoColGroup> 
join(List<CompressedSizeInfoColGroup> currentGroups) {
 
-               Comparator<CompressedSizeInfoColGroup> comp = 
Comparator.comparing(CompressedSizeInfoColGroup::getNumVals);
-               Queue<CompressedSizeInfoColGroup> que = new 
PriorityQueue<>(currentGroups.size(), comp);
-               List<CompressedSizeInfoColGroup> ret = new ArrayList<>();
+               Queue<CostOfJoin> que = new 
PriorityQueue<>(currentGroups.size());
 
-               for(CompressedSizeInfoColGroup g : currentGroups) {
-                       if(g.getBestCompressionType() == CompressionType.CONST)
-                               ret.add(g);
-                       else
-                               que.add(g);
-               }
+               List<CompressedSizeInfoColGroup> ret = new ArrayList<>();
+               for(CompressedSizeInfoColGroup g : currentGroups)
+                       que.add(new CostOfJoin(g));
 
-               boolean finished = false;
-               while(!finished) {
+               while(true) {
                        if(que.peek() != null) {
-                               CompressedSizeInfoColGroup l = que.poll();
+                               final CostOfJoin l = que.poll();
                                if(que.peek() != null) {
-                                       CompressedSizeInfoColGroup r = 
que.poll();
-                                       int worstCaseJoinedSize = 
l.getNumVals() * r.getNumVals();
-                                       if(worstCaseJoinedSize < 
toSmallForAnalysis)
-                                               que.add(joinWithoutAnalysis(l, 
r));
-                                       else if(worstCaseJoinedSize < 
largestDistinct){
-
-                                               CompressedSizeInfoColGroup g = 
joinWithAnalysis(l, r);
-                                               if(g.getNumVals() < 
largestDistinct)
-                                                       
que.add(joinWithAnalysis(l, r));
-                                               else{
-                                                       finished = true;
-                                                       que.add(l);
-                                                       que.add(r);
-                                               }
-                                       }
+                                       final CostOfJoin r = que.poll();
+                                       final double costIndividual = (l.cost + 
r.cost);
+                                       final CostOfJoin g = new 
CostOfJoin(joinWithAnalysis(l.elm, r.elm));
+                                       if(g.cost < costIndividual)
+                                               que.add(g);
                                        else {
-                                               finished = true;
-                                               que.add(l);
+                                               ret.add(l.elm);
                                                que.add(r);
                                        }
                                }
                                else {
-                                       que.add(l);
-                                       finished = true;
+                                       ret.add(l.elm);
+                                       break;
                                }
                        }
                        else
-                               finished = true;
+                               break;
                }
-               for(CompressedSizeInfoColGroup g : que)
-                       ret.add(g);
+               for(CostOfJoin g : que)
+                       ret.add(g.elm);
 
                return ret;
        }
+
+       private class CostOfJoin implements Comparable<CostOfJoin> {
+               protected final CompressedSizeInfoColGroup elm;
+               protected final double cost;
+
+               protected CostOfJoin(CompressedSizeInfoColGroup elm) {
+                       this.elm = elm;
+                       final int nRows = _est.getNumRows();
+                       final double commonFraction = 
elm.getMostCommonFraction();
+                       final double rowsToProcess = commonFraction > 0.2 ? 
nRows * (1 - Math.min(commonFraction, 0.95)) : nRows;
+                       this.cost = rowsToProcess + elm.getNumVals() * 
elm.getColumns().length;
+
+               }
+
+               @Override
+               public int compareTo(CostOfJoin o) {
+                       return cost == o.cost ? 0 : cost > o.cost ? 1 : -1;
+               }
+
+               @Override
+               public String toString() {
+                       StringBuilder sb = new StringBuilder();
+                       sb.append("\n");
+                       sb.append(cost);
+                       sb.append(" - ");
+                       sb.append(Arrays.toString(elm.getColumns()));
+                       return sb.toString();
+               }
+       }
 }
diff --git 
a/src/main/java/org/apache/sysds/runtime/compress/cocode/PlanningCoCoder.java 
b/src/main/java/org/apache/sysds/runtime/compress/cocode/PlanningCoCoder.java
index 46bf988..94572c9 100644
--- 
a/src/main/java/org/apache/sysds/runtime/compress/cocode/PlanningCoCoder.java
+++ 
b/src/main/java/org/apache/sysds/runtime/compress/cocode/PlanningCoCoder.java
@@ -41,7 +41,7 @@ public class PlanningCoCoder {
         * The Valid coCoding techniques
         */
        public enum PartitionerType {
-               BIN_PACKING, STATIC, COST;
+               BIN_PACKING, STATIC, COST, COST_MATRIX_MULT;
        }
 
        /**
@@ -66,10 +66,10 @@ public class PlanningCoCoder {
                        constantGroups = new ArrayList<>();
                        List<CompressedSizeInfoColGroup> newGroups = new 
ArrayList<>();
                        mem = new Memorizer();
-                       for(CompressedSizeInfoColGroup g : colInfos.getInfo()){
+                       for(CompressedSizeInfoColGroup g : colInfos.getInfo()) {
                                if(g.getBestCompressionType() == 
CompressionType.CONST)
                                        constantGroups.add(g);
-                               else{
+                               else {
                                        mem.put(g);
                                        newGroups.add(g);
                                }
@@ -98,6 +98,8 @@ public class PlanningCoCoder {
                                return new CoCodeStatic(est, cs, numRows);
                        case COST:
                                return new CoCodeCost(est, cs, numRows);
+                       case COST_MATRIX_MULT:
+                               return new CoCodeCostMatrixMult(est, cs, 
numRows);
                        default:
                                throw new RuntimeException("Unsupported column 
group partitioner: " + type.toString());
                }
diff --git 
a/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupSDC.java 
b/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupSDC.java
index 1acbfdc..268f294 100644
--- a/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupSDC.java
+++ b/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupSDC.java
@@ -643,7 +643,7 @@ public class ColGroupSDC extends ColGroupValue {
                                else
                                        itThis.next();
                        }
-       
+
                        while(itThat.hasNext()) {
                                final int fr = 
that.getIndex(itThat.getDataIndexAndIncrement());
                                that._dict.addToEntry(ret, fr, 
offsetToDefaultThis, nCol);
@@ -717,7 +717,31 @@ public class ColGroupSDC extends ColGroupValue {
 
        @Override
        public Dictionary preAggregateThatSDCSingleStructure(ColGroupSDCSingle 
that, Dictionary ret, boolean preModified) {
-               throw new NotImplementedException();
+               final AIterator itThat = that._indexes.getIterator();
+               final AIterator itThis = _indexes.getIterator();
+               final int nCol = that._colIndexes.length;
+               final int defThis = this.getNumValues() * nCol - nCol;
+
+               if(preModified) {
+                       while(itThat.hasNext() && itThis.hasNext()) {
+                               if(itThat.value() == itThis.value()) {
+                                       itThat.next();
+                                       final int to = 
getIndex(itThis.getDataIndexAndIncrement());
+                                       that._dict.addToEntry(ret, 0, to, nCol);
+                               }
+                               else if(itThat.value() < itThis.value()) {
+                                       itThat.next();
+                                       that._dict.addToEntry(ret, 0, defThis, 
nCol);
+                               }
+                               else
+                                       itThis.next();
+                       }
+               }
+               else {
+                       throw new NotImplementedException();
+               }
+
+               return ret;
        }
 
 }
diff --git 
a/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupSDCSingle.java
 
b/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupSDCSingle.java
index 85aab80..14afdf4 100644
--- 
a/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupSDCSingle.java
+++ 
b/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupSDCSingle.java
@@ -565,7 +565,29 @@ public class ColGroupSDCSingle extends ColGroupValue {
 
        @Override
        public Dictionary preAggregateThatSDCStructure(ColGroupSDC that, 
Dictionary ret, boolean preModified) {
-               throw new NotImplementedException();
+               final AIterator itThat = that._indexes.getIterator();
+               final AIterator itThis = _indexes.getIterator();
+               final int nCol = that._colIndexes.length;
+               // final int defThat = that.getNumValues() * nCol - nCol;
+
+               if(preModified) {
+                       while(itThat.hasNext() && itThis.hasNext()) {
+                               if(itThat.value() == itThis.value()) {
+                                       final int fr = 
that.getIndex(itThat.getDataIndexAndIncrement());
+                                       that._dict.addToEntry(ret, fr, 1, nCol);
+                               }
+                               else if(itThat.value() < itThis.value())
+                                       itThat.next();
+                               else{
+                                       itThis.next();
+                                       // that._dict.addToEntry(ret, defThat, 
0, nCol);
+                               }
+                       }
+               }
+               else {
+                       throw new NotImplementedException();
+               }
+               return ret;
        }
 
        @Override
@@ -600,10 +622,6 @@ public class ColGroupSDCSingle extends ColGroupValue {
                                        itThis.next();
                        }
 
-                       // while(itThat.hasNext()) {
-                       //      final int fr = 
that.getIndex(itThat.getDataIndexAndIncrement());
-                       //      that._dict.addToEntry(ret, fr, 1, nCol);
-                       // }
                        return ret;
                }
                else {
diff --git 
a/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupValue.java 
b/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupValue.java
index 223c3a0..dc52ca1 100644
--- 
a/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupValue.java
+++ 
b/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupValue.java
@@ -828,13 +828,23 @@ public abstract class ColGroupValue extends 
ColGroupCompressed implements Clonea
 
                if(sameIndexStructure(lhs)) {
                        int[] agI = getCounts();
-                       for(int a = 0, off = 0; a < nvL; a++, off += nvL + 1)
-                               leftMultDictEntry(agI[a], off, nvL, lCol, rCol, 
lhs, numCols, lhValues, rhValues, result);
+                       for(int i = 0; i < agI.length; i++) {
+                               for(int l = 0; l < lCol; l++) {
+                                       final int leftOff = lhs._colIndexes[l] 
* numCols;
+                                       final double lhV = lhValues[i * lCol + 
l] * agI[i];
+                                       if(lhV != 0)
+                                               for(int r = 0; r < rCol; r++) {
+                                                       final double rhV = 
rhValues[i * rCol + r];
+                                                       final double va = lhV * 
rhV;
+                                                       result[leftOff + 
this._colIndexes[r]] += va;
+                                               }
+                               }
+                       }
                }
                else if(lhs instanceof ColGroupConst || this instanceof 
ColGroupConst) {
-                       double[] r = this instanceof ColGroupConst ? rhValues : 
this._dict.colSum(getCounts(), rCol);
-                       double[] l = lhs instanceof ColGroupConst ? lhValues : 
lhs._dict.colSum(lhs.getCounts(), lCol);
-                       vectorVectorMultiply(l, lhs._colIndexes, r, 
this._colIndexes, result, numCols);
+                       // double[] r = this instanceof ColGroupConst ? 
rhValues : this._dict.colSum(getCounts(), rCol);
+                       // double[] l = lhs instanceof ColGroupConst ? lhValues 
: lhs._dict.colSum(lhs.getCounts(), lCol);
+                       // vectorVectorMultiply(l, lhs._colIndexes, r, 
this._colIndexes, result, numCols);
                }
                else {
                        int[] countsRight = getCounts();
@@ -848,9 +858,9 @@ public abstract class ColGroupValue extends 
ColGroupCompressed implements Clonea
 
                        if(skipRight > threshold && percentageRight > 
percentageLeft && !(this instanceof ColGroupDDC)) {
                                double[] mct = 
this._dict.getMostCommonTuple(this.getCounts(), rCol);
-                               double[] lhsSum = 
lhs._dict.colSum(lhs.getCounts(), lCol);
-                               if(mct != null)
-                                       vectorVectorMultiply(lhsSum, 
lhs._colIndexes, mct, this._colIndexes, result, numCols);
+                               // double[] lhsSum = 
lhs._dict.colSum(lhs.getCounts(), lCol);
+                               // if(mct != null)
+                               //      vectorVectorMultiply(lhsSum, 
lhs._colIndexes, mct, this._colIndexes, result, numCols);
 
                                ColGroupValue thisM = (mct != null) ? 
(ColGroupValue) this
                                        
.copyAndSet(this._dict.subtractTuple(mct)) : this;
@@ -860,9 +870,9 @@ public abstract class ColGroupValue extends 
ColGroupCompressed implements Clonea
                        }
                        else if(skipLeft > threshold && !(lhs instanceof 
ColGroupDDC)) {
                                double[] mct = 
lhs._dict.getMostCommonTuple(lhs.getCounts(), lCol);
-                               double[] thisColSum = 
this._dict.colSum(getCounts(), rCol);
-                               if(mct != null)
-                                       vectorVectorMultiply(mct, 
lhs._colIndexes, thisColSum, this._colIndexes, result, numCols);
+                               // double[] thisColSum = 
this._dict.colSum(getCounts(), rCol);
+                               // if(mct != null)
+                               //      vectorVectorMultiply(mct, 
lhs._colIndexes, thisColSum, this._colIndexes, result, numCols);
 
                                ColGroupValue lhsM = (mct != null) ? 
(ColGroupValue) lhs.copyAndSet(lhs._dict.subtractTuple(mct)) : lhs;
                                Dictionary preAgg = 
this.preAggregateThatIndexStructure(lhsM, true);
@@ -882,41 +892,6 @@ public abstract class ColGroupValue extends 
ColGroupCompressed implements Clonea
                }
        }
 
-       // private void leftMultMapPreAggregate(final int nvL, final int lCol, 
final int rCol, final ColGroupValue lhs,
-       // final int numCols, double[] lhValues, double[] rhValues, double[] c, 
MapPreAggregate agM) {
-       // final int[] map = agM.getMap();
-       // final int aggSize = agM.getSize();
-       // for(int k = 0; k < aggSize; k += 2)
-       // leftMultDictEntry(map[k + 1], map[k], nvL, lCol, rCol, lhs, numCols, 
lhValues, rhValues, c);
-       // leftMultDictEntry(agM.getMapFreeValue(), 0, nvL, lCol, rCol, lhs, 
numCols, lhValues, rhValues, c);
-       // }
-
-       // private void leftMultArrayPreAggregate(final int nvL, final int nvR, 
final int lCol, final int rCol,
-       // final ColGroupValue lhs, final int numCols, double[] lhValues, 
double[] rhValues, double[] c, int[] arr) {
-       // for(int a = 0; a < nvL * nvR; a++)
-       // leftMultDictEntry(arr[a], a, nvL, lCol, rCol, lhs, numCols, 
lhValues, rhValues, c);
-       // }
-
-       private void leftMultDictEntry(final int m, final int a, final int nvL, 
final int lCol, final int rCol,
-               final ColGroupValue lhs, final int numCols, final double[] 
lhValues, final double[] rhValues,
-               final double[] c) {
-
-               if(m > 0) {
-                       final int lhsRowOffset = (a % nvL) * lCol;
-                       final int rhsRowOffset = (a / nvL) * rCol;
-
-                       for(int j = 0; j < lCol; j++) {
-                               final int resultOff = lhs._colIndexes[j] * 
numCols;
-                               final double lh = lhValues[lhsRowOffset + j] * 
m;
-                               if(lh != 0)
-                                       for(int i = 0; i < rCol; i++) {
-                                               double rh = 
rhValues[rhsRowOffset + i];
-                                               c[resultOff + _colIndexes[i]] 
+= lh * rh;
-                                       }
-                       }
-               }
-       }
-
        @Override
        public void tsmm(double[] result, int numColumns) {
                int[] counts = getCounts();
diff --git 
a/src/main/java/org/apache/sysds/runtime/compress/colgroup/dictionary/ADictionary.java
 
b/src/main/java/org/apache/sysds/runtime/compress/colgroup/dictionary/ADictionary.java
index 1aeda85..352e96b 100644
--- 
a/src/main/java/org/apache/sysds/runtime/compress/colgroup/dictionary/ADictionary.java
+++ 
b/src/main/java/org/apache/sysds/runtime/compress/colgroup/dictionary/ADictionary.java
@@ -22,6 +22,8 @@ package org.apache.sysds.runtime.compress.colgroup.dictionary;
 import java.io.DataOutput;
 import java.io.IOException;
 
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
 import org.apache.sysds.runtime.compress.utils.ABitmap;
 import org.apache.sysds.runtime.compress.utils.Bitmap;
 import org.apache.sysds.runtime.compress.utils.BitmapLossy;
@@ -35,7 +37,7 @@ import 
org.apache.sysds.runtime.matrix.operators.ScalarOperator;
  */
 public abstract class ADictionary {
 
-       // private static final Log LOG = 
LogFactory.getLog(ADictionary.class.getName());
+       protected static final Log LOG = 
LogFactory.getLog(ADictionary.class.getName());
 
        /**
         * Get all the values contained in the dictionary as a linearized 
double array.
diff --git 
a/src/main/java/org/apache/sysds/runtime/compress/colgroup/dictionary/Dictionary.java
 
b/src/main/java/org/apache/sysds/runtime/compress/colgroup/dictionary/Dictionary.java
index b4f843f..5a32823 100644
--- 
a/src/main/java/org/apache/sysds/runtime/compress/colgroup/dictionary/Dictionary.java
+++ 
b/src/main/java/org/apache/sysds/runtime/compress/colgroup/dictionary/Dictionary.java
@@ -24,8 +24,6 @@ import java.io.DataOutput;
 import java.io.IOException;
 import java.util.Arrays;
 
-import org.apache.commons.logging.Log;
-import org.apache.commons.logging.LogFactory;
 import org.apache.sysds.runtime.DMLCompressionException;
 import org.apache.sysds.runtime.functionobjects.Builtin;
 import org.apache.sysds.runtime.functionobjects.ValueFunction;
@@ -39,7 +37,6 @@ import org.apache.sysds.utils.MemoryEstimates;
  */
 public class Dictionary extends ADictionary {
 
-       protected static final Log LOG = 
LogFactory.getLog(Dictionary.class.getName());
        private final double[] _values;
 
        public Dictionary(double[] values) {
diff --git 
a/src/main/java/org/apache/sysds/runtime/compress/colgroup/dictionary/QDictionary.java
 
b/src/main/java/org/apache/sysds/runtime/compress/colgroup/dictionary/QDictionary.java
index a8559db..7b01fb5 100644
--- 
a/src/main/java/org/apache/sysds/runtime/compress/colgroup/dictionary/QDictionary.java
+++ 
b/src/main/java/org/apache/sysds/runtime/compress/colgroup/dictionary/QDictionary.java
@@ -24,8 +24,6 @@ import java.io.IOException;
 import java.util.Arrays;
 
 import org.apache.commons.lang.NotImplementedException;
-import org.apache.commons.logging.Log;
-import org.apache.commons.logging.LogFactory;
 import org.apache.sysds.runtime.compress.utils.BitmapLossy;
 import org.apache.sysds.runtime.functionobjects.Builtin;
 import org.apache.sysds.runtime.functionobjects.Divide;
@@ -42,7 +40,6 @@ import org.apache.sysds.utils.MemoryEstimates;
  */
 public class QDictionary extends ADictionary {
 
-       protected static final Log LOG = 
LogFactory.getLog(QDictionary.class.getName());
        protected double _scale;
        protected byte[] _values;
 
@@ -121,14 +118,13 @@ public class QDictionary extends ADictionary {
                return ret;
        }
 
-
        @Override
-       public double[] aggregateTuples(Builtin fn, final int nCol){
+       public double[] aggregateTuples(Builtin fn, final int nCol) {
                if(nCol == 1)
                        return getValues();
                final int nRows = _values.length / nCol;
                double[] res = new double[nRows];
-               for(int i = 0; i < nRows; i++){
+               for(int i = 0; i < nRows; i++) {
                        final int off = i * nCol;
                        res[i] = _values[off];
                        for(int j = off + 1; j < off + nCol; j++)
@@ -258,7 +254,6 @@ public class QDictionary extends ADictionary {
                return new QDictionary(ret, _scale);
        }
 
-
        @Override
        public void write(DataOutput out) throws IOException {
                super.write(out);
@@ -314,20 +309,20 @@ public class QDictionary extends ADictionary {
                }
        }
 
-
        @Override
-       public double[] colSum(int[] counts, int nCol){
+       public double[] colSum(int[] counts, int nCol) {
                throw new NotImplementedException("Not Implemented");
                // final double[] res = new double[counts.length];
                // int idx = 0;
                // for(int k = 0; k< _values.length / counts.length; k++){
-               //      final int cntk = counts[k];
-               //      for(int j = 0; j< counts.length; j++){
-               //              res[j] += _values[idx++] * cntk;
-               //      }
+               // final int cntk = counts[k];
+               // for(int j = 0; j< counts.length; j++){
+               // res[j] += _values[idx++] * cntk;
+               // }
                // }
                // return res;
        }
+
        @Override
        public void colSum(double[] c, int[] counts, int[] colIndexes, boolean 
square) {
                throw new NotImplementedException("Not Implemented");
@@ -412,7 +407,7 @@ public class QDictionary extends ADictionary {
                }
        }
 
-       public String getString( int colIndexes) {
+       public String getString(int colIndexes) {
                StringBuilder sb = new StringBuilder();
                for(int i = 0; i < size(); i++) {
                        sb.append(_values[i]);
@@ -453,32 +448,31 @@ public class QDictionary extends ADictionary {
        }
 
        @Override
-       public boolean containsValue(double pattern){
+       public boolean containsValue(double pattern) {
                if(Double.isNaN(pattern) || Double.isInfinite(pattern))
                        return false;
                throw new NotImplementedException("Not contains value on Q 
Dictionary");
        }
 
        @Override
-       public long getNumberNonZeros(int[] counts, int nCol){
-               long nnz =  0;
+       public long getNumberNonZeros(int[] counts, int nCol) {
+               long nnz = 0;
                final int nRow = _values.length / nCol;
-               for(int i = 0; i < nRow; i++){
+               for(int i = 0; i < nRow; i++) {
                        long rowCount = 0;
-                       final int off = i * nCol; 
-                       for(int j = off; j < off + nCol; j++){
+                       final int off = i * nCol;
+                       for(int j = off; j < off + nCol; j++) {
                                if(_values[j] != 0)
-                                       rowCount ++;
+                                       rowCount++;
                        }
                        nnz += rowCount * counts[i];
                }
                return nnz;
        }
 
-
        @Override
-       public void addToEntry(Dictionary d, int fr, int to, int nCol){
-               throw new NotImplementedException("Not implemented yet");       
+       public void addToEntry(Dictionary d, int fr, int to, int nCol) {
+               throw new NotImplementedException("Not implemented yet");
        }
 
        @Override
@@ -487,9 +481,9 @@ public class QDictionary extends ADictionary {
        }
 
        @Override
-       public long getNumberNonZerosContained(){
+       public long getNumberNonZerosContained() {
                long count = 0;
-               for(double v : _values){
+               for(double v : _values) {
                        if(v != 0.0)
                                count++;
                }
@@ -497,12 +491,12 @@ public class QDictionary extends ADictionary {
        }
 
        @Override
-       public double[] getMostCommonTuple(int[] counts, int nCol){
+       public double[] getMostCommonTuple(int[] counts, int nCol) {
                return null;
        }
 
        @Override
-       public ADictionary subtractTuple(double[] tuple){
+       public ADictionary subtractTuple(double[] tuple) {
                throw new NotImplementedException();
        }
 }
diff --git 
a/src/main/java/org/apache/sysds/runtime/compress/colgroup/dictionary/SparseDictionary.java
 
b/src/main/java/org/apache/sysds/runtime/compress/colgroup/dictionary/SparseDictionary.java
new file mode 100644
index 0000000..3a400f3
--- /dev/null
+++ 
b/src/main/java/org/apache/sysds/runtime/compress/colgroup/dictionary/SparseDictionary.java
@@ -0,0 +1,218 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.sysds.runtime.compress.colgroup.dictionary;
+
+import org.apache.sysds.runtime.functionobjects.Builtin;
+import org.apache.sysds.runtime.functionobjects.ValueFunction;
+import org.apache.sysds.runtime.matrix.operators.ScalarOperator;
+
+/**
+ * A sparse dictionary implementation, use if the tuples are sparse.
+ */
+public class SparseDictionary extends ADictionary {
+
+       @Override
+       public double[] getValues() {
+               LOG.warn("Inefficient materialization of sparse Dictionary.");
+
+               return null;
+       }
+
+       @Override
+       public double getValue(int i) {
+               // TODO Auto-generated method stub
+               return 0;
+       }
+
+       @Override
+       public int hasZeroTuple(int nCol) {
+               // TODO Auto-generated method stub
+               return 0;
+       }
+
+       @Override
+       public long getInMemorySize() {
+               // TODO Auto-generated method stub
+               return 0;
+       }
+
+       @Override
+       public double aggregate(double init, Builtin fn) {
+               // TODO Auto-generated method stub
+               return 0;
+       }
+
+       @Override
+       public double[] aggregateTuples(Builtin fn, int nCol) {
+               // TODO Auto-generated method stub
+               return null;
+       }
+
+       @Override
+       public int size() {
+               // TODO Auto-generated method stub
+               return 0;
+       }
+
+       @Override
+       public ADictionary apply(ScalarOperator op) {
+               // TODO Auto-generated method stub
+               return null;
+       }
+
+       @Override
+       public ADictionary applyScalarOp(ScalarOperator op, double newVal, int 
numCols) {
+               // TODO Auto-generated method stub
+               return null;
+       }
+
+       @Override
+       public ADictionary applyBinaryRowOpLeft(ValueFunction fn, double[] v, 
boolean sparseSafe, int[] colIndexes) {
+               // TODO Auto-generated method stub
+               return null;
+       }
+
+       @Override
+       public ADictionary applyBinaryRowOpRight(ValueFunction fn, double[] v, 
boolean sparseSafe, int[] colIndexes) {
+               // TODO Auto-generated method stub
+               return null;
+       }
+
+       @Override
+       public ADictionary clone() {
+               // TODO Auto-generated method stub
+               return null;
+       }
+
+       @Override
+       public ADictionary cloneAndExtend(int len) {
+               // TODO Auto-generated method stub
+               return null;
+       }
+
+       @Override
+       public boolean isLossy() {
+               // TODO Auto-generated method stub
+               return false;
+       }
+
+       @Override
+       public int getNumberOfValues(int ncol) {
+               // TODO Auto-generated method stub
+               return 0;
+       }
+
+       @Override
+       public double[] sumAllRowsToDouble(boolean square, int nrColumns) {
+               // TODO Auto-generated method stub
+               return null;
+       }
+
+       @Override
+       public double sumRow(int k, boolean square, int nrColumns) {
+               // TODO Auto-generated method stub
+               return 0;
+       }
+
+       @Override
+       public double[] colSum(int[] counts, int nCol) {
+               // TODO Auto-generated method stub
+               return null;
+       }
+
+       @Override
+       public void colSum(double[] c, int[] counts, int[] colIndexes, boolean 
square) {
+               // TODO Auto-generated method stub
+
+       }
+
+       @Override
+       public double sum(int[] counts, int ncol) {
+               // TODO Auto-generated method stub
+               return 0;
+       }
+
+       @Override
+       public double sumsq(int[] counts, int ncol) {
+               // TODO Auto-generated method stub
+               return 0;
+       }
+
+       @Override
+       public String getString(int colIndexes) {
+               // TODO Auto-generated method stub
+               return null;
+       }
+
+       @Override
+       public void addMaxAndMin(double[] ret, int[] colIndexes) {
+               // TODO Auto-generated method stub
+
+       }
+
+       @Override
+       public ADictionary sliceOutColumnRange(int idxStart, int idxEnd, int 
previousNumberOfColumns) {
+               // TODO Auto-generated method stub
+               return null;
+       }
+
+       @Override
+       public ADictionary reExpandColumns(int max) {
+               // TODO Auto-generated method stub
+               return null;
+       }
+
+       @Override
+       public boolean containsValue(double pattern) {
+               // TODO Auto-generated method stub
+               return false;
+       }
+
+       @Override
+       public long getNumberNonZeros(int[] counts, int nCol) {
+               // TODO Auto-generated method stub
+               return 0;
+       }
+
+       @Override
+       public long getNumberNonZerosContained() {
+               // TODO Auto-generated method stub
+               return 0;
+       }
+
+       @Override
+       public void addToEntry(Dictionary d, int fr, int to, int nCol) {
+               // TODO Auto-generated method stub
+
+       }
+
+       @Override
+       public double[] getMostCommonTuple(int[] counts, int nCol) {
+               // TODO Auto-generated method stub
+               return null;
+       }
+
+       @Override
+       public ADictionary subtractTuple(double[] tuple) {
+               // TODO Auto-generated method stub
+               return null;
+       }
+
+}
diff --git 
a/src/main/java/org/apache/sysds/runtime/compress/estim/CompressedSizeEstimator.java
 
b/src/main/java/org/apache/sysds/runtime/compress/estim/CompressedSizeEstimator.java
index 5332060..4902cb8 100644
--- 
a/src/main/java/org/apache/sysds/runtime/compress/estim/CompressedSizeEstimator.java
+++ 
b/src/main/java/org/apache/sysds/runtime/compress/estim/CompressedSizeEstimator.java
@@ -42,19 +42,19 @@ public abstract class CompressedSizeEstimator {
        protected static final Log LOG = 
LogFactory.getLog(CompressedSizeEstimator.class.getName());
 
        /** The Matrix Block to extract the compression estimates from */
-       protected MatrixBlock _data;
+       final protected MatrixBlock _data;
        /** The number of rows in the matrix block, extracted to a field 
because the matrix could be transposed */
-       protected int _numRows;
+       final protected int _numRows;
        /** The number of columns in the matrix block, extracted to a field 
because the matrix could be transposed */
-       protected int _numCols;
+       final protected int _numCols;
        /** The compression settings to use, for estimating the size, and 
compress the ColGroups. */
-       protected final CompressionSettings _compSettings;
+       final protected CompressionSettings _compSettings;
 
        /**
         * Boolean specifying if the _data is in transposed format. This is 
used to select the correct readers for the
         * extraction of bitmaps for the columns.
         */
-       protected boolean _transposed = false;
+       protected boolean _transposed;
 
        /**
         * Main Constructor for Compression Estimator.
@@ -72,6 +72,15 @@ public abstract class CompressedSizeEstimator {
                _compSettings = compSettings;
        }
 
+
+       public int getNumRows(){
+               return _numRows;
+       }
+
+       public int getNumColumns(){
+               return _numCols;
+       }
+
        /**
         * Multi threaded version of extracting Compression Size info
         * 
diff --git 
a/src/main/java/org/apache/sysds/runtime/compress/estim/CompressedSizeEstimatorSample.java
 
b/src/main/java/org/apache/sysds/runtime/compress/estim/CompressedSizeEstimatorSample.java
index 0c83202..af94473 100644
--- 
a/src/main/java/org/apache/sysds/runtime/compress/estim/CompressedSizeEstimatorSample.java
+++ 
b/src/main/java/org/apache/sysds/runtime/compress/estim/CompressedSizeEstimatorSample.java
@@ -41,6 +41,8 @@ public class CompressedSizeEstimatorSample extends 
CompressedSizeEstimator {
        private static final int FORCE_TRANSPOSE_ON_SAMPLE_THRESHOLD = 8000;
 
        private final int[] _sampleRows;
+
+       private final MatrixBlock _sample;
        private HashMap<Integer, Double> _solveCache = null;
 
        /**
@@ -56,7 +58,7 @@ public class CompressedSizeEstimatorSample extends 
CompressedSizeEstimator {
                super(data, compSettings, transposed);
                _sampleRows = sampleRows;
                _solveCache = new HashMap<>();
-               _data = sampleData(data, compSettings, sampleRows, transposed);
+               _sample = sampleData(data, compSettings, sampleRows, 
transposed);
        }
 
        protected MatrixBlock sampleData(MatrixBlock data, CompressionSettings 
compSettings, int[] sampleRows,
@@ -75,8 +77,6 @@ public class CompressedSizeEstimatorSample extends 
CompressedSizeEstimator {
                        sampledMatrixBlock = 
LibMatrixReorg.transposeInPlace(sampledMatrixBlock, 16);
                }
                else {
-
-                       // Override the _data Matrix block with the sampled 
matrix block.
                        MatrixBlock select = (transposed) ? new 
MatrixBlock(data.getNumColumns(), 1,
                                true) : new MatrixBlock(data.getNumRows(), 1, 
true);
                        for(int i = 0; i < sampleRows.length; i++)
@@ -106,7 +106,7 @@ public class CompressedSizeEstimatorSample extends 
CompressedSizeEstimator {
                // final int numCols = colIndexes.length;
 
                // extract statistics from sample
-               final ABitmap ubm = BitmapEncoder.extractBitmap(colIndexes, 
_data, _transposed);
+               final ABitmap ubm = BitmapEncoder.extractBitmap(colIndexes, 
_sample, _transposed);
                final EstimationFactors fact = 
EstimationFactors.computeSizeEstimationFactors(ubm, false, _numRows, 
colIndexes);
                final int numZerosInSample = ubm.getZeroCounts();
                final boolean lossy = ubm.getType() == BitmapType.Lossy;
diff --git 
a/src/main/java/org/apache/sysds/runtime/compress/estim/CompressedSizeInfoColGroup.java
 
b/src/main/java/org/apache/sysds/runtime/compress/estim/CompressedSizeInfoColGroup.java
index 7b9bb8d..d25f280 100644
--- 
a/src/main/java/org/apache/sysds/runtime/compress/estim/CompressedSizeInfoColGroup.java
+++ 
b/src/main/java/org/apache/sysds/runtime/compress/estim/CompressedSizeInfoColGroup.java
@@ -144,6 +144,14 @@ public class CompressedSizeInfoColGroup {
                return _cardinalityRatio;
        }
 
+       public double getMostCommonFraction(){
+               return (double) _facts.largestOff / _facts.numRows;
+       }
+
+       public double getTupleSparsity(){
+               return _facts.tupleSparsity;
+       }
+
        private static Map<CompressionType, Long> 
calculateCompressionSizes(EstimationFactors fact,
                Set<CompressionType> validCompressionTypes) {
                Map<CompressionType, Long> res = new HashMap<>();

Reply via email to