This is an automated email from the ASF dual-hosted git repository.

baunsgaard pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/systemds.git


The following commit(s) were added to refs/heads/master by this push:
     new c59ad18  [SYSTEMDS-3002] CLA CoCoding without Dictionaries
c59ad18 is described below

commit c59ad18c58c1a0212cfa859bb873958a96cbc624
Author: baunsgaard <[email protected]>
AuthorDate: Thu Jun 3 15:45:00 2021 +0200

    [SYSTEMDS-3002] CLA CoCoding without Dictionaries
    
    This commit make the co-coding cheaper if many column combinations is
    constructed.
    
    For example with BinaryMNIST and bin packing co-coding, the execution
    time is halved from 6 to 3 sec in co-coding phase
    This gives a total compression time of: 6 sec + IO of ~10. Also
    notably is normal MNIST of 60 k values, there compression time is now
    ~200ms.
    
    This change impacts the matrix cost based column co coding the most,
    reducing the overall compression time in binary mnist from ~120 sec
    to ~8 as well.
    
    Closes #1296
---
 .../runtime/compress/CompressedMatrixBlock.java    |  13 +-
 .../compress/CompressedMatrixBlockFactory.java     |   4 +-
 .../runtime/compress/CompressionSettings.java      |  16 ++-
 .../compress/CompressionSettingsBuilder.java       |   9 +-
 .../runtime/compress/cocode/AColumnCoCoder.java    |   9 +-
 .../sysds/runtime/compress/cocode/CoCodeCost.java  |  80 +++++++-----
 .../compress/cocode/CoCodeCostMatrixMult.java      |  21 ++--
 .../runtime/compress/cocode/PlanningCoCoder.java   |   8 +-
 .../runtime/compress/colgroup/ColGroupFactory.java |   8 +-
 .../compress/colgroup/mapping/AMapToData.java      |  14 +++
 .../compress/colgroup/mapping/MapToBit.java        |  12 +-
 .../compress/colgroup/mapping/MapToByte.java       |  12 +-
 .../compress/colgroup/mapping/MapToChar.java       |  10 +-
 .../compress/colgroup/mapping/MapToFactory.java    |  72 ++++++++++-
 .../compress/colgroup/mapping/MapToInt.java        |  12 +-
 .../compress/estim/CompressedSizeEstimator.java    |  46 ++++++-
 .../estim/CompressedSizeEstimatorExact.java        |  24 ++--
 .../estim/CompressedSizeEstimatorSample.java       | 129 +++++++++++--------
 .../compress/estim/CompressedSizeInfoColGroup.java |  58 +++++++--
 .../runtime/compress/estim/EstimationFactors.java  |  70 +++++++++--
 .../compress/estim/sample/FrequencyCount.java      |  47 -------
 .../estim/sample/GuaranteedErrorEstimator.java     |  66 ----------
 .../compress/estim/sample/HassAndStokes.java       |  13 +-
 .../estim/sample/SampleEstimatorFactory.java       |  96 +++++++++++++++
 .../compress/estim/sample/ShlosserEstimator.java   |  10 +-
 .../estim/sample/ShlosserJackknifeEstimator.java   |  16 +--
 .../estim/sample/SmoothedJackknifeEstimator.java   |   9 +-
 .../sysds/runtime/compress/utils/ABitmap.java      |  12 ++
 .../sysds/runtime/compress/utils/Bitmap.java       |   5 +
 .../sysds/runtime/compress/utils/BitmapLossy.java  |   5 +
 .../runtime/compress/utils/MultiColBitmap.java     |   5 +
 .../runtime/compress/{cocode => utils}/Util.java   |   2 +-
 .../component/compress/CompressedTestBase.java     |   2 +-
 .../compress/estim/JoinCompressionInfoTest.java    | 119 ++++++++++++++++++
 .../compress/mapping/StandAloneTests.java          | 137 +++++++++++++++++++++
 35 files changed, 861 insertions(+), 310 deletions(-)

diff --git 
a/src/main/java/org/apache/sysds/runtime/compress/CompressedMatrixBlock.java 
b/src/main/java/org/apache/sysds/runtime/compress/CompressedMatrixBlock.java
index 6d1d02b..5a52b87 100644
--- a/src/main/java/org/apache/sysds/runtime/compress/CompressedMatrixBlock.java
+++ b/src/main/java/org/apache/sysds/runtime/compress/CompressedMatrixBlock.java
@@ -78,6 +78,7 @@ import org.apache.sysds.runtime.instructions.cp.CM_COV_Object;
 import org.apache.sysds.runtime.instructions.cp.ScalarObject;
 import org.apache.sysds.runtime.instructions.spark.data.IndexedMatrixValue;
 import org.apache.sysds.runtime.matrix.data.CTableMap;
+import org.apache.sysds.runtime.matrix.data.LibMatrixBincell;
 import org.apache.sysds.runtime.matrix.data.LibMatrixDatagen;
 import org.apache.sysds.runtime.matrix.data.LibMatrixReorg;
 import org.apache.sysds.runtime.matrix.data.MatrixBlock;
@@ -492,14 +493,14 @@ public class CompressedMatrixBlock extends MatrixBlock {
                        return out;
 
                BinaryOperator bop = new 
BinaryOperator(Multiply.getMultiplyFnObject());
-
-               MatrixBlock tmp = CLALibRightMultBy.rightMultByMatrix(this, v, 
null, k, true);
+               boolean allowOverlap = 
ConfigurationManager.getDMLConfig().getBooleanValue(DMLConfig.COMPRESSED_OVERLAPPING);
+               MatrixBlock tmp = CLALibRightMultBy.rightMultByMatrix(this, v, 
null, k, allowOverlap);
 
                if(ctype == ChainType.XtwXv) {
-                       // if(tmp instanceof CompressedMatrixBlock)
-                       tmp = CLALibBinaryCellOp.binaryOperations(bop, 
(CompressedMatrixBlock) tmp, w, null);
-                       // else
-                       // LibMatrixBincell.bincellOpInPlace(tmp, w, bop);
+                       if(tmp instanceof CompressedMatrixBlock)
+                               tmp = CLALibBinaryCellOp.binaryOperations(bop, 
(CompressedMatrixBlock) tmp, w, null);
+                       else
+                               LibMatrixBincell.bincellOpInPlace(tmp, w, bop);
                }
 
                if(tmp instanceof CompressedMatrixBlock)
diff --git 
a/src/main/java/org/apache/sysds/runtime/compress/CompressedMatrixBlockFactory.java
 
b/src/main/java/org/apache/sysds/runtime/compress/CompressedMatrixBlockFactory.java
index 914a707..513103a 100644
--- 
a/src/main/java/org/apache/sysds/runtime/compress/CompressedMatrixBlockFactory.java
+++ 
b/src/main/java/org/apache/sysds/runtime/compress/CompressedMatrixBlockFactory.java
@@ -170,7 +170,9 @@ public class CompressedMatrixBlockFactory {
        }
 
        private void coCodePhase(CompressedSizeEstimator sizeEstimator, 
CompressedSizeInfo sizeInfos, int numRows) {
-               coCodeColGroups = 
PlanningCoCoder.findCoCodesByPartitioning(sizeEstimator, sizeInfos, numRows, k, 
compSettings);
+               // for(int i = 0; i < 100000; i ++)
+                       coCodeColGroups = 
PlanningCoCoder.findCoCodesByPartitioning(sizeEstimator, sizeInfos, numRows, k, 
compSettings);
+               
                _stats.estimatedSizeCoCoded = coCodeColGroups.memoryEstimate();
                logPhase();
        }
diff --git 
a/src/main/java/org/apache/sysds/runtime/compress/CompressionSettings.java 
b/src/main/java/org/apache/sysds/runtime/compress/CompressionSettings.java
index aeabc92..b93a8f5 100644
--- a/src/main/java/org/apache/sysds/runtime/compress/CompressionSettings.java
+++ b/src/main/java/org/apache/sysds/runtime/compress/CompressionSettings.java
@@ -25,6 +25,7 @@ import org.apache.commons.logging.Log;
 import org.apache.commons.logging.LogFactory;
 import 
org.apache.sysds.runtime.compress.cocode.PlanningCoCoder.PartitionerType;
 import org.apache.sysds.runtime.compress.colgroup.AColGroup.CompressionType;
+import 
org.apache.sysds.runtime.compress.estim.sample.SampleEstimatorFactory.EstimationType;
 
 /**
  * Compression Settings class, used as a bundle of parameters inside the 
Compression framework. See
@@ -99,10 +100,13 @@ public class CompressionSettings {
         */
        public final int minimumSampleSize;
 
+       /** The sample type used for sampling */
+       public final EstimationType estimationType;
+
        protected CompressionSettings(double samplingRatio, boolean 
allowSharedDictionary, String transposeInput,
-               boolean skipList, int seed, boolean lossy,
-               EnumSet<CompressionType> validCompressions, boolean 
sortValuesByLength, PartitionerType columnPartitioner,
-               int maxColGroupCoCode, double coCodePercentage, int 
minimumSampleSize) {
+               boolean skipList, int seed, boolean lossy, 
EnumSet<CompressionType> validCompressions,
+               boolean sortValuesByLength, PartitionerType columnPartitioner, 
int maxColGroupCoCode, double coCodePercentage,
+               int minimumSampleSize, EstimationType estimationType) {
                this.samplingRatio = samplingRatio;
                this.allowSharedDictionary = allowSharedDictionary;
                this.transposeInput = transposeInput;
@@ -115,7 +119,9 @@ public class CompressionSettings {
                this.maxColGroupCoCode = maxColGroupCoCode;
                this.coCodePercentage = coCodePercentage;
                this.minimumSampleSize = minimumSampleSize;
-               LOG.debug(this);
+               this.estimationType = estimationType;
+               if(LOG.isDebugEnabled())
+                       LOG.debug(this);
        }
 
        @Override
@@ -131,6 +137,8 @@ public class CompressionSettings {
                sb.append("\n Max Static ColGroup CoCode: " + 
maxColGroupCoCode);
                sb.append("\n Max cocodePercentage: " + coCodePercentage);
                sb.append("\n Sample Percentage: " + samplingRatio);
+               if(samplingRatio < 1.0)
+                       sb.append("\n Estimation Type: " + estimationType);
                // If needed for debugging add more fields to the printing.
                return sb.toString();
        }
diff --git 
a/src/main/java/org/apache/sysds/runtime/compress/CompressionSettingsBuilder.java
 
b/src/main/java/org/apache/sysds/runtime/compress/CompressionSettingsBuilder.java
index 3ec42a0..e4ad2bb 100644
--- 
a/src/main/java/org/apache/sysds/runtime/compress/CompressionSettingsBuilder.java
+++ 
b/src/main/java/org/apache/sysds/runtime/compress/CompressionSettingsBuilder.java
@@ -26,6 +26,7 @@ import org.apache.sysds.conf.DMLConfig;
 import org.apache.sysds.runtime.DMLCompressionException;
 import 
org.apache.sysds.runtime.compress.cocode.PlanningCoCoder.PartitionerType;
 import org.apache.sysds.runtime.compress.colgroup.AColGroup.CompressionType;
+import 
org.apache.sysds.runtime.compress.estim.sample.SampleEstimatorFactory.EstimationType;
 
 /**
  * Builder pattern for Compression Settings. See CompressionSettings for 
details on values.
@@ -43,6 +44,7 @@ public class CompressionSettingsBuilder {
        private int maxColGroupCoCode = 10000;
        private double coCodePercentage = 0.01;
        private int minimumSampleSize = 2000;
+       private EstimationType estimationType = EstimationType.HassAndStokes;
 
        private final static double defaultSampleRate = 0.01;
 
@@ -261,6 +263,11 @@ public class CompressionSettingsBuilder {
                return this;
        }
 
+       public CompressionSettingsBuilder setEstimationType(EstimationType 
estimationType){
+               this.estimationType = estimationType;
+               return this;
+       }
+
        /**
         * Create the CompressionSettings object to use in the compression.
         * 
@@ -269,6 +276,6 @@ public class CompressionSettingsBuilder {
        public CompressionSettings create() {
                return new CompressionSettings(samplingRatio, 
allowSharedDictionary, transposeInput, skipList, seed, lossy,
                        validCompressions, sortValuesByLength, 
columnPartitioner, maxColGroupCoCode, coCodePercentage,
-                       minimumSampleSize);
+                       minimumSampleSize, estimationType);
        }
 }
diff --git 
a/src/main/java/org/apache/sysds/runtime/compress/cocode/AColumnCoCoder.java 
b/src/main/java/org/apache/sysds/runtime/compress/cocode/AColumnCoCoder.java
index 124e3a4..202e4bf 100644
--- a/src/main/java/org/apache/sysds/runtime/compress/cocode/AColumnCoCoder.java
+++ b/src/main/java/org/apache/sysds/runtime/compress/cocode/AColumnCoCoder.java
@@ -25,6 +25,7 @@ import org.apache.sysds.runtime.compress.CompressionSettings;
 import org.apache.sysds.runtime.compress.estim.CompressedSizeEstimator;
 import org.apache.sysds.runtime.compress.estim.CompressedSizeInfo;
 import org.apache.sysds.runtime.compress.estim.CompressedSizeInfoColGroup;
+import org.apache.sysds.runtime.compress.utils.Util;
 
 public abstract class AColumnCoCoder {
 
@@ -54,15 +55,17 @@ public abstract class AColumnCoCoder {
 
        protected CompressedSizeInfoColGroup 
joinWithAnalysis(CompressedSizeInfoColGroup lhs,
                CompressedSizeInfoColGroup rhs) {
-               int[] joined = Util.join(lhs.getColumns(), rhs.getColumns());
-               return _est.estimateCompressedColGroupSize(joined,( 
lhs.getNumVals() + 1) * (rhs.getNumVals() + 1));
+               return _est.estimateJoinCompressedSize(lhs, rhs);
        }
 
        protected CompressedSizeInfoColGroup 
joinWithoutAnalysis(CompressedSizeInfoColGroup lhs,
                CompressedSizeInfoColGroup rhs) {
                int[] joined = Util.join(lhs.getColumns(), rhs.getColumns());
                int numVals = lhs.getNumVals() + rhs.getNumVals();
-               return new CompressedSizeInfoColGroup(joined, numVals, 
_est.getNumRows());
+               if(numVals< 0 || numVals > _est.getNumRows())
+                       return null;
+               else
+                       return new CompressedSizeInfoColGroup(joined, numVals, 
_est.getNumRows());
        }
 
        protected CompressedSizeInfoColGroup analyze(CompressedSizeInfoColGroup 
g) {
diff --git 
a/src/main/java/org/apache/sysds/runtime/compress/cocode/CoCodeCost.java 
b/src/main/java/org/apache/sysds/runtime/compress/cocode/CoCodeCost.java
index 3f7185a..a39a524 100644
--- a/src/main/java/org/apache/sysds/runtime/compress/cocode/CoCodeCost.java
+++ b/src/main/java/org/apache/sysds/runtime/compress/cocode/CoCodeCost.java
@@ -61,47 +61,67 @@ public class CoCodeCost extends AColumnCoCoder {
        }
 
        private List<CompressedSizeInfoColGroup> 
join(List<CompressedSizeInfoColGroup> currentGroups) {
-
+               // return joinToSmallForAnalysis(currentGroups);
+               List<CompressedSizeInfoColGroup> filteredGroups = 
joinToSmallForAnalysis(currentGroups);
+               // return currentGroups;
                Comparator<CompressedSizeInfoColGroup> comp = 
Comparator.comparing(CompressedSizeInfoColGroup::getNumVals);
-               Queue<CompressedSizeInfoColGroup> que = new 
PriorityQueue<>(currentGroups.size(), comp);
+               Queue<CompressedSizeInfoColGroup> que = new 
PriorityQueue<>(filteredGroups.size(), comp);
                List<CompressedSizeInfoColGroup> ret = new ArrayList<>();
 
-               for(CompressedSizeInfoColGroup g : currentGroups)
-                       que.add(g);
-
-               boolean finished = false;
-               while(!finished) {
-                       if(que.peek() != null) {
-                               CompressedSizeInfoColGroup l = que.poll();
-                               if(que.peek() != null) {
-                                       CompressedSizeInfoColGroup r = 
que.poll();
-                                       int worstCaseJoinedSize = 
l.getNumVals() * r.getNumVals();
-                                       if(worstCaseJoinedSize < 
toSmallForAnalysis)
-                                               que.add(joinWithoutAnalysis(l, 
r));
-                                       else if(worstCaseJoinedSize < 
largestDistinct) {
-                                               CompressedSizeInfoColGroup g = 
joinWithAnalysis(l, r);
-                                               if(g.getNumVals() < 
largestDistinct)
-                                                       
que.add(joinWithAnalysis(l, r));
-                                               else {
-                                                       ret.add(l);
-                                                       que.add(r);
-                                               }
-                                       }
-                                       else {
-                                               ret.add(l);
-                                               que.add(r);
-                                       }
+               for(CompressedSizeInfoColGroup g : filteredGroups) {
+                       if(g != null)
+                               que.add(g);
+               }
+
+               CompressedSizeInfoColGroup l = que.poll();
+
+               while(que.peek() != null) {
+                       CompressedSizeInfoColGroup r = que.peek();
+                       int worstCaseJoinedSize = l.getNumVals() * 
r.getNumVals();
+                       if(worstCaseJoinedSize < toSmallForAnalysis) {
+                               que.poll();
+                               que.add(joinWithoutAnalysis(l, r));
+                       }
+                       else if(worstCaseJoinedSize < largestDistinct) {
+                               CompressedSizeInfoColGroup g = 
joinWithAnalysis(l, r);
+                               if(g != null && g.getNumVals() < 
largestDistinct) {
+                                       que.poll();
+                                       que.add(g);
                                }
-                               else
+                               else 
                                        ret.add(l);
                        }
-                       else
-                               finished = true;
+                       else 
+                               ret.add(l);
+                       
+                       l = que.poll();
                }
 
+               ret.add(l);
+
                for(CompressedSizeInfoColGroup g : que)
                        ret.add(g);
 
                return ret;
        }
+
+       private List<CompressedSizeInfoColGroup> 
joinToSmallForAnalysis(List<CompressedSizeInfoColGroup> currentGroups) {
+               return currentGroups;
+               // List<CompressedSizeInfoColGroup> tmp = new ArrayList<>();
+               // int id = 0;
+               // while(id < currentGroups.size() - 1) {
+               //      CompressedSizeInfoColGroup g1 = currentGroups.get(id);
+               //      CompressedSizeInfoColGroup g2 = currentGroups.get(id + 
1);
+               //      if(g1.getNumVals() * g2.getNumVals() < 
toSmallForAnalysis) {
+               //              tmp.add(joinWithoutAnalysis(g1, g2));
+               //      }
+               //      else {
+               //              tmp.add(g1);
+               //              tmp.add(g2);
+               //      }
+               //      id += 2;
+
+               // }
+               // return tmp;
+       }
 }
diff --git 
a/src/main/java/org/apache/sysds/runtime/compress/cocode/CoCodeCostMatrixMult.java
 
b/src/main/java/org/apache/sysds/runtime/compress/cocode/CoCodeCostMatrixMult.java
index 53f3243..2dd45d1 100644
--- 
a/src/main/java/org/apache/sysds/runtime/compress/cocode/CoCodeCostMatrixMult.java
+++ 
b/src/main/java/org/apache/sysds/runtime/compress/cocode/CoCodeCostMatrixMult.java
@@ -103,17 +103,22 @@ public class CoCodeCostMatrixMult extends AColumnCoCoder {
 
                protected CostOfJoin(CompressedSizeInfoColGroup elm) {
                        this.elm = elm;
+                       if(elm == null) {
+                               this.cost = Double.POSITIVE_INFINITY;
+                       }
+                       else {
 
-                       final int nCols = elm.getColumns().length;
-                       final double nRows = _est.getNumRows();
-                       final double preAggregateCost = nRows;
+                               final int nCols = elm.getColumns().length;
+                               final double nRows = _est.getNumRows();
+                               final double preAggregateCost = nRows;
 
-                       final int numberTuples = elm.getNumVals();
-                       final double tupleSparsity = elm.getTupleSparsity();
-                       final double postScalingCost = (nCols > 1 && 
tupleSparsity > 0.4) ? numberTuples * nCols : numberTuples *
-                               nCols * tupleSparsity;
+                               final int numberTuples = elm.getNumVals();
+                               final double tupleSparsity = 
elm.getTupleSparsity();
+                               final double postScalingCost = (nCols > 1 && 
tupleSparsity > 0.4) ? numberTuples *
+                                       nCols : numberTuples * nCols * 
tupleSparsity;
 
-                       this.cost = preAggregateCost + postScalingCost;
+                               this.cost = preAggregateCost + postScalingCost;
+                       }
                }
 
                @Override
diff --git 
a/src/main/java/org/apache/sysds/runtime/compress/cocode/PlanningCoCoder.java 
b/src/main/java/org/apache/sysds/runtime/compress/cocode/PlanningCoCoder.java
index 6074d1d..28912ba 100644
--- 
a/src/main/java/org/apache/sysds/runtime/compress/cocode/PlanningCoCoder.java
+++ 
b/src/main/java/org/apache/sysds/runtime/compress/cocode/PlanningCoCoder.java
@@ -32,6 +32,7 @@ import 
org.apache.sysds.runtime.compress.colgroup.AColGroup.CompressionType;
 import org.apache.sysds.runtime.compress.estim.CompressedSizeEstimator;
 import org.apache.sysds.runtime.compress.estim.CompressedSizeInfo;
 import org.apache.sysds.runtime.compress.estim.CompressedSizeInfoColGroup;
+import org.apache.sysds.runtime.compress.utils.Util;
 
 public class PlanningCoCoder {
 
@@ -132,7 +133,10 @@ public class PlanningCoCoder {
                List<CompressedSizeInfoColGroup> finalGroups = new 
ArrayList<>();
                // For each bin of columns that is allowed to potentially 
cocode.
                for(CompressedSizeInfoColGroup bin : bins.getInfo()) {
-                       if(bin.getColumns().length == 1)
+                       final int len = bin.getColumns().length;
+                       if(len == 0)
+                               continue;
+                       else if(len == 1)
                                // early termination
                                finalGroups.add(bin);
                        else
@@ -248,7 +252,7 @@ public class PlanningCoCoder {
                                        g = 
CompressedSizeInfoColGroup.addConstGroup(c, left, cs.validCompressions);
                                else {
                                        st3++;
-                                       g = 
est.estimateCompressedColGroupSize(c);
+                                       g = 
est.estimateJoinCompressedSize(left, right);
                                }
 
                                if(leftConst || rightConst)
diff --git 
a/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupFactory.java 
b/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupFactory.java
index 1cb92c7..acd59ad 100644
--- 
a/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupFactory.java
+++ 
b/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupFactory.java
@@ -200,7 +200,7 @@ public final class ColGroupFactory {
                CompressedSizeEstimator estimator = new 
CompressedSizeEstimatorExact(in, compSettings);
 
                CompressedSizeInfoColGroup sizeInfo = new 
CompressedSizeInfoColGroup(
-                       estimator.estimateCompressedColGroupSize(ubm, 
colIndexes), compSettings.validCompressions);
+                       estimator.estimateCompressedColGroupSize(ubm, 
colIndexes), compSettings.validCompressions, ubm);
 
                int numRows = compSettings.transposed ? in.getNumColumns() : 
in.getNumRows();
                return compress(colIndexes, numRows, ubm, 
sizeInfo.getBestCompressionType(compSettings), compSettings, in,
@@ -285,8 +285,8 @@ public final class ColGroupFactory {
        public static AColGroup compress(int[] colIndexes, int rlen, ABitmap 
ubm, CompressionType compType,
                CompressionSettings cs, MatrixBlock rawMatrixBlock, double 
tupleSparsity) {
 
-               if(compType == CompressionType.UNCOMPRESSED && 
PartitionerType.isCostBased(cs.columnPartitioner))
-                       compType = CompressionType.DDC;
+               // if(compType == CompressionType.UNCOMPRESSED && 
PartitionerType.isCostBased(cs.columnPartitioner))
+               //      compType = CompressionType.DDC;
 
                final IntArrayList[] of = ubm.getOffsetList();
 
@@ -411,7 +411,7 @@ public final class ColGroupFactory {
 
        private static AColGroup compressDDC(int[] colIndexes, int rlen, 
ABitmap ubm, CompressionSettings cs,
                double tupleSparsity) {
-               boolean zeros = ubm.getNumOffsets() < (long) rlen;
+               boolean zeros = ubm.getNumOffsets() < rlen;
                ADictionary dict = DictionaryFactory.create(ubm, tupleSparsity, 
zeros);
                AMapToData data = MapToFactory.create(rlen, zeros, 
ubm.getOffsetList());
                return new ColGroupDDC(colIndexes, rlen, dict, data, null);
diff --git 
a/src/main/java/org/apache/sysds/runtime/compress/colgroup/mapping/AMapToData.java
 
b/src/main/java/org/apache/sysds/runtime/compress/colgroup/mapping/AMapToData.java
index ed1f3a0..42dd6b4 100644
--- 
a/src/main/java/org/apache/sysds/runtime/compress/colgroup/mapping/AMapToData.java
+++ 
b/src/main/java/org/apache/sysds/runtime/compress/colgroup/mapping/AMapToData.java
@@ -29,6 +29,20 @@ public abstract class AMapToData {
 
        protected static final Log LOG = 
LogFactory.getLog(AMapToData.class.getName());
 
+       private int nUnique;
+
+       protected AMapToData(int nUnique) {
+               this.nUnique = nUnique;
+       }
+
+       public int getUnique() {
+               return nUnique;
+       }
+
+       protected void setUnique(int nUnique){
+               this.nUnique = nUnique;
+       }
+
        public abstract int getIndex(int n);
 
        public abstract void set(int n, int v);
diff --git 
a/src/main/java/org/apache/sysds/runtime/compress/colgroup/mapping/MapToBit.java
 
b/src/main/java/org/apache/sysds/runtime/compress/colgroup/mapping/MapToBit.java
index d77643b..48abe9e 100644
--- 
a/src/main/java/org/apache/sysds/runtime/compress/colgroup/mapping/MapToBit.java
+++ 
b/src/main/java/org/apache/sysds/runtime/compress/colgroup/mapping/MapToBit.java
@@ -32,12 +32,14 @@ public class MapToBit extends AMapToData {
        private final BitSet _data;
        private final int _size;
 
-       public MapToBit(int size) {
+       public MapToBit(int unique, int size) {
+               super(unique);
                _data = new BitSet(size);
                _size = size;
        }
 
-       private MapToBit(BitSet d, int size) {
+       private MapToBit(int unique, BitSet d, int size) {
+               super(unique);
                _data = d;
                _size = size;
        }
@@ -66,7 +68,7 @@ public class MapToBit extends AMapToData {
        @Override
        public long getExactSizeOnDisk() {
                final int dSize = _data.size();
-               long size = 1 + 4 + 4; // base variables
+               long size = 1 + 4 +  4 + 4; // base variables
                size += (dSize / 64) * 8; // all longs except last
                size += (dSize % 64 == 0 ? 0 : 8); // last long
                return size;
@@ -86,6 +88,7 @@ public class MapToBit extends AMapToData {
        public void write(DataOutput out) throws IOException {
                long[] internals = _data.toLongArray();
                out.writeByte(MAP_TYPE.BIT.ordinal());
+               out.writeInt(getUnique());
                out.writeInt(_size);
                out.writeInt(internals.length);
                for(int i = 0; i < internals.length; i++)
@@ -93,11 +96,12 @@ public class MapToBit extends AMapToData {
        }
 
        public static MapToBit readFields(DataInput in) throws IOException {
+               int unique = in.readInt();
                int size = in.readInt();
                long[] internalLong = new long[in.readInt()];
                for(int i = 0; i < internalLong.length; i++)
                        internalLong[i] = in.readLong();
 
-               return new MapToBit(BitSet.valueOf(internalLong), size);
+               return new MapToBit(unique, BitSet.valueOf(internalLong), size);
        }
 }
diff --git 
a/src/main/java/org/apache/sysds/runtime/compress/colgroup/mapping/MapToByte.java
 
b/src/main/java/org/apache/sysds/runtime/compress/colgroup/mapping/MapToByte.java
index b6315ad..b0830a3 100644
--- 
a/src/main/java/org/apache/sysds/runtime/compress/colgroup/mapping/MapToByte.java
+++ 
b/src/main/java/org/apache/sysds/runtime/compress/colgroup/mapping/MapToByte.java
@@ -31,11 +31,13 @@ public class MapToByte extends AMapToData {
 
        private final byte[] _data;
 
-       public MapToByte(int size) {
+       public MapToByte(int unique, int size) {
+               super(unique);
                _data = new byte[size];
        }
 
-       private MapToByte(byte[] data) {
+       private MapToByte(int unique, byte[] data) {
+               super(unique);
                _data = data;
        }
 
@@ -62,7 +64,7 @@ public class MapToByte extends AMapToData {
 
        @Override
        public long getExactSizeOnDisk() {
-               return 1 + 4 + _data.length;
+               return 1 + 4+ 4 + _data.length;
        }
 
        @Override
@@ -78,16 +80,18 @@ public class MapToByte extends AMapToData {
        @Override
        public void write(DataOutput out) throws IOException {
                out.writeByte(MAP_TYPE.BYTE.ordinal());
+               out.writeInt(getUnique());
                out.writeInt(_data.length);
                for(int i = 0; i < _data.length; i++)
                        out.writeByte(_data[i]);
        }
 
        public static MapToByte readFields(DataInput in) throws IOException {
+               int unique = in.readInt();
                final int length = in.readInt();
                final byte[] data = new byte[length];
                for(int i = 0; i < length; i++)
                        data[i] = in.readByte();
-               return new MapToByte(data);
+               return new MapToByte(unique, data);
        }
 }
diff --git 
a/src/main/java/org/apache/sysds/runtime/compress/colgroup/mapping/MapToChar.java
 
b/src/main/java/org/apache/sysds/runtime/compress/colgroup/mapping/MapToChar.java
index ac6f388..45b9d10 100644
--- 
a/src/main/java/org/apache/sysds/runtime/compress/colgroup/mapping/MapToChar.java
+++ 
b/src/main/java/org/apache/sysds/runtime/compress/colgroup/mapping/MapToChar.java
@@ -31,11 +31,13 @@ public class MapToChar extends AMapToData {
 
        private final char[] _data;
 
-       public MapToChar(int size) {
+       public MapToChar(int unique, int size) {
+               super(unique);
                _data = new char[size];
        }
 
-       public MapToChar(char[] data) {
+       public MapToChar(int unique, char[] data) {
+               super(unique);
                _data = data;
        }
 
@@ -78,17 +80,19 @@ public class MapToChar extends AMapToData {
        @Override
        public void write(DataOutput out) throws IOException {
                out.writeByte(MAP_TYPE.CHAR.ordinal());
+               out.writeInt(getUnique());
                out.writeInt(_data.length);
                for(int i = 0; i < _data.length; i++)
                        out.writeChar(_data[i]);
        }
 
        public static MapToChar readFields(DataInput in) throws IOException {
+               int unique = in.readInt();
                final int length = in.readInt();
                final char[] data = new char[length];
                for(int i = 0; i < length; i++)
                        data[i] = in.readChar();
-               return new MapToChar(data);
+               return new MapToChar(unique, data);
        }
 
 }
diff --git 
a/src/main/java/org/apache/sysds/runtime/compress/colgroup/mapping/MapToFactory.java
 
b/src/main/java/org/apache/sysds/runtime/compress/colgroup/mapping/MapToFactory.java
index cbf142b..8858640 100644
--- 
a/src/main/java/org/apache/sysds/runtime/compress/colgroup/mapping/MapToFactory.java
+++ 
b/src/main/java/org/apache/sysds/runtime/compress/colgroup/mapping/MapToFactory.java
@@ -22,15 +22,32 @@ package org.apache.sysds.runtime.compress.colgroup.mapping;
 import java.io.DataInput;
 import java.io.IOException;
 
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
 import org.apache.sysds.runtime.DMLCompressionException;
+import org.apache.sysds.runtime.compress.utils.ABitmap;
 import org.apache.sysds.runtime.compress.utils.IntArrayList;
 
-public class MapToFactory {
+public final class MapToFactory {
+
+       protected static final Log LOG = 
LogFactory.getLog(MapToFactory.class.getName());
 
        public enum MAP_TYPE {
                BIT, BYTE, CHAR, INT;
        }
 
+       public static AMapToData create(ABitmap map) {
+               return create(map.getNumRows(), map);
+       }
+
+       public static AMapToData create(int size, ABitmap map) {
+               if(map == null || map.isEmpty())
+                       return null;
+
+               boolean zeros = map.getNumOffsets() < size;
+               return create(size, zeros, map.getOffsetList());
+       }
+
        public static AMapToData create(int size, boolean zeros, IntArrayList[] 
values) {
                AMapToData _data = MapToFactory.create(size, values.length + 
(zeros ? 1 : 0));
                if(zeros)
@@ -47,13 +64,13 @@ public class MapToFactory {
 
        public static AMapToData create(int size, int numTuples) {
                if(numTuples <= 1)
-                       return new MapToBit(size);
+                       return new MapToBit(numTuples, size);
                else if(numTuples <= 256)
-                       return new MapToByte(size);
+                       return new MapToByte(numTuples, size);
                else if(numTuples <= Character.MAX_VALUE)
-                       return new MapToChar(size);
+                       return new MapToChar(numTuples, size);
                else
-                       return new MapToInt(size);
+                       return new MapToInt(numTuples, size);
        }
 
        public static long estimateInMemorySize(int size, int numTuples) {
@@ -82,4 +99,49 @@ public class MapToFactory {
                                throw new DMLCompressionException("Unknown Map 
type.");
                }
        }
+
+       public static AMapToData join(AMapToData left, AMapToData right) {
+               if(left == null)
+                       return right;
+               else if(right == null)
+                       return left;
+               final int nVL = left.getUnique();
+               final int nVR = right.getUnique();
+               final int size = left.size();
+               final int maxUnique = nVL * nVR;
+               if(size != right.size())
+                       throw new DMLCompressionException("Invalid input maps 
to join, must contain same number of rows");
+
+               try {
+                       return computeJoin(left, right, size, nVL, maxUnique);
+               }
+               catch(Exception e) {
+                       throw new DMLCompressionException("Joining failed max 
unique expected:" + maxUnique, e);
+               }
+       }
+
+       private static AMapToData computeJoin(AMapToData left, AMapToData 
right, int size, int nVL, int maxUnique) {
+               AMapToData tmp = create(size, maxUnique);
+               return computeJoinUsingLinearizedMap(tmp, left, right, size, 
nVL, maxUnique);
+       }
+
+       private static AMapToData computeJoinUsingLinearizedMap(AMapToData tmp, 
AMapToData left, AMapToData right, int size,
+               int nVL, int maxUnique) {
+               int[] map = new int[maxUnique];
+               int newUID = 1;
+               for(int i = 0; i < size; i++) {
+                       final int nv = left.getIndex(i) + right.getIndex(i) * 
nVL;
+                       final int mapV = map[nv];
+                       if(mapV == 0) {
+                               tmp.set(i, newUID - 1);
+                               map[nv] = newUID++;
+                       }
+                       else 
+                               tmp.set(i, mapV - 1);
+               }
+
+               tmp.setUnique(newUID-1);
+               // LOG.error(Arrays.toString(map));
+               return tmp;
+       }
 }
diff --git 
a/src/main/java/org/apache/sysds/runtime/compress/colgroup/mapping/MapToInt.java
 
b/src/main/java/org/apache/sysds/runtime/compress/colgroup/mapping/MapToInt.java
index 8318be1..a0e3dcb 100644
--- 
a/src/main/java/org/apache/sysds/runtime/compress/colgroup/mapping/MapToInt.java
+++ 
b/src/main/java/org/apache/sysds/runtime/compress/colgroup/mapping/MapToInt.java
@@ -31,11 +31,13 @@ public class MapToInt extends AMapToData {
 
        private final int[] _data;
 
-       public MapToInt(int size) {
+       public MapToInt(int unique, int size) {
+               super(unique);
                _data = new int[size];
        }
 
-       private MapToInt(int[] data) {
+       private MapToInt(int unique,int[] data) {
+               super(unique);
                _data = data;
        }
 
@@ -62,7 +64,7 @@ public class MapToInt extends AMapToData {
 
        @Override
        public long getExactSizeOnDisk() {
-               return 1 + 4 + _data.length * 4;
+               return 1 + 4 + 4 + _data.length * 4;
        }
 
        @Override
@@ -78,16 +80,18 @@ public class MapToInt extends AMapToData {
        @Override
        public void write(DataOutput out) throws IOException {
                out.writeByte(MAP_TYPE.INT.ordinal());
+               out.writeInt(getUnique());
                out.writeInt(_data.length);
                for(int i = 0; i < _data.length; i++)
                        out.writeInt(_data[i]);
        }
 
        public static MapToInt readFields(DataInput in) throws IOException {
+               int unique = in.readInt();
                final int length = in.readInt();
                final int[] data = new int[length];
                for(int i = 0; i < length; i++)
                        data[i] = in.readInt();
-               return new MapToInt(data);
+               return new MapToInt(unique, data);
        }
 }
diff --git 
a/src/main/java/org/apache/sysds/runtime/compress/estim/CompressedSizeEstimator.java
 
b/src/main/java/org/apache/sysds/runtime/compress/estim/CompressedSizeEstimator.java
index 91e4a4e..24c2958 100644
--- 
a/src/main/java/org/apache/sysds/runtime/compress/estim/CompressedSizeEstimator.java
+++ 
b/src/main/java/org/apache/sysds/runtime/compress/estim/CompressedSizeEstimator.java
@@ -33,6 +33,7 @@ import org.apache.commons.logging.LogFactory;
 import org.apache.sysds.runtime.compress.CompressionSettings;
 import org.apache.sysds.runtime.compress.colgroup.AColGroup.CompressionType;
 import org.apache.sysds.runtime.compress.utils.ABitmap;
+import org.apache.sysds.runtime.compress.utils.Util;
 import org.apache.sysds.runtime.matrix.data.MatrixBlock;
 import org.apache.sysds.runtime.util.CommonThreadPool;
 
@@ -178,6 +179,32 @@ public abstract class CompressedSizeEstimator {
        public abstract CompressedSizeInfoColGroup 
estimateCompressedColGroupSize(int[] colIndexes, int nrUniqueUpperBound);
 
        /**
+        * Join two analyzed column groups together. without materializing the 
dictionaries of either side.
+        * 
+        * If either side was constructed without analysis then fall back to 
default materialization of double arrays.
+        * 
+        * @param g1 First group
+        * @param g2 Second group
+        * @return A joined compressed size estimation for the group.
+        */
+       public CompressedSizeInfoColGroup 
estimateJoinCompressedSize(CompressedSizeInfoColGroup g1,
+               CompressedSizeInfoColGroup g2) {
+               int[] joined = Util.join(g1.getColumns(), g2.getColumns());
+               final int g1V = g1.getNumVals();
+               final int g2V = g2.getNumVals();
+               if(g1V * g2V < 0 || g1V * g2V > getNumRows())
+                       return null;
+               else if(joined.length == 2 || (g1.getMap() == null && g2V != 0) 
|| (g2.getMap() == null && g2V != 0))
+                       return estimateCompressedColGroupSize(joined, (g1V + 1) 
* (g2V + 1));
+               else
+                       return estimateJoinCompressedSize(joined, g1, g2);
+
+       }
+
+       protected abstract CompressedSizeInfoColGroup 
estimateJoinCompressedSize(int[] joinedcols,
+               CompressedSizeInfoColGroup g1, CompressedSizeInfoColGroup g2);
+
+       /**
         * Method used to extract the CompressedSizeEstimationFactors from an 
constructed UncompressedBitmap. Note this
         * method works both for the sample based estimator and the exact 
estimator, since the bitmap, can be extracted from
         * a sample or from the entire dataset.
@@ -190,9 +217,10 @@ public abstract class CompressedSizeEstimator {
                return estimateCompressedColGroupSize(ubm, colIndexes, 
_numRows, _cs);
        }
 
-       public static EstimationFactors estimateCompressedColGroupSize(ABitmap 
ubm, int[] colIndexes, int nrRows,  CompressionSettings cs) {
+       public static EstimationFactors estimateCompressedColGroupSize(ABitmap 
ubm, int[] colIndexes, int nrRows,
+               CompressionSettings cs) {
                return EstimationFactors.computeSizeEstimationFactors(ubm, 
cs.validCompressions.contains(CompressionType.RLE),
-               nrRows, colIndexes);
+                       colIndexes);
        }
 
        private CompressedSizeInfoColGroup[] CompressedSizeInfoColGroup(int 
clen) {
@@ -247,4 +275,18 @@ public abstract class CompressedSizeEstimator {
                }
                return colIndexes;
        }
+
+       @Override
+       public String toString() {
+               StringBuilder sb = new StringBuilder();
+               sb.append(this.getClass().getSimpleName());
+               sb.append(" transposed: ");
+               sb.append(_transposed);
+               sb.append(" cols: ");
+               sb.append(_numCols);
+               sb.append(" rows: ");
+               sb.append(_numRows);
+               return sb.toString();
+       }
+
 }
diff --git 
a/src/main/java/org/apache/sysds/runtime/compress/estim/CompressedSizeEstimatorExact.java
 
b/src/main/java/org/apache/sysds/runtime/compress/estim/CompressedSizeEstimatorExact.java
index 5fdf8e2..20de976 100644
--- 
a/src/main/java/org/apache/sysds/runtime/compress/estim/CompressedSizeEstimatorExact.java
+++ 
b/src/main/java/org/apache/sysds/runtime/compress/estim/CompressedSizeEstimatorExact.java
@@ -20,6 +20,9 @@
 package org.apache.sysds.runtime.compress.estim;
 
 import org.apache.sysds.runtime.compress.CompressionSettings;
+import org.apache.sysds.runtime.compress.colgroup.AColGroup.CompressionType;
+import org.apache.sysds.runtime.compress.colgroup.mapping.AMapToData;
+import org.apache.sysds.runtime.compress.colgroup.mapping.MapToFactory;
 import org.apache.sysds.runtime.compress.lib.BitmapEncoder;
 import org.apache.sysds.runtime.compress.utils.ABitmap;
 import org.apache.sysds.runtime.matrix.data.MatrixBlock;
@@ -37,20 +40,17 @@ public class CompressedSizeEstimatorExact extends 
CompressedSizeEstimator {
        public CompressedSizeInfoColGroup estimateCompressedColGroupSize(int[] 
colIndexes, int nrUniqueUpperBound) {
                // exact estimator can ignore upper bound.
                ABitmap entireBitMap = BitmapEncoder.extractBitmap(colIndexes, 
_data, _transposed);
-               return new 
CompressedSizeInfoColGroup(estimateCompressedColGroupSize(entireBitMap, 
colIndexes),
-                       _cs.validCompressions);
+               EstimationFactors em = 
estimateCompressedColGroupSize(entireBitMap, colIndexes);
+               return new CompressedSizeInfoColGroup(em, 
_cs.validCompressions, entireBitMap);
        }
 
        @Override
-       public String toString() {
-               StringBuilder sb = new StringBuilder();
-               sb.append(this.getClass().getSimpleName());
-               sb.append(" transposed: ");
-               sb.append(_transposed);
-               sb.append(" cols: ");
-               sb.append(_numCols);
-               sb.append(" rows: ");
-               sb.append(_numRows);
-               return sb.toString();
+       public CompressedSizeInfoColGroup estimateJoinCompressedSize(int[] 
joined, CompressedSizeInfoColGroup g1,
+               CompressedSizeInfoColGroup g2) {
+               AMapToData map = MapToFactory.join(g1.getMap(), g2.getMap());
+               EstimationFactors em = 
EstimationFactors.computeSizeEstimation(joined, map,
+                       _cs.validCompressions.contains(CompressionType.RLE), 
_numRows, false);
+               return new CompressedSizeInfoColGroup(em, 
_cs.validCompressions, map);
        }
+
 }
diff --git 
a/src/main/java/org/apache/sysds/runtime/compress/estim/CompressedSizeEstimatorSample.java
 
b/src/main/java/org/apache/sysds/runtime/compress/estim/CompressedSizeEstimatorSample.java
index 3b43796..6c19dc9 100644
--- 
a/src/main/java/org/apache/sysds/runtime/compress/estim/CompressedSizeEstimatorSample.java
+++ 
b/src/main/java/org/apache/sysds/runtime/compress/estim/CompressedSizeEstimatorSample.java
@@ -21,13 +21,14 @@ package org.apache.sysds.runtime.compress.estim;
 
 import java.util.HashMap;
 
+import org.apache.commons.lang.NotImplementedException;
 import org.apache.sysds.runtime.compress.CompressionSettings;
 import org.apache.sysds.runtime.compress.colgroup.AColGroup.CompressionType;
-import org.apache.sysds.runtime.compress.estim.sample.HassAndStokes;
+import org.apache.sysds.runtime.compress.colgroup.mapping.AMapToData;
+import org.apache.sysds.runtime.compress.colgroup.mapping.MapToFactory;
+import org.apache.sysds.runtime.compress.estim.sample.SampleEstimatorFactory;
 import org.apache.sysds.runtime.compress.lib.BitmapEncoder;
 import org.apache.sysds.runtime.compress.utils.ABitmap;
-import org.apache.sysds.runtime.compress.utils.ABitmap.BitmapType;
-import org.apache.sysds.runtime.compress.utils.IntArrayList;
 import org.apache.sysds.runtime.data.SparseBlock;
 import org.apache.sysds.runtime.data.SparseBlockMCSR;
 import org.apache.sysds.runtime.data.SparseRow;
@@ -91,63 +92,85 @@ public class CompressedSizeEstimatorSample extends 
CompressedSizeEstimator {
 
        @Override
        public CompressedSizeInfoColGroup estimateCompressedColGroupSize(int[] 
colIndexes, int nrUniqueUpperBound) {
-               final int sampleSize = _sampleRows.length;
-               // final int numCols = colIndexes.length;
-               final double scalingFactor = ((double) _numRows / sampleSize);
+
                // extract statistics from sample
                final ABitmap ubm = BitmapEncoder.extractBitmap(colIndexes, 
_sample, _transposed);
-               final EstimationFactors fact = 
EstimationFactors.computeSizeEstimationFactors(ubm, false, _numRows, 
colIndexes);
-               final int numZerosInSample = ubm.getZeroCounts();
-               final boolean lossy = ubm.getType() == BitmapType.Lossy;
-
-               if(numZerosInSample == sampleSize || ubm.getOffsetList() == 
null) {
-                       // Since we sample, and this column seems to be empty, 
we set the return to 1 value detected.
-                       // aka 1 value, and 1 offset.
-                       // This makes it more robust in the coCoding of Columns
-                       return new CompressedSizeInfoColGroup(new 
EstimationFactors(colIndexes, 1, 1, _numRows - 1, 2, 1, _numRows,
-                               lossy, true, (double) 1 / _numRows, (double) 1 
/ ubm.getNumColumns()), _cs.validCompressions);
+               final EstimationFactors sampleFacts = 
EstimationFactors.computeSizeEstimationFactors(ubm, false, colIndexes);
+               final AMapToData map = MapToFactory.create(ubm);
+
+               // result facts
+               EstimationFactors em = estimateCompressionFactors(sampleFacts, 
map, colIndexes, nrUniqueUpperBound);
+               return new CompressedSizeInfoColGroup(em, 
_cs.validCompressions, map);
+       }
+
+       @Override
+       public CompressedSizeInfoColGroup estimateJoinCompressedSize(int[] 
joined, CompressedSizeInfoColGroup g1,
+               CompressedSizeInfoColGroup g2) {
+               final int g1V = g1.getMap().getUnique();
+               final int g2V = g2.getMap().getUnique();
+               final int nrUniqueUpperBound = g1V * g2V;
+
+               final AMapToData map = MapToFactory.join(g1.getMap(), 
g2.getMap());
+               EstimationFactors sampleFacts = 
EstimationFactors.computeSizeEstimation(joined, map,
+                       _cs.validCompressions.contains(CompressionType.RLE), 
map.size(), false);
+
+               // result facts
+               EstimationFactors em = estimateCompressionFactors(sampleFacts, 
map, joined, nrUniqueUpperBound);
+               return new CompressedSizeInfoColGroup(em, 
_cs.validCompressions, map);
+       }
+
+       private EstimationFactors estimateCompressionFactors(EstimationFactors 
sampleFacts, AMapToData map,
+               int[] colIndexes, int nrUniqueUpperBound) {
+               final int numZerosInSample = sampleFacts.numRows - 
sampleFacts.numOffs;
+               final int sampleSize = _sampleRows.length;
+
+               if(numZerosInSample == sampleSize) {
+                       final int nCol = sampleFacts.cols.length;
+                       /**
+                        * Since we sample, and this column seems to be empty 
we set the return to 1 value detected. aka 1 value,
+                        * and 1 offset. This makes it more robust in the 
coCoding of Columns
+                        */
+                       final int largestInstanceCount = _numRows - 1;
+                       return new EstimationFactors(colIndexes, 1, 1, 
largestInstanceCount, new int[] {largestInstanceCount}, 2, 1,
+                               _numRows, sampleFacts.lossy, true, (double) 1 / 
_numRows, (double) 1 / nCol);
                }
+               else {
 
-               // Estimate number of distinct values (incl fixes for anomalies 
w/ large sample fraction)
-               int totalCardinality = getNumDistinctValues(ubm, _numRows, 
sampleSize, _solveCache);
-               // Number of unique is trivially bounded by the sampled number 
of uniques and the number of rows.
-               totalCardinality = Math.min(Math.min(Math.max(totalCardinality, 
fact.numVals), _numRows), nrUniqueUpperBound);
+                       final double scalingFactor = ((double) _numRows / 
sampleSize);
+                       // Estimate number of distinct values (incl fixes for 
anomalies w/ large sample fraction)
+                       final int totalCardinality = Math.max(map.getUnique(), 
Math.min(_numRows,
+                               
getEstimatedDistinctCount(sampleFacts.frequencies, nrUniqueUpperBound)));
 
-               // estimate number of non-zeros (conservatively round up)
-               final double C = Math.max(1 - (double) fact.numSingle / 
sampleSize, (double) sampleSize / _numRows);
-               final int numNonZeros = Math.max((int) Math.floor(_numRows - 
scalingFactor * C * numZerosInSample),
-                       totalCardinality);
+                       // estimate number of non-zeros (conservatively round 
up)
+                       final double C = Math.max(1 - (double) 
sampleFacts.numSingle / sampleSize, (double) sampleSize / _numRows);
+                       final int numNonZeros = Math.max((int) 
Math.floor(_numRows - scalingFactor * C * numZerosInSample),
+                               totalCardinality);
 
+                       final int totalNumRuns = getNumRuns(map, 
sampleFacts.numVals, sampleSize, _numRows, _sampleRows);
+
+                       final int largestInstanceCount = Math.min(_numRows, 
(int) Math.floor(sampleFacts.largestOff * scalingFactor));
+
+                       return new EstimationFactors(colIndexes, 
totalCardinality, numNonZeros, largestInstanceCount,
+                               sampleFacts.frequencies, totalNumRuns, 
sampleFacts.numSingle, _numRows, sampleFacts.lossy,
+                               sampleFacts.zeroIsMostFrequent, 
sampleFacts.overAllSparsity, sampleFacts.tupleSparsity);
+               }
+       }
+
+       private int getEstimatedDistinctCount( int[] frequencies, int 
upperBound) {
+               return Math.min(SampleEstimatorFactory.distinctCount( 
frequencies, _numRows, _sampleRows.length,
+                       _cs.estimationType, _solveCache), upperBound);
+       }
+
+       private int getNumRuns(AMapToData map, int numVals, int sampleSize, int 
totalNumRows, int[] sampleRows) {
                // estimate number of segments and number of runs incl 
correction for
                // empty segments and empty runs (via expected mean of offset 
value)
                // int numUnseenSeg = (int) (unseenVals * Math.ceil((double) 
_numRows / BitmapEncoder.BITMAP_BLOCK_SZ / 2));
-               final int totalNumRuns = 
_cs.validCompressions.contains(CompressionType.RLE) &&
-                       ubm.getNumValues() > 0 ? getNumRuns(ubm, sampleSize, 
_numRows, _sampleRows) : 0;
-
-               // Largest instance count ... initiate as the number of zeros.
-               int largestInstanceCount = numZerosInSample;
-               for(IntArrayList a : ubm.getOffsetList())
-                       if(a.size() > largestInstanceCount)
-                               largestInstanceCount = a.size();
-
-               final boolean zeroIsMostFrequent = largestInstanceCount == 
numZerosInSample;
-
-               // Scale largest Instance count to correctly reflect the number 
of instances.
-               largestInstanceCount = Math.min((int) (scalingFactor * 
largestInstanceCount), _numRows);
-               
-               EstimationFactors totalFacts = new 
EstimationFactors(colIndexes, totalCardinality, numNonZeros,
-                       largestInstanceCount, totalNumRuns, fact.numSingle, 
_numRows, lossy, zeroIsMostFrequent,
-                       fact.overAllSparsity, fact.tupleSparsity);
-
-               // construct new size info summary
-               return new CompressedSizeInfoColGroup(totalFacts, 
_cs.validCompressions);
-       }
-
-       private static int getNumDistinctValues(ABitmap ubm, int numRows, int 
sampleSize,
-               HashMap<Integer, Double> solveCache) {
-               return HassAndStokes.haasAndStokes(ubm, numRows, sampleSize, 
solveCache);
+               return _cs.validCompressions.contains(CompressionType.RLE) && 
numVals > 0 ? getNumRuns(map, sampleSize,
+                       _numRows, _sampleRows) : 0;
        }
 
+       // Fix getNumRuns when adding RLE back.
+       @SuppressWarnings("unused")
        private static int getNumRuns(ABitmap ubm, int sampleSize, int 
totalNumRows, int[] sampleRows) {
                int numVals = ubm.getNumValues();
                double numRuns = 0;
@@ -306,6 +329,11 @@ public class CompressedSizeEstimatorSample extends 
CompressedSizeEstimator {
                return (int) Math.min(Math.round(numRuns), Integer.MAX_VALUE);
        }
 
+       private static int getNumRuns(AMapToData map, int sampleSize, int 
totalNumRows, int[] sampleRows) {
+
+               throw new NotImplementedException("Not Supported ever since the 
ubm was replaced by the map");
+       }
+
        /**
         * Returns a sorted array of n integers, drawn uniformly from the range 
[0,range).
         * 
@@ -320,7 +348,7 @@ public class CompressedSizeEstimatorSample extends 
CompressedSizeEstimator {
        @Override
        public String toString() {
                StringBuilder sb = new StringBuilder();
-               sb.append(this.getClass().getSimpleName());
+               sb.append(super.toString());
                sb.append(" sampleSize: ");
                sb.append(_sampleRows.length);
                sb.append(" transposed: ");
@@ -331,5 +359,4 @@ public class CompressedSizeEstimatorSample extends 
CompressedSizeEstimator {
                sb.append(_numRows);
                return sb.toString();
        }
-
 }
diff --git 
a/src/main/java/org/apache/sysds/runtime/compress/estim/CompressedSizeInfoColGroup.java
 
b/src/main/java/org/apache/sysds/runtime/compress/estim/CompressedSizeInfoColGroup.java
index 8b984ff..2fd7a40 100644
--- 
a/src/main/java/org/apache/sysds/runtime/compress/estim/CompressedSizeInfoColGroup.java
+++ 
b/src/main/java/org/apache/sysds/runtime/compress/estim/CompressedSizeInfoColGroup.java
@@ -28,9 +28,11 @@ import org.apache.commons.lang.NotImplementedException;
 import org.apache.commons.logging.Log;
 import org.apache.commons.logging.LogFactory;
 import org.apache.sysds.runtime.compress.CompressionSettings;
-import 
org.apache.sysds.runtime.compress.cocode.PlanningCoCoder.PartitionerType;
 import org.apache.sysds.runtime.compress.colgroup.AColGroup.CompressionType;
 import org.apache.sysds.runtime.compress.colgroup.ColGroupSizes;
+import org.apache.sysds.runtime.compress.colgroup.mapping.AMapToData;
+import org.apache.sysds.runtime.compress.colgroup.mapping.MapToFactory;
+import org.apache.sysds.runtime.compress.utils.ABitmap;
 
 /**
  * Information collected about a specific ColGroup's compression size.
@@ -46,6 +48,11 @@ public class CompressedSizeInfoColGroup {
        private final Map<CompressionType, Long> _sizes;
 
        /**
+        * Map containing a mapping to unique values, but not necessarily the 
actual values contained in this column group
+        */
+       private final AMapToData _map;
+
+       /**
         * Join columns without analyzing the content. This only specify the 
compression ratio if encoded in DDC since this
         * is trivially calculated. The number of tuples contained can be set 
to the upper theoretical bound of two groups
         * by multiplying the number of distinct tuple of each colGroups with 
each other.
@@ -62,6 +69,7 @@ public class CompressedSizeInfoColGroup {
                _sizes = null;
                _bestCompressionType = null;
                _minSize = 
ColGroupSizes.estimateInMemorySizeDDC(columns.length, numVals, numRows, 1.0, 
false);
+               _map = null;
        }
 
        /**
@@ -69,8 +77,28 @@ public class CompressedSizeInfoColGroup {
         * 
         * @param facts                 The facts extracted from a number of 
columns, based on the estimateFactors
         * @param validCompressionTypes The list of valid compression types, 
allowed to be performed.
+        * @param map                   The map of the distinct values 
contained in this column group.
         */
-       public CompressedSizeInfoColGroup(EstimationFactors facts, 
Set<CompressionType> validCompressionTypes) {
+       public CompressedSizeInfoColGroup(EstimationFactors facts, 
Set<CompressionType> validCompressionTypes,
+               ABitmap map) {
+               _facts = facts;
+               _cardinalityRatio = (double) facts.numVals / facts.numRows;
+               _sizes = calculateCompressionSizes(facts, 
validCompressionTypes);
+               Map.Entry<CompressionType, Long> bestEntry = null;
+               for(Map.Entry<CompressionType, Long> ent : _sizes.entrySet()) {
+                       if(bestEntry == null || ent.getValue() < 
bestEntry.getValue())
+                               bestEntry = ent;
+               }
+
+               _bestCompressionType = bestEntry.getKey();
+               _minSize = bestEntry.getValue();
+               _map = MapToFactory.create(map);
+               if(LOG.isTraceEnabled())
+                       LOG.trace(this);
+       }
+
+       protected CompressedSizeInfoColGroup(EstimationFactors facts, 
Set<CompressionType> validCompressionTypes,
+               AMapToData map) {
                _facts = facts;
                _cardinalityRatio = (double) facts.numVals / facts.numRows;
                _sizes = calculateCompressionSizes(facts, 
validCompressionTypes);
@@ -82,6 +110,7 @@ public class CompressedSizeInfoColGroup {
 
                _bestCompressionType = bestEntry.getKey();
                _minSize = bestEntry.getValue();
+               _map = map;
                if(LOG.isTraceEnabled())
                        LOG.trace(this);
        }
@@ -89,7 +118,7 @@ public class CompressedSizeInfoColGroup {
        /**
         * This method adds a column group without having to analyze. This is 
because the columns added are constant groups.
         * 
-        * NOTE THIS IS ONLY VALID IF THE COLUMN ADDED IS EMPTY!
+        * NOTE THIS IS ONLY VALID IF THE COLUMN ADDED IS EMPTY OR CONSTANT!
         * 
         * @param columns               The columns of the colgroups together
         * @param oneSide               One of the sides, this may contain 
something, but the other side (not part of the
@@ -100,7 +129,7 @@ public class CompressedSizeInfoColGroup {
        public static CompressedSizeInfoColGroup addConstGroup(int[] columns, 
CompressedSizeInfoColGroup oneSide,
                Set<CompressionType> validCompressionTypes) {
                EstimationFactors fact = new EstimationFactors(columns, 
oneSide._facts);
-               CompressedSizeInfoColGroup ret = new 
CompressedSizeInfoColGroup(fact, validCompressionTypes);
+               CompressedSizeInfoColGroup ret = new 
CompressedSizeInfoColGroup(fact, validCompressionTypes, oneSide._map);
                return ret;
        }
 
@@ -109,11 +138,6 @@ public class CompressedSizeInfoColGroup {
        }
 
        public CompressionType getBestCompressionType(CompressionSettings cs) {
-               if(cs.columnPartitioner == PartitionerType.COST_MATRIX_MULT) {
-                       // if(_sizes.get(CompressionType.SDC) * 0.8 < 
_sizes.get(_bestCompressionType))
-                       if(getMostCommonFraction() > 0.4)
-                               return CompressionType.SDC;
-               }
                return _bestCompressionType;
        }
 
@@ -163,6 +187,14 @@ public class CompressedSizeInfoColGroup {
                return _facts.tupleSparsity;
        }
 
+       public AMapToData getMap() {
+               return _map;
+       }
+
+       public boolean containsZeros() {
+               return _facts.numOffs < _facts.numRows;
+       }
+
        private static Map<CompressionType, Long> 
calculateCompressionSizes(EstimationFactors fact,
                Set<CompressionType> validCompressionTypes) {
                Map<CompressionType, Long> res = new HashMap<>();
@@ -218,10 +250,10 @@ public class CompressedSizeInfoColGroup {
 
        @Override
        public boolean equals(Object that) {
-               if(!(that instanceof CompressedSizeInfoColGroup))
-                       return false;
-
-               return Arrays.equals(_facts.cols, ((CompressedSizeInfoColGroup) 
that)._facts.cols);
+               throw new NotImplementedException();
+               // if(!(that instanceof CompressedSizeInfoColGroup))
+               // return false;
+               // return Arrays.equals(_facts.cols, 
((CompressedSizeInfoColGroup) that)._facts.cols);
        }
 
        @Override
diff --git 
a/src/main/java/org/apache/sysds/runtime/compress/estim/EstimationFactors.java 
b/src/main/java/org/apache/sysds/runtime/compress/estim/EstimationFactors.java
index b42e211..57d7182 100644
--- 
a/src/main/java/org/apache/sysds/runtime/compress/estim/EstimationFactors.java
+++ 
b/src/main/java/org/apache/sysds/runtime/compress/estim/EstimationFactors.java
@@ -21,12 +21,13 @@ package org.apache.sysds.runtime.compress.estim;
 
 import java.util.Arrays;
 
+import org.apache.commons.lang.NotImplementedException;
 import org.apache.commons.logging.Log;
 import org.apache.commons.logging.LogFactory;
 import org.apache.sysds.runtime.DMLCompressionException;
 import org.apache.sysds.runtime.compress.CompressionSettings;
+import org.apache.sysds.runtime.compress.colgroup.mapping.AMapToData;
 import org.apache.sysds.runtime.compress.utils.ABitmap;
-import org.apache.sysds.runtime.compress.utils.ABitmap.BitmapType;
 
 /**
  * Compressed Size Estimation factors. Contains meta information used to 
estimate the compression sizes of given columns
@@ -43,6 +44,8 @@ public class EstimationFactors {
        protected final int numOffs;
        /** The number of instances in the largest offset, this is used to 
determine if SDC is good. */
        protected final int largestOff;
+       /** The frequencies of the different tuples in the columns */
+       protected final int[] frequencies;
        /** The Number of runs, of consecutive equal numbers, used primarily in 
RLE */
        protected final int numRuns;
        /** The Number of Values in the collection not Zero , Also refered to 
as singletons */
@@ -62,7 +65,7 @@ public class EstimationFactors {
                this.cols = cols;
                this.numVals = numVals;
                this.numRows = numRows;
-
+               this.frequencies = null;
                this.numOffs = -1;
                this.largestOff = -1;
                this.numRuns = -1;
@@ -80,6 +83,7 @@ public class EstimationFactors {
                this.numRows = old.numRows;
                this.numOffs = old.numOffs;
                this.largestOff = old.largestOff;
+               this.frequencies = old.frequencies;
                this.numRuns = old.numRuns;
                this.numSingle = old.numSingle;
                this.lossy = old.lossy;
@@ -89,12 +93,14 @@ public class EstimationFactors {
                this.tupleSparsity = old.tupleSparsity;
        }
 
-       protected EstimationFactors(int[] cols, int numVals, int numOffs, int 
largestOff, int numRuns, int numSingle,
-               int numRows, boolean lossy, boolean zeroIsMostFrequent, double 
overAllSparsity, double tupleSparsity) {
+       protected EstimationFactors(int[] cols, int numVals, int numOffs, int 
largestOff, int[] frequencies, int numRuns,
+               int numSingle, int numRows, boolean lossy, boolean 
zeroIsMostFrequent, double overAllSparsity,
+               double tupleSparsity) {
                this.cols = cols;
                this.numVals = numVals;
                this.numOffs = numOffs;
                this.largestOff = largestOff;
+               this.frequencies = frequencies;
                this.numRuns = numRuns;
                this.numSingle = numSingle;
                this.numRows = numRows;
@@ -117,12 +123,12 @@ public class EstimationFactors {
                                        + " > numRows: " + numRows);
        }
 
-       protected static EstimationFactors computeSizeEstimationFactors(ABitmap 
ubm, boolean inclRLE, int numRows,
-               int[] cols) {
+       protected static EstimationFactors computeSizeEstimationFactors(ABitmap 
ubm, boolean inclRLE, int[] cols) {
+               final int numRows = ubm.getNumRows();
                if(ubm == null || ubm.getOffsetList() == null)
-                       return new EstimationFactors(cols, 0, 0, numRows, 1, 0, 
numRows, false, true, 0, 0);
+                       return new EstimationFactors(cols, 0, 0, numRows, new 
int[] {numRows}, 1, 0, numRows, false, true, 0, 0);
                else {
-                       final int numVals = ubm.getNumValues();
+                       int numVals = ubm.getNumValues();
                        int numRuns = 0;
                        int numOffs = 0;
                        int numSingle = 0;
@@ -153,6 +159,12 @@ public class EstimationFactors {
                        }
 
                        final int zerosOffs = numRows - numOffs;
+                       final boolean containsZero = zerosOffs > 0;
+                       int[] frequencies = new int[numVals + (containsZero ? 1 
: 0)];
+                       for(int i = 0; i < numVals; i++)
+                               frequencies[i] = ubm.getNumOffsets(i);
+                       if(containsZero)
+                               frequencies[numVals] = zerosOffs;
                        final boolean zerosLargestOffset = zerosOffs > 
largestOffs;
                        if(zerosLargestOffset)
                                largestOffs = zerosOffs;
@@ -160,11 +172,49 @@ public class EstimationFactors {
                        double overAllSparsity = (double) overallNonZeroCount / 
(numRows * cols.length);
                        double tupleSparsity = (double) tupleNonZeroCount / 
(numVals * cols.length);
 
-                       return new EstimationFactors(cols, numVals, numOffs, 
largestOffs, numRuns, numSingle, numRows,
-                               ubm.getType() == BitmapType.Lossy, 
zerosLargestOffset, overAllSparsity, tupleSparsity);
+                       return new EstimationFactors(cols, numVals, numOffs, 
largestOffs, frequencies, numRuns, numSingle, numRows,
+                               ubm.lossy(), zerosLargestOffset, 
overAllSparsity, tupleSparsity);
                }
        }
 
+       protected static EstimationFactors computeSizeEstimation(final int[] 
cols, final AMapToData map,
+               final boolean inclRLE, final int numRows, final boolean 
lastEntryAllZero) {
+               final boolean lossy = false;
+               if(map == null)
+                       return new EstimationFactors(cols, 0, 0, numRows, new 
int[] {numRows}, 1, 0, numRows, false, true, 0, 0);
+
+               final int nUnique = map.getUnique();
+               if(lastEntryAllZero) {
+                       throw new NotImplementedException();
+               }
+               else {
+                       final boolean zerosLargestOffset = false;
+                       final double overAllSparsity = 1.0;
+                       final double tupleSparsity = 1.0;
+                       final int numOffs = map.size();
+                       final int numSingle = 0; // unknown
+                       if(inclRLE) {
+                               throw new NotImplementedException();
+                       }
+                       else {
+                               final int numRuns = 0;
+                               int[] counts = new int[nUnique];
+                               for(int i = 0; i < map.size(); i++) {
+                                       counts[map.getIndex(i)]++;
+                               }
+                               int largestOffs = 0;
+                               for(int i = 0; i < nUnique; i++) {
+                                       if(counts[i] > largestOffs)
+                                               largestOffs = counts[i];
+                               }
+                               return new EstimationFactors(cols, nUnique, 
numOffs, largestOffs, counts, numRuns, numSingle, numRows,
+                                       lossy, zerosLargestOffset, 
overAllSparsity, tupleSparsity);
+                       }
+
+               }
+
+       }
+
        @Override
        public String toString() {
                StringBuilder sb = new StringBuilder();
diff --git 
a/src/main/java/org/apache/sysds/runtime/compress/estim/sample/FrequencyCount.java
 
b/src/main/java/org/apache/sysds/runtime/compress/estim/sample/FrequencyCount.java
deleted file mode 100644
index 3badbe3..0000000
--- 
a/src/main/java/org/apache/sysds/runtime/compress/estim/sample/FrequencyCount.java
+++ /dev/null
@@ -1,47 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-package org.apache.sysds.runtime.compress.estim.sample;
-
-import org.apache.sysds.runtime.compress.utils.ABitmap;
-
-public class FrequencyCount {
-
-       /**
-        * Creates an inverted histogram, where freqCounts[i-1] indicates how 
many values occurred with a frequency i. Note
-        * that freqCounts[0] represents the special values of the number of 
singletons.
-        * 
-        * @param ubm uncompressed bitmap
-        * @return frequency counts
-        */
-       protected static int[] get(ABitmap ubm) {
-               // determine max frequency
-               int numVals = ubm.getNumValues();
-               int maxCount = 0;
-               for(int i = 0; i < numVals; i++)
-                       maxCount = Math.max(maxCount, ubm.getNumOffsets(i));
-
-               // create frequency histogram
-               int[] freqCounts = new int[maxCount];
-               for(int i = 0; i < numVals; i++)
-                       freqCounts[ubm.getNumOffsets(i) - 1]++;
-
-               return freqCounts;
-       }
-}
diff --git 
a/src/main/java/org/apache/sysds/runtime/compress/estim/sample/GuaranteedErrorEstimator.java
 
b/src/main/java/org/apache/sysds/runtime/compress/estim/sample/GuaranteedErrorEstimator.java
deleted file mode 100644
index e1cea6e..0000000
--- 
a/src/main/java/org/apache/sysds/runtime/compress/estim/sample/GuaranteedErrorEstimator.java
+++ /dev/null
@@ -1,66 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-package org.apache.sysds.runtime.compress.estim.sample;
-
-import java.util.HashMap;
-
-import org.apache.sysds.runtime.compress.readers.ReaderColumnSelection;
-import org.apache.sysds.runtime.compress.utils.DblArray;
-
-public class GuaranteedErrorEstimator {
-
-       /**
-        * M. Charikar, S. Chaudhuri, R. Motwani, and V. R. Narasayya, Towards 
estimation error guarantees for distinct
-        * values, PODS'00.
-        * 
-        * @param nRows            number of rows
-        * @param sampleSize       sample size
-        * @param sampleRowsReader a reader for the sampled rows
-        * @return error estimator
-        */
-       @SuppressWarnings("unused")
-       private static int guaranteedErrorEstimator(int nRows, int sampleSize, 
ReaderColumnSelection sampleRowsReader) {
-               HashMap<DblArray, Integer> valsCount = 
getValCounts(sampleRowsReader);
-               // number of values that occur only once
-               int singltonValsCount = 0;
-               int otherValsCount = 0;
-               for(Integer c : valsCount.values()) {
-                       if(c == 1)
-                               singltonValsCount++;
-                       else
-                               otherValsCount++;
-               }
-               return (int) Math.round(otherValsCount + singltonValsCount * 
Math.sqrt(((double) nRows) / sampleSize));
-       }
-
-       private static HashMap<DblArray, Integer> 
getValCounts(ReaderColumnSelection sampleRowsReader) {
-               HashMap<DblArray, Integer> valsCount = new HashMap<>();
-               DblArray val = null;
-               Integer cnt;
-               while(null != (val = sampleRowsReader.nextRow())) {
-                       cnt = valsCount.get(val);
-                       if(cnt == null)
-                               cnt = 0;
-                       cnt++;
-                       valsCount.put(new DblArray(val), cnt);
-               }
-               return valsCount;
-       }
-}
diff --git 
a/src/main/java/org/apache/sysds/runtime/compress/estim/sample/HassAndStokes.java
 
b/src/main/java/org/apache/sysds/runtime/compress/estim/sample/HassAndStokes.java
index d7b0109..429961b 100644
--- 
a/src/main/java/org/apache/sysds/runtime/compress/estim/sample/HassAndStokes.java
+++ 
b/src/main/java/org/apache/sysds/runtime/compress/estim/sample/HassAndStokes.java
@@ -23,7 +23,6 @@ import java.util.HashMap;
 
 import org.apache.commons.math3.analysis.UnivariateFunction;
 import org.apache.commons.math3.analysis.solvers.UnivariateSolverUtils;
-import org.apache.sysds.runtime.compress.utils.ABitmap;
 
 public class HassAndStokes {
 
@@ -40,16 +39,14 @@ public class HassAndStokes {
         * 
         * The hybrid estimator given by Eq. 33 in Section 6
         * 
-        * @param ubm        The Uncompressed Bit map
+        * @param numVals        The number of unique values in the sample
+        * @param freqCounts The inverse histogram of frequencies. counts 
extracted 
         * @param nRows      The number of rows originally in the input
         * @param sampleSize The number of rows used in the sample
         * @param solveCache A Hashmap containing information for 
getDuj2aEstimate
         * @return An estimation of distinct elements in the population.
         */
-       public static int haasAndStokes(ABitmap ubm, int nRows, int sampleSize, 
HashMap<Integer, Double> solveCache) {
-               // obtain value and frequency histograms
-               int numVals = ubm.getNumValues();
-               int[] freqCounts = FrequencyCount.get(ubm);
+       protected static int distinctCount(int numVals, int[] freqCounts, int 
nRows, int sampleSize, HashMap<Integer, Double> solveCache) {
 
                // all values in the sample are zeros.
                if(numVals == 0)
@@ -74,7 +71,9 @@ public class HassAndStokes {
                        d = getSh3Estimate(q, freqCounts, numVals);
 
                // round and ensure min value 1
-               return Math.max(1, (int) Math.round(d));
+               final int est = Math.max(1, (int) Math.round(d));
+               // Number of unique is trivially bounded by the sampled number 
of uniques and the number of rows.
+               return Math.min(Math.max(est, numVals), nRows);
        }
 
        /**
diff --git 
a/src/main/java/org/apache/sysds/runtime/compress/estim/sample/SampleEstimatorFactory.java
 
b/src/main/java/org/apache/sysds/runtime/compress/estim/sample/SampleEstimatorFactory.java
new file mode 100644
index 0000000..5c126ea
--- /dev/null
+++ 
b/src/main/java/org/apache/sysds/runtime/compress/estim/sample/SampleEstimatorFactory.java
@@ -0,0 +1,96 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.sysds.runtime.compress.estim.sample;
+
+import java.util.Arrays;
+import java.util.HashMap;
+
+import org.apache.commons.lang.NotImplementedException;
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.apache.sysds.runtime.DMLCompressionException;
+
+public class SampleEstimatorFactory {
+
+       protected static final Log LOG = 
LogFactory.getLog(SampleEstimatorFactory.class.getName());
+
+       public enum EstimationType {
+               HassAndStokes, ShlosserEstimator, ShlosserJackknifeEstimator, 
SmoothedJackknifeEstimator,
+
+       }
+
+       public static int distinctCount(int[] frequencies, int nRows, int 
sampleSize, EstimationType type,
+               HashMap<Integer, Double> solveCache) {
+               final int numVals = ((float) frequencies[frequencies.length - 
1] /
+                       sampleSize < 0.4) ? frequencies.length : 
frequencies.length - 1;
+               try {
+
+                       switch(type) {
+                               case HassAndStokes:
+                                       return 
HassAndStokes.distinctCount(numVals, getInvertedFrequencyHistogram(frequencies, 
numVals),
+                                               nRows, sampleSize, solveCache);
+                               case ShlosserEstimator:
+                                       return 
ShlosserEstimator.distinctCount(numVals, 
getInvertedFrequencyHistogram(frequencies, numVals),
+                                               nRows, sampleSize);
+                               case ShlosserJackknifeEstimator:
+                                       return 
ShlosserJackknifeEstimator.distinctCount(numVals, frequencies,
+                                               
getInvertedFrequencyHistogram(frequencies, numVals), nRows, sampleSize);
+                               case SmoothedJackknifeEstimator:
+                                       return 
SmoothedJackknifeEstimator.distinctCount(numVals,
+                                               
getInvertedFrequencyHistogram(frequencies, numVals), nRows, sampleSize);
+                               default:
+                                       throw new NotImplementedException("Type 
not yet supported for counting distinct: " + type);
+                       }
+               }
+               catch(Exception e) {
+                       throw new DMLCompressionException(
+                               "Error while estimating distinct count with 
arguments:\n" + numVals + " frequencies:\n"
+                                       + Arrays.toString(frequencies) + "\n 
nrows: " + nRows + " " + sampleSize + " type: " + type,
+                               e);
+               }
+
+       }
+
+       private static int[] getInvertedFrequencyHistogram(int[] frequencies, 
int numVals) {
+               try {
+
+                       // Find max
+                       int maxCount = 0;
+                       for(int i = 0; i < numVals; i++) {
+                               final int v = frequencies[i];
+                               if(v > maxCount)
+                                       maxCount = v;
+                       }
+
+                       // create frequency histogram
+                       int[] freqCounts = new int[maxCount];
+                       for(int i = 0; i < numVals; i++) {
+                               if(frequencies[i] != 0)
+                                       freqCounts[frequencies[i] - 1]++;
+                       }
+
+                       return freqCounts;
+               }
+               catch(Exception e) {
+                       throw new DMLCompressionException(
+                               "Could not extract inverted frequencies from 
input: " + Arrays.toString(frequencies), e);
+               }
+       }
+}
diff --git 
a/src/main/java/org/apache/sysds/runtime/compress/estim/sample/ShlosserEstimator.java
 
b/src/main/java/org/apache/sysds/runtime/compress/estim/sample/ShlosserEstimator.java
index 5fd9e16..dd33e63 100644
--- 
a/src/main/java/org/apache/sysds/runtime/compress/estim/sample/ShlosserEstimator.java
+++ 
b/src/main/java/org/apache/sysds/runtime/compress/estim/sample/ShlosserEstimator.java
@@ -19,26 +19,22 @@
 
 package org.apache.sysds.runtime.compress.estim.sample;
 
-import org.apache.sysds.runtime.compress.utils.Bitmap;
-
 public class ShlosserEstimator {
 
        /**
         * Peter J. Haas, Jeffrey F. Naughton, S. Seshadri, and Lynne Stokes. 
Sampling-Based Estimation of the Number of
         * Distinct Values of an Attribute. VLDB'95, Section 3.2.
         * 
-        * @param ubm        The Uncompressed Bitmap containing the data from 
the sample
+        * @param numVals        The number of unique values in the sample
+        * @param freqCounts The inverse histogram of frequencies. counts 
extracted 
         * @param nRows      The original number of rows in the entire input
         * @param sampleSize The number of rows in the sample
         * @return an estimation of number of distinct values.
         */
-       public static int get(Bitmap ubm, int nRows, int sampleSize) {
+       protected static int distinctCount(int numVals, int[] freqCounts, int 
nRows, int sampleSize) {
                double q = ((double) sampleSize) / nRows;
                double oneMinusQ = 1 - q;
 
-               int numVals = ubm.getNumValues();
-               int[] freqCounts = FrequencyCount.get(ubm);
-
                double numerSum = 0, denomSum = 0;
                int iPlusOne = 1;
                for(int i = 0; i < freqCounts.length; i++, iPlusOne++) {
diff --git 
a/src/main/java/org/apache/sysds/runtime/compress/estim/sample/ShlosserJackknifeEstimator.java
 
b/src/main/java/org/apache/sysds/runtime/compress/estim/sample/ShlosserJackknifeEstimator.java
index 7ccffe8..5f74e0f 100644
--- 
a/src/main/java/org/apache/sysds/runtime/compress/estim/sample/ShlosserJackknifeEstimator.java
+++ 
b/src/main/java/org/apache/sysds/runtime/compress/estim/sample/ShlosserJackknifeEstimator.java
@@ -20,7 +20,6 @@
 package org.apache.sysds.runtime.compress.estim.sample;
 
 import org.apache.commons.math3.distribution.ChiSquaredDistribution;
-import org.apache.sysds.runtime.compress.utils.Bitmap;
 
 public class ShlosserJackknifeEstimator {
 
@@ -30,14 +29,15 @@ public class ShlosserJackknifeEstimator {
         * Peter J. Haas, Jeffrey F. Naughton, S. Seshadri, and Lynne Stokes. 
1995. Sampling-Based Estimation of the Number
         * of Distinct Values of an Attribute. VLDB'95, Section 5.2, 
recommended estimator by the authors
         * 
-        * @param ubm        The Uncompressed Bitmap containing the data from 
the sample
+        * @param numVals        The number of unique values in the sample
+        * @param frequencies The Frequencies of the different unique values
+        * @param freqCounts The inverse histogram of frequencies. counts 
extracted 
         * @param nRows      The original number of rows in the entire input
         * @param sampleSize The number of rows in the sample
         * @return an estimation of number of distinct values.
         */
-       @SuppressWarnings("unused")
-       private static int shlosserJackknifeEstimator(Bitmap ubm, int nRows, 
int sampleSize) {
-               int numVals = ubm.getNumValues();
+       protected static int distinctCount(int numVals, int[] frequencies, 
int[] freqCounts, int nRows, int sampleSize) {
+
                CriticalValue cv = computeCriticalValue(sampleSize);
 
                // uniformity chi-square test
@@ -45,15 +45,15 @@ public class ShlosserJackknifeEstimator {
                // test-statistic
                double u = 0;
                for(int i = 0; i < numVals; i++) {
-                       u += Math.pow(ubm.getNumOffsets(i) - nBar, 2);
+                       u += Math.pow(frequencies[i] - nBar, 2);
                }
                u /= nBar;
                if(sampleSize != cv.usedSampleSize)
                        computeCriticalValue(sampleSize);
                if(u < cv.uniformityCriticalValue) // uniform
-                       return SmoothedJackknifeEstimator.get(ubm, nRows, 
sampleSize);
+                       return 
SmoothedJackknifeEstimator.distinctCount(numVals, freqCounts, nRows, 
sampleSize);
                else
-                       return ShlosserEstimator.get(ubm, nRows, sampleSize);
+                       return ShlosserEstimator.distinctCount(numVals, 
freqCounts, nRows, sampleSize);
        }
 
        private static CriticalValue computeCriticalValue(int sampleSize) {
diff --git 
a/src/main/java/org/apache/sysds/runtime/compress/estim/sample/SmoothedJackknifeEstimator.java
 
b/src/main/java/org/apache/sysds/runtime/compress/estim/sample/SmoothedJackknifeEstimator.java
index b5536e2..e9d5d50 100644
--- 
a/src/main/java/org/apache/sysds/runtime/compress/estim/sample/SmoothedJackknifeEstimator.java
+++ 
b/src/main/java/org/apache/sysds/runtime/compress/estim/sample/SmoothedJackknifeEstimator.java
@@ -19,22 +19,19 @@
 
 package org.apache.sysds.runtime.compress.estim.sample;
 
-import org.apache.sysds.runtime.compress.utils.Bitmap;
-
 public class SmoothedJackknifeEstimator {
 
        /**
         * Peter J. Haas, Jeffrey F. Naughton, S. Seshadri, and Lynne Stokes. 
Sampling-Based Estimation of the Number of
         * Distinct Values of an Attribute. VLDB'95, Section 4.3.
         * 
-        * @param ubm        The Uncompressed Bitmap containing the data from 
the sample
+        * @param numVals    The number of unique values in the sample
+        * @param freqCounts The inverse histogram of frequencies. counts 
extracted
         * @param nRows      The original number of rows in the entire input
         * @param sampleSize The number of rows in the sample
         * @return Estimate of the number of distinct values
         */
-       public static int get(Bitmap ubm, int nRows, int sampleSize) {
-               int numVals = ubm.getNumValues();
-               int[] freqCounts = FrequencyCount.get(ubm);
+       public static int distinctCount(int numVals, int[] freqCounts, int 
nRows, int sampleSize) {
                // all values in the sample are zeros
                if(freqCounts.length == 0)
                        return 0;
diff --git a/src/main/java/org/apache/sysds/runtime/compress/utils/ABitmap.java 
b/src/main/java/org/apache/sysds/runtime/compress/utils/ABitmap.java
index 69573a4..9f0a929 100644
--- a/src/main/java/org/apache/sysds/runtime/compress/utils/ABitmap.java
+++ b/src/main/java/org/apache/sysds/runtime/compress/utils/ABitmap.java
@@ -33,6 +33,7 @@ public abstract class ABitmap {
        }
 
        protected final int _numCols;
+       protected final int _numRows;
 
        /** Bitmaps (as lists of offsets) for each of the values. */
        protected IntArrayList[] _offsetsLists;
@@ -42,6 +43,7 @@ public abstract class ABitmap {
 
        public ABitmap(int numCols, IntArrayList[] offsetsLists, int rows) {
                _numCols = numCols;
+               _numRows = rows;
                int offsetsTotal = 0;
                if(offsetsLists != null) {
                        for(IntArrayList a : offsetsLists)
@@ -61,6 +63,16 @@ public abstract class ABitmap {
                return _numCols;
        }
 
+       public int getNumRows(){
+               return _numRows;
+       }
+
+       public boolean isEmpty(){
+               return _offsetsLists == null;
+       }
+
+       public abstract boolean lossy();
+
        /**
         * Obtain number of distinct value groups in the column. this number is 
also the number of bitmaps, since there is
         * one bitmap per value
diff --git a/src/main/java/org/apache/sysds/runtime/compress/utils/Bitmap.java 
b/src/main/java/org/apache/sysds/runtime/compress/utils/Bitmap.java
index db0c2b6..d41fe94 100644
--- a/src/main/java/org/apache/sysds/runtime/compress/utils/Bitmap.java
+++ b/src/main/java/org/apache/sysds/runtime/compress/utils/Bitmap.java
@@ -95,4 +95,9 @@ public final class Bitmap extends ABitmap {
        public BitmapType getType() {
                return BitmapType.Full;
        }
+
+       @Override
+       public boolean lossy(){
+               return false;
+       }
 }
diff --git 
a/src/main/java/org/apache/sysds/runtime/compress/utils/BitmapLossy.java 
b/src/main/java/org/apache/sysds/runtime/compress/utils/BitmapLossy.java
index 56745e2..2c79211 100644
--- a/src/main/java/org/apache/sysds/runtime/compress/utils/BitmapLossy.java
+++ b/src/main/java/org/apache/sysds/runtime/compress/utils/BitmapLossy.java
@@ -109,6 +109,11 @@ public final class BitmapLossy extends ABitmap {
        }
 
        @Override
+       public boolean lossy(){
+               return true;
+       }
+
+       @Override
        public String toString() {
                StringBuilder sb = new StringBuilder();
                sb.append(super.toString());
diff --git 
a/src/main/java/org/apache/sysds/runtime/compress/utils/MultiColBitmap.java 
b/src/main/java/org/apache/sysds/runtime/compress/utils/MultiColBitmap.java
index 80d98df..173834b 100644
--- a/src/main/java/org/apache/sysds/runtime/compress/utils/MultiColBitmap.java
+++ b/src/main/java/org/apache/sysds/runtime/compress/utils/MultiColBitmap.java
@@ -109,4 +109,9 @@ public final class MultiColBitmap extends ABitmap {
        public BitmapType getType() {
                return BitmapType.Full;
        }
+       
+       @Override
+       public boolean lossy(){
+               return false;
+       }
 }
diff --git a/src/main/java/org/apache/sysds/runtime/compress/cocode/Util.java 
b/src/main/java/org/apache/sysds/runtime/compress/utils/Util.java
similarity index 96%
rename from src/main/java/org/apache/sysds/runtime/compress/cocode/Util.java
rename to src/main/java/org/apache/sysds/runtime/compress/utils/Util.java
index 96973df..b6c9a6e 100644
--- a/src/main/java/org/apache/sysds/runtime/compress/cocode/Util.java
+++ b/src/main/java/org/apache/sysds/runtime/compress/utils/Util.java
@@ -17,7 +17,7 @@
  * under the License.
  */
 
-package org.apache.sysds.runtime.compress.cocode;
+package org.apache.sysds.runtime.compress.utils;
 
 public class Util {
        public static int[] join(int[] lhs, int[] rhs) {
diff --git 
a/src/test/java/org/apache/sysds/test/component/compress/CompressedTestBase.java
 
b/src/test/java/org/apache/sysds/test/component/compress/CompressedTestBase.java
index 3a5ced0..20054f2 100644
--- 
a/src/test/java/org/apache/sysds/test/component/compress/CompressedTestBase.java
+++ 
b/src/test/java/org/apache/sysds/test/component/compress/CompressedTestBase.java
@@ -233,7 +233,7 @@ public abstract class CompressedTestBase extends TestBase {
 
                                                EstimationFactors ef = 
CompressedSizeEstimator.estimateCompressedColGroupSize(ubm, colIndexes,
                                                        mb.getNumRows(), cs);
-                                               CompressedSizeInfoColGroup cgi 
= new CompressedSizeInfoColGroup(ef, cs.validCompressions);
+                                               CompressedSizeInfoColGroup cgi 
= new CompressedSizeInfoColGroup(ef, cs.validCompressions, ubm);
                                                AColGroup cg = 
ColGroupFactory.compress(colIndexes, mb.getNumRows(), ubm, c, cs, mb,
                                                        cgi.getTupleSparsity());
                                                colGroups.add(cg);
diff --git 
a/src/test/java/org/apache/sysds/test/component/compress/estim/JoinCompressionInfoTest.java
 
b/src/test/java/org/apache/sysds/test/component/compress/estim/JoinCompressionInfoTest.java
new file mode 100644
index 0000000..a4760b4
--- /dev/null
+++ 
b/src/test/java/org/apache/sysds/test/component/compress/estim/JoinCompressionInfoTest.java
@@ -0,0 +1,119 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.sysds.test.component.compress.estim;
+
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.apache.sysds.runtime.compress.CompressionSettings;
+import org.apache.sysds.runtime.compress.CompressionSettingsBuilder;
+import org.apache.sysds.runtime.compress.estim.CompressedSizeEstimator;
+import org.apache.sysds.runtime.compress.estim.CompressedSizeEstimatorFactory;
+import org.apache.sysds.runtime.compress.estim.CompressedSizeInfoColGroup;
+import org.apache.sysds.runtime.matrix.data.MatrixBlock;
+import org.apache.sysds.runtime.util.DataConverter;
+import org.apache.sysds.test.TestUtils;
+import org.junit.Assert;
+import org.junit.Test;
+
+public class JoinCompressionInfoTest {
+
+       protected static final Log LOG = 
LogFactory.getLog(SampleEstimatorTest.class.getName());
+
+       private static final int seed = 1512314;
+
+       final MatrixBlock mbt;
+
+       public JoinCompressionInfoTest() {
+               // matrix block 2 columns
+               MatrixBlock tmp = DataConverter
+                       
.convertToMatrixBlock(TestUtils.round(TestUtils.generateTestMatrix(2, 500000, 
1, 10, 1.0, seed + 1)));
+               tmp = tmp.append(
+                       DataConverter
+                               
.convertToMatrixBlock(TestUtils.round(TestUtils.generateTestMatrix(1, 500000, 
1, 2, 1.0, seed + 1))),
+                       new MatrixBlock(), false);
+               mbt = tmp;
+       }
+
+       @Test
+       public void compressedSizeInfoEstimatorFull() {
+               
testSampleEstimateIsAtMaxEstimatedElementsInEachColumnsProduct(1.0, 1.0);
+       }
+
+       @Test
+       public void compressedSizeInfoEstimatorSample_90() {
+               
testSampleEstimateIsAtMaxEstimatedElementsInEachColumnsProduct(0.9, 0.9);
+       }
+
+       @Test
+       public void compressedSizeInfoEstimatorSample_50() {
+               
testSampleEstimateIsAtMaxEstimatedElementsInEachColumnsProduct(0.5, 0.90);
+       }
+
+       @Test
+       public void compressedSizeInfoEstimatorSample_20() {
+               
testSampleEstimateIsAtMaxEstimatedElementsInEachColumnsProduct(0.2, 0.8);
+       }
+
+       @Test
+       public void compressedSizeInfoEstimatorSample_10() {
+               
testSampleEstimateIsAtMaxEstimatedElementsInEachColumnsProduct(0.1, 0.75);
+       }
+
+       @Test
+       public void compressedSizeInfoEstimatorSample_5() {
+               
testSampleEstimateIsAtMaxEstimatedElementsInEachColumnsProduct(0.05, 0.7);
+       }
+
+       @Test
+       public void compressedSizeInfoEstimatorSample_1() {
+               
testSampleEstimateIsAtMaxEstimatedElementsInEachColumnsProduct(0.01, 0.6);
+       }
+
+       @Test
+       public void compressedSizeInfoEstimatorSample_p1() {
+               
testSampleEstimateIsAtMaxEstimatedElementsInEachColumnsProduct(0.001, 0.5);
+       }
+
+       private void 
testSampleEstimateIsAtMaxEstimatedElementsInEachColumnsProduct(double ratio, 
double tolerance) {
+               try {
+
+                       final CompressionSettings cs_estimate = new 
CompressionSettingsBuilder().setMinimumSampleSize(100)
+                               .setSamplingRatio(ratio).setSeed(seed).create();
+
+                       cs_estimate.transposed = true;
+
+                       final CompressedSizeEstimator es = 
CompressedSizeEstimatorFactory.getSizeEstimator(mbt, cs_estimate);
+                       CompressedSizeInfoColGroup g1 = 
es.estimateCompressedColGroupSize(new int[] {0});
+                       CompressedSizeInfoColGroup g2 = 
es.estimateCompressedColGroupSize(new int[] {1});
+                       g1 = es.estimateJoinCompressedSize(g1, g2);
+                       g2 = es.estimateCompressedColGroupSize(new int[] {2});
+
+                       CompressedSizeInfoColGroup joined_result = 
es.estimateJoinCompressedSize(g1, g2);
+                       CompressedSizeInfoColGroup estimate_full = 
es.estimateCompressedColGroupSize(new int[] {0, 1, 2});
+
+                       Assert.assertEquals(joined_result.getNumVals(), 
estimate_full.getNumVals());
+               }
+               catch(Exception e) {
+                       e.printStackTrace();
+                       throw e;
+               }
+
+       }
+}
diff --git 
a/src/test/java/org/apache/sysds/test/component/compress/mapping/StandAloneTests.java
 
b/src/test/java/org/apache/sysds/test/component/compress/mapping/StandAloneTests.java
new file mode 100644
index 0000000..6916732
--- /dev/null
+++ 
b/src/test/java/org/apache/sysds/test/component/compress/mapping/StandAloneTests.java
@@ -0,0 +1,137 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.sysds.test.component.compress.mapping;
+
+import static org.junit.Assert.assertEquals;
+
+import java.util.Arrays;
+
+import org.apache.sysds.runtime.compress.colgroup.mapping.AMapToData;
+import org.apache.sysds.runtime.compress.colgroup.mapping.MapToFactory;
+import org.apache.sysds.runtime.compress.utils.IntArrayList;
+import org.junit.Test;
+
+public class StandAloneTests {
+       @Test
+       public void testJoin_01() {
+               AMapToData a = MapToFactory.create(10, true, new IntArrayList[] 
{gen(new int[] {1, 2, 3, 4})});
+               AMapToData b = MapToFactory.create(10, true, new IntArrayList[] 
{gen(new int[] {2, 4, 6, 8})});
+               AMapToData c = MapToFactory.join(a, b);
+               // compare(c, new int[] {3, 2, 0, 2, 0, 3, 1, 3, 1, 3});
+               compare(c, new int[] {0, 1, 2, 1, 2, 0, 3, 0, 3, 0});
+       }
+
+       @Test
+       public void testJoin_02() {
+               AMapToData a = MapToFactory.create(10, true, new IntArrayList[] 
{gen(new int[] {1, 2, 3, 4})});
+               AMapToData c = MapToFactory.join(a, a);
+               // compare(c, new int[] {1, 0, 0, 0, 0, 1, 1, 1, 1, 1});
+               compare(c, new int[] {0, 1, 1, 1, 1, 0, 0, 0, 0, 0});
+       }
+
+       @Test
+       public void testJoin_03() {
+               AMapToData a = MapToFactory.create(10, true, new IntArrayList[] 
{gen(new int[] {1, 2, 3, 4})});
+               AMapToData b = MapToFactory.create(10, true, new IntArrayList[] 
{gen(new int[] {1, 2, 3})});
+               AMapToData c = MapToFactory.join(a, b);
+               // compare(c, new int[] {2, 0, 0, 0, 1, 2, 2, 2, 2, 2});
+               compare(c, new int[] {0, 1, 1, 1, 2, 0, 0, 0, 0, 0});
+       }
+
+       @Test
+       public void testJoin_04() {
+               AMapToData a = MapToFactory.create(10, true, new IntArrayList[] 
{gen(new int[] {1, 2, 3})});
+               AMapToData b = MapToFactory.create(10, true, new IntArrayList[] 
{gen(new int[] {1, 2, 3, 4})});
+               AMapToData c = MapToFactory.join(a, b);
+               // compare(c, new int[] {2, 0, 0, 0, 1, 2, 2, 2, 2, 2});
+               compare(c, new int[] {0, 1, 1, 1, 2, 0, 0, 0, 0, 0});
+       }
+
+       @Test
+       public void testJoin_05() {
+               AMapToData a = MapToFactory.create(10, true, new IntArrayList[] 
{gen(new int[] {1, 2, 3}), gen(new int[] {4})});
+               AMapToData b = MapToFactory.create(10, true, new IntArrayList[] 
{gen(new int[] {1, 2, 3, 4})});
+               AMapToData c = MapToFactory.join(a, b);
+               // compare(c, new int[] {2, 0, 0, 0, 1, 2, 2, 2, 2, 2});
+               compare(c, new int[] {0, 1, 1, 1, 2, 0, 0, 0, 0, 0});
+       }
+
+       @Test
+       public void testJoin_06() {
+               AMapToData a = MapToFactory.create(10, true,
+                       new IntArrayList[] {gen(new int[] {1, 2, 3}), gen(new 
int[] {4, 5})});
+               AMapToData b = MapToFactory.create(10, true, new IntArrayList[] 
{gen(new int[] {1, 2, 3, 4})});
+               AMapToData c = MapToFactory.join(a, b);
+               // compare(c, new int[] {3, 0, 0, 0, 1, 2, 3, 3, 3, 3});
+               compare(c, new int[] {0, 1, 1, 1, 2, 3, 0, 0, 0, 0});
+       }
+
+       @Test
+       public void testJoin_07() {
+               AMapToData a = MapToFactory.create(10, true,
+                       new IntArrayList[] {gen(new int[] {1, 2, 3}), gen(new 
int[] {4, 5}), gen(new int[] {6, 7})});
+               AMapToData b = MapToFactory.create(10, true, new IntArrayList[] 
{gen(new int[] {1, 2, 3, 4})});
+               AMapToData c = MapToFactory.join(a, b);
+               // compare(c, new int[] {4, 0, 0, 0, 1, 2, 3, 3, 4, 4});
+               compare(c, new int[] {0, 1, 1, 1, 2, 3, 4, 4, 0, 0});
+       }
+
+       @Test(expected = RuntimeException.class)
+       public void testInvalidArgument() {
+               AMapToData a = MapToFactory.create(11, true,
+                       new IntArrayList[] {gen(new int[] {1, 2, 3}), gen(new 
int[] {4, 5}), gen(new int[] {6, 7})});
+               AMapToData b = MapToFactory.create(10, true, new IntArrayList[] 
{gen(new int[] {1, 2, 3, 4})});
+               MapToFactory.join(a, b);
+       }
+
+       @Test
+       public void test_null_argument_01() {
+               AMapToData a = null;
+               AMapToData b = MapToFactory.create(10, true, new IntArrayList[] 
{gen(new int[] {1, 2, 3, 4})});
+               AMapToData c = MapToFactory.join(a, b);
+               compare(c, new int[] {1, 0, 0, 0, 0, 1, 1, 1, 1, 1});
+               // compare(c, new int[] {0, 1, 1, 1, 1, 0, 0, 0, 0, 0});
+       }
+
+       @Test
+       public void test_null_argument_02() {
+               AMapToData a = MapToFactory.create(10, true, new IntArrayList[] 
{gen(new int[] {1, 2, 3, 4})});
+               AMapToData b = null;
+               AMapToData c = MapToFactory.join(a, b);
+               compare(c, new int[] {1, 0, 0, 0, 0, 1, 1, 1, 1, 1});
+               // compare(c, new int[] {0, 1, 1, 1, 1, 0, 0, 0, 0, 0});
+       }
+
+       private void compare(AMapToData res, int[] expected) {
+               StringBuilder sb = new StringBuilder();
+               sb.append("\nExpected:\n");
+               sb.append(Arrays.toString(expected));
+               sb.append("\nActual:\n");
+               sb.append(res.toString());
+               sb.append("\n");
+               for(int i = 0; i < expected.length; i++) {
+                       assertEquals(sb.toString(), expected[i], 
res.getIndex(i));
+               }
+       }
+
+       private static IntArrayList gen(int[] in) {
+               return new IntArrayList(in);
+       }
+}

Reply via email to