This is an automated email from the ASF dual-hosted git repository.
baunsgaard pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/systemds.git
The following commit(s) were added to refs/heads/master by this push:
new c59ad18 [SYSTEMDS-3002] CLA CoCoding without Dictionaries
c59ad18 is described below
commit c59ad18c58c1a0212cfa859bb873958a96cbc624
Author: baunsgaard <[email protected]>
AuthorDate: Thu Jun 3 15:45:00 2021 +0200
[SYSTEMDS-3002] CLA CoCoding without Dictionaries
This commit make the co-coding cheaper if many column combinations is
constructed.
For example with BinaryMNIST and bin packing co-coding, the execution
time is halved from 6 to 3 sec in co-coding phase
This gives a total compression time of: 6 sec + IO of ~10. Also
notably is normal MNIST of 60 k values, there compression time is now
~200ms.
This change impacts the matrix cost based column co coding the most,
reducing the overall compression time in binary mnist from ~120 sec
to ~8 as well.
Closes #1296
---
.../runtime/compress/CompressedMatrixBlock.java | 13 +-
.../compress/CompressedMatrixBlockFactory.java | 4 +-
.../runtime/compress/CompressionSettings.java | 16 ++-
.../compress/CompressionSettingsBuilder.java | 9 +-
.../runtime/compress/cocode/AColumnCoCoder.java | 9 +-
.../sysds/runtime/compress/cocode/CoCodeCost.java | 80 +++++++-----
.../compress/cocode/CoCodeCostMatrixMult.java | 21 ++--
.../runtime/compress/cocode/PlanningCoCoder.java | 8 +-
.../runtime/compress/colgroup/ColGroupFactory.java | 8 +-
.../compress/colgroup/mapping/AMapToData.java | 14 +++
.../compress/colgroup/mapping/MapToBit.java | 12 +-
.../compress/colgroup/mapping/MapToByte.java | 12 +-
.../compress/colgroup/mapping/MapToChar.java | 10 +-
.../compress/colgroup/mapping/MapToFactory.java | 72 ++++++++++-
.../compress/colgroup/mapping/MapToInt.java | 12 +-
.../compress/estim/CompressedSizeEstimator.java | 46 ++++++-
.../estim/CompressedSizeEstimatorExact.java | 24 ++--
.../estim/CompressedSizeEstimatorSample.java | 129 +++++++++++--------
.../compress/estim/CompressedSizeInfoColGroup.java | 58 +++++++--
.../runtime/compress/estim/EstimationFactors.java | 70 +++++++++--
.../compress/estim/sample/FrequencyCount.java | 47 -------
.../estim/sample/GuaranteedErrorEstimator.java | 66 ----------
.../compress/estim/sample/HassAndStokes.java | 13 +-
.../estim/sample/SampleEstimatorFactory.java | 96 +++++++++++++++
.../compress/estim/sample/ShlosserEstimator.java | 10 +-
.../estim/sample/ShlosserJackknifeEstimator.java | 16 +--
.../estim/sample/SmoothedJackknifeEstimator.java | 9 +-
.../sysds/runtime/compress/utils/ABitmap.java | 12 ++
.../sysds/runtime/compress/utils/Bitmap.java | 5 +
.../sysds/runtime/compress/utils/BitmapLossy.java | 5 +
.../runtime/compress/utils/MultiColBitmap.java | 5 +
.../runtime/compress/{cocode => utils}/Util.java | 2 +-
.../component/compress/CompressedTestBase.java | 2 +-
.../compress/estim/JoinCompressionInfoTest.java | 119 ++++++++++++++++++
.../compress/mapping/StandAloneTests.java | 137 +++++++++++++++++++++
35 files changed, 861 insertions(+), 310 deletions(-)
diff --git
a/src/main/java/org/apache/sysds/runtime/compress/CompressedMatrixBlock.java
b/src/main/java/org/apache/sysds/runtime/compress/CompressedMatrixBlock.java
index 6d1d02b..5a52b87 100644
--- a/src/main/java/org/apache/sysds/runtime/compress/CompressedMatrixBlock.java
+++ b/src/main/java/org/apache/sysds/runtime/compress/CompressedMatrixBlock.java
@@ -78,6 +78,7 @@ import org.apache.sysds.runtime.instructions.cp.CM_COV_Object;
import org.apache.sysds.runtime.instructions.cp.ScalarObject;
import org.apache.sysds.runtime.instructions.spark.data.IndexedMatrixValue;
import org.apache.sysds.runtime.matrix.data.CTableMap;
+import org.apache.sysds.runtime.matrix.data.LibMatrixBincell;
import org.apache.sysds.runtime.matrix.data.LibMatrixDatagen;
import org.apache.sysds.runtime.matrix.data.LibMatrixReorg;
import org.apache.sysds.runtime.matrix.data.MatrixBlock;
@@ -492,14 +493,14 @@ public class CompressedMatrixBlock extends MatrixBlock {
return out;
BinaryOperator bop = new
BinaryOperator(Multiply.getMultiplyFnObject());
-
- MatrixBlock tmp = CLALibRightMultBy.rightMultByMatrix(this, v,
null, k, true);
+ boolean allowOverlap =
ConfigurationManager.getDMLConfig().getBooleanValue(DMLConfig.COMPRESSED_OVERLAPPING);
+ MatrixBlock tmp = CLALibRightMultBy.rightMultByMatrix(this, v,
null, k, allowOverlap);
if(ctype == ChainType.XtwXv) {
- // if(tmp instanceof CompressedMatrixBlock)
- tmp = CLALibBinaryCellOp.binaryOperations(bop,
(CompressedMatrixBlock) tmp, w, null);
- // else
- // LibMatrixBincell.bincellOpInPlace(tmp, w, bop);
+ if(tmp instanceof CompressedMatrixBlock)
+ tmp = CLALibBinaryCellOp.binaryOperations(bop,
(CompressedMatrixBlock) tmp, w, null);
+ else
+ LibMatrixBincell.bincellOpInPlace(tmp, w, bop);
}
if(tmp instanceof CompressedMatrixBlock)
diff --git
a/src/main/java/org/apache/sysds/runtime/compress/CompressedMatrixBlockFactory.java
b/src/main/java/org/apache/sysds/runtime/compress/CompressedMatrixBlockFactory.java
index 914a707..513103a 100644
---
a/src/main/java/org/apache/sysds/runtime/compress/CompressedMatrixBlockFactory.java
+++
b/src/main/java/org/apache/sysds/runtime/compress/CompressedMatrixBlockFactory.java
@@ -170,7 +170,9 @@ public class CompressedMatrixBlockFactory {
}
private void coCodePhase(CompressedSizeEstimator sizeEstimator,
CompressedSizeInfo sizeInfos, int numRows) {
- coCodeColGroups =
PlanningCoCoder.findCoCodesByPartitioning(sizeEstimator, sizeInfos, numRows, k,
compSettings);
+ // for(int i = 0; i < 100000; i ++)
+ coCodeColGroups =
PlanningCoCoder.findCoCodesByPartitioning(sizeEstimator, sizeInfos, numRows, k,
compSettings);
+
_stats.estimatedSizeCoCoded = coCodeColGroups.memoryEstimate();
logPhase();
}
diff --git
a/src/main/java/org/apache/sysds/runtime/compress/CompressionSettings.java
b/src/main/java/org/apache/sysds/runtime/compress/CompressionSettings.java
index aeabc92..b93a8f5 100644
--- a/src/main/java/org/apache/sysds/runtime/compress/CompressionSettings.java
+++ b/src/main/java/org/apache/sysds/runtime/compress/CompressionSettings.java
@@ -25,6 +25,7 @@ import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import
org.apache.sysds.runtime.compress.cocode.PlanningCoCoder.PartitionerType;
import org.apache.sysds.runtime.compress.colgroup.AColGroup.CompressionType;
+import
org.apache.sysds.runtime.compress.estim.sample.SampleEstimatorFactory.EstimationType;
/**
* Compression Settings class, used as a bundle of parameters inside the
Compression framework. See
@@ -99,10 +100,13 @@ public class CompressionSettings {
*/
public final int minimumSampleSize;
+ /** The sample type used for sampling */
+ public final EstimationType estimationType;
+
protected CompressionSettings(double samplingRatio, boolean
allowSharedDictionary, String transposeInput,
- boolean skipList, int seed, boolean lossy,
- EnumSet<CompressionType> validCompressions, boolean
sortValuesByLength, PartitionerType columnPartitioner,
- int maxColGroupCoCode, double coCodePercentage, int
minimumSampleSize) {
+ boolean skipList, int seed, boolean lossy,
EnumSet<CompressionType> validCompressions,
+ boolean sortValuesByLength, PartitionerType columnPartitioner,
int maxColGroupCoCode, double coCodePercentage,
+ int minimumSampleSize, EstimationType estimationType) {
this.samplingRatio = samplingRatio;
this.allowSharedDictionary = allowSharedDictionary;
this.transposeInput = transposeInput;
@@ -115,7 +119,9 @@ public class CompressionSettings {
this.maxColGroupCoCode = maxColGroupCoCode;
this.coCodePercentage = coCodePercentage;
this.minimumSampleSize = minimumSampleSize;
- LOG.debug(this);
+ this.estimationType = estimationType;
+ if(LOG.isDebugEnabled())
+ LOG.debug(this);
}
@Override
@@ -131,6 +137,8 @@ public class CompressionSettings {
sb.append("\n Max Static ColGroup CoCode: " +
maxColGroupCoCode);
sb.append("\n Max cocodePercentage: " + coCodePercentage);
sb.append("\n Sample Percentage: " + samplingRatio);
+ if(samplingRatio < 1.0)
+ sb.append("\n Estimation Type: " + estimationType);
// If needed for debugging add more fields to the printing.
return sb.toString();
}
diff --git
a/src/main/java/org/apache/sysds/runtime/compress/CompressionSettingsBuilder.java
b/src/main/java/org/apache/sysds/runtime/compress/CompressionSettingsBuilder.java
index 3ec42a0..e4ad2bb 100644
---
a/src/main/java/org/apache/sysds/runtime/compress/CompressionSettingsBuilder.java
+++
b/src/main/java/org/apache/sysds/runtime/compress/CompressionSettingsBuilder.java
@@ -26,6 +26,7 @@ import org.apache.sysds.conf.DMLConfig;
import org.apache.sysds.runtime.DMLCompressionException;
import
org.apache.sysds.runtime.compress.cocode.PlanningCoCoder.PartitionerType;
import org.apache.sysds.runtime.compress.colgroup.AColGroup.CompressionType;
+import
org.apache.sysds.runtime.compress.estim.sample.SampleEstimatorFactory.EstimationType;
/**
* Builder pattern for Compression Settings. See CompressionSettings for
details on values.
@@ -43,6 +44,7 @@ public class CompressionSettingsBuilder {
private int maxColGroupCoCode = 10000;
private double coCodePercentage = 0.01;
private int minimumSampleSize = 2000;
+ private EstimationType estimationType = EstimationType.HassAndStokes;
private final static double defaultSampleRate = 0.01;
@@ -261,6 +263,11 @@ public class CompressionSettingsBuilder {
return this;
}
+ public CompressionSettingsBuilder setEstimationType(EstimationType
estimationType){
+ this.estimationType = estimationType;
+ return this;
+ }
+
/**
* Create the CompressionSettings object to use in the compression.
*
@@ -269,6 +276,6 @@ public class CompressionSettingsBuilder {
public CompressionSettings create() {
return new CompressionSettings(samplingRatio,
allowSharedDictionary, transposeInput, skipList, seed, lossy,
validCompressions, sortValuesByLength,
columnPartitioner, maxColGroupCoCode, coCodePercentage,
- minimumSampleSize);
+ minimumSampleSize, estimationType);
}
}
diff --git
a/src/main/java/org/apache/sysds/runtime/compress/cocode/AColumnCoCoder.java
b/src/main/java/org/apache/sysds/runtime/compress/cocode/AColumnCoCoder.java
index 124e3a4..202e4bf 100644
--- a/src/main/java/org/apache/sysds/runtime/compress/cocode/AColumnCoCoder.java
+++ b/src/main/java/org/apache/sysds/runtime/compress/cocode/AColumnCoCoder.java
@@ -25,6 +25,7 @@ import org.apache.sysds.runtime.compress.CompressionSettings;
import org.apache.sysds.runtime.compress.estim.CompressedSizeEstimator;
import org.apache.sysds.runtime.compress.estim.CompressedSizeInfo;
import org.apache.sysds.runtime.compress.estim.CompressedSizeInfoColGroup;
+import org.apache.sysds.runtime.compress.utils.Util;
public abstract class AColumnCoCoder {
@@ -54,15 +55,17 @@ public abstract class AColumnCoCoder {
protected CompressedSizeInfoColGroup
joinWithAnalysis(CompressedSizeInfoColGroup lhs,
CompressedSizeInfoColGroup rhs) {
- int[] joined = Util.join(lhs.getColumns(), rhs.getColumns());
- return _est.estimateCompressedColGroupSize(joined,(
lhs.getNumVals() + 1) * (rhs.getNumVals() + 1));
+ return _est.estimateJoinCompressedSize(lhs, rhs);
}
protected CompressedSizeInfoColGroup
joinWithoutAnalysis(CompressedSizeInfoColGroup lhs,
CompressedSizeInfoColGroup rhs) {
int[] joined = Util.join(lhs.getColumns(), rhs.getColumns());
int numVals = lhs.getNumVals() + rhs.getNumVals();
- return new CompressedSizeInfoColGroup(joined, numVals,
_est.getNumRows());
+ if(numVals< 0 || numVals > _est.getNumRows())
+ return null;
+ else
+ return new CompressedSizeInfoColGroup(joined, numVals,
_est.getNumRows());
}
protected CompressedSizeInfoColGroup analyze(CompressedSizeInfoColGroup
g) {
diff --git
a/src/main/java/org/apache/sysds/runtime/compress/cocode/CoCodeCost.java
b/src/main/java/org/apache/sysds/runtime/compress/cocode/CoCodeCost.java
index 3f7185a..a39a524 100644
--- a/src/main/java/org/apache/sysds/runtime/compress/cocode/CoCodeCost.java
+++ b/src/main/java/org/apache/sysds/runtime/compress/cocode/CoCodeCost.java
@@ -61,47 +61,67 @@ public class CoCodeCost extends AColumnCoCoder {
}
private List<CompressedSizeInfoColGroup>
join(List<CompressedSizeInfoColGroup> currentGroups) {
-
+ // return joinToSmallForAnalysis(currentGroups);
+ List<CompressedSizeInfoColGroup> filteredGroups =
joinToSmallForAnalysis(currentGroups);
+ // return currentGroups;
Comparator<CompressedSizeInfoColGroup> comp =
Comparator.comparing(CompressedSizeInfoColGroup::getNumVals);
- Queue<CompressedSizeInfoColGroup> que = new
PriorityQueue<>(currentGroups.size(), comp);
+ Queue<CompressedSizeInfoColGroup> que = new
PriorityQueue<>(filteredGroups.size(), comp);
List<CompressedSizeInfoColGroup> ret = new ArrayList<>();
- for(CompressedSizeInfoColGroup g : currentGroups)
- que.add(g);
-
- boolean finished = false;
- while(!finished) {
- if(que.peek() != null) {
- CompressedSizeInfoColGroup l = que.poll();
- if(que.peek() != null) {
- CompressedSizeInfoColGroup r =
que.poll();
- int worstCaseJoinedSize =
l.getNumVals() * r.getNumVals();
- if(worstCaseJoinedSize <
toSmallForAnalysis)
- que.add(joinWithoutAnalysis(l,
r));
- else if(worstCaseJoinedSize <
largestDistinct) {
- CompressedSizeInfoColGroup g =
joinWithAnalysis(l, r);
- if(g.getNumVals() <
largestDistinct)
-
que.add(joinWithAnalysis(l, r));
- else {
- ret.add(l);
- que.add(r);
- }
- }
- else {
- ret.add(l);
- que.add(r);
- }
+ for(CompressedSizeInfoColGroup g : filteredGroups) {
+ if(g != null)
+ que.add(g);
+ }
+
+ CompressedSizeInfoColGroup l = que.poll();
+
+ while(que.peek() != null) {
+ CompressedSizeInfoColGroup r = que.peek();
+ int worstCaseJoinedSize = l.getNumVals() *
r.getNumVals();
+ if(worstCaseJoinedSize < toSmallForAnalysis) {
+ que.poll();
+ que.add(joinWithoutAnalysis(l, r));
+ }
+ else if(worstCaseJoinedSize < largestDistinct) {
+ CompressedSizeInfoColGroup g =
joinWithAnalysis(l, r);
+ if(g != null && g.getNumVals() <
largestDistinct) {
+ que.poll();
+ que.add(g);
}
- else
+ else
ret.add(l);
}
- else
- finished = true;
+ else
+ ret.add(l);
+
+ l = que.poll();
}
+ ret.add(l);
+
for(CompressedSizeInfoColGroup g : que)
ret.add(g);
return ret;
}
+
+ private List<CompressedSizeInfoColGroup>
joinToSmallForAnalysis(List<CompressedSizeInfoColGroup> currentGroups) {
+ return currentGroups;
+ // List<CompressedSizeInfoColGroup> tmp = new ArrayList<>();
+ // int id = 0;
+ // while(id < currentGroups.size() - 1) {
+ // CompressedSizeInfoColGroup g1 = currentGroups.get(id);
+ // CompressedSizeInfoColGroup g2 = currentGroups.get(id +
1);
+ // if(g1.getNumVals() * g2.getNumVals() <
toSmallForAnalysis) {
+ // tmp.add(joinWithoutAnalysis(g1, g2));
+ // }
+ // else {
+ // tmp.add(g1);
+ // tmp.add(g2);
+ // }
+ // id += 2;
+
+ // }
+ // return tmp;
+ }
}
diff --git
a/src/main/java/org/apache/sysds/runtime/compress/cocode/CoCodeCostMatrixMult.java
b/src/main/java/org/apache/sysds/runtime/compress/cocode/CoCodeCostMatrixMult.java
index 53f3243..2dd45d1 100644
---
a/src/main/java/org/apache/sysds/runtime/compress/cocode/CoCodeCostMatrixMult.java
+++
b/src/main/java/org/apache/sysds/runtime/compress/cocode/CoCodeCostMatrixMult.java
@@ -103,17 +103,22 @@ public class CoCodeCostMatrixMult extends AColumnCoCoder {
protected CostOfJoin(CompressedSizeInfoColGroup elm) {
this.elm = elm;
+ if(elm == null) {
+ this.cost = Double.POSITIVE_INFINITY;
+ }
+ else {
- final int nCols = elm.getColumns().length;
- final double nRows = _est.getNumRows();
- final double preAggregateCost = nRows;
+ final int nCols = elm.getColumns().length;
+ final double nRows = _est.getNumRows();
+ final double preAggregateCost = nRows;
- final int numberTuples = elm.getNumVals();
- final double tupleSparsity = elm.getTupleSparsity();
- final double postScalingCost = (nCols > 1 &&
tupleSparsity > 0.4) ? numberTuples * nCols : numberTuples *
- nCols * tupleSparsity;
+ final int numberTuples = elm.getNumVals();
+ final double tupleSparsity =
elm.getTupleSparsity();
+ final double postScalingCost = (nCols > 1 &&
tupleSparsity > 0.4) ? numberTuples *
+ nCols : numberTuples * nCols *
tupleSparsity;
- this.cost = preAggregateCost + postScalingCost;
+ this.cost = preAggregateCost + postScalingCost;
+ }
}
@Override
diff --git
a/src/main/java/org/apache/sysds/runtime/compress/cocode/PlanningCoCoder.java
b/src/main/java/org/apache/sysds/runtime/compress/cocode/PlanningCoCoder.java
index 6074d1d..28912ba 100644
---
a/src/main/java/org/apache/sysds/runtime/compress/cocode/PlanningCoCoder.java
+++
b/src/main/java/org/apache/sysds/runtime/compress/cocode/PlanningCoCoder.java
@@ -32,6 +32,7 @@ import
org.apache.sysds.runtime.compress.colgroup.AColGroup.CompressionType;
import org.apache.sysds.runtime.compress.estim.CompressedSizeEstimator;
import org.apache.sysds.runtime.compress.estim.CompressedSizeInfo;
import org.apache.sysds.runtime.compress.estim.CompressedSizeInfoColGroup;
+import org.apache.sysds.runtime.compress.utils.Util;
public class PlanningCoCoder {
@@ -132,7 +133,10 @@ public class PlanningCoCoder {
List<CompressedSizeInfoColGroup> finalGroups = new
ArrayList<>();
// For each bin of columns that is allowed to potentially
cocode.
for(CompressedSizeInfoColGroup bin : bins.getInfo()) {
- if(bin.getColumns().length == 1)
+ final int len = bin.getColumns().length;
+ if(len == 0)
+ continue;
+ else if(len == 1)
// early termination
finalGroups.add(bin);
else
@@ -248,7 +252,7 @@ public class PlanningCoCoder {
g =
CompressedSizeInfoColGroup.addConstGroup(c, left, cs.validCompressions);
else {
st3++;
- g =
est.estimateCompressedColGroupSize(c);
+ g =
est.estimateJoinCompressedSize(left, right);
}
if(leftConst || rightConst)
diff --git
a/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupFactory.java
b/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupFactory.java
index 1cb92c7..acd59ad 100644
---
a/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupFactory.java
+++
b/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupFactory.java
@@ -200,7 +200,7 @@ public final class ColGroupFactory {
CompressedSizeEstimator estimator = new
CompressedSizeEstimatorExact(in, compSettings);
CompressedSizeInfoColGroup sizeInfo = new
CompressedSizeInfoColGroup(
- estimator.estimateCompressedColGroupSize(ubm,
colIndexes), compSettings.validCompressions);
+ estimator.estimateCompressedColGroupSize(ubm,
colIndexes), compSettings.validCompressions, ubm);
int numRows = compSettings.transposed ? in.getNumColumns() :
in.getNumRows();
return compress(colIndexes, numRows, ubm,
sizeInfo.getBestCompressionType(compSettings), compSettings, in,
@@ -285,8 +285,8 @@ public final class ColGroupFactory {
public static AColGroup compress(int[] colIndexes, int rlen, ABitmap
ubm, CompressionType compType,
CompressionSettings cs, MatrixBlock rawMatrixBlock, double
tupleSparsity) {
- if(compType == CompressionType.UNCOMPRESSED &&
PartitionerType.isCostBased(cs.columnPartitioner))
- compType = CompressionType.DDC;
+ // if(compType == CompressionType.UNCOMPRESSED &&
PartitionerType.isCostBased(cs.columnPartitioner))
+ // compType = CompressionType.DDC;
final IntArrayList[] of = ubm.getOffsetList();
@@ -411,7 +411,7 @@ public final class ColGroupFactory {
private static AColGroup compressDDC(int[] colIndexes, int rlen,
ABitmap ubm, CompressionSettings cs,
double tupleSparsity) {
- boolean zeros = ubm.getNumOffsets() < (long) rlen;
+ boolean zeros = ubm.getNumOffsets() < rlen;
ADictionary dict = DictionaryFactory.create(ubm, tupleSparsity,
zeros);
AMapToData data = MapToFactory.create(rlen, zeros,
ubm.getOffsetList());
return new ColGroupDDC(colIndexes, rlen, dict, data, null);
diff --git
a/src/main/java/org/apache/sysds/runtime/compress/colgroup/mapping/AMapToData.java
b/src/main/java/org/apache/sysds/runtime/compress/colgroup/mapping/AMapToData.java
index ed1f3a0..42dd6b4 100644
---
a/src/main/java/org/apache/sysds/runtime/compress/colgroup/mapping/AMapToData.java
+++
b/src/main/java/org/apache/sysds/runtime/compress/colgroup/mapping/AMapToData.java
@@ -29,6 +29,20 @@ public abstract class AMapToData {
protected static final Log LOG =
LogFactory.getLog(AMapToData.class.getName());
+ private int nUnique;
+
+ protected AMapToData(int nUnique) {
+ this.nUnique = nUnique;
+ }
+
+ public int getUnique() {
+ return nUnique;
+ }
+
+ protected void setUnique(int nUnique){
+ this.nUnique = nUnique;
+ }
+
public abstract int getIndex(int n);
public abstract void set(int n, int v);
diff --git
a/src/main/java/org/apache/sysds/runtime/compress/colgroup/mapping/MapToBit.java
b/src/main/java/org/apache/sysds/runtime/compress/colgroup/mapping/MapToBit.java
index d77643b..48abe9e 100644
---
a/src/main/java/org/apache/sysds/runtime/compress/colgroup/mapping/MapToBit.java
+++
b/src/main/java/org/apache/sysds/runtime/compress/colgroup/mapping/MapToBit.java
@@ -32,12 +32,14 @@ public class MapToBit extends AMapToData {
private final BitSet _data;
private final int _size;
- public MapToBit(int size) {
+ public MapToBit(int unique, int size) {
+ super(unique);
_data = new BitSet(size);
_size = size;
}
- private MapToBit(BitSet d, int size) {
+ private MapToBit(int unique, BitSet d, int size) {
+ super(unique);
_data = d;
_size = size;
}
@@ -66,7 +68,7 @@ public class MapToBit extends AMapToData {
@Override
public long getExactSizeOnDisk() {
final int dSize = _data.size();
- long size = 1 + 4 + 4; // base variables
+ long size = 1 + 4 + 4 + 4; // base variables
size += (dSize / 64) * 8; // all longs except last
size += (dSize % 64 == 0 ? 0 : 8); // last long
return size;
@@ -86,6 +88,7 @@ public class MapToBit extends AMapToData {
public void write(DataOutput out) throws IOException {
long[] internals = _data.toLongArray();
out.writeByte(MAP_TYPE.BIT.ordinal());
+ out.writeInt(getUnique());
out.writeInt(_size);
out.writeInt(internals.length);
for(int i = 0; i < internals.length; i++)
@@ -93,11 +96,12 @@ public class MapToBit extends AMapToData {
}
public static MapToBit readFields(DataInput in) throws IOException {
+ int unique = in.readInt();
int size = in.readInt();
long[] internalLong = new long[in.readInt()];
for(int i = 0; i < internalLong.length; i++)
internalLong[i] = in.readLong();
- return new MapToBit(BitSet.valueOf(internalLong), size);
+ return new MapToBit(unique, BitSet.valueOf(internalLong), size);
}
}
diff --git
a/src/main/java/org/apache/sysds/runtime/compress/colgroup/mapping/MapToByte.java
b/src/main/java/org/apache/sysds/runtime/compress/colgroup/mapping/MapToByte.java
index b6315ad..b0830a3 100644
---
a/src/main/java/org/apache/sysds/runtime/compress/colgroup/mapping/MapToByte.java
+++
b/src/main/java/org/apache/sysds/runtime/compress/colgroup/mapping/MapToByte.java
@@ -31,11 +31,13 @@ public class MapToByte extends AMapToData {
private final byte[] _data;
- public MapToByte(int size) {
+ public MapToByte(int unique, int size) {
+ super(unique);
_data = new byte[size];
}
- private MapToByte(byte[] data) {
+ private MapToByte(int unique, byte[] data) {
+ super(unique);
_data = data;
}
@@ -62,7 +64,7 @@ public class MapToByte extends AMapToData {
@Override
public long getExactSizeOnDisk() {
- return 1 + 4 + _data.length;
+ return 1 + 4+ 4 + _data.length;
}
@Override
@@ -78,16 +80,18 @@ public class MapToByte extends AMapToData {
@Override
public void write(DataOutput out) throws IOException {
out.writeByte(MAP_TYPE.BYTE.ordinal());
+ out.writeInt(getUnique());
out.writeInt(_data.length);
for(int i = 0; i < _data.length; i++)
out.writeByte(_data[i]);
}
public static MapToByte readFields(DataInput in) throws IOException {
+ int unique = in.readInt();
final int length = in.readInt();
final byte[] data = new byte[length];
for(int i = 0; i < length; i++)
data[i] = in.readByte();
- return new MapToByte(data);
+ return new MapToByte(unique, data);
}
}
diff --git
a/src/main/java/org/apache/sysds/runtime/compress/colgroup/mapping/MapToChar.java
b/src/main/java/org/apache/sysds/runtime/compress/colgroup/mapping/MapToChar.java
index ac6f388..45b9d10 100644
---
a/src/main/java/org/apache/sysds/runtime/compress/colgroup/mapping/MapToChar.java
+++
b/src/main/java/org/apache/sysds/runtime/compress/colgroup/mapping/MapToChar.java
@@ -31,11 +31,13 @@ public class MapToChar extends AMapToData {
private final char[] _data;
- public MapToChar(int size) {
+ public MapToChar(int unique, int size) {
+ super(unique);
_data = new char[size];
}
- public MapToChar(char[] data) {
+ public MapToChar(int unique, char[] data) {
+ super(unique);
_data = data;
}
@@ -78,17 +80,19 @@ public class MapToChar extends AMapToData {
@Override
public void write(DataOutput out) throws IOException {
out.writeByte(MAP_TYPE.CHAR.ordinal());
+ out.writeInt(getUnique());
out.writeInt(_data.length);
for(int i = 0; i < _data.length; i++)
out.writeChar(_data[i]);
}
public static MapToChar readFields(DataInput in) throws IOException {
+ int unique = in.readInt();
final int length = in.readInt();
final char[] data = new char[length];
for(int i = 0; i < length; i++)
data[i] = in.readChar();
- return new MapToChar(data);
+ return new MapToChar(unique, data);
}
}
diff --git
a/src/main/java/org/apache/sysds/runtime/compress/colgroup/mapping/MapToFactory.java
b/src/main/java/org/apache/sysds/runtime/compress/colgroup/mapping/MapToFactory.java
index cbf142b..8858640 100644
---
a/src/main/java/org/apache/sysds/runtime/compress/colgroup/mapping/MapToFactory.java
+++
b/src/main/java/org/apache/sysds/runtime/compress/colgroup/mapping/MapToFactory.java
@@ -22,15 +22,32 @@ package org.apache.sysds.runtime.compress.colgroup.mapping;
import java.io.DataInput;
import java.io.IOException;
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
import org.apache.sysds.runtime.DMLCompressionException;
+import org.apache.sysds.runtime.compress.utils.ABitmap;
import org.apache.sysds.runtime.compress.utils.IntArrayList;
-public class MapToFactory {
+public final class MapToFactory {
+
+ protected static final Log LOG =
LogFactory.getLog(MapToFactory.class.getName());
public enum MAP_TYPE {
BIT, BYTE, CHAR, INT;
}
+ public static AMapToData create(ABitmap map) {
+ return create(map.getNumRows(), map);
+ }
+
+ public static AMapToData create(int size, ABitmap map) {
+ if(map == null || map.isEmpty())
+ return null;
+
+ boolean zeros = map.getNumOffsets() < size;
+ return create(size, zeros, map.getOffsetList());
+ }
+
public static AMapToData create(int size, boolean zeros, IntArrayList[]
values) {
AMapToData _data = MapToFactory.create(size, values.length +
(zeros ? 1 : 0));
if(zeros)
@@ -47,13 +64,13 @@ public class MapToFactory {
public static AMapToData create(int size, int numTuples) {
if(numTuples <= 1)
- return new MapToBit(size);
+ return new MapToBit(numTuples, size);
else if(numTuples <= 256)
- return new MapToByte(size);
+ return new MapToByte(numTuples, size);
else if(numTuples <= Character.MAX_VALUE)
- return new MapToChar(size);
+ return new MapToChar(numTuples, size);
else
- return new MapToInt(size);
+ return new MapToInt(numTuples, size);
}
public static long estimateInMemorySize(int size, int numTuples) {
@@ -82,4 +99,49 @@ public class MapToFactory {
throw new DMLCompressionException("Unknown Map
type.");
}
}
+
+ public static AMapToData join(AMapToData left, AMapToData right) {
+ if(left == null)
+ return right;
+ else if(right == null)
+ return left;
+ final int nVL = left.getUnique();
+ final int nVR = right.getUnique();
+ final int size = left.size();
+ final int maxUnique = nVL * nVR;
+ if(size != right.size())
+ throw new DMLCompressionException("Invalid input maps
to join, must contain same number of rows");
+
+ try {
+ return computeJoin(left, right, size, nVL, maxUnique);
+ }
+ catch(Exception e) {
+ throw new DMLCompressionException("Joining failed max
unique expected:" + maxUnique, e);
+ }
+ }
+
+ private static AMapToData computeJoin(AMapToData left, AMapToData
right, int size, int nVL, int maxUnique) {
+ AMapToData tmp = create(size, maxUnique);
+ return computeJoinUsingLinearizedMap(tmp, left, right, size,
nVL, maxUnique);
+ }
+
+ private static AMapToData computeJoinUsingLinearizedMap(AMapToData tmp,
AMapToData left, AMapToData right, int size,
+ int nVL, int maxUnique) {
+ int[] map = new int[maxUnique];
+ int newUID = 1;
+ for(int i = 0; i < size; i++) {
+ final int nv = left.getIndex(i) + right.getIndex(i) *
nVL;
+ final int mapV = map[nv];
+ if(mapV == 0) {
+ tmp.set(i, newUID - 1);
+ map[nv] = newUID++;
+ }
+ else
+ tmp.set(i, mapV - 1);
+ }
+
+ tmp.setUnique(newUID-1);
+ // LOG.error(Arrays.toString(map));
+ return tmp;
+ }
}
diff --git
a/src/main/java/org/apache/sysds/runtime/compress/colgroup/mapping/MapToInt.java
b/src/main/java/org/apache/sysds/runtime/compress/colgroup/mapping/MapToInt.java
index 8318be1..a0e3dcb 100644
---
a/src/main/java/org/apache/sysds/runtime/compress/colgroup/mapping/MapToInt.java
+++
b/src/main/java/org/apache/sysds/runtime/compress/colgroup/mapping/MapToInt.java
@@ -31,11 +31,13 @@ public class MapToInt extends AMapToData {
private final int[] _data;
- public MapToInt(int size) {
+ public MapToInt(int unique, int size) {
+ super(unique);
_data = new int[size];
}
- private MapToInt(int[] data) {
+ private MapToInt(int unique,int[] data) {
+ super(unique);
_data = data;
}
@@ -62,7 +64,7 @@ public class MapToInt extends AMapToData {
@Override
public long getExactSizeOnDisk() {
- return 1 + 4 + _data.length * 4;
+ return 1 + 4 + 4 + _data.length * 4;
}
@Override
@@ -78,16 +80,18 @@ public class MapToInt extends AMapToData {
@Override
public void write(DataOutput out) throws IOException {
out.writeByte(MAP_TYPE.INT.ordinal());
+ out.writeInt(getUnique());
out.writeInt(_data.length);
for(int i = 0; i < _data.length; i++)
out.writeInt(_data[i]);
}
public static MapToInt readFields(DataInput in) throws IOException {
+ int unique = in.readInt();
final int length = in.readInt();
final int[] data = new int[length];
for(int i = 0; i < length; i++)
data[i] = in.readInt();
- return new MapToInt(data);
+ return new MapToInt(unique, data);
}
}
diff --git
a/src/main/java/org/apache/sysds/runtime/compress/estim/CompressedSizeEstimator.java
b/src/main/java/org/apache/sysds/runtime/compress/estim/CompressedSizeEstimator.java
index 91e4a4e..24c2958 100644
---
a/src/main/java/org/apache/sysds/runtime/compress/estim/CompressedSizeEstimator.java
+++
b/src/main/java/org/apache/sysds/runtime/compress/estim/CompressedSizeEstimator.java
@@ -33,6 +33,7 @@ import org.apache.commons.logging.LogFactory;
import org.apache.sysds.runtime.compress.CompressionSettings;
import org.apache.sysds.runtime.compress.colgroup.AColGroup.CompressionType;
import org.apache.sysds.runtime.compress.utils.ABitmap;
+import org.apache.sysds.runtime.compress.utils.Util;
import org.apache.sysds.runtime.matrix.data.MatrixBlock;
import org.apache.sysds.runtime.util.CommonThreadPool;
@@ -178,6 +179,32 @@ public abstract class CompressedSizeEstimator {
public abstract CompressedSizeInfoColGroup
estimateCompressedColGroupSize(int[] colIndexes, int nrUniqueUpperBound);
/**
+ * Join two analyzed column groups together. without materializing the
dictionaries of either side.
+ *
+ * If either side was constructed without analysis then fall back to
default materialization of double arrays.
+ *
+ * @param g1 First group
+ * @param g2 Second group
+ * @return A joined compressed size estimation for the group.
+ */
+ public CompressedSizeInfoColGroup
estimateJoinCompressedSize(CompressedSizeInfoColGroup g1,
+ CompressedSizeInfoColGroup g2) {
+ int[] joined = Util.join(g1.getColumns(), g2.getColumns());
+ final int g1V = g1.getNumVals();
+ final int g2V = g2.getNumVals();
+ if(g1V * g2V < 0 || g1V * g2V > getNumRows())
+ return null;
+ else if(joined.length == 2 || (g1.getMap() == null && g2V != 0)
|| (g2.getMap() == null && g2V != 0))
+ return estimateCompressedColGroupSize(joined, (g1V + 1)
* (g2V + 1));
+ else
+ return estimateJoinCompressedSize(joined, g1, g2);
+
+ }
+
+ protected abstract CompressedSizeInfoColGroup
estimateJoinCompressedSize(int[] joinedcols,
+ CompressedSizeInfoColGroup g1, CompressedSizeInfoColGroup g2);
+
+ /**
* Method used to extract the CompressedSizeEstimationFactors from an
constructed UncompressedBitmap. Note this
* method works both for the sample based estimator and the exact
estimator, since the bitmap, can be extracted from
* a sample or from the entire dataset.
@@ -190,9 +217,10 @@ public abstract class CompressedSizeEstimator {
return estimateCompressedColGroupSize(ubm, colIndexes,
_numRows, _cs);
}
- public static EstimationFactors estimateCompressedColGroupSize(ABitmap
ubm, int[] colIndexes, int nrRows, CompressionSettings cs) {
+ public static EstimationFactors estimateCompressedColGroupSize(ABitmap
ubm, int[] colIndexes, int nrRows,
+ CompressionSettings cs) {
return EstimationFactors.computeSizeEstimationFactors(ubm,
cs.validCompressions.contains(CompressionType.RLE),
- nrRows, colIndexes);
+ colIndexes);
}
private CompressedSizeInfoColGroup[] CompressedSizeInfoColGroup(int
clen) {
@@ -247,4 +275,18 @@ public abstract class CompressedSizeEstimator {
}
return colIndexes;
}
+
+ @Override
+ public String toString() {
+ StringBuilder sb = new StringBuilder();
+ sb.append(this.getClass().getSimpleName());
+ sb.append(" transposed: ");
+ sb.append(_transposed);
+ sb.append(" cols: ");
+ sb.append(_numCols);
+ sb.append(" rows: ");
+ sb.append(_numRows);
+ return sb.toString();
+ }
+
}
diff --git
a/src/main/java/org/apache/sysds/runtime/compress/estim/CompressedSizeEstimatorExact.java
b/src/main/java/org/apache/sysds/runtime/compress/estim/CompressedSizeEstimatorExact.java
index 5fdf8e2..20de976 100644
---
a/src/main/java/org/apache/sysds/runtime/compress/estim/CompressedSizeEstimatorExact.java
+++
b/src/main/java/org/apache/sysds/runtime/compress/estim/CompressedSizeEstimatorExact.java
@@ -20,6 +20,9 @@
package org.apache.sysds.runtime.compress.estim;
import org.apache.sysds.runtime.compress.CompressionSettings;
+import org.apache.sysds.runtime.compress.colgroup.AColGroup.CompressionType;
+import org.apache.sysds.runtime.compress.colgroup.mapping.AMapToData;
+import org.apache.sysds.runtime.compress.colgroup.mapping.MapToFactory;
import org.apache.sysds.runtime.compress.lib.BitmapEncoder;
import org.apache.sysds.runtime.compress.utils.ABitmap;
import org.apache.sysds.runtime.matrix.data.MatrixBlock;
@@ -37,20 +40,17 @@ public class CompressedSizeEstimatorExact extends
CompressedSizeEstimator {
public CompressedSizeInfoColGroup estimateCompressedColGroupSize(int[]
colIndexes, int nrUniqueUpperBound) {
// exact estimator can ignore upper bound.
ABitmap entireBitMap = BitmapEncoder.extractBitmap(colIndexes,
_data, _transposed);
- return new
CompressedSizeInfoColGroup(estimateCompressedColGroupSize(entireBitMap,
colIndexes),
- _cs.validCompressions);
+ EstimationFactors em =
estimateCompressedColGroupSize(entireBitMap, colIndexes);
+ return new CompressedSizeInfoColGroup(em,
_cs.validCompressions, entireBitMap);
}
@Override
- public String toString() {
- StringBuilder sb = new StringBuilder();
- sb.append(this.getClass().getSimpleName());
- sb.append(" transposed: ");
- sb.append(_transposed);
- sb.append(" cols: ");
- sb.append(_numCols);
- sb.append(" rows: ");
- sb.append(_numRows);
- return sb.toString();
+ public CompressedSizeInfoColGroup estimateJoinCompressedSize(int[]
joined, CompressedSizeInfoColGroup g1,
+ CompressedSizeInfoColGroup g2) {
+ AMapToData map = MapToFactory.join(g1.getMap(), g2.getMap());
+ EstimationFactors em =
EstimationFactors.computeSizeEstimation(joined, map,
+ _cs.validCompressions.contains(CompressionType.RLE),
_numRows, false);
+ return new CompressedSizeInfoColGroup(em,
_cs.validCompressions, map);
}
+
}
diff --git
a/src/main/java/org/apache/sysds/runtime/compress/estim/CompressedSizeEstimatorSample.java
b/src/main/java/org/apache/sysds/runtime/compress/estim/CompressedSizeEstimatorSample.java
index 3b43796..6c19dc9 100644
---
a/src/main/java/org/apache/sysds/runtime/compress/estim/CompressedSizeEstimatorSample.java
+++
b/src/main/java/org/apache/sysds/runtime/compress/estim/CompressedSizeEstimatorSample.java
@@ -21,13 +21,14 @@ package org.apache.sysds.runtime.compress.estim;
import java.util.HashMap;
+import org.apache.commons.lang.NotImplementedException;
import org.apache.sysds.runtime.compress.CompressionSettings;
import org.apache.sysds.runtime.compress.colgroup.AColGroup.CompressionType;
-import org.apache.sysds.runtime.compress.estim.sample.HassAndStokes;
+import org.apache.sysds.runtime.compress.colgroup.mapping.AMapToData;
+import org.apache.sysds.runtime.compress.colgroup.mapping.MapToFactory;
+import org.apache.sysds.runtime.compress.estim.sample.SampleEstimatorFactory;
import org.apache.sysds.runtime.compress.lib.BitmapEncoder;
import org.apache.sysds.runtime.compress.utils.ABitmap;
-import org.apache.sysds.runtime.compress.utils.ABitmap.BitmapType;
-import org.apache.sysds.runtime.compress.utils.IntArrayList;
import org.apache.sysds.runtime.data.SparseBlock;
import org.apache.sysds.runtime.data.SparseBlockMCSR;
import org.apache.sysds.runtime.data.SparseRow;
@@ -91,63 +92,85 @@ public class CompressedSizeEstimatorSample extends
CompressedSizeEstimator {
@Override
public CompressedSizeInfoColGroup estimateCompressedColGroupSize(int[]
colIndexes, int nrUniqueUpperBound) {
- final int sampleSize = _sampleRows.length;
- // final int numCols = colIndexes.length;
- final double scalingFactor = ((double) _numRows / sampleSize);
+
// extract statistics from sample
final ABitmap ubm = BitmapEncoder.extractBitmap(colIndexes,
_sample, _transposed);
- final EstimationFactors fact =
EstimationFactors.computeSizeEstimationFactors(ubm, false, _numRows,
colIndexes);
- final int numZerosInSample = ubm.getZeroCounts();
- final boolean lossy = ubm.getType() == BitmapType.Lossy;
-
- if(numZerosInSample == sampleSize || ubm.getOffsetList() ==
null) {
- // Since we sample, and this column seems to be empty,
we set the return to 1 value detected.
- // aka 1 value, and 1 offset.
- // This makes it more robust in the coCoding of Columns
- return new CompressedSizeInfoColGroup(new
EstimationFactors(colIndexes, 1, 1, _numRows - 1, 2, 1, _numRows,
- lossy, true, (double) 1 / _numRows, (double) 1
/ ubm.getNumColumns()), _cs.validCompressions);
+ final EstimationFactors sampleFacts =
EstimationFactors.computeSizeEstimationFactors(ubm, false, colIndexes);
+ final AMapToData map = MapToFactory.create(ubm);
+
+ // result facts
+ EstimationFactors em = estimateCompressionFactors(sampleFacts,
map, colIndexes, nrUniqueUpperBound);
+ return new CompressedSizeInfoColGroup(em,
_cs.validCompressions, map);
+ }
+
+ @Override
+ public CompressedSizeInfoColGroup estimateJoinCompressedSize(int[]
joined, CompressedSizeInfoColGroup g1,
+ CompressedSizeInfoColGroup g2) {
+ final int g1V = g1.getMap().getUnique();
+ final int g2V = g2.getMap().getUnique();
+ final int nrUniqueUpperBound = g1V * g2V;
+
+ final AMapToData map = MapToFactory.join(g1.getMap(),
g2.getMap());
+ EstimationFactors sampleFacts =
EstimationFactors.computeSizeEstimation(joined, map,
+ _cs.validCompressions.contains(CompressionType.RLE),
map.size(), false);
+
+ // result facts
+ EstimationFactors em = estimateCompressionFactors(sampleFacts,
map, joined, nrUniqueUpperBound);
+ return new CompressedSizeInfoColGroup(em,
_cs.validCompressions, map);
+ }
+
+ private EstimationFactors estimateCompressionFactors(EstimationFactors
sampleFacts, AMapToData map,
+ int[] colIndexes, int nrUniqueUpperBound) {
+ final int numZerosInSample = sampleFacts.numRows -
sampleFacts.numOffs;
+ final int sampleSize = _sampleRows.length;
+
+ if(numZerosInSample == sampleSize) {
+ final int nCol = sampleFacts.cols.length;
+ /**
+ * Since we sample, and this column seems to be empty
we set the return to 1 value detected. aka 1 value,
+ * and 1 offset. This makes it more robust in the
coCoding of Columns
+ */
+ final int largestInstanceCount = _numRows - 1;
+ return new EstimationFactors(colIndexes, 1, 1,
largestInstanceCount, new int[] {largestInstanceCount}, 2, 1,
+ _numRows, sampleFacts.lossy, true, (double) 1 /
_numRows, (double) 1 / nCol);
}
+ else {
- // Estimate number of distinct values (incl fixes for anomalies
w/ large sample fraction)
- int totalCardinality = getNumDistinctValues(ubm, _numRows,
sampleSize, _solveCache);
- // Number of unique is trivially bounded by the sampled number
of uniques and the number of rows.
- totalCardinality = Math.min(Math.min(Math.max(totalCardinality,
fact.numVals), _numRows), nrUniqueUpperBound);
+ final double scalingFactor = ((double) _numRows /
sampleSize);
+ // Estimate number of distinct values (incl fixes for
anomalies w/ large sample fraction)
+ final int totalCardinality = Math.max(map.getUnique(),
Math.min(_numRows,
+
getEstimatedDistinctCount(sampleFacts.frequencies, nrUniqueUpperBound)));
- // estimate number of non-zeros (conservatively round up)
- final double C = Math.max(1 - (double) fact.numSingle /
sampleSize, (double) sampleSize / _numRows);
- final int numNonZeros = Math.max((int) Math.floor(_numRows -
scalingFactor * C * numZerosInSample),
- totalCardinality);
+ // estimate number of non-zeros (conservatively round
up)
+ final double C = Math.max(1 - (double)
sampleFacts.numSingle / sampleSize, (double) sampleSize / _numRows);
+ final int numNonZeros = Math.max((int)
Math.floor(_numRows - scalingFactor * C * numZerosInSample),
+ totalCardinality);
+ final int totalNumRuns = getNumRuns(map,
sampleFacts.numVals, sampleSize, _numRows, _sampleRows);
+
+ final int largestInstanceCount = Math.min(_numRows,
(int) Math.floor(sampleFacts.largestOff * scalingFactor));
+
+ return new EstimationFactors(colIndexes,
totalCardinality, numNonZeros, largestInstanceCount,
+ sampleFacts.frequencies, totalNumRuns,
sampleFacts.numSingle, _numRows, sampleFacts.lossy,
+ sampleFacts.zeroIsMostFrequent,
sampleFacts.overAllSparsity, sampleFacts.tupleSparsity);
+ }
+ }
+
+ private int getEstimatedDistinctCount( int[] frequencies, int
upperBound) {
+ return Math.min(SampleEstimatorFactory.distinctCount(
frequencies, _numRows, _sampleRows.length,
+ _cs.estimationType, _solveCache), upperBound);
+ }
+
+ private int getNumRuns(AMapToData map, int numVals, int sampleSize, int
totalNumRows, int[] sampleRows) {
// estimate number of segments and number of runs incl
correction for
// empty segments and empty runs (via expected mean of offset
value)
// int numUnseenSeg = (int) (unseenVals * Math.ceil((double)
_numRows / BitmapEncoder.BITMAP_BLOCK_SZ / 2));
- final int totalNumRuns =
_cs.validCompressions.contains(CompressionType.RLE) &&
- ubm.getNumValues() > 0 ? getNumRuns(ubm, sampleSize,
_numRows, _sampleRows) : 0;
-
- // Largest instance count ... initiate as the number of zeros.
- int largestInstanceCount = numZerosInSample;
- for(IntArrayList a : ubm.getOffsetList())
- if(a.size() > largestInstanceCount)
- largestInstanceCount = a.size();
-
- final boolean zeroIsMostFrequent = largestInstanceCount ==
numZerosInSample;
-
- // Scale largest Instance count to correctly reflect the number
of instances.
- largestInstanceCount = Math.min((int) (scalingFactor *
largestInstanceCount), _numRows);
-
- EstimationFactors totalFacts = new
EstimationFactors(colIndexes, totalCardinality, numNonZeros,
- largestInstanceCount, totalNumRuns, fact.numSingle,
_numRows, lossy, zeroIsMostFrequent,
- fact.overAllSparsity, fact.tupleSparsity);
-
- // construct new size info summary
- return new CompressedSizeInfoColGroup(totalFacts,
_cs.validCompressions);
- }
-
- private static int getNumDistinctValues(ABitmap ubm, int numRows, int
sampleSize,
- HashMap<Integer, Double> solveCache) {
- return HassAndStokes.haasAndStokes(ubm, numRows, sampleSize,
solveCache);
+ return _cs.validCompressions.contains(CompressionType.RLE) &&
numVals > 0 ? getNumRuns(map, sampleSize,
+ _numRows, _sampleRows) : 0;
}
+ // Fix getNumRuns when adding RLE back.
+ @SuppressWarnings("unused")
private static int getNumRuns(ABitmap ubm, int sampleSize, int
totalNumRows, int[] sampleRows) {
int numVals = ubm.getNumValues();
double numRuns = 0;
@@ -306,6 +329,11 @@ public class CompressedSizeEstimatorSample extends
CompressedSizeEstimator {
return (int) Math.min(Math.round(numRuns), Integer.MAX_VALUE);
}
+ private static int getNumRuns(AMapToData map, int sampleSize, int
totalNumRows, int[] sampleRows) {
+
+ throw new NotImplementedException("Not Supported ever since the
ubm was replaced by the map");
+ }
+
/**
* Returns a sorted array of n integers, drawn uniformly from the range
[0,range).
*
@@ -320,7 +348,7 @@ public class CompressedSizeEstimatorSample extends
CompressedSizeEstimator {
@Override
public String toString() {
StringBuilder sb = new StringBuilder();
- sb.append(this.getClass().getSimpleName());
+ sb.append(super.toString());
sb.append(" sampleSize: ");
sb.append(_sampleRows.length);
sb.append(" transposed: ");
@@ -331,5 +359,4 @@ public class CompressedSizeEstimatorSample extends
CompressedSizeEstimator {
sb.append(_numRows);
return sb.toString();
}
-
}
diff --git
a/src/main/java/org/apache/sysds/runtime/compress/estim/CompressedSizeInfoColGroup.java
b/src/main/java/org/apache/sysds/runtime/compress/estim/CompressedSizeInfoColGroup.java
index 8b984ff..2fd7a40 100644
---
a/src/main/java/org/apache/sysds/runtime/compress/estim/CompressedSizeInfoColGroup.java
+++
b/src/main/java/org/apache/sysds/runtime/compress/estim/CompressedSizeInfoColGroup.java
@@ -28,9 +28,11 @@ import org.apache.commons.lang.NotImplementedException;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.sysds.runtime.compress.CompressionSettings;
-import
org.apache.sysds.runtime.compress.cocode.PlanningCoCoder.PartitionerType;
import org.apache.sysds.runtime.compress.colgroup.AColGroup.CompressionType;
import org.apache.sysds.runtime.compress.colgroup.ColGroupSizes;
+import org.apache.sysds.runtime.compress.colgroup.mapping.AMapToData;
+import org.apache.sysds.runtime.compress.colgroup.mapping.MapToFactory;
+import org.apache.sysds.runtime.compress.utils.ABitmap;
/**
* Information collected about a specific ColGroup's compression size.
@@ -46,6 +48,11 @@ public class CompressedSizeInfoColGroup {
private final Map<CompressionType, Long> _sizes;
/**
+ * Map containing a mapping to unique values, but not necessarily the
actual values contained in this column group
+ */
+ private final AMapToData _map;
+
+ /**
* Join columns without analyzing the content. This only specify the
compression ratio if encoded in DDC since this
* is trivially calculated. The number of tuples contained can be set
to the upper theoretical bound of two groups
* by multiplying the number of distinct tuple of each colGroups with
each other.
@@ -62,6 +69,7 @@ public class CompressedSizeInfoColGroup {
_sizes = null;
_bestCompressionType = null;
_minSize =
ColGroupSizes.estimateInMemorySizeDDC(columns.length, numVals, numRows, 1.0,
false);
+ _map = null;
}
/**
@@ -69,8 +77,28 @@ public class CompressedSizeInfoColGroup {
*
* @param facts The facts extracted from a number of
columns, based on the estimateFactors
* @param validCompressionTypes The list of valid compression types,
allowed to be performed.
+ * @param map The map of the distinct values
contained in this column group.
*/
- public CompressedSizeInfoColGroup(EstimationFactors facts,
Set<CompressionType> validCompressionTypes) {
+ public CompressedSizeInfoColGroup(EstimationFactors facts,
Set<CompressionType> validCompressionTypes,
+ ABitmap map) {
+ _facts = facts;
+ _cardinalityRatio = (double) facts.numVals / facts.numRows;
+ _sizes = calculateCompressionSizes(facts,
validCompressionTypes);
+ Map.Entry<CompressionType, Long> bestEntry = null;
+ for(Map.Entry<CompressionType, Long> ent : _sizes.entrySet()) {
+ if(bestEntry == null || ent.getValue() <
bestEntry.getValue())
+ bestEntry = ent;
+ }
+
+ _bestCompressionType = bestEntry.getKey();
+ _minSize = bestEntry.getValue();
+ _map = MapToFactory.create(map);
+ if(LOG.isTraceEnabled())
+ LOG.trace(this);
+ }
+
+ protected CompressedSizeInfoColGroup(EstimationFactors facts,
Set<CompressionType> validCompressionTypes,
+ AMapToData map) {
_facts = facts;
_cardinalityRatio = (double) facts.numVals / facts.numRows;
_sizes = calculateCompressionSizes(facts,
validCompressionTypes);
@@ -82,6 +110,7 @@ public class CompressedSizeInfoColGroup {
_bestCompressionType = bestEntry.getKey();
_minSize = bestEntry.getValue();
+ _map = map;
if(LOG.isTraceEnabled())
LOG.trace(this);
}
@@ -89,7 +118,7 @@ public class CompressedSizeInfoColGroup {
/**
* This method adds a column group without having to analyze. This is
because the columns added are constant groups.
*
- * NOTE THIS IS ONLY VALID IF THE COLUMN ADDED IS EMPTY!
+ * NOTE THIS IS ONLY VALID IF THE COLUMN ADDED IS EMPTY OR CONSTANT!
*
* @param columns The columns of the colgroups together
* @param oneSide One of the sides, this may contain
something, but the other side (not part of the
@@ -100,7 +129,7 @@ public class CompressedSizeInfoColGroup {
public static CompressedSizeInfoColGroup addConstGroup(int[] columns,
CompressedSizeInfoColGroup oneSide,
Set<CompressionType> validCompressionTypes) {
EstimationFactors fact = new EstimationFactors(columns,
oneSide._facts);
- CompressedSizeInfoColGroup ret = new
CompressedSizeInfoColGroup(fact, validCompressionTypes);
+ CompressedSizeInfoColGroup ret = new
CompressedSizeInfoColGroup(fact, validCompressionTypes, oneSide._map);
return ret;
}
@@ -109,11 +138,6 @@ public class CompressedSizeInfoColGroup {
}
public CompressionType getBestCompressionType(CompressionSettings cs) {
- if(cs.columnPartitioner == PartitionerType.COST_MATRIX_MULT) {
- // if(_sizes.get(CompressionType.SDC) * 0.8 <
_sizes.get(_bestCompressionType))
- if(getMostCommonFraction() > 0.4)
- return CompressionType.SDC;
- }
return _bestCompressionType;
}
@@ -163,6 +187,14 @@ public class CompressedSizeInfoColGroup {
return _facts.tupleSparsity;
}
+ public AMapToData getMap() {
+ return _map;
+ }
+
+ public boolean containsZeros() {
+ return _facts.numOffs < _facts.numRows;
+ }
+
private static Map<CompressionType, Long>
calculateCompressionSizes(EstimationFactors fact,
Set<CompressionType> validCompressionTypes) {
Map<CompressionType, Long> res = new HashMap<>();
@@ -218,10 +250,10 @@ public class CompressedSizeInfoColGroup {
@Override
public boolean equals(Object that) {
- if(!(that instanceof CompressedSizeInfoColGroup))
- return false;
-
- return Arrays.equals(_facts.cols, ((CompressedSizeInfoColGroup)
that)._facts.cols);
+ throw new NotImplementedException();
+ // if(!(that instanceof CompressedSizeInfoColGroup))
+ // return false;
+ // return Arrays.equals(_facts.cols,
((CompressedSizeInfoColGroup) that)._facts.cols);
}
@Override
diff --git
a/src/main/java/org/apache/sysds/runtime/compress/estim/EstimationFactors.java
b/src/main/java/org/apache/sysds/runtime/compress/estim/EstimationFactors.java
index b42e211..57d7182 100644
---
a/src/main/java/org/apache/sysds/runtime/compress/estim/EstimationFactors.java
+++
b/src/main/java/org/apache/sysds/runtime/compress/estim/EstimationFactors.java
@@ -21,12 +21,13 @@ package org.apache.sysds.runtime.compress.estim;
import java.util.Arrays;
+import org.apache.commons.lang.NotImplementedException;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.sysds.runtime.DMLCompressionException;
import org.apache.sysds.runtime.compress.CompressionSettings;
+import org.apache.sysds.runtime.compress.colgroup.mapping.AMapToData;
import org.apache.sysds.runtime.compress.utils.ABitmap;
-import org.apache.sysds.runtime.compress.utils.ABitmap.BitmapType;
/**
* Compressed Size Estimation factors. Contains meta information used to
estimate the compression sizes of given columns
@@ -43,6 +44,8 @@ public class EstimationFactors {
protected final int numOffs;
/** The number of instances in the largest offset, this is used to
determine if SDC is good. */
protected final int largestOff;
+ /** The frequencies of the different tuples in the columns */
+ protected final int[] frequencies;
/** The Number of runs, of consecutive equal numbers, used primarily in
RLE */
protected final int numRuns;
/** The Number of Values in the collection not Zero , Also refered to
as singletons */
@@ -62,7 +65,7 @@ public class EstimationFactors {
this.cols = cols;
this.numVals = numVals;
this.numRows = numRows;
-
+ this.frequencies = null;
this.numOffs = -1;
this.largestOff = -1;
this.numRuns = -1;
@@ -80,6 +83,7 @@ public class EstimationFactors {
this.numRows = old.numRows;
this.numOffs = old.numOffs;
this.largestOff = old.largestOff;
+ this.frequencies = old.frequencies;
this.numRuns = old.numRuns;
this.numSingle = old.numSingle;
this.lossy = old.lossy;
@@ -89,12 +93,14 @@ public class EstimationFactors {
this.tupleSparsity = old.tupleSparsity;
}
- protected EstimationFactors(int[] cols, int numVals, int numOffs, int
largestOff, int numRuns, int numSingle,
- int numRows, boolean lossy, boolean zeroIsMostFrequent, double
overAllSparsity, double tupleSparsity) {
+ protected EstimationFactors(int[] cols, int numVals, int numOffs, int
largestOff, int[] frequencies, int numRuns,
+ int numSingle, int numRows, boolean lossy, boolean
zeroIsMostFrequent, double overAllSparsity,
+ double tupleSparsity) {
this.cols = cols;
this.numVals = numVals;
this.numOffs = numOffs;
this.largestOff = largestOff;
+ this.frequencies = frequencies;
this.numRuns = numRuns;
this.numSingle = numSingle;
this.numRows = numRows;
@@ -117,12 +123,12 @@ public class EstimationFactors {
+ " > numRows: " + numRows);
}
- protected static EstimationFactors computeSizeEstimationFactors(ABitmap
ubm, boolean inclRLE, int numRows,
- int[] cols) {
+ protected static EstimationFactors computeSizeEstimationFactors(ABitmap
ubm, boolean inclRLE, int[] cols) {
+ final int numRows = ubm.getNumRows();
if(ubm == null || ubm.getOffsetList() == null)
- return new EstimationFactors(cols, 0, 0, numRows, 1, 0,
numRows, false, true, 0, 0);
+ return new EstimationFactors(cols, 0, 0, numRows, new
int[] {numRows}, 1, 0, numRows, false, true, 0, 0);
else {
- final int numVals = ubm.getNumValues();
+ int numVals = ubm.getNumValues();
int numRuns = 0;
int numOffs = 0;
int numSingle = 0;
@@ -153,6 +159,12 @@ public class EstimationFactors {
}
final int zerosOffs = numRows - numOffs;
+ final boolean containsZero = zerosOffs > 0;
+ int[] frequencies = new int[numVals + (containsZero ? 1
: 0)];
+ for(int i = 0; i < numVals; i++)
+ frequencies[i] = ubm.getNumOffsets(i);
+ if(containsZero)
+ frequencies[numVals] = zerosOffs;
final boolean zerosLargestOffset = zerosOffs >
largestOffs;
if(zerosLargestOffset)
largestOffs = zerosOffs;
@@ -160,11 +172,49 @@ public class EstimationFactors {
double overAllSparsity = (double) overallNonZeroCount /
(numRows * cols.length);
double tupleSparsity = (double) tupleNonZeroCount /
(numVals * cols.length);
- return new EstimationFactors(cols, numVals, numOffs,
largestOffs, numRuns, numSingle, numRows,
- ubm.getType() == BitmapType.Lossy,
zerosLargestOffset, overAllSparsity, tupleSparsity);
+ return new EstimationFactors(cols, numVals, numOffs,
largestOffs, frequencies, numRuns, numSingle, numRows,
+ ubm.lossy(), zerosLargestOffset,
overAllSparsity, tupleSparsity);
}
}
+ protected static EstimationFactors computeSizeEstimation(final int[]
cols, final AMapToData map,
+ final boolean inclRLE, final int numRows, final boolean
lastEntryAllZero) {
+ final boolean lossy = false;
+ if(map == null)
+ return new EstimationFactors(cols, 0, 0, numRows, new
int[] {numRows}, 1, 0, numRows, false, true, 0, 0);
+
+ final int nUnique = map.getUnique();
+ if(lastEntryAllZero) {
+ throw new NotImplementedException();
+ }
+ else {
+ final boolean zerosLargestOffset = false;
+ final double overAllSparsity = 1.0;
+ final double tupleSparsity = 1.0;
+ final int numOffs = map.size();
+ final int numSingle = 0; // unknown
+ if(inclRLE) {
+ throw new NotImplementedException();
+ }
+ else {
+ final int numRuns = 0;
+ int[] counts = new int[nUnique];
+ for(int i = 0; i < map.size(); i++) {
+ counts[map.getIndex(i)]++;
+ }
+ int largestOffs = 0;
+ for(int i = 0; i < nUnique; i++) {
+ if(counts[i] > largestOffs)
+ largestOffs = counts[i];
+ }
+ return new EstimationFactors(cols, nUnique,
numOffs, largestOffs, counts, numRuns, numSingle, numRows,
+ lossy, zerosLargestOffset,
overAllSparsity, tupleSparsity);
+ }
+
+ }
+
+ }
+
@Override
public String toString() {
StringBuilder sb = new StringBuilder();
diff --git
a/src/main/java/org/apache/sysds/runtime/compress/estim/sample/FrequencyCount.java
b/src/main/java/org/apache/sysds/runtime/compress/estim/sample/FrequencyCount.java
deleted file mode 100644
index 3badbe3..0000000
---
a/src/main/java/org/apache/sysds/runtime/compress/estim/sample/FrequencyCount.java
+++ /dev/null
@@ -1,47 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied. See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-package org.apache.sysds.runtime.compress.estim.sample;
-
-import org.apache.sysds.runtime.compress.utils.ABitmap;
-
-public class FrequencyCount {
-
- /**
- * Creates an inverted histogram, where freqCounts[i-1] indicates how
many values occurred with a frequency i. Note
- * that freqCounts[0] represents the special values of the number of
singletons.
- *
- * @param ubm uncompressed bitmap
- * @return frequency counts
- */
- protected static int[] get(ABitmap ubm) {
- // determine max frequency
- int numVals = ubm.getNumValues();
- int maxCount = 0;
- for(int i = 0; i < numVals; i++)
- maxCount = Math.max(maxCount, ubm.getNumOffsets(i));
-
- // create frequency histogram
- int[] freqCounts = new int[maxCount];
- for(int i = 0; i < numVals; i++)
- freqCounts[ubm.getNumOffsets(i) - 1]++;
-
- return freqCounts;
- }
-}
diff --git
a/src/main/java/org/apache/sysds/runtime/compress/estim/sample/GuaranteedErrorEstimator.java
b/src/main/java/org/apache/sysds/runtime/compress/estim/sample/GuaranteedErrorEstimator.java
deleted file mode 100644
index e1cea6e..0000000
---
a/src/main/java/org/apache/sysds/runtime/compress/estim/sample/GuaranteedErrorEstimator.java
+++ /dev/null
@@ -1,66 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied. See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-package org.apache.sysds.runtime.compress.estim.sample;
-
-import java.util.HashMap;
-
-import org.apache.sysds.runtime.compress.readers.ReaderColumnSelection;
-import org.apache.sysds.runtime.compress.utils.DblArray;
-
-public class GuaranteedErrorEstimator {
-
- /**
- * M. Charikar, S. Chaudhuri, R. Motwani, and V. R. Narasayya, Towards
estimation error guarantees for distinct
- * values, PODS'00.
- *
- * @param nRows number of rows
- * @param sampleSize sample size
- * @param sampleRowsReader a reader for the sampled rows
- * @return error estimator
- */
- @SuppressWarnings("unused")
- private static int guaranteedErrorEstimator(int nRows, int sampleSize,
ReaderColumnSelection sampleRowsReader) {
- HashMap<DblArray, Integer> valsCount =
getValCounts(sampleRowsReader);
- // number of values that occur only once
- int singltonValsCount = 0;
- int otherValsCount = 0;
- for(Integer c : valsCount.values()) {
- if(c == 1)
- singltonValsCount++;
- else
- otherValsCount++;
- }
- return (int) Math.round(otherValsCount + singltonValsCount *
Math.sqrt(((double) nRows) / sampleSize));
- }
-
- private static HashMap<DblArray, Integer>
getValCounts(ReaderColumnSelection sampleRowsReader) {
- HashMap<DblArray, Integer> valsCount = new HashMap<>();
- DblArray val = null;
- Integer cnt;
- while(null != (val = sampleRowsReader.nextRow())) {
- cnt = valsCount.get(val);
- if(cnt == null)
- cnt = 0;
- cnt++;
- valsCount.put(new DblArray(val), cnt);
- }
- return valsCount;
- }
-}
diff --git
a/src/main/java/org/apache/sysds/runtime/compress/estim/sample/HassAndStokes.java
b/src/main/java/org/apache/sysds/runtime/compress/estim/sample/HassAndStokes.java
index d7b0109..429961b 100644
---
a/src/main/java/org/apache/sysds/runtime/compress/estim/sample/HassAndStokes.java
+++
b/src/main/java/org/apache/sysds/runtime/compress/estim/sample/HassAndStokes.java
@@ -23,7 +23,6 @@ import java.util.HashMap;
import org.apache.commons.math3.analysis.UnivariateFunction;
import org.apache.commons.math3.analysis.solvers.UnivariateSolverUtils;
-import org.apache.sysds.runtime.compress.utils.ABitmap;
public class HassAndStokes {
@@ -40,16 +39,14 @@ public class HassAndStokes {
*
* The hybrid estimator given by Eq. 33 in Section 6
*
- * @param ubm The Uncompressed Bit map
+ * @param numVals The number of unique values in the sample
+ * @param freqCounts The inverse histogram of frequencies. counts
extracted
* @param nRows The number of rows originally in the input
* @param sampleSize The number of rows used in the sample
* @param solveCache A Hashmap containing information for
getDuj2aEstimate
* @return An estimation of distinct elements in the population.
*/
- public static int haasAndStokes(ABitmap ubm, int nRows, int sampleSize,
HashMap<Integer, Double> solveCache) {
- // obtain value and frequency histograms
- int numVals = ubm.getNumValues();
- int[] freqCounts = FrequencyCount.get(ubm);
+ protected static int distinctCount(int numVals, int[] freqCounts, int
nRows, int sampleSize, HashMap<Integer, Double> solveCache) {
// all values in the sample are zeros.
if(numVals == 0)
@@ -74,7 +71,9 @@ public class HassAndStokes {
d = getSh3Estimate(q, freqCounts, numVals);
// round and ensure min value 1
- return Math.max(1, (int) Math.round(d));
+ final int est = Math.max(1, (int) Math.round(d));
+ // Number of unique is trivially bounded by the sampled number
of uniques and the number of rows.
+ return Math.min(Math.max(est, numVals), nRows);
}
/**
diff --git
a/src/main/java/org/apache/sysds/runtime/compress/estim/sample/SampleEstimatorFactory.java
b/src/main/java/org/apache/sysds/runtime/compress/estim/sample/SampleEstimatorFactory.java
new file mode 100644
index 0000000..5c126ea
--- /dev/null
+++
b/src/main/java/org/apache/sysds/runtime/compress/estim/sample/SampleEstimatorFactory.java
@@ -0,0 +1,96 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.sysds.runtime.compress.estim.sample;
+
+import java.util.Arrays;
+import java.util.HashMap;
+
+import org.apache.commons.lang.NotImplementedException;
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.apache.sysds.runtime.DMLCompressionException;
+
+public class SampleEstimatorFactory {
+
+ protected static final Log LOG =
LogFactory.getLog(SampleEstimatorFactory.class.getName());
+
+ public enum EstimationType {
+ HassAndStokes, ShlosserEstimator, ShlosserJackknifeEstimator,
SmoothedJackknifeEstimator,
+
+ }
+
+ public static int distinctCount(int[] frequencies, int nRows, int
sampleSize, EstimationType type,
+ HashMap<Integer, Double> solveCache) {
+ final int numVals = ((float) frequencies[frequencies.length -
1] /
+ sampleSize < 0.4) ? frequencies.length :
frequencies.length - 1;
+ try {
+
+ switch(type) {
+ case HassAndStokes:
+ return
HassAndStokes.distinctCount(numVals, getInvertedFrequencyHistogram(frequencies,
numVals),
+ nRows, sampleSize, solveCache);
+ case ShlosserEstimator:
+ return
ShlosserEstimator.distinctCount(numVals,
getInvertedFrequencyHistogram(frequencies, numVals),
+ nRows, sampleSize);
+ case ShlosserJackknifeEstimator:
+ return
ShlosserJackknifeEstimator.distinctCount(numVals, frequencies,
+
getInvertedFrequencyHistogram(frequencies, numVals), nRows, sampleSize);
+ case SmoothedJackknifeEstimator:
+ return
SmoothedJackknifeEstimator.distinctCount(numVals,
+
getInvertedFrequencyHistogram(frequencies, numVals), nRows, sampleSize);
+ default:
+ throw new NotImplementedException("Type
not yet supported for counting distinct: " + type);
+ }
+ }
+ catch(Exception e) {
+ throw new DMLCompressionException(
+ "Error while estimating distinct count with
arguments:\n" + numVals + " frequencies:\n"
+ + Arrays.toString(frequencies) + "\n
nrows: " + nRows + " " + sampleSize + " type: " + type,
+ e);
+ }
+
+ }
+
+ private static int[] getInvertedFrequencyHistogram(int[] frequencies,
int numVals) {
+ try {
+
+ // Find max
+ int maxCount = 0;
+ for(int i = 0; i < numVals; i++) {
+ final int v = frequencies[i];
+ if(v > maxCount)
+ maxCount = v;
+ }
+
+ // create frequency histogram
+ int[] freqCounts = new int[maxCount];
+ for(int i = 0; i < numVals; i++) {
+ if(frequencies[i] != 0)
+ freqCounts[frequencies[i] - 1]++;
+ }
+
+ return freqCounts;
+ }
+ catch(Exception e) {
+ throw new DMLCompressionException(
+ "Could not extract inverted frequencies from
input: " + Arrays.toString(frequencies), e);
+ }
+ }
+}
diff --git
a/src/main/java/org/apache/sysds/runtime/compress/estim/sample/ShlosserEstimator.java
b/src/main/java/org/apache/sysds/runtime/compress/estim/sample/ShlosserEstimator.java
index 5fd9e16..dd33e63 100644
---
a/src/main/java/org/apache/sysds/runtime/compress/estim/sample/ShlosserEstimator.java
+++
b/src/main/java/org/apache/sysds/runtime/compress/estim/sample/ShlosserEstimator.java
@@ -19,26 +19,22 @@
package org.apache.sysds.runtime.compress.estim.sample;
-import org.apache.sysds.runtime.compress.utils.Bitmap;
-
public class ShlosserEstimator {
/**
* Peter J. Haas, Jeffrey F. Naughton, S. Seshadri, and Lynne Stokes.
Sampling-Based Estimation of the Number of
* Distinct Values of an Attribute. VLDB'95, Section 3.2.
*
- * @param ubm The Uncompressed Bitmap containing the data from
the sample
+ * @param numVals The number of unique values in the sample
+ * @param freqCounts The inverse histogram of frequencies. counts
extracted
* @param nRows The original number of rows in the entire input
* @param sampleSize The number of rows in the sample
* @return an estimation of number of distinct values.
*/
- public static int get(Bitmap ubm, int nRows, int sampleSize) {
+ protected static int distinctCount(int numVals, int[] freqCounts, int
nRows, int sampleSize) {
double q = ((double) sampleSize) / nRows;
double oneMinusQ = 1 - q;
- int numVals = ubm.getNumValues();
- int[] freqCounts = FrequencyCount.get(ubm);
-
double numerSum = 0, denomSum = 0;
int iPlusOne = 1;
for(int i = 0; i < freqCounts.length; i++, iPlusOne++) {
diff --git
a/src/main/java/org/apache/sysds/runtime/compress/estim/sample/ShlosserJackknifeEstimator.java
b/src/main/java/org/apache/sysds/runtime/compress/estim/sample/ShlosserJackknifeEstimator.java
index 7ccffe8..5f74e0f 100644
---
a/src/main/java/org/apache/sysds/runtime/compress/estim/sample/ShlosserJackknifeEstimator.java
+++
b/src/main/java/org/apache/sysds/runtime/compress/estim/sample/ShlosserJackknifeEstimator.java
@@ -20,7 +20,6 @@
package org.apache.sysds.runtime.compress.estim.sample;
import org.apache.commons.math3.distribution.ChiSquaredDistribution;
-import org.apache.sysds.runtime.compress.utils.Bitmap;
public class ShlosserJackknifeEstimator {
@@ -30,14 +29,15 @@ public class ShlosserJackknifeEstimator {
* Peter J. Haas, Jeffrey F. Naughton, S. Seshadri, and Lynne Stokes.
1995. Sampling-Based Estimation of the Number
* of Distinct Values of an Attribute. VLDB'95, Section 5.2,
recommended estimator by the authors
*
- * @param ubm The Uncompressed Bitmap containing the data from
the sample
+ * @param numVals The number of unique values in the sample
+ * @param frequencies The Frequencies of the different unique values
+ * @param freqCounts The inverse histogram of frequencies. counts
extracted
* @param nRows The original number of rows in the entire input
* @param sampleSize The number of rows in the sample
* @return an estimation of number of distinct values.
*/
- @SuppressWarnings("unused")
- private static int shlosserJackknifeEstimator(Bitmap ubm, int nRows,
int sampleSize) {
- int numVals = ubm.getNumValues();
+ protected static int distinctCount(int numVals, int[] frequencies,
int[] freqCounts, int nRows, int sampleSize) {
+
CriticalValue cv = computeCriticalValue(sampleSize);
// uniformity chi-square test
@@ -45,15 +45,15 @@ public class ShlosserJackknifeEstimator {
// test-statistic
double u = 0;
for(int i = 0; i < numVals; i++) {
- u += Math.pow(ubm.getNumOffsets(i) - nBar, 2);
+ u += Math.pow(frequencies[i] - nBar, 2);
}
u /= nBar;
if(sampleSize != cv.usedSampleSize)
computeCriticalValue(sampleSize);
if(u < cv.uniformityCriticalValue) // uniform
- return SmoothedJackknifeEstimator.get(ubm, nRows,
sampleSize);
+ return
SmoothedJackknifeEstimator.distinctCount(numVals, freqCounts, nRows,
sampleSize);
else
- return ShlosserEstimator.get(ubm, nRows, sampleSize);
+ return ShlosserEstimator.distinctCount(numVals,
freqCounts, nRows, sampleSize);
}
private static CriticalValue computeCriticalValue(int sampleSize) {
diff --git
a/src/main/java/org/apache/sysds/runtime/compress/estim/sample/SmoothedJackknifeEstimator.java
b/src/main/java/org/apache/sysds/runtime/compress/estim/sample/SmoothedJackknifeEstimator.java
index b5536e2..e9d5d50 100644
---
a/src/main/java/org/apache/sysds/runtime/compress/estim/sample/SmoothedJackknifeEstimator.java
+++
b/src/main/java/org/apache/sysds/runtime/compress/estim/sample/SmoothedJackknifeEstimator.java
@@ -19,22 +19,19 @@
package org.apache.sysds.runtime.compress.estim.sample;
-import org.apache.sysds.runtime.compress.utils.Bitmap;
-
public class SmoothedJackknifeEstimator {
/**
* Peter J. Haas, Jeffrey F. Naughton, S. Seshadri, and Lynne Stokes.
Sampling-Based Estimation of the Number of
* Distinct Values of an Attribute. VLDB'95, Section 4.3.
*
- * @param ubm The Uncompressed Bitmap containing the data from
the sample
+ * @param numVals The number of unique values in the sample
+ * @param freqCounts The inverse histogram of frequencies. counts
extracted
* @param nRows The original number of rows in the entire input
* @param sampleSize The number of rows in the sample
* @return Estimate of the number of distinct values
*/
- public static int get(Bitmap ubm, int nRows, int sampleSize) {
- int numVals = ubm.getNumValues();
- int[] freqCounts = FrequencyCount.get(ubm);
+ public static int distinctCount(int numVals, int[] freqCounts, int
nRows, int sampleSize) {
// all values in the sample are zeros
if(freqCounts.length == 0)
return 0;
diff --git a/src/main/java/org/apache/sysds/runtime/compress/utils/ABitmap.java
b/src/main/java/org/apache/sysds/runtime/compress/utils/ABitmap.java
index 69573a4..9f0a929 100644
--- a/src/main/java/org/apache/sysds/runtime/compress/utils/ABitmap.java
+++ b/src/main/java/org/apache/sysds/runtime/compress/utils/ABitmap.java
@@ -33,6 +33,7 @@ public abstract class ABitmap {
}
protected final int _numCols;
+ protected final int _numRows;
/** Bitmaps (as lists of offsets) for each of the values. */
protected IntArrayList[] _offsetsLists;
@@ -42,6 +43,7 @@ public abstract class ABitmap {
public ABitmap(int numCols, IntArrayList[] offsetsLists, int rows) {
_numCols = numCols;
+ _numRows = rows;
int offsetsTotal = 0;
if(offsetsLists != null) {
for(IntArrayList a : offsetsLists)
@@ -61,6 +63,16 @@ public abstract class ABitmap {
return _numCols;
}
+ public int getNumRows(){
+ return _numRows;
+ }
+
+ public boolean isEmpty(){
+ return _offsetsLists == null;
+ }
+
+ public abstract boolean lossy();
+
/**
* Obtain number of distinct value groups in the column. this number is
also the number of bitmaps, since there is
* one bitmap per value
diff --git a/src/main/java/org/apache/sysds/runtime/compress/utils/Bitmap.java
b/src/main/java/org/apache/sysds/runtime/compress/utils/Bitmap.java
index db0c2b6..d41fe94 100644
--- a/src/main/java/org/apache/sysds/runtime/compress/utils/Bitmap.java
+++ b/src/main/java/org/apache/sysds/runtime/compress/utils/Bitmap.java
@@ -95,4 +95,9 @@ public final class Bitmap extends ABitmap {
public BitmapType getType() {
return BitmapType.Full;
}
+
+ @Override
+ public boolean lossy(){
+ return false;
+ }
}
diff --git
a/src/main/java/org/apache/sysds/runtime/compress/utils/BitmapLossy.java
b/src/main/java/org/apache/sysds/runtime/compress/utils/BitmapLossy.java
index 56745e2..2c79211 100644
--- a/src/main/java/org/apache/sysds/runtime/compress/utils/BitmapLossy.java
+++ b/src/main/java/org/apache/sysds/runtime/compress/utils/BitmapLossy.java
@@ -109,6 +109,11 @@ public final class BitmapLossy extends ABitmap {
}
@Override
+ public boolean lossy(){
+ return true;
+ }
+
+ @Override
public String toString() {
StringBuilder sb = new StringBuilder();
sb.append(super.toString());
diff --git
a/src/main/java/org/apache/sysds/runtime/compress/utils/MultiColBitmap.java
b/src/main/java/org/apache/sysds/runtime/compress/utils/MultiColBitmap.java
index 80d98df..173834b 100644
--- a/src/main/java/org/apache/sysds/runtime/compress/utils/MultiColBitmap.java
+++ b/src/main/java/org/apache/sysds/runtime/compress/utils/MultiColBitmap.java
@@ -109,4 +109,9 @@ public final class MultiColBitmap extends ABitmap {
public BitmapType getType() {
return BitmapType.Full;
}
+
+ @Override
+ public boolean lossy(){
+ return false;
+ }
}
diff --git a/src/main/java/org/apache/sysds/runtime/compress/cocode/Util.java
b/src/main/java/org/apache/sysds/runtime/compress/utils/Util.java
similarity index 96%
rename from src/main/java/org/apache/sysds/runtime/compress/cocode/Util.java
rename to src/main/java/org/apache/sysds/runtime/compress/utils/Util.java
index 96973df..b6c9a6e 100644
--- a/src/main/java/org/apache/sysds/runtime/compress/cocode/Util.java
+++ b/src/main/java/org/apache/sysds/runtime/compress/utils/Util.java
@@ -17,7 +17,7 @@
* under the License.
*/
-package org.apache.sysds.runtime.compress.cocode;
+package org.apache.sysds.runtime.compress.utils;
public class Util {
public static int[] join(int[] lhs, int[] rhs) {
diff --git
a/src/test/java/org/apache/sysds/test/component/compress/CompressedTestBase.java
b/src/test/java/org/apache/sysds/test/component/compress/CompressedTestBase.java
index 3a5ced0..20054f2 100644
---
a/src/test/java/org/apache/sysds/test/component/compress/CompressedTestBase.java
+++
b/src/test/java/org/apache/sysds/test/component/compress/CompressedTestBase.java
@@ -233,7 +233,7 @@ public abstract class CompressedTestBase extends TestBase {
EstimationFactors ef =
CompressedSizeEstimator.estimateCompressedColGroupSize(ubm, colIndexes,
mb.getNumRows(), cs);
- CompressedSizeInfoColGroup cgi
= new CompressedSizeInfoColGroup(ef, cs.validCompressions);
+ CompressedSizeInfoColGroup cgi
= new CompressedSizeInfoColGroup(ef, cs.validCompressions, ubm);
AColGroup cg =
ColGroupFactory.compress(colIndexes, mb.getNumRows(), ubm, c, cs, mb,
cgi.getTupleSparsity());
colGroups.add(cg);
diff --git
a/src/test/java/org/apache/sysds/test/component/compress/estim/JoinCompressionInfoTest.java
b/src/test/java/org/apache/sysds/test/component/compress/estim/JoinCompressionInfoTest.java
new file mode 100644
index 0000000..a4760b4
--- /dev/null
+++
b/src/test/java/org/apache/sysds/test/component/compress/estim/JoinCompressionInfoTest.java
@@ -0,0 +1,119 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.sysds.test.component.compress.estim;
+
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.apache.sysds.runtime.compress.CompressionSettings;
+import org.apache.sysds.runtime.compress.CompressionSettingsBuilder;
+import org.apache.sysds.runtime.compress.estim.CompressedSizeEstimator;
+import org.apache.sysds.runtime.compress.estim.CompressedSizeEstimatorFactory;
+import org.apache.sysds.runtime.compress.estim.CompressedSizeInfoColGroup;
+import org.apache.sysds.runtime.matrix.data.MatrixBlock;
+import org.apache.sysds.runtime.util.DataConverter;
+import org.apache.sysds.test.TestUtils;
+import org.junit.Assert;
+import org.junit.Test;
+
+public class JoinCompressionInfoTest {
+
+ protected static final Log LOG =
LogFactory.getLog(SampleEstimatorTest.class.getName());
+
+ private static final int seed = 1512314;
+
+ final MatrixBlock mbt;
+
+ public JoinCompressionInfoTest() {
+ // matrix block 2 columns
+ MatrixBlock tmp = DataConverter
+
.convertToMatrixBlock(TestUtils.round(TestUtils.generateTestMatrix(2, 500000,
1, 10, 1.0, seed + 1)));
+ tmp = tmp.append(
+ DataConverter
+
.convertToMatrixBlock(TestUtils.round(TestUtils.generateTestMatrix(1, 500000,
1, 2, 1.0, seed + 1))),
+ new MatrixBlock(), false);
+ mbt = tmp;
+ }
+
+ @Test
+ public void compressedSizeInfoEstimatorFull() {
+
testSampleEstimateIsAtMaxEstimatedElementsInEachColumnsProduct(1.0, 1.0);
+ }
+
+ @Test
+ public void compressedSizeInfoEstimatorSample_90() {
+
testSampleEstimateIsAtMaxEstimatedElementsInEachColumnsProduct(0.9, 0.9);
+ }
+
+ @Test
+ public void compressedSizeInfoEstimatorSample_50() {
+
testSampleEstimateIsAtMaxEstimatedElementsInEachColumnsProduct(0.5, 0.90);
+ }
+
+ @Test
+ public void compressedSizeInfoEstimatorSample_20() {
+
testSampleEstimateIsAtMaxEstimatedElementsInEachColumnsProduct(0.2, 0.8);
+ }
+
+ @Test
+ public void compressedSizeInfoEstimatorSample_10() {
+
testSampleEstimateIsAtMaxEstimatedElementsInEachColumnsProduct(0.1, 0.75);
+ }
+
+ @Test
+ public void compressedSizeInfoEstimatorSample_5() {
+
testSampleEstimateIsAtMaxEstimatedElementsInEachColumnsProduct(0.05, 0.7);
+ }
+
+ @Test
+ public void compressedSizeInfoEstimatorSample_1() {
+
testSampleEstimateIsAtMaxEstimatedElementsInEachColumnsProduct(0.01, 0.6);
+ }
+
+ @Test
+ public void compressedSizeInfoEstimatorSample_p1() {
+
testSampleEstimateIsAtMaxEstimatedElementsInEachColumnsProduct(0.001, 0.5);
+ }
+
+ private void
testSampleEstimateIsAtMaxEstimatedElementsInEachColumnsProduct(double ratio,
double tolerance) {
+ try {
+
+ final CompressionSettings cs_estimate = new
CompressionSettingsBuilder().setMinimumSampleSize(100)
+ .setSamplingRatio(ratio).setSeed(seed).create();
+
+ cs_estimate.transposed = true;
+
+ final CompressedSizeEstimator es =
CompressedSizeEstimatorFactory.getSizeEstimator(mbt, cs_estimate);
+ CompressedSizeInfoColGroup g1 =
es.estimateCompressedColGroupSize(new int[] {0});
+ CompressedSizeInfoColGroup g2 =
es.estimateCompressedColGroupSize(new int[] {1});
+ g1 = es.estimateJoinCompressedSize(g1, g2);
+ g2 = es.estimateCompressedColGroupSize(new int[] {2});
+
+ CompressedSizeInfoColGroup joined_result =
es.estimateJoinCompressedSize(g1, g2);
+ CompressedSizeInfoColGroup estimate_full =
es.estimateCompressedColGroupSize(new int[] {0, 1, 2});
+
+ Assert.assertEquals(joined_result.getNumVals(),
estimate_full.getNumVals());
+ }
+ catch(Exception e) {
+ e.printStackTrace();
+ throw e;
+ }
+
+ }
+}
diff --git
a/src/test/java/org/apache/sysds/test/component/compress/mapping/StandAloneTests.java
b/src/test/java/org/apache/sysds/test/component/compress/mapping/StandAloneTests.java
new file mode 100644
index 0000000..6916732
--- /dev/null
+++
b/src/test/java/org/apache/sysds/test/component/compress/mapping/StandAloneTests.java
@@ -0,0 +1,137 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.sysds.test.component.compress.mapping;
+
+import static org.junit.Assert.assertEquals;
+
+import java.util.Arrays;
+
+import org.apache.sysds.runtime.compress.colgroup.mapping.AMapToData;
+import org.apache.sysds.runtime.compress.colgroup.mapping.MapToFactory;
+import org.apache.sysds.runtime.compress.utils.IntArrayList;
+import org.junit.Test;
+
+public class StandAloneTests {
+ @Test
+ public void testJoin_01() {
+ AMapToData a = MapToFactory.create(10, true, new IntArrayList[]
{gen(new int[] {1, 2, 3, 4})});
+ AMapToData b = MapToFactory.create(10, true, new IntArrayList[]
{gen(new int[] {2, 4, 6, 8})});
+ AMapToData c = MapToFactory.join(a, b);
+ // compare(c, new int[] {3, 2, 0, 2, 0, 3, 1, 3, 1, 3});
+ compare(c, new int[] {0, 1, 2, 1, 2, 0, 3, 0, 3, 0});
+ }
+
+ @Test
+ public void testJoin_02() {
+ AMapToData a = MapToFactory.create(10, true, new IntArrayList[]
{gen(new int[] {1, 2, 3, 4})});
+ AMapToData c = MapToFactory.join(a, a);
+ // compare(c, new int[] {1, 0, 0, 0, 0, 1, 1, 1, 1, 1});
+ compare(c, new int[] {0, 1, 1, 1, 1, 0, 0, 0, 0, 0});
+ }
+
+ @Test
+ public void testJoin_03() {
+ AMapToData a = MapToFactory.create(10, true, new IntArrayList[]
{gen(new int[] {1, 2, 3, 4})});
+ AMapToData b = MapToFactory.create(10, true, new IntArrayList[]
{gen(new int[] {1, 2, 3})});
+ AMapToData c = MapToFactory.join(a, b);
+ // compare(c, new int[] {2, 0, 0, 0, 1, 2, 2, 2, 2, 2});
+ compare(c, new int[] {0, 1, 1, 1, 2, 0, 0, 0, 0, 0});
+ }
+
+ @Test
+ public void testJoin_04() {
+ AMapToData a = MapToFactory.create(10, true, new IntArrayList[]
{gen(new int[] {1, 2, 3})});
+ AMapToData b = MapToFactory.create(10, true, new IntArrayList[]
{gen(new int[] {1, 2, 3, 4})});
+ AMapToData c = MapToFactory.join(a, b);
+ // compare(c, new int[] {2, 0, 0, 0, 1, 2, 2, 2, 2, 2});
+ compare(c, new int[] {0, 1, 1, 1, 2, 0, 0, 0, 0, 0});
+ }
+
+ @Test
+ public void testJoin_05() {
+ AMapToData a = MapToFactory.create(10, true, new IntArrayList[]
{gen(new int[] {1, 2, 3}), gen(new int[] {4})});
+ AMapToData b = MapToFactory.create(10, true, new IntArrayList[]
{gen(new int[] {1, 2, 3, 4})});
+ AMapToData c = MapToFactory.join(a, b);
+ // compare(c, new int[] {2, 0, 0, 0, 1, 2, 2, 2, 2, 2});
+ compare(c, new int[] {0, 1, 1, 1, 2, 0, 0, 0, 0, 0});
+ }
+
+ @Test
+ public void testJoin_06() {
+ AMapToData a = MapToFactory.create(10, true,
+ new IntArrayList[] {gen(new int[] {1, 2, 3}), gen(new
int[] {4, 5})});
+ AMapToData b = MapToFactory.create(10, true, new IntArrayList[]
{gen(new int[] {1, 2, 3, 4})});
+ AMapToData c = MapToFactory.join(a, b);
+ // compare(c, new int[] {3, 0, 0, 0, 1, 2, 3, 3, 3, 3});
+ compare(c, new int[] {0, 1, 1, 1, 2, 3, 0, 0, 0, 0});
+ }
+
+ @Test
+ public void testJoin_07() {
+ AMapToData a = MapToFactory.create(10, true,
+ new IntArrayList[] {gen(new int[] {1, 2, 3}), gen(new
int[] {4, 5}), gen(new int[] {6, 7})});
+ AMapToData b = MapToFactory.create(10, true, new IntArrayList[]
{gen(new int[] {1, 2, 3, 4})});
+ AMapToData c = MapToFactory.join(a, b);
+ // compare(c, new int[] {4, 0, 0, 0, 1, 2, 3, 3, 4, 4});
+ compare(c, new int[] {0, 1, 1, 1, 2, 3, 4, 4, 0, 0});
+ }
+
+ @Test(expected = RuntimeException.class)
+ public void testInvalidArgument() {
+ AMapToData a = MapToFactory.create(11, true,
+ new IntArrayList[] {gen(new int[] {1, 2, 3}), gen(new
int[] {4, 5}), gen(new int[] {6, 7})});
+ AMapToData b = MapToFactory.create(10, true, new IntArrayList[]
{gen(new int[] {1, 2, 3, 4})});
+ MapToFactory.join(a, b);
+ }
+
+ @Test
+ public void test_null_argument_01() {
+ AMapToData a = null;
+ AMapToData b = MapToFactory.create(10, true, new IntArrayList[]
{gen(new int[] {1, 2, 3, 4})});
+ AMapToData c = MapToFactory.join(a, b);
+ compare(c, new int[] {1, 0, 0, 0, 0, 1, 1, 1, 1, 1});
+ // compare(c, new int[] {0, 1, 1, 1, 1, 0, 0, 0, 0, 0});
+ }
+
+ @Test
+ public void test_null_argument_02() {
+ AMapToData a = MapToFactory.create(10, true, new IntArrayList[]
{gen(new int[] {1, 2, 3, 4})});
+ AMapToData b = null;
+ AMapToData c = MapToFactory.join(a, b);
+ compare(c, new int[] {1, 0, 0, 0, 0, 1, 1, 1, 1, 1});
+ // compare(c, new int[] {0, 1, 1, 1, 1, 0, 0, 0, 0, 0});
+ }
+
+ private void compare(AMapToData res, int[] expected) {
+ StringBuilder sb = new StringBuilder();
+ sb.append("\nExpected:\n");
+ sb.append(Arrays.toString(expected));
+ sb.append("\nActual:\n");
+ sb.append(res.toString());
+ sb.append("\n");
+ for(int i = 0; i < expected.length; i++) {
+ assertEquals(sb.toString(), expected[i],
res.getIndex(i));
+ }
+ }
+
+ private static IntArrayList gen(int[] in) {
+ return new IntArrayList(in);
+ }
+}