This is an automated email from the ASF dual-hosted git repository.

baunsgaard pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/systemds.git

commit bc2602f1a24de39328424aaa90dce40509152031
Author: baunsgaard <[email protected]>
AuthorDate: Tue May 4 20:54:23 2021 +0200

    [SYSTEMDS-2999] CLA Decompression Unification
    
    This commit change the blocking of the decompression, to no longer align
    perfectly with 64k blocks, since if a column group contain many columns
    this is sub optimal.
    
    [SYSTEMDS-3000] CLA MM Most Common Element Addition
    
    This commit adds an exploitation of the compressed representation
    that allows add the most common element when multiplying on the
    left side with a compressed transposed matrix.
    This is a common occurrence in MMChain and TSMM and allows sparsity
    exploitation of dense compressed column groups.
---
 .../runtime/compress/CompressedMatrixBlock.java    |  73 ++-
 .../compress/CompressionSettingsBuilder.java       |   2 +-
 .../sysds/runtime/compress/cocode/CoCodeCost.java  |   8 +-
 .../sysds/runtime/compress/colgroup/AColGroup.java |   3 +-
 .../compress/colgroup/ColGroupCompressed.java      |  33 +-
 .../runtime/compress/colgroup/ColGroupConst.java   |   7 +-
 .../runtime/compress/colgroup/ColGroupDDC.java     | 138 ++---
 .../runtime/compress/colgroup/ColGroupEmpty.java   |   6 +-
 .../runtime/compress/colgroup/ColGroupFactory.java |  48 +-
 .../runtime/compress/colgroup/ColGroupOLE.java     | 563 ++++++++++-----------
 .../runtime/compress/colgroup/ColGroupOffset.java  |  14 -
 .../runtime/compress/colgroup/ColGroupRLE.java     | 443 ++++++++--------
 .../runtime/compress/colgroup/ColGroupSDC.java     |  84 ++-
 .../compress/colgroup/ColGroupSDCSingle.java       | 129 ++++-
 .../compress/colgroup/ColGroupSDCSingleZeros.java  |  26 +-
 .../compress/colgroup/ColGroupSDCZeros.java        |  11 +-
 .../compress/colgroup/ColGroupUncompressed.java    |  98 ++--
 .../runtime/compress/colgroup/ColGroupValue.java   | 289 +++++++----
 .../compress/colgroup/dictionary/ADictionary.java  |  27 +-
 .../compress/colgroup/dictionary/Dictionary.java   |  85 +++-
 .../compress/colgroup/dictionary/QDictionary.java  |  39 +-
 .../sysds/runtime/compress/lib/CLALibCompAgg.java  |   1 -
 .../runtime/compress/lib/CLALibLeftMultBy.java     |  63 +--
 .../runtime/compress/lib/CLALibRightMultBy.java    |  82 +--
 .../sysds/runtime/compress/lib/CLALibScalar.java   |   3 +-
 .../sysds/runtime/matrix/data/MatrixValue.java     |   4 +-
 .../component/compress/CompressedTestBase.java     |  23 +-
 .../test/component/compress/TestConstants.java     |   2 +-
 28 files changed, 1355 insertions(+), 949 deletions(-)

diff --git 
a/src/main/java/org/apache/sysds/runtime/compress/CompressedMatrixBlock.java 
b/src/main/java/org/apache/sysds/runtime/compress/CompressedMatrixBlock.java
index 473a1ec..bb7781e 100644
--- a/src/main/java/org/apache/sysds/runtime/compress/CompressedMatrixBlock.java
+++ b/src/main/java/org/apache/sysds/runtime/compress/CompressedMatrixBlock.java
@@ -199,7 +199,11 @@ public class CompressedMatrixBlock extends MatrixBlock {
                Timing time = new Timing(true);
 
                // preallocation sparse rows to avoid repeated reallocations
-               MatrixBlock ret = new MatrixBlock(rlen, clen, false, -1);
+               MatrixBlock ret = 
getUncompressedColGroupAndRemoveFromListOfColGroups();
+               if(ret != null && getColGroups().size() == 0)
+                       return ret;
+               else if(ret == null)
+                       ret = new MatrixBlock(rlen, clen, false, -1);
 
                ret.allocateDenseBlock();
                decompress(ret);
@@ -244,7 +248,11 @@ public class CompressedMatrixBlock extends MatrixBlock {
                        return decompress();
 
                Timing time = new Timing(true);
-               MatrixBlock ret = new MatrixBlock(rlen, clen, false, 
-1).allocateBlock();
+               MatrixBlock ret = 
getUncompressedColGroupAndRemoveFromListOfColGroups();
+               if(ret != null && getColGroups().size() == 0)
+                       return ret;
+               else if(ret == null)
+                       ret = new MatrixBlock(rlen, clen, false, -1);
                ret.allocateDenseBlock();
                decompress(ret, k);
 
@@ -258,19 +266,13 @@ public class CompressedMatrixBlock extends MatrixBlock {
        }
 
        public MatrixBlock decompress(MatrixBlock ret, int k) {
-
-               if(nonZeros == -1)
-                       ret.setNonZeros(this.recomputeNonZeros());
-               else
-                       ret.setNonZeros(nonZeros);
                try {
                        ExecutorService pool = CommonThreadPool.get(k);
                        int rlen = getNumRows();
                        final int blkz = CompressionSettings.BITMAP_BLOCK_SZ;
-                       int blklen = (int) Math.ceil((double) rlen / k);
-                       blklen += (blklen % blkz != 0) ? blkz - blklen % blkz : 
0;
+                       int blklen = (int) Math.max(64, Math.ceil((double) 
(blkz) / getNumColumns()));
                        ArrayList<DecompressTask> tasks = new ArrayList<>();
-                       for(int i = 0; i < k & i * blklen < getNumRows(); i++)
+                       for(int i = 0; i * blklen < getNumRows(); i++)
                                tasks.add(new DecompressTask(_colGroups, ret, i 
* blklen, Math.min((i + 1) * blklen, rlen),
                                        overlappingColGroups));
                        List<Future<Long>> rtasks = pool.invokeAll(tasks);
@@ -285,6 +287,34 @@ public class CompressedMatrixBlock extends MatrixBlock {
                        ret.recomputeNonZeros();
                        ret.examSparsity();
                }
+               else if(nonZeros == -1)
+                       ret.setNonZeros(this.recomputeNonZeros());
+               else
+                       ret.setNonZeros(nonZeros);
+               return ret;
+       }
+
+       private MatrixBlock 
getUncompressedColGroupAndRemoveFromListOfColGroups() {
+               // If we have a uncompressed column group that covers all of 
the matrix,
+               // it makes sense to use as the decompression target.
+               MatrixBlock ret = null;
+               // It is only relevant if we are in overlapping state, or we 
only have a Uncompressed ColumnGroup left.
+               if(isOverlapping() || _colGroups.size() == 1) {
+                       for(int i = 0; i < _colGroups.size(); i++) {
+                               AColGroup g = _colGroups.get(i);
+                               if(g instanceof ColGroupUncompressed) {
+                                       // Find an Uncompressed ColumnGroup
+                                       ColGroupUncompressed guc = 
(ColGroupUncompressed) g;
+                                       MatrixBlock gMB = guc.getData();
+                                       // Make sure that it is the correct 
dimensions
+                                       if(gMB.getNumColumns() == 
this.getNumColumns() && gMB.getNumRows() == this.getNumRows()) {
+                                               _colGroups.remove(i);
+                                               return gMB;
+                                       }
+                               }
+                       }
+               }
+
                return ret;
        }
 
@@ -480,21 +510,22 @@ public class CompressedMatrixBlock extends MatrixBlock {
 
                // compute matrix mult
 
-               boolean tryOverlapOutput = v.getNumColumns() > 
_colGroups.size() && w != null && w.getNumRows() > 1;
-               MatrixBlock tmp = CLALibRightMultBy.rightMultByMatrix(this, v, 
null, k, tryOverlapOutput);
+               // boolean tryOverlapOutput = v.getNumColumns() > 
_colGroups.size();
+               MatrixBlock tmp = CLALibRightMultBy.rightMultByMatrix(this, v, 
null, k, true);
 
-               if(tmp instanceof CompressedMatrixBlock) {
-                       CompressedMatrixBlock tmpC = (CompressedMatrixBlock) 
tmp;
-                       if(ctype == ChainType.XtwXv)
-                               tmpC = (CompressedMatrixBlock) 
CLALibBinaryCellOp.binaryOperations(bop, tmpC, w, null);
-                       tmp = tmpC.decompress(k);
+               if(ctype == ChainType.XtwXv) {
+                       if(tmp instanceof CompressedMatrixBlock)
+                               tmp = CLALibBinaryCellOp.binaryOperations(bop, 
(CompressedMatrixBlock) tmp, w, null);
+                       else
+                               LibMatrixBincell.bincellOpInPlace(tmp, w, bop);
                }
-               else if(ctype == ChainType.XtwXv)
-                       LibMatrixBincell.bincellOpInPlace(tmp, w, bop);
 
-               CLALibLeftMultBy.leftMultByMatrixTransposed(this, tmp, out, k);
-               out = LibMatrixReorg.transposeInPlace(out, k);
+               if(tmp instanceof CompressedMatrixBlock)
+                       CLALibLeftMultBy.leftMultByMatrixTransposed(this, 
(CompressedMatrixBlock) tmp, out, k);
+               else
+                       CLALibLeftMultBy.leftMultByMatrixTransposed(this, tmp, 
out, k);
 
+               out = LibMatrixReorg.transposeInPlace(out, k);
                out.recomputeNonZeros();
                return out;
        }
diff --git 
a/src/main/java/org/apache/sysds/runtime/compress/CompressionSettingsBuilder.java
 
b/src/main/java/org/apache/sysds/runtime/compress/CompressionSettingsBuilder.java
index 388836f..a53bcfa 100644
--- 
a/src/main/java/org/apache/sysds/runtime/compress/CompressionSettingsBuilder.java
+++ 
b/src/main/java/org/apache/sysds/runtime/compress/CompressionSettingsBuilder.java
@@ -39,7 +39,7 @@ public class CompressionSettingsBuilder {
        private boolean investigateEstimate = true;
        private boolean lossy = false;
        private EnumSet<CompressionType> validCompressions;
-       private boolean sortValuesByLength = false;
+       private boolean sortValuesByLength = true;
        private PartitionerType columnPartitioner;
        private int maxStaticColGroupCoCode = 10;
 
diff --git 
a/src/main/java/org/apache/sysds/runtime/compress/cocode/CoCodeCost.java 
b/src/main/java/org/apache/sysds/runtime/compress/cocode/CoCodeCost.java
index 2cbc42f..1cce5b1 100644
--- a/src/main/java/org/apache/sysds/runtime/compress/cocode/CoCodeCost.java
+++ b/src/main/java/org/apache/sysds/runtime/compress/cocode/CoCodeCost.java
@@ -48,11 +48,15 @@ public class CoCodeCost extends AColumnCoCoder {
         */
        private final int largestDistinct;
 
-       private final static int toSmallForAnalysis = 64;
+       private final int toSmallForAnalysis;
+
+       private final double percentMaxCardinality = 0.08;
 
        protected CoCodeCost(CompressedSizeEstimator sizeEstimator, 
CompressionSettings cs, int numRows) {
                super(sizeEstimator, cs, numRows);
-               largestDistinct = Math.min(4096, Math.max(256, (int) (_numRows 
* 0.01)));
+               largestDistinct = Math.max(256, (int) (_numRows * 
percentMaxCardinality));
+               toSmallForAnalysis = largestDistinct / 4;
+               LOG.error("CocodeCost largest Distinct: "+ largestDistinct + " 
toSmallForAnalysis: " + toSmallForAnalysis);
        }
 
        @Override
diff --git 
a/src/main/java/org/apache/sysds/runtime/compress/colgroup/AColGroup.java 
b/src/main/java/org/apache/sysds/runtime/compress/colgroup/AColGroup.java
index 2f3bdff..8de274f 100644
--- a/src/main/java/org/apache/sysds/runtime/compress/colgroup/AColGroup.java
+++ b/src/main/java/org/apache/sysds/runtime/compress/colgroup/AColGroup.java
@@ -60,6 +60,7 @@ public abstract class AColGroup implements Serializable {
 
                /**
                 * Get the super type of the specific ColGroup Type used.
+                * 
                 * @param c The concrete ColGroupType
                 * @return The super CompressionType.
                 */
@@ -751,8 +752,8 @@ public abstract class AColGroup implements Serializable {
        @Override
        public String toString() {
                StringBuilder sb = new StringBuilder();
+               sb.append(" ColGroupType: ");
                sb.append(this.getClass().getSimpleName());
-               sb.append("\n");
                sb.append(String.format("\n%15s%5d ", "Columns:", 
_colIndexes.length));
                sb.append(Arrays.toString(_colIndexes));
 
diff --git 
a/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupCompressed.java
 
b/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupCompressed.java
index a231283..c8f9a41 100644
--- 
a/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupCompressed.java
+++ 
b/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupCompressed.java
@@ -89,16 +89,6 @@ public abstract class ColGroupCompressed extends AColGroup {
 
        protected abstract boolean sameIndexStructure(ColGroupCompressed that);
 
-       public void leftMultByMatrix(MatrixBlock matrix, double[] result, int 
numCols, int rl, int ru) {
-               if(matrix.isEmpty())
-                       return;
-               else if(matrix.isInSparseFormat())
-                       leftMultBySparseMatrix(matrix.getSparseBlock(), result, 
matrix.getNumRows(), numCols, rl, ru);
-               else {
-                       leftMultByMatrix(matrix.getDenseBlockValues(), result, 
matrix.getNumRows(), numCols, rl, ru);
-               }
-       }
-
        /**
         * Multiply with a matrix on the left.
         * 
@@ -125,22 +115,33 @@ public abstract class ColGroupCompressed extends 
AColGroup {
                int ru);
 
        @Override
-       public double getMin() {
+       public final void leftMultByMatrix(MatrixBlock matrix, double[] result, 
int numCols, int rl, int ru) {
+               if(matrix.isEmpty())
+                       return;
+               else if(matrix.isInSparseFormat())
+                       leftMultBySparseMatrix(matrix.getSparseBlock(), result, 
matrix.getNumRows(), numCols, rl, ru);
+               else {
+                       leftMultByMatrix(matrix.getDenseBlockValues(), result, 
matrix.getNumRows(), numCols, rl, ru);
+               }
+       }
+
+       @Override
+       public final double getMin() {
                return computeMxx(Double.POSITIVE_INFINITY, 
Builtin.getBuiltinFnObject(BuiltinCode.MIN));
        }
 
        @Override
-       public double getMax() {
+       public final double getMax() {
                return computeMxx(Double.NEGATIVE_INFINITY, 
Builtin.getBuiltinFnObject(BuiltinCode.MAX));
        }
 
        @Override
-       public void unaryAggregateOperations(AggregateUnaryOperator op, 
double[] c) {
+       public final void unaryAggregateOperations(AggregateUnaryOperator op, 
double[] c) {
                unaryAggregateOperations(op, c, 0, _numRows);
        }
 
        @Override
-       public void unaryAggregateOperations(AggregateUnaryOperator op, 
double[] c, int rl, int ru) {
+       public final void unaryAggregateOperations(AggregateUnaryOperator op, 
double[] c, int rl, int ru) {
                // sum and sumsq (reduceall/reducerow over tuples and counts)
                if(op.aggOp.increOp.fn instanceof Plus || op.aggOp.increOp.fn 
instanceof KahanPlus ||
                        op.aggOp.increOp.fn instanceof KahanPlusSq) {
@@ -174,13 +175,13 @@ public abstract class ColGroupCompressed extends 
AColGroup {
        @Override
        public String toString() {
                StringBuilder sb = new StringBuilder();
+               sb.append(" num Rows: " + getNumRows());
                sb.append(super.toString());
-               sb.append("num Rows: " + getNumRows());
                return sb.toString();
        }
 
        @Override
-       public int getNumRows() {
+       public final int getNumRows() {
                return _numRows;
        }
 
diff --git 
a/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupConst.java 
b/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupConst.java
index 75ed773..eb38ee0 100644
--- 
a/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupConst.java
+++ 
b/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupConst.java
@@ -300,7 +300,7 @@ public class ColGroupConst extends ColGroupValue {
        }
 
        @Override
-       public Dictionary preAggregateThatSDCStructure(ColGroupSDC that, 
Dictionary ret) {
+       public Dictionary preAggregateThatSDCStructure(ColGroupSDC that, 
Dictionary ret, boolean preModified) {
                throw new DMLCompressionException("Does not make sense to call 
this");
        }
 
@@ -315,6 +315,11 @@ public class ColGroupConst extends ColGroupValue {
        }
 
        @Override
+       public Dictionary preAggregateThatSDCSingleStructure(ColGroupSDCSingle 
that, Dictionary ret, boolean preModified){
+               throw new DMLCompressionException("Does not make sense to call 
this");
+       }
+
+       @Override
        protected int containsAllZeroTuple() {
                return -1;
        }
diff --git 
a/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupDDC.java 
b/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupDDC.java
index e9e98d7..aa8a55f 100644
--- a/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupDDC.java
+++ b/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupDDC.java
@@ -71,11 +71,11 @@ public class ColGroupDDC extends ColGroupValue {
        public void decompressToBlockUnSafe(MatrixBlock target, int rl, int ru, 
int offT, double[] values) {
                final int nCol = _colIndexes.length;
                final int tCol = target.getNumColumns();
-               double[] c = target.getDenseBlockValues();
+               final double[] c = target.getDenseBlockValues();
                offT = offT * tCol;
 
                for(int i = rl; i < ru; i++, offT += tCol) {
-                       int rowIndex = getIndex(i) * nCol;
+                       final int rowIndex = _data.getIndex(i) * nCol;
                        for(int j = 0; j < nCol; j++)
                                c[offT + _colIndexes[j]] += values[rowIndex + 
j];
                }
@@ -87,7 +87,7 @@ public class ColGroupDDC extends ColGroupValue {
                int ncol = getNumCols();
                double[] dictionary = getValues();
                for(int i = 0; i < _numRows; i++) {
-                       int rowIndex = getIndex(i) * ncol;
+                       int rowIndex = _data.getIndex(i) * ncol;
                        for(int colIx = 0; colIx < ncol; colIx++) {
                                int origMatrixColIx = getColIndex(colIx);
                                int col = colIndexTargets[origMatrixColIx];
@@ -105,7 +105,7 @@ public class ColGroupDDC extends ColGroupValue {
                double[] values = getValues();
                int nnz = 0;
                for(int i = 0; i < _numRows; i++) {
-                       int index = getIndex(i);
+                       int index = _data.getIndex(i);
                        if(index < getNumValues())
                                nnz += ((c[i] += values[(index) * ncol + 
colpos]) != 0) ? 1 : 0;
                        else
@@ -123,7 +123,7 @@ public class ColGroupDDC extends ColGroupValue {
                final int numValues = getNumValues();
                int nnz = 0;
                for(int i = 0, r = rl; i < ru - rl; i++, r++) {
-                       int index = getIndex(r);
+                       int index = _data.getIndex(r);
                        if(index < numValues)
                                nnz += ((c[i] += values[(index) * ncol + 
colpos]) != 0) ? 1 : 0;
                        else
@@ -138,7 +138,7 @@ public class ColGroupDDC extends ColGroupValue {
                double[] values = getValues();
                final int numValues = getNumValues();
                for(int i = 0, r = rl; i < ru - rl; i++, r++) {
-                       int index = getIndex(r);
+                       int index = _data.getIndex(r);
                        if(index < numValues)
                                c[i] += values[(index) * ncol + colpos];
                }
@@ -152,7 +152,7 @@ public class ColGroupDDC extends ColGroupValue {
                        throw new RuntimeException("Column index " + c + " not 
in DDC group.");
 
                // get value
-               int index = getIndex(r);
+               int index = _data.getIndex(r);
                if(index < getNumValues())
                        return _dict.getValue(index * _colIndexes.length + ix);
                else
@@ -167,7 +167,7 @@ public class ColGroupDDC extends ColGroupValue {
                double[] values = _dict.getValues();
                for(int i = rl; i < ru; i++) {
                        int lnnz = 0;
-                       int index = getIndex(i);
+                       int index = _data.getIndex(i);
                        if(index < numVals) {
                                for(int colIx = index; colIx < ncol + index; 
colIx++) {
                                        lnnz += (values[colIx]) != 0 ? 1 : 0;
@@ -181,7 +181,7 @@ public class ColGroupDDC extends ColGroupValue {
        protected void computeRowSums(double[] c, boolean square, int rl, int 
ru, boolean mean) {
                double[] vals = _dict.sumAllRowsToDouble(square, 
_colIndexes.length);
                for(int rix = rl; rix < ru; rix++)
-                       c[rix] += vals[getIndex(rix)];
+                       c[rix] += vals[_data.getIndex(rix)];
        }
 
        @Override
@@ -189,7 +189,7 @@ public class ColGroupDDC extends ColGroupValue {
                final int nCol = getNumCols();
                double[] preAggregatedRows = _dict.aggregateTuples(builtin, 
nCol);
                for(int i = rl; i < ru; i++)
-                       c[i] = builtin.execute(c[i], 
preAggregatedRows[getIndex(i)]);
+                       c[i] = builtin.execute(c[i], 
preAggregatedRows[_data.getIndex(i)]);
        }
 
        @Override
@@ -200,7 +200,7 @@ public class ColGroupDDC extends ColGroupValue {
        @Override
        public int[] getCounts(int rl, int ru, int[] counts) {
                for(int i = rl; i < ru; i++) {
-                       int index = getIndex(i);
+                       int index = _data.getIndex(i);
                        counts[index]++;
                }
                return counts;
@@ -211,10 +211,10 @@ public class ColGroupDDC extends ColGroupValue {
                double[] vals = allocDVector(getNumValues(), true);
                if(row > 0)
                        for(int i = 0, off = _numRows * row; i < _numRows; i++, 
off++)
-                               vals[getIndex(i)] += a[off];
+                               vals[_data.getIndex(i)] += a[off];
                else
                        for(int i = 0; i < _numRows; i++)
-                               vals[getIndex(i)] += a[i];
+                               vals[_data.getIndex(i)] += a[i];
 
                return vals;
        }
@@ -226,34 +226,12 @@ public class ColGroupDDC extends ColGroupValue {
                int[] indexes = sb.indexes(row);
                double[] sparseV = sb.values(row);
                for(int i = sb.pos(row); i < sb.size(row) + sb.pos(row); i++)
-                       vals[getIndex(indexes[i])] += sparseV[i];
+                       vals[_data.getIndex(indexes[i])] += sparseV[i];
                return vals;
 
        }
 
        /**
-        * Generic get index in dictionary for value at row position.
-        * 
-        * @param r row position to get dictionary index for.
-        * @return The dictionary index
-        */
-       protected int getIndex(int r) {
-               return _data.getIndex(r);
-       }
-
-       /**
-        * Generic get index in dictionary for value at row, col position. If 
used consider changing to getIndex and
-        * precalculate offset to row
-        * 
-        * @param r     The row to find
-        * @param colIx the col index to find
-        * @return the index in the dictionary containing the specified value
-        */
-       protected int getIndex(int r, int colIx) {
-               return _data.getIndex(r) * getNumCols() + colIx;
-       }
-
-       /**
         * Generic get value for byte-length-agnostic access to first column.
         * 
         * @param r      Global row index
@@ -261,7 +239,7 @@ public class ColGroupDDC extends ColGroupValue {
         * @return value
         */
        protected double getData(int r, double[] values) {
-               int index = getIndex(r);
+               int index = _data.getIndex(r);
                return (index < values.length) ? values[index] : 0.0;
        }
 
@@ -274,7 +252,7 @@ public class ColGroupDDC extends ColGroupValue {
         * @return value
         */
        protected double getData(int r, int colIx, double[] values) {
-               int index = getIndex(r, colIx);
+               int index = _data.getIndex(r) * _colIndexes.length + colIx;
                return (index < values.length) ? values[index] : 0.0;
        }
 
@@ -296,7 +274,7 @@ public class ColGroupDDC extends ColGroupValue {
                IPreAggregate ag = PreAggregateFactory.ag(retSize);
                // int[] m = _data.materializeMultiplied(nCol);
                for(int i = 0; i < this._numRows; i++)
-                       ag.increment(lhs.getIndex(i) + this.getIndex(i) * nCol);
+                       ag.increment(lhs._data.getIndex(i) + 
this._data.getIndex(i) * nCol);
 
                return ag;
        }
@@ -315,9 +293,9 @@ public class ColGroupDDC extends ColGroupValue {
 
                int col;
                for(; i < this._numRows && lIt.hasNext(); i++) {
-                       int row = this.getIndex(i);
+                       int row = this._data.getIndex(i);
                        if(lIt.value() == i)
-                               col = 
lhs.getIndex(lIt.getDataIndexAndIncrement());
+                               col = 
lhs._data.getIndex(lIt.getDataIndexAndIncrement());
 
                        else
                                col = offsetToDefault;
@@ -325,7 +303,7 @@ public class ColGroupDDC extends ColGroupValue {
                }
                col = offsetToDefault;
                for(; i < this._numRows; i++) {
-                       int row = this.getIndex(i);
+                       int row = this._data.getIndex(i);
                        ag.increment(col + row * nCol);
                }
 
@@ -344,7 +322,7 @@ public class ColGroupDDC extends ColGroupValue {
 
                int col;
                for(; i < this._numRows && lIt.hasNext(); i++) {
-                       int row = this.getIndex(i);
+                       int row = this._data.getIndex(i);
                        if(lIt.value() == i) {
                                col = 1;
                                lIt.next();
@@ -355,7 +333,7 @@ public class ColGroupDDC extends ColGroupValue {
                }
 
                for(; i < this._numRows; i++)
-                       ag.increment(this.getIndex(i) * nCol);
+                       ag.increment(this._data.getIndex(i) * nCol);
 
                return ag;
        }
@@ -369,8 +347,8 @@ public class ColGroupDDC extends ColGroupValue {
                final AIterator lIt = lhs._indexes.getIterator();
 
                while(lIt.hasNext()) {
-                       int row = this.getIndex(lIt.value());
-                       int col = lhs.getIndex(lIt.getDataIndexAndIncrement());
+                       int row = this._data.getIndex(lIt.value());
+                       int col = 
lhs._data.getIndex(lIt.getDataIndexAndIncrement());
                        ag.increment(col + row * nCol);
                }
 
@@ -387,7 +365,7 @@ public class ColGroupDDC extends ColGroupValue {
                final AIterator lIt = lhs._indexes.getIterator();
 
                while(lIt.hasNext()) {
-                       int row = this.getIndex(lIt.value());
+                       int row = this._data.getIndex(lIt.value());
                        lIt.next();
                        ag.increment(row);
                }
@@ -409,7 +387,7 @@ public class ColGroupDDC extends ColGroupValue {
                        for(int bixL = 0, offL = 0, sLenL = 0; bixL < bLenL; 
bixL += sLenL + 1, offL += blksz) {
                                sLenL = lhs._data[bOffL + bixL];
                                for(int i = 1; i <= sLenL; i++) {
-                                       int idx = this.getIndex(offL + 
lhs._data[bOffL + bixL + i]);
+                                       int idx = this._data.getIndex(offL + 
lhs._data[bOffL + bixL + i]);
                                        ag.increment(kl + idx * NVL);
                                }
                        }
@@ -433,7 +411,7 @@ public class ColGroupDDC extends ColGroupValue {
                                lenL = lhs._data[boffL + bixL + 1];
                                final int endL = startL + lenL;
                                for(int i = startL; i < endL; i++) {
-                                       int kr = getIndex(i) * NVL;
+                                       int kr = _data.getIndex(i) * NVL;
                                        ag.increment(kl + kr);
                                }
                        }
@@ -445,27 +423,36 @@ public class ColGroupDDC extends ColGroupValue {
        public Dictionary preAggregateThatDDCStructure(ColGroupDDC that, 
Dictionary ret) {
                final int nCol = that._colIndexes.length;
                for(int r = 0; r < _numRows; r++)
-                       that._dict.addToEntry(ret, that.getIndex(r), 
this.getIndex(r), nCol);
+                       that._dict.addToEntry(ret, that._data.getIndex(r), 
this._data.getIndex(r), nCol);
 
                return ret;
        }
 
        @Override
-       public Dictionary preAggregateThatSDCStructure(ColGroupSDC that, 
Dictionary ret) {
+       public Dictionary preAggregateThatSDCStructure(ColGroupSDC that, 
Dictionary ret, boolean preModified) {
                final AIterator itThat = that._indexes.getIterator();
                final int offsetToDefault = that.getNumValues() - 1;
                final int nCol = that._colIndexes.length;
+               if(preModified) {
+                       while(itThat.hasNext()) {
+                               final int to = _data.getIndex(itThat.value());
+                               final int fr = 
that._data.getIndex(itThat.getDataIndexAndIncrement());
+                               that._dict.addToEntry(ret, fr, to, nCol);
+                       }
+               }
+               else {
+                       int i = 0;
 
-               int i = 0;
+                       for(; i < _numRows && itThat.hasNext(); i++) {
+                               int fr = (itThat.value() == i) ? that._data
+                                       
.getIndex(itThat.getDataIndexAndIncrement()) : offsetToDefault;
+                               that._dict.addToEntry(ret, fr, 
this._data.getIndex(i), nCol);
+                       }
 
-               for(; i < _numRows && itThat.hasNext(); i++) {
-                       int fr = (itThat.value() == i) ? 
that.getIndex(itThat.getDataIndexAndIncrement()) : offsetToDefault;
-                       that._dict.addToEntry(ret, fr, this.getIndex(i), nCol);
+                       for(; i < _numRows; i++)
+                               that._dict.addToEntry(ret, offsetToDefault, 
this._data.getIndex(i), nCol);
                }
 
-               for(; i < _numRows; i++)
-                       that._dict.addToEntry(ret, offsetToDefault, 
this.getIndex(i), nCol);
-
                return ret;
        }
 
@@ -475,8 +462,8 @@ public class ColGroupDDC extends ColGroupValue {
                final int nCol = that._colIndexes.length;
 
                while(itThat.hasNext()) {
-                       final int to = getIndex(itThat.value());
-                       final int fr = 
that.getIndex(itThat.getDataIndexAndIncrement());
+                       final int to = _data.getIndex(itThat.value());
+                       final int fr = 
that._data.getIndex(itThat.getDataIndexAndIncrement());
                        that._dict.addToEntry(ret, fr, to, nCol);
                }
 
@@ -489,7 +476,7 @@ public class ColGroupDDC extends ColGroupValue {
                final int nCol = that._colIndexes.length;
 
                while(itThat.hasNext()) {
-                       final int to = getIndex(itThat.value());
+                       final int to = _data.getIndex(itThat.value());
                        itThat.next();
                        that._dict.addToEntry(ret, 0, to, nCol);
                }
@@ -498,6 +485,35 @@ public class ColGroupDDC extends ColGroupValue {
        }
 
        @Override
+       public Dictionary preAggregateThatSDCSingleStructure(ColGroupSDCSingle 
that, Dictionary ret, boolean preModified) {
+               final AIterator itThat = that._indexes.getIterator();
+               final int nCol = that._colIndexes.length;
+               if(preModified) {
+                       while(itThat.hasNext()) {
+                               final int to = _data.getIndex(itThat.value());
+                               itThat.next();
+                               that._dict.addToEntry(ret, 0, to, nCol);
+                       }
+               }
+               else {
+                       int i = 0;
+                       for(; i < _numRows && itThat.hasNext(); i++) {
+                               if(itThat.value() == i) {
+                                       that._dict.addToEntry(ret, 0, 
this._data.getIndex(i), nCol);
+                                       itThat.next();
+                               }
+                               else
+                                       that._dict.addToEntry(ret, 1, 
this._data.getIndex(i), nCol);
+                       }
+
+                       for(; i < _numRows; i++)
+                               that._dict.addToEntry(ret, 1, 
this._data.getIndex(i), nCol);
+               }
+
+               return ret;
+       }
+
+       @Override
        public boolean sameIndexStructure(ColGroupCompressed that) {
                return that instanceof ColGroupDDC && ((ColGroupDDC) 
that)._data == _data;
        }
diff --git 
a/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupEmpty.java 
b/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupEmpty.java
index 5192b26..2046d18 100644
--- 
a/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupEmpty.java
+++ 
b/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupEmpty.java
@@ -140,7 +140,7 @@ public class ColGroupEmpty extends ColGroupCompressed {
                if(val0 == 0)
                        return this;
                return new ColGroupConst(_colIndexes, _numRows,
-                       new Dictionary(new double[0]).applyScalarOp(op, val0, 
_colIndexes.length));
+                       new Dictionary(new 
double[_colIndexes.length]).applyScalarOp(op, val0, _colIndexes.length));
        }
 
        @Override
@@ -148,7 +148,7 @@ public class ColGroupEmpty extends ColGroupCompressed {
                if(sparseSafe)
                        return this;
                return new ColGroupConst(_colIndexes, _numRows,
-                       new Dictionary(new double[0]).applyBinaryRowOp(op.fn, 
v, sparseSafe, _colIndexes, left));
+                       new Dictionary(new 
double[_colIndexes.length]).applyBinaryRowOp(op.fn, v, sparseSafe, _colIndexes, 
left));
        }
 
        @Override
@@ -183,7 +183,7 @@ public class ColGroupEmpty extends ColGroupCompressed {
 
        @Override
        protected double computeMxx(double c, Builtin builtin) {
-               return 0;
+               return builtin.execute(c, 0);
        }
 
        @Override
diff --git 
a/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupFactory.java 
b/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupFactory.java
index 49ef877..f16204e 100644
--- 
a/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupFactory.java
+++ 
b/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupFactory.java
@@ -289,14 +289,16 @@ public class ColGroupFactory {
                if(LOG.isTraceEnabled())
                        LOG.trace("compressing to: " + compType);
                try {
+                       if(cs.sortValuesByLength)
+                               ubm.sortValuesByFrequency();
 
                        switch(compType) {
                                case DDC:
                                        return compressDDC(colIndexes, rlen, 
ubm, cs);
                                case RLE:
-                                       return new ColGroupRLE(colIndexes, 
rlen, ubm, cs);
+                                       return compressRLE(colIndexes, rlen, 
ubm, cs);
                                case OLE:
-                                       return new ColGroupOLE(colIndexes, 
rlen, ubm, cs);
+                                       return compressOLE(colIndexes, rlen, 
ubm, cs);
                                case SDC:
                                        return compressSDC(colIndexes, rlen, 
ubm, cs);
                                case UNCOMPRESSED:
@@ -460,4 +462,46 @@ public class ColGroupFactory {
                return new ColGroupDDC(colIndexes, rlen, dict, _data, null);
 
        }
+
+       private static AColGroup compressOLE(int[] colIndexes, int rlen, 
ABitmap ubm, CompressionSettings cs) {
+
+               ADictionary dict = ADictionary.getDictionary(ubm);
+               ColGroupOLE ole = new ColGroupOLE(rlen);
+
+               final int numVals = ubm.getNumValues();
+               char[][] lbitmaps = new char[numVals][];
+               int totalLen = 0;
+               for(int i = 0; i < numVals; i++) {
+                       lbitmaps[i] = 
ColGroupOLE.genOffsetBitmap(ubm.getOffsetsList(i).extractValues(), 
ubm.getNumOffsets(i));
+                       totalLen += lbitmaps[i].length;
+               }
+
+               // compact bitmaps to linearized representation
+               ole.createCompressedBitmaps(numVals, totalLen, lbitmaps);
+               ole._dict = dict;
+               ole._zeros = ubm.getNumOffsets() < (long) rlen;
+               ole._colIndexes = colIndexes;
+               return ole;
+       }
+
+       private static AColGroup compressRLE(int[] colIndexes, int rlen, 
ABitmap ubm, CompressionSettings cs) {
+
+               ADictionary dict = ADictionary.getDictionary(ubm);
+               ColGroupRLE rle = new ColGroupRLE(rlen);
+               // compress the bitmaps
+               final int numVals = ubm.getNumValues();
+               char[][] lbitmaps = new char[numVals][];
+               int totalLen = 0;
+
+               for(int k = 0; k < numVals; k++) {
+                       lbitmaps[k] = 
ColGroupRLE.genRLEBitmap(ubm.getOffsetsList(k).extractValues(), 
ubm.getNumOffsets(k));
+                       totalLen += lbitmaps[k].length;
+               }
+               // compact bitmaps to linearized representation
+               rle.createCompressedBitmaps(numVals, totalLen, lbitmaps);
+               rle._dict = dict;
+               rle._zeros = ubm.getNumOffsets() < (long) rlen;
+               rle._colIndexes = colIndexes;
+               return rle;
+       }
 }
diff --git 
a/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupOLE.java 
b/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupOLE.java
index cc3d9f1..8101f40 100644
--- a/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupOLE.java
+++ b/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupOLE.java
@@ -29,7 +29,6 @@ import 
org.apache.sysds.runtime.compress.colgroup.dictionary.Dictionary;
 import org.apache.sysds.runtime.compress.colgroup.offset.AIterator;
 import org.apache.sysds.runtime.compress.colgroup.pre.IPreAggregate;
 import org.apache.sysds.runtime.compress.colgroup.pre.PreAggregateFactory;
-import org.apache.sysds.runtime.compress.utils.ABitmap;
 import org.apache.sysds.runtime.compress.utils.LinearAlgebraUtils;
 import org.apache.sysds.runtime.data.SparseBlock;
 import org.apache.sysds.runtime.functionobjects.Builtin;
@@ -44,7 +43,7 @@ import 
org.apache.sysds.runtime.matrix.operators.ScalarOperator;
 public class ColGroupOLE extends ColGroupOffset {
        private static final long serialVersionUID = -9157676271360528008L;
 
-               /**
+       /**
         * Constructor for serialization
         * 
         * @param numRows Number of rows contained
@@ -53,30 +52,6 @@ public class ColGroupOLE extends ColGroupOffset {
                super(numRows);
        }
 
-       /**
-        * Main constructor. Constructs and stores the necessary bitmaps.
-        * 
-        * @param colIndices indices (within the block) of the columns included 
in this column
-        * @param numRows    total number of rows in the parent block
-        * @param ubm        Uncompressed bitmap representation of the block
-        * @param cs         The Compression settings used for compression
-        */
-       protected ColGroupOLE(int[] colIndices, int numRows, ABitmap ubm, 
CompressionSettings cs) {
-               super(colIndices, numRows, ubm, cs);
-               // compress the bitmaps
-               final int numVals = ubm.getNumValues();
-               char[][] lbitmaps = new char[numVals][];
-               int totalLen = 0;
-               for(int i = 0; i < numVals; i++) {
-                       lbitmaps[i] = 
genOffsetBitmap(ubm.getOffsetsList(i).extractValues(), ubm.getNumOffsets(i));
-                       totalLen += lbitmaps[i].length;
-               }
-
-               // compact bitmaps to linearized representation
-               createCompressedBitmaps(numVals, totalLen, lbitmaps);
-
-       }
-
        protected ColGroupOLE(int[] colIndices, int numRows, boolean zeros, 
ADictionary dict, char[] bitmaps,
                int[] bitmapOffs, int[] counts) {
                super(colIndices, numRows, zeros, dict, counts);
@@ -425,304 +400,309 @@ public class ColGroupOLE extends ColGroupOffset {
 
        // @Override
        // public void rightMultByVector(double[] b, double[] c, int rl, int 
ru, double[] dictVals) {
-       //      final int blksz = CompressionSettings.BITMAP_BLOCK_SZ;
-       //      final int numVals = getNumValues();
-
-       //      if(rl % blksz != 0)
-       //              throw new DMLCompressionException("All blocks should be 
starting at block segments for OLE");
-
-       //      if(numVals > 1 && _numRows > blksz * 2) {
-       //              // since single segment scans already exceed typical L2 
cache sizes
-       //              // and because there is some overhead associated with 
blocking, the
-       //              // best configuration aligns with L3 cache size 
(x*vcores*64K*8B < L3)
-       //              // x=4 leads to a good yet slightly conservative 
compromise for single-/
-       //              // multi-threaded and typical number of cores and L3 
cache sizes
-       //              final int blksz2 = CompressionSettings.BITMAP_BLOCK_SZ 
* 2;
-       //              int[] apos = skipScan(numVals, rl);
-       //              double[] aval = preaggValues(numVals, b, dictVals);
-
-       //              // step 2: cache conscious matrix-vector via horizontal 
scans
-       //              for(int bi = rl; bi < ru; bi += blksz2) {
-       //                      int bimax = Math.min(bi + blksz2, ru);
-
-       //                      // horizontal segment scan, incl pos maintenance
-       //                      for(int k = 0; k < numVals; k++) {
-       //                              int boff = _ptr[k];
-       //                              int blen = len(k);
-       //                              double val = aval[k];
-       //                              int bix = apos[k];
-
-       //                              for(int ii = bi; ii < bimax && bix < 
blen; ii += blksz) {
-       //                                      // prepare length, start, and 
end pos
-       //                                      int len = _data[boff + bix];
-       //                                      int pos = boff + bix + 1;
-
-       //                                      // compute partial results
-       //                                      LinearAlgebraUtils.vectAdd(val, 
c, _data, pos, ii, len);
-       //                                      bix += len + 1;
-       //                              }
-
-       //                              apos[k] = bix;
-       //                      }
-       //              }
-       //      }
-       //      else {
-       //              // iterate over all values and their bitmaps
-       //              for(int k = 0; k < numVals; k++) {
-       //                      // prepare value-to-add for entire value bitmap
-       //                      int boff = _ptr[k];
-       //                      int blen = len(k);
-       //                      double val = sumValues(k, b, dictVals);
-
-       //                      // iterate over bitmap blocks and add values
-       //                      if(val != 0) {
-       //                              int bix = 0;
-       //                              int off = 0;
-       //                              int slen = -1;
-
-       //                              // scan to beginning offset if necessary
-       //                              if(rl > 0) {
-       //                                      for(; bix < blen & off < rl; 
bix += slen + 1, off += blksz) {
-       //                                              slen = _data[boff + 
bix];
-       //                                      }
-       //                              }
-
-       //                              // compute partial results
-       //                              for(; bix < blen & off < ru; bix += 
slen + 1, off += blksz) {
-       //                                      slen = _data[boff + bix];
-       //                                      for(int blckIx = 1; blckIx <= 
slen; blckIx++) {
-       //                                              c[off + _data[boff + 
bix + blckIx]] += val;
-       //                                      }
-       //                              }
-       //                      }
-       //              }
-       //      }
+       // final int blksz = CompressionSettings.BITMAP_BLOCK_SZ;
+       // final int numVals = getNumValues();
+
+       // if(rl % blksz != 0)
+       // throw new DMLCompressionException("All blocks should be starting at 
block segments for OLE");
+
+       // if(numVals > 1 && _numRows > blksz * 2) {
+       // // since single segment scans already exceed typical L2 cache sizes
+       // // and because there is some overhead associated with blocking, the
+       // // best configuration aligns with L3 cache size (x*vcores*64K*8B < 
L3)
+       // // x=4 leads to a good yet slightly conservative compromise for 
single-/
+       // // multi-threaded and typical number of cores and L3 cache sizes
+       // final int blksz2 = CompressionSettings.BITMAP_BLOCK_SZ * 2;
+       // int[] apos = skipScan(numVals, rl);
+       // double[] aval = preaggValues(numVals, b, dictVals);
+
+       // // step 2: cache conscious matrix-vector via horizontal scans
+       // for(int bi = rl; bi < ru; bi += blksz2) {
+       // int bimax = Math.min(bi + blksz2, ru);
+
+       // // horizontal segment scan, incl pos maintenance
+       // for(int k = 0; k < numVals; k++) {
+       // int boff = _ptr[k];
+       // int blen = len(k);
+       // double val = aval[k];
+       // int bix = apos[k];
+
+       // for(int ii = bi; ii < bimax && bix < blen; ii += blksz) {
+       // // prepare length, start, and end pos
+       // int len = _data[boff + bix];
+       // int pos = boff + bix + 1;
+
+       // // compute partial results
+       // LinearAlgebraUtils.vectAdd(val, c, _data, pos, ii, len);
+       // bix += len + 1;
+       // }
+
+       // apos[k] = bix;
+       // }
+       // }
+       // }
+       // else {
+       // // iterate over all values and their bitmaps
+       // for(int k = 0; k < numVals; k++) {
+       // // prepare value-to-add for entire value bitmap
+       // int boff = _ptr[k];
+       // int blen = len(k);
+       // double val = sumValues(k, b, dictVals);
+
+       // // iterate over bitmap blocks and add values
+       // if(val != 0) {
+       // int bix = 0;
+       // int off = 0;
+       // int slen = -1;
+
+       // // scan to beginning offset if necessary
+       // if(rl > 0) {
+       // for(; bix < blen & off < rl; bix += slen + 1, off += blksz) {
+       // slen = _data[boff + bix];
+       // }
+       // }
+
+       // // compute partial results
+       // for(; bix < blen & off < ru; bix += slen + 1, off += blksz) {
+       // slen = _data[boff + bix];
+       // for(int blckIx = 1; blckIx <= slen; blckIx++) {
+       // c[off + _data[boff + bix + blckIx]] += val;
+       // }
+       // }
+       // }
+       // }
+       // }
        // }
 
        // @Override
-       // public void rightMultByMatrix(int[] outputColumns, double[] 
preAggregatedB, double[] c, int thatNrColumns, int rl,
-       //      int ru) {
-
-       //      final int blksz = CompressionSettings.BITMAP_BLOCK_SZ;
-       //      final int numVals = getNumValues();
-
-       //      if(numVals > 1 && _numRows > blksz * 2) {
-       //              final int blksz2 = blksz * 2;
-       //              int[] apos = skipScan(numVals, rl);
-       //              int blockStart = rl - rl % blksz;
-       //              for(int bi = blockStart; bi < ru; bi += blksz2) {
-       //                      int bimax = Math.min(bi + blksz2, ru);
-       //                      for(int k = 0; k < numVals; k++) {
-       //                              int boff = _ptr[k];
-       //                              int blen = len(k);
-       //                              int bix = apos[k];
-       //                              for(int ii = bi; ii < bimax && bix < 
blen; ii += blksz) {
-       //                                      int len = _data[boff + bix];
-       //                                      int pos = _data[boff + bix + 1];
-       //                                      if(pos >= rl)
-       //                                              addV(c, preAggregatedB, 
outputColumns, (bi + pos) * thatNrColumns, k);
-       //                                      bix += len + 1;
-       //                              }
-       //                              apos[k] = bix;
-       //                      }
-       //              }
-       //      }
-       //      else {
-       //              for(int k = 0; k < numVals; k++) {
-       //                      int boff = _ptr[k];
-       //                      int blen = len(k);
-       //                      int bix = skipScanVal(k, rl);
-       //                      int off = rl;
-       //                      int slen = 0;
-       //                      // compute partial results
-       //                      for(; bix < blen & off < ru; bix += slen + 1, 
off += blksz) {
-       //                              slen = _data[boff + bix];
-       //                              for(int blckIx = 1; blckIx <= slen; 
blckIx++) {
-       //                                      int rowIdx = (_data[boff + bix 
+ blckIx] + off) * thatNrColumns;
-       //                                      addV(c, preAggregatedB, 
outputColumns, rowIdx, k);
-       //                              }
-       //                      }
-       //              }
-       //      }
+       // public void rightMultByMatrix(int[] outputColumns, double[] 
preAggregatedB, double[] c, int thatNrColumns, int
+       // rl,
+       // int ru) {
+
+       // final int blksz = CompressionSettings.BITMAP_BLOCK_SZ;
+       // final int numVals = getNumValues();
+
+       // if(numVals > 1 && _numRows > blksz * 2) {
+       // final int blksz2 = blksz * 2;
+       // int[] apos = skipScan(numVals, rl);
+       // int blockStart = rl - rl % blksz;
+       // for(int bi = blockStart; bi < ru; bi += blksz2) {
+       // int bimax = Math.min(bi + blksz2, ru);
+       // for(int k = 0; k < numVals; k++) {
+       // int boff = _ptr[k];
+       // int blen = len(k);
+       // int bix = apos[k];
+       // for(int ii = bi; ii < bimax && bix < blen; ii += blksz) {
+       // int len = _data[boff + bix];
+       // int pos = _data[boff + bix + 1];
+       // if(pos >= rl)
+       // addV(c, preAggregatedB, outputColumns, (bi + pos) * thatNrColumns, 
k);
+       // bix += len + 1;
+       // }
+       // apos[k] = bix;
+       // }
+       // }
+       // }
+       // else {
+       // for(int k = 0; k < numVals; k++) {
+       // int boff = _ptr[k];
+       // int blen = len(k);
+       // int bix = skipScanVal(k, rl);
+       // int off = rl;
+       // int slen = 0;
+       // // compute partial results
+       // for(; bix < blen & off < ru; bix += slen + 1, off += blksz) {
+       // slen = _data[boff + bix];
+       // for(int blckIx = 1; blckIx <= slen; blckIx++) {
+       // int rowIdx = (_data[boff + bix + blckIx] + off) * thatNrColumns;
+       // addV(c, preAggregatedB, outputColumns, rowIdx, k);
+       // }
+       // }
+       // }
+       // }
        // }
 
        // private static void addV(double[] c, double[] preAggregatedB, int[] 
outputColumns, int rowIdx, int k) {
-       //      int n = k * outputColumns.length;
-       //      for(int i = 0; i < outputColumns.length; i++) {
-       //              c[rowIdx + outputColumns[i]] += preAggregatedB[n + i];
-       //      }
+       // int n = k * outputColumns.length;
+       // for(int i = 0; i < outputColumns.length; i++) {
+       // c[rowIdx + outputColumns[i]] += preAggregatedB[n + i];
+       // }
        // }
 
        // @Override
        // public void leftMultByRowVector(double[] a, double[] c, int numVals, 
double[] values) {
-       //      final int blksz = CompressionSettings.BITMAP_BLOCK_SZ;
+       // final int blksz = CompressionSettings.BITMAP_BLOCK_SZ;
 
-       //      if(numVals >= 1 && _numRows > blksz)
-       //              leftMultByRowVectorBlocking(a, c, numVals, values);
-       //      else
-       //              leftMultByRowVectorNonBlocking(a, c, numVals, values);
+       // if(numVals >= 1 && _numRows > blksz)
+       // leftMultByRowVectorBlocking(a, c, numVals, values);
+       // else
+       // leftMultByRowVectorNonBlocking(a, c, numVals, values);
 
        // }
 
        // private void leftMultByRowVectorBlocking(double[] a, double[] c, int 
numVals, double[] values) {
-       //      double[] cvals = preAggregate(a);
-       //      postScaling(values, cvals, c, numVals);
+       // double[] cvals = preAggregate(a);
+       // postScaling(values, cvals, c, numVals);
        // }
 
        // private void leftMultByRowVectorNonBlocking(double[] a, double[] c, 
int numVals, double[] values) {
-       //      // iterate over all values and their bitmaps
-       //      final int blksz = CompressionSettings.BITMAP_BLOCK_SZ;
-       //      final int numCols = getNumCols();
-       //      for(int k = 0, valOff = 0; k < numVals; k++, valOff += numCols) 
{
-       //              int boff = _ptr[k];
-       //              int blen = len(k);
-
-       //              // iterate over bitmap blocks and add partial results
-       //              double vsum = 0;
-       //              for(int bix = 0, off = 0; bix < blen; bix += _data[boff 
+ bix] + 1, off += blksz)
-       //                      vsum += LinearAlgebraUtils.vectSum(a, _data, 
off, boff + bix + 1, _data[boff + bix]);
-
-       //              // scale partial results by values and write results
-       //              for(int j = 0; j < numCols; j++)
-       //                      c[_colIndexes[j]] += vsum * values[valOff + j];
-       //      }
+       // // iterate over all values and their bitmaps
+       // final int blksz = CompressionSettings.BITMAP_BLOCK_SZ;
+       // final int numCols = getNumCols();
+       // for(int k = 0, valOff = 0; k < numVals; k++, valOff += numCols) {
+       // int boff = _ptr[k];
+       // int blen = len(k);
+
+       // // iterate over bitmap blocks and add partial results
+       // double vsum = 0;
+       // for(int bix = 0, off = 0; bix < blen; bix += _data[boff + bix] + 1, 
off += blksz)
+       // vsum += LinearAlgebraUtils.vectSum(a, _data, off, boff + bix + 1, 
_data[boff + bix]);
+
+       // // scale partial results by values and write results
+       // for(int j = 0; j < numCols; j++)
+       // c[_colIndexes[j]] += vsum * values[valOff + j];
+       // }
        // }
 
        // @Override
        // public void leftMultByMatrix(double[] a, double[] c, double[] 
values, int numRows, int numCols, int rl, int ru,
-       //      int vOff) {
-       //      final int numVals = getNumValues();
-       //      final int blksz = CompressionSettings.BITMAP_BLOCK_SZ;
-       //      if(numVals >= 1 && _numRows > blksz)
-       //              leftMultByMatrixBlocking(a, c, values, numRows, 
numCols, rl, ru, vOff, numVals);
-       //      else
-       //              leftMultByMatrixNonBlocking(a, c, values, numRows, 
numCols, rl, ru, vOff, numVals);
+       // int vOff) {
+       // final int numVals = getNumValues();
+       // final int blksz = CompressionSettings.BITMAP_BLOCK_SZ;
+       // if(numVals >= 1 && _numRows > blksz)
+       // leftMultByMatrixBlocking(a, c, values, numRows, numCols, rl, ru, 
vOff, numVals);
+       // else
+       // leftMultByMatrixNonBlocking(a, c, values, numRows, numCols, rl, ru, 
vOff, numVals);
 
        // }
 
        // private void leftMultByMatrixBlocking(double[] a, double[] c, 
double[] values, int numRows, int numCols, int rl,
-       //      int ru, int vOff, int numVals) {
-       //      for(int i = rl; i < ru; i++) {
-       //              double[] cvals = preAggregate(a, i);
-       //              postScaling(values, cvals, c, numVals, i, numCols);
-       //      }
+       // int ru, int vOff, int numVals) {
+       // for(int i = rl; i < ru; i++) {
+       // double[] cvals = preAggregate(a, i);
+       // postScaling(values, cvals, c, numVals, i, numCols);
+       // }
        // }
 
-       // private void leftMultByMatrixNonBlocking(double[] a, double[] c, 
double[] values, int numRows, int numCols, int rl,
-       //      int ru, int vOff, int numVals) {
-       //      final int blksz = CompressionSettings.BITMAP_BLOCK_SZ;
-       //      for(int i = rl, offR = vOff * _numRows; i < ru; i++, offR += 
_numRows) {
-       //              for(int k = 0, valOff = 0; k < numVals; k++, valOff += 
_colIndexes.length) {
-       //                      int boff = _ptr[k];
-       //                      int blen = len(k);
-
-       //                      // iterate over bitmap blocks and add partial 
results
-       //                      double vsum = 0;
-       //                      for(int bix = 0, off = 0; bix < blen; bix += 
_data[boff + bix] + 1, off += blksz)
-       //                              vsum += LinearAlgebraUtils.vectSum(a, 
_data, off + offR, boff + bix + 1, _data[boff + bix]);
-
-       //                      // scale partial results by values and write 
results
-
-       //                      int offC = i * numCols;
-       //                      for(int j = 0; j < _colIndexes.length; j++) {
-       //                              int colIx = _colIndexes[j] + offC;
-       //                              c[colIx] += vsum * values[valOff + j];
-       //                      }
-       //              }
-       //      }
+       // private void leftMultByMatrixNonBlocking(double[] a, double[] c, 
double[] values, int numRows, int numCols, int
+       // rl,
+       // int ru, int vOff, int numVals) {
+       // final int blksz = CompressionSettings.BITMAP_BLOCK_SZ;
+       // for(int i = rl, offR = vOff * _numRows; i < ru; i++, offR += 
_numRows) {
+       // for(int k = 0, valOff = 0; k < numVals; k++, valOff += 
_colIndexes.length) {
+       // int boff = _ptr[k];
+       // int blen = len(k);
+
+       // // iterate over bitmap blocks and add partial results
+       // double vsum = 0;
+       // for(int bix = 0, off = 0; bix < blen; bix += _data[boff + bix] + 1, 
off += blksz)
+       // vsum += LinearAlgebraUtils.vectSum(a, _data, off + offR, boff + bix 
+ 1, _data[boff + bix]);
+
+       // // scale partial results by values and write results
+
+       // int offC = i * numCols;
+       // for(int j = 0; j < _colIndexes.length; j++) {
+       // int colIx = _colIndexes[j] + offC;
+       // c[colIx] += vsum * values[valOff + j];
+       // }
+       // }
+       // }
        // }
 
        // @Override
-       // public void leftMultBySparseMatrix(SparseBlock sb, double[] c, 
double[] values, int numRows, int numCols, int row) {
-       //      // final int blksz = CompressionSettings.BITMAP_BLOCK_SZ;
-       //      // final int numVals = getNumValues();
-       //      throw new NotImplementedException("Not implemented Sparse 
multiplication OLE");
-       //      // if(numVals > 1 && _numRows > blksz)
-       //      //      leftMultBySparseMatrixBlocking(sb, c, values, numRows, 
numCols, row, tmpA, numVals);
-       //      // else
-       //      //      leftMultBySparseMatrixNonBlock(sb, c, values, numRows, 
numCols, row, tmpA, numVals);
+       // public void leftMultBySparseMatrix(SparseBlock sb, double[] c, 
double[] values, int numRows, int numCols, int
+       // row) {
+       // // final int blksz = CompressionSettings.BITMAP_BLOCK_SZ;
+       // // final int numVals = getNumValues();
+       // throw new NotImplementedException("Not implemented Sparse 
multiplication OLE");
+       // // if(numVals > 1 && _numRows > blksz)
+       // // leftMultBySparseMatrixBlocking(sb, c, values, numRows, numCols, 
row, tmpA, numVals);
+       // // else
+       // // leftMultBySparseMatrixNonBlock(sb, c, values, numRows, numCols, 
row, tmpA, numVals);
+
+       // }
+
+       // private void leftMultBySparseMatrixBlocking(SparseBlock sb, double[] 
c, double[] values, int numRows, int
+       // numCols,
+       // int row, double[] tmpA, int numVals) {
+       // final int blksz = CompressionSettings.BITMAP_BLOCK_SZ;
+       // int sparseEndIndex = sb.size(row) + sb.pos(row);
+       // int[] indexes = sb.indexes(row);
+       // double[] sparseV = sb.values(row);
+
+       // // cache blocking config (see matrix-vector mult for explanation)
+       // final int blksz2 = 2 * CompressionSettings.BITMAP_BLOCK_SZ;
+
+       // // step 1: prepare position and value arrays
+       // int[] apos = allocIVector(numVals, true);
+       // double[] cvals = allocDVector(numVals, true);
+       // // step 2: cache conscious matrix-vector via horizontal scans
+       // int pI = sb.pos(row);
+       // for(int ai = 0; ai < _numRows; ai += blksz2) {
+       // int aimax = Math.min(ai + blksz2, _numRows);
+       // Arrays.fill(tmpA, 0);
+       // for(; pI < sparseEndIndex && indexes[pI] < aimax; pI++) {
+       // if(indexes[pI] >= ai)
+       // tmpA[indexes[pI] - ai] = sparseV[pI];
+       // }
 
+       // // horizontal segment scan, incl pos maintenance
+       // for(int k = 0; k < numVals; k++) {
+       // int boff = _ptr[k];
+       // int blen = len(k);
+       // int bix = apos[k];
+       // double vsum = 0;
+       // for(int ii = ai; ii < aimax && bix < blen; ii += blksz) {
+       // int len = _data[boff + bix];
+       // int pos = boff + bix + 1;
+       // int blockId = (ii / blksz) % 2;
+       // vsum += LinearAlgebraUtils.vectSum(tmpA, _data, blockId * blksz, 
pos, len);
+       // bix += len + 1;
        // }
 
-       // private void leftMultBySparseMatrixBlocking(SparseBlock sb, double[] 
c, double[] values, int numRows, int numCols,
-       //      int row, double[] tmpA, int numVals) {
-       //      final int blksz = CompressionSettings.BITMAP_BLOCK_SZ;
-       //      int sparseEndIndex = sb.size(row) + sb.pos(row);
-       //      int[] indexes = sb.indexes(row);
-       //      double[] sparseV = sb.values(row);
-
-       //      // cache blocking config (see matrix-vector mult for 
explanation)
-       //      final int blksz2 = 2 * CompressionSettings.BITMAP_BLOCK_SZ;
-
-       //      // step 1: prepare position and value arrays
-       //      int[] apos = allocIVector(numVals, true);
-       //      double[] cvals = allocDVector(numVals, true);
-       //      // step 2: cache conscious matrix-vector via horizontal scans
-       //      int pI = sb.pos(row);
-       //      for(int ai = 0; ai < _numRows; ai += blksz2) {
-       //              int aimax = Math.min(ai + blksz2, _numRows);
-       //              Arrays.fill(tmpA, 0);
-       //              for(; pI < sparseEndIndex && indexes[pI] < aimax; pI++) 
{
-       //                      if(indexes[pI] >= ai)
-       //                              tmpA[indexes[pI] - ai] = sparseV[pI];
-       //              }
-
-       //              // horizontal segment scan, incl pos maintenance
-       //              for(int k = 0; k < numVals; k++) {
-       //                      int boff = _ptr[k];
-       //                      int blen = len(k);
-       //                      int bix = apos[k];
-       //                      double vsum = 0;
-       //                      for(int ii = ai; ii < aimax && bix < blen; ii 
+= blksz) {
-       //                              int len = _data[boff + bix];
-       //                              int pos = boff + bix + 1;
-       //                              int blockId = (ii / blksz) % 2;
-       //                              vsum += 
LinearAlgebraUtils.vectSum(tmpA, _data, blockId * blksz, pos, len);
-       //                              bix += len + 1;
-       //                      }
-
-       //                      apos[k] = bix;
-       //                      cvals[k] += vsum;
-       //              }
-       //      }
-
-       //      int offC = row * numCols;
-       //      // step 3: scale partial results by values and write to global 
output
-       //      for(int k = 0, valOff = 0; k < numVals; k++, valOff += 
_colIndexes.length)
-       //              for(int j = 0; j < _colIndexes.length; j++) {
-       //                      int colIx = _colIndexes[j] + offC;
-       //                      c[colIx] += cvals[k] * values[valOff + j];
-       //              }
+       // apos[k] = bix;
+       // cvals[k] += vsum;
+       // }
+       // }
 
+       // int offC = row * numCols;
+       // // step 3: scale partial results by values and write to global output
+       // for(int k = 0, valOff = 0; k < numVals; k++, valOff += 
_colIndexes.length)
+       // for(int j = 0; j < _colIndexes.length; j++) {
+       // int colIx = _colIndexes[j] + offC;
+       // c[colIx] += cvals[k] * values[valOff + j];
        // }
 
-       // private void leftMultBySparseMatrixNonBlock(SparseBlock sb, double[] 
c, double[] values, int numRows, int numCols,
-       //      int row, double[] tmpA, int numVals) {
-       //      final int blksz = CompressionSettings.BITMAP_BLOCK_SZ;
-       //      int sparseEndIndex = sb.size(row) + sb.pos(row);
-       //      int[] indexes = sb.indexes(row);
-       //      double[] sparseV = sb.values(row);
-
-       //      for(int k = 0, valOff = 0; k < numVals; k++, valOff += 
_colIndexes.length) {
-       //              int boff = _ptr[k];
-       //              int blen = len(k);
-       //              double vsum = 0;
-       //              int pI = sb.pos(row);
-       //              for(int bix = 0, off = 0; bix < blen; bix += _data[boff 
+ bix] + 1, off += blksz) {
-       //                      // blockId = off / blksz;
-       //                      Arrays.fill(tmpA, 0);
-       //                      for(; pI < sparseEndIndex && indexes[pI] < off 
+ blksz; pI++) {
-       //                              if(indexes[pI] >= off)
-       //                                      tmpA[indexes[pI] - off] = 
sparseV[pI];
-       //                      }
-       //                      vsum += LinearAlgebraUtils.vectSum(tmpA, _data, 
0, boff + bix + 1, _data[boff + bix]);
-       //              }
-
-       //              for(int j = 0; j < _colIndexes.length; j++) {
-       //                      int Voff = _colIndexes[j] + row * numCols;
-       //                      c[Voff] += vsum * values[valOff + j];
-       //              }
-       //      }
+       // }
+
+       // private void leftMultBySparseMatrixNonBlock(SparseBlock sb, double[] 
c, double[] values, int numRows, int
+       // numCols,
+       // int row, double[] tmpA, int numVals) {
+       // final int blksz = CompressionSettings.BITMAP_BLOCK_SZ;
+       // int sparseEndIndex = sb.size(row) + sb.pos(row);
+       // int[] indexes = sb.indexes(row);
+       // double[] sparseV = sb.values(row);
+
+       // for(int k = 0, valOff = 0; k < numVals; k++, valOff += 
_colIndexes.length) {
+       // int boff = _ptr[k];
+       // int blen = len(k);
+       // double vsum = 0;
+       // int pI = sb.pos(row);
+       // for(int bix = 0, off = 0; bix < blen; bix += _data[boff + bix] + 1, 
off += blksz) {
+       // // blockId = off / blksz;
+       // Arrays.fill(tmpA, 0);
+       // for(; pI < sparseEndIndex && indexes[pI] < off + blksz; pI++) {
+       // if(indexes[pI] >= off)
+       // tmpA[indexes[pI] - off] = sparseV[pI];
+       // }
+       // vsum += LinearAlgebraUtils.vectSum(tmpA, _data, 0, boff + bix + 1, 
_data[boff + bix]);
+       // }
+
+       // for(int j = 0; j < _colIndexes.length; j++) {
+       // int Voff = _colIndexes[j] + row * numCols;
+       // c[Voff] += vsum * values[valOff + j];
+       // }
+       // }
        // }
 
        @Override
@@ -1090,7 +1070,7 @@ public class ColGroupOLE extends ColGroupOffset {
                        for(int bixR = 0, offR = 0, sLenR = 0; bixR < bLenR; 
bixR += sLenR + 1, offR += blksz) {
                                sLenR = this._data[bOffR + bixR];
                                for(int j = 1; j <= sLenR; j++) {
-                                       int idx = lhs.getIndex(offR + 
this._data[bOffR + bixR + j]);
+                                       int idx = lhs._data.getIndex(offR + 
this._data[bOffR + bixR + j]);
                                        ag.increment(idx + krOff);
                                }
                        }
@@ -1214,7 +1194,7 @@ public class ColGroupOLE extends ColGroupOffset {
        }
 
        @Override
-       public Dictionary preAggregateThatSDCStructure(ColGroupSDC that, 
Dictionary ret) {
+       public Dictionary preAggregateThatSDCStructure(ColGroupSDC that, 
Dictionary ret, boolean preModified) {
                throw new NotImplementedException();
        }
 
@@ -1227,4 +1207,9 @@ public class ColGroupOLE extends ColGroupOffset {
        public Dictionary 
preAggregateThatSDCSingleZerosStructure(ColGroupSDCSingleZeros that, Dictionary 
ret) {
                throw new NotImplementedException();
        }
+
+       @Override
+       public Dictionary preAggregateThatSDCSingleStructure(ColGroupSDCSingle 
that, Dictionary ret, boolean preModified){
+               throw new NotImplementedException();
+       }
 }
diff --git 
a/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupOffset.java 
b/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupOffset.java
index 176258a..d33e816 100644
--- 
a/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupOffset.java
+++ 
b/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupOffset.java
@@ -24,9 +24,7 @@ import java.io.DataOutput;
 import java.io.IOException;
 import java.util.Arrays;
 
-import org.apache.sysds.runtime.compress.CompressionSettings;
 import org.apache.sysds.runtime.compress.colgroup.dictionary.ADictionary;
-import org.apache.sysds.runtime.compress.utils.ABitmap;
 import org.apache.sysds.runtime.compress.utils.LinearAlgebraUtils;
 import org.apache.sysds.runtime.functionobjects.Builtin;
 
@@ -53,18 +51,6 @@ public abstract class ColGroupOffset extends ColGroupValue {
                super(numRows);
        }
 
-       /**
-        * Main constructor. Stores the headers for the individual bitmaps.
-        * 
-        * @param colIndices indices (within the block) of the columns included 
in this column
-        * @param numRows    total number of rows in the parent block
-        * @param ubm        Uncompressed bitmap representation of the block
-        * @param cs         The Compression settings used for compression
-        */
-       protected ColGroupOffset(int[] colIndices, int numRows, ABitmap ubm, 
CompressionSettings cs) {
-               super(colIndices, numRows, ubm, cs);
-       }
-
        protected ColGroupOffset(int[] colIndices, int numRows, boolean zeros, 
ADictionary dict, int[] cachedCounts) {
                super(colIndices, numRows, dict, cachedCounts);
                _zeros = zeros;
diff --git 
a/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupRLE.java 
b/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupRLE.java
index d27913b..1e79abc 100644
--- a/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupRLE.java
+++ b/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupRLE.java
@@ -29,7 +29,6 @@ import 
org.apache.sysds.runtime.compress.colgroup.dictionary.ADictionary;
 import org.apache.sysds.runtime.compress.colgroup.dictionary.Dictionary;
 import org.apache.sysds.runtime.compress.colgroup.pre.IPreAggregate;
 import org.apache.sysds.runtime.compress.colgroup.pre.PreAggregateFactory;
-import org.apache.sysds.runtime.compress.utils.ABitmap;
 import org.apache.sysds.runtime.compress.utils.LinearAlgebraUtils;
 import org.apache.sysds.runtime.data.SparseBlock;
 import org.apache.sysds.runtime.functionobjects.Builtin;
@@ -51,31 +50,6 @@ public class ColGroupRLE extends ColGroupOffset {
                super(numRows);
        }
 
-       /**
-        * Main constructor. Constructs and stores the necessary bitmaps.
-        * 
-        * @param colIndices indices (within the block) of the columns included 
in this column
-        * @param numRows    total number of rows in the parent block
-        * @param ubm        Uncompressed bitmap representation of the block
-        * @param cs         The Compression settings used for compression
-        */
-       protected ColGroupRLE(int[] colIndices, int numRows, ABitmap ubm, 
CompressionSettings cs) {
-               super(colIndices, numRows, ubm, cs);
-
-               // compress the bitmaps
-               final int numVals = ubm.getNumValues();
-               char[][] lbitmaps = new char[numVals][];
-               int totalLen = 0;
-
-               for(int k = 0; k < numVals; k++) {
-                       lbitmaps[k] = 
genRLEBitmap(ubm.getOffsetsList(k).extractValues(), ubm.getNumOffsets(k));
-                       totalLen += lbitmaps[k].length;
-               }
-               // compact bitmaps to linearized representation
-               createCompressedBitmaps(numVals, totalLen, lbitmaps);
-
-       }
-
        protected ColGroupRLE(int[] colIndices, int numRows, boolean zeros, 
ADictionary dict, char[] bitmaps,
                int[] bitmapOffs, int[] cachedCounts) {
                super(colIndices, numRows, zeros, dict, cachedCounts);
@@ -393,223 +367,225 @@ public class ColGroupRLE extends ColGroupOffset {
 
        // @Override
        // public void rightMultByVector(double[] b, double[] c, int rl, int 
ru, double[] dictVals) {
-       //      final int numVals = getNumValues();
-       //      if(numVals >= 1 && _numRows > 
CompressionSettings.BITMAP_BLOCK_SZ) {
-       //              // L3 cache alignment, see comment rightMultByVector 
OLE column group
-       //              // core difference of RLE to OLE is that runs are not 
segment alignment,
-       //              // which requires care of handling runs crossing 
cache-buckets
-       //              final int blksz = CompressionSettings.BITMAP_BLOCK_SZ * 
2;
-
-       //              // step 1: prepare position and value arrays
-
-       //              // current pos / values per RLE list
-
-       //              // step 2: cache conscious matrix-vector via horizontal 
scans
-       //              for(int bi = rl; bi < ru; bi += blksz) {
-       //                      int[] astart = new int[numVals];
-       //                      int[] apos = skipScan(numVals, rl, astart);
-       //                      double[] aval = preaggValues(numVals, b, 
dictVals);
-       //                      int bimax = Math.min(bi + blksz, ru);
-
-       //                      // horizontal segment scan, incl pos maintenance
-       //                      for(int k = 0; k < numVals; k++) {
-       //                              int boff = _ptr[k];
-       //                              int blen = len(k);
-       //                              double val = aval[k];
-       //                              int bix = apos[k];
-       //                              int start = astart[k];
-
-       //                              // compute partial results, not aligned
-       //                              while(bix < blen & bix < bimax) {
-       //                                      int lstart = _data[boff + bix];
-       //                                      int llen = _data[boff + bix + 
1];
-       //                                      int len = Math.min(start + 
lstart + llen, bimax) - Math.max(bi, start + lstart);
-       //                                      if(len > 0) {
-       //                                              
LinearAlgebraUtils.vectAdd(val, c, Math.max(bi, start + lstart), len);
-       //                                      }
-       //                                      start += lstart + llen;
-       //                                      bix += 2;
-       //                              }
-
-       //                              apos[k] = bix;
-       //                              astart[k] = start;
-       //                      }
-       //              }
-       //      }
-       //      else {
-       //              for(int k = 0; k < numVals; k++) {
-       //                      int boff = _ptr[k];
-       //                      int blen = len(k);
-       //                      double val = sumValues(k, b, dictVals);
-       //                      int bix = 0;
-       //                      int start = 0;
-
-       //                      // scan to beginning offset if necessary
-       //                      if(rl > 0) { // rl aligned with blksz
-       //                              while(bix < blen) {
-       //                                      int lstart = _data[boff + bix]; 
// start
-       //                                      int llen = _data[boff + bix + 
1]; // len
-       //                                      if(start + lstart + llen >= rl)
-       //                                              break;
-       //                                      start += lstart + llen;
-       //                                      bix += 2;
-       //                              }
-       //                      }
-
-       //                      // compute partial results, not aligned
-       //                      while(bix < blen) {
-       //                              int lstart = _data[boff + bix];
-       //                              int llen = _data[boff + bix + 1];
-       //                              LinearAlgebraUtils.vectAdd(val, c, 
Math.max(rl, start + lstart),
-       //                                      Math.min(start + lstart + llen, 
ru) - Math.max(rl, start + lstart));
-       //                              if(start + lstart + llen >= ru)
-       //                                      break;
-       //                              start += lstart + llen;
-       //                              bix += 2;
-       //                      }
-       //              }
-       //      }
+       // final int numVals = getNumValues();
+       // if(numVals >= 1 && _numRows > CompressionSettings.BITMAP_BLOCK_SZ) {
+       // // L3 cache alignment, see comment rightMultByVector OLE column group
+       // // core difference of RLE to OLE is that runs are not segment 
alignment,
+       // // which requires care of handling runs crossing cache-buckets
+       // final int blksz = CompressionSettings.BITMAP_BLOCK_SZ * 2;
+
+       // // step 1: prepare position and value arrays
+
+       // // current pos / values per RLE list
+
+       // // step 2: cache conscious matrix-vector via horizontal scans
+       // for(int bi = rl; bi < ru; bi += blksz) {
+       // int[] astart = new int[numVals];
+       // int[] apos = skipScan(numVals, rl, astart);
+       // double[] aval = preaggValues(numVals, b, dictVals);
+       // int bimax = Math.min(bi + blksz, ru);
+
+       // // horizontal segment scan, incl pos maintenance
+       // for(int k = 0; k < numVals; k++) {
+       // int boff = _ptr[k];
+       // int blen = len(k);
+       // double val = aval[k];
+       // int bix = apos[k];
+       // int start = astart[k];
+
+       // // compute partial results, not aligned
+       // while(bix < blen & bix < bimax) {
+       // int lstart = _data[boff + bix];
+       // int llen = _data[boff + bix + 1];
+       // int len = Math.min(start + lstart + llen, bimax) - Math.max(bi, 
start + lstart);
+       // if(len > 0) {
+       // LinearAlgebraUtils.vectAdd(val, c, Math.max(bi, start + lstart), 
len);
+       // }
+       // start += lstart + llen;
+       // bix += 2;
+       // }
+
+       // apos[k] = bix;
+       // astart[k] = start;
+       // }
+       // }
+       // }
+       // else {
+       // for(int k = 0; k < numVals; k++) {
+       // int boff = _ptr[k];
+       // int blen = len(k);
+       // double val = sumValues(k, b, dictVals);
+       // int bix = 0;
+       // int start = 0;
+
+       // // scan to beginning offset if necessary
+       // if(rl > 0) { // rl aligned with blksz
+       // while(bix < blen) {
+       // int lstart = _data[boff + bix]; // start
+       // int llen = _data[boff + bix + 1]; // len
+       // if(start + lstart + llen >= rl)
+       // break;
+       // start += lstart + llen;
+       // bix += 2;
+       // }
+       // }
+
+       // // compute partial results, not aligned
+       // while(bix < blen) {
+       // int lstart = _data[boff + bix];
+       // int llen = _data[boff + bix + 1];
+       // LinearAlgebraUtils.vectAdd(val, c, Math.max(rl, start + lstart),
+       // Math.min(start + lstart + llen, ru) - Math.max(rl, start + lstart));
+       // if(start + lstart + llen >= ru)
+       // break;
+       // start += lstart + llen;
+       // bix += 2;
+       // }
+       // }
+       // }
        // }
 
        // @Override
-       // public void rightMultByMatrix(int[] outputColumns, double[] 
preAggregatedB, double[] c, int thatNrColumns, int rl,
-       //      int ru) {
-       //      final int nrVals = getNumValues();
-       //      for(int k = 0; k < nrVals; k++) {
-       //              int boff = _ptr[k];
-       //              int blen = len(k);
-       //              int bix = 0;
-       //              int start = 0;
-
-       //              // scan to beginning offset if necessary
-       //              if(rl > 0) { // rl aligned with blksz
-       //                      while(bix < blen) {
-       //                              int lstart = _data[boff + bix]; // start
-       //                              int llen = _data[boff + bix + 1]; // len
-       //                              if(start + lstart + llen >= rl)
-       //                                      break;
-       //                              start += lstart + llen;
-       //                              bix += 2;
-       //                      }
-       //              }
-       //              // compute partial results, not aligned
-       //              while(bix < blen) {
-       //                      int lstart = _data[boff + bix];
-       //                      int llen = _data[boff + bix + 1];
-       //                      LinearAlgebraUtils.vectListAdd(preAggregatedB, 
c, Math.max(rl, start + lstart),
-       //                              Math.min(start + lstart + llen, ru), 
outputColumns, thatNrColumns, k);
-       //                      if(start + lstart + llen >= ru)
-       //                              break;
-       //                      start += lstart + llen;
-       //                      bix += 2;
-       //              }
-       //      }
+       // public void rightMultByMatrix(int[] outputColumns, double[] 
preAggregatedB, double[] c, int thatNrColumns, int
+       // rl,
+       // int ru) {
+       // final int nrVals = getNumValues();
+       // for(int k = 0; k < nrVals; k++) {
+       // int boff = _ptr[k];
+       // int blen = len(k);
+       // int bix = 0;
+       // int start = 0;
+
+       // // scan to beginning offset if necessary
+       // if(rl > 0) { // rl aligned with blksz
+       // while(bix < blen) {
+       // int lstart = _data[boff + bix]; // start
+       // int llen = _data[boff + bix + 1]; // len
+       // if(start + lstart + llen >= rl)
+       // break;
+       // start += lstart + llen;
+       // bix += 2;
+       // }
+       // }
+       // // compute partial results, not aligned
+       // while(bix < blen) {
+       // int lstart = _data[boff + bix];
+       // int llen = _data[boff + bix + 1];
+       // LinearAlgebraUtils.vectListAdd(preAggregatedB, c, Math.max(rl, start 
+ lstart),
+       // Math.min(start + lstart + llen, ru), outputColumns, thatNrColumns, 
k);
+       // if(start + lstart + llen >= ru)
+       // break;
+       // start += lstart + llen;
+       // bix += 2;
+       // }
+       // }
        // }
 
        // @Override
        // public void leftMultByRowVector(double[] a, double[] c, int numVals, 
double[] values) {
-       //      final int numCols = getNumCols();
-
-       //      if(numVals >= 1 && _numRows > 
CompressionSettings.BITMAP_BLOCK_SZ) {
-       //              double[] cvals = preAggregate(a, 0);
-       //              postScaling(values, cvals, c, numVals);
-       //      }
-       //      else {
-       //              // iterate over all values and their bitmaps
-       //              for(int k = 0, valOff = 0; k < numVals; k++, valOff += 
numCols) {
-       //                      int boff = _ptr[k];
-       //                      int blen = len(k);
-
-       //                      double vsum = 0;
-       //                      int curRunEnd = 0;
-       //                      for(int bix = 0; bix < blen; bix += 2) {
-       //                              int curRunStartOff = curRunEnd + 
_data[boff + bix];
-       //                              int curRunLen = _data[boff + bix + 1];
-       //                              vsum += LinearAlgebraUtils.vectSum(a, 
curRunStartOff, curRunLen);
-       //                              curRunEnd = curRunStartOff + curRunLen;
-       //                      }
-
-       //                      // scale partial results by values and write 
results
-       //                      for(int j = 0; j < numCols; j++)
-       //                              c[_colIndexes[j]] += vsum * 
values[valOff + j];
-       //              }
-       //      }
+       // final int numCols = getNumCols();
+
+       // if(numVals >= 1 && _numRows > CompressionSettings.BITMAP_BLOCK_SZ) {
+       // double[] cvals = preAggregate(a, 0);
+       // postScaling(values, cvals, c, numVals);
+       // }
+       // else {
+       // // iterate over all values and their bitmaps
+       // for(int k = 0, valOff = 0; k < numVals; k++, valOff += numCols) {
+       // int boff = _ptr[k];
+       // int blen = len(k);
+
+       // double vsum = 0;
+       // int curRunEnd = 0;
+       // for(int bix = 0; bix < blen; bix += 2) {
+       // int curRunStartOff = curRunEnd + _data[boff + bix];
+       // int curRunLen = _data[boff + bix + 1];
+       // vsum += LinearAlgebraUtils.vectSum(a, curRunStartOff, curRunLen);
+       // curRunEnd = curRunStartOff + curRunLen;
+       // }
+
+       // // scale partial results by values and write results
+       // for(int j = 0; j < numCols; j++)
+       // c[_colIndexes[j]] += vsum * values[valOff + j];
+       // }
+       // }
        // }
 
        // @Override
        // public void leftMultByMatrix(final double[] a, final double[] c, 
final double[] values, final int numRows,
-       //      final int numCols, int rl, final int ru, final int vOff) {
-
-       //      final int numVals = getNumValues();
-       //      if(numVals >= 1 && _numRows > 
CompressionSettings.BITMAP_BLOCK_SZ) {
-       //              for(int i = rl; i < ru; i++) {
-       //                      double[] cvals = preAggregate(a, i);
-       //                      postScaling(values, cvals, c, numVals, i, 
numCols);
-       //              }
-       //      }
-       //      else {
-       //              // iterate over all values and their bitmaps
-       //              for(int i = rl, off = vOff * _numRows; i < ru; i++, off 
+= _numRows) {
-       //                      int offC = i * numCols;
-       //                      int valOff = 0;
-       //                      for(int k = 0; k < numVals; k++) {
-       //                              int boff = _ptr[k];
-       //                              int blen = len(k);
-
-       //                              double vsum = 0;
-       //                              int curRunEnd = 0;
-       //                              for(int bix = 0; bix < blen; bix += 2) {
-       //                                      int curRunStartOff = curRunEnd 
+ _data[boff + bix];
-       //                                      int curRunLen = _data[boff + 
bix + 1];
-       //                                      vsum += 
LinearAlgebraUtils.vectSum(a, curRunStartOff + off, curRunLen);
-       //                                      curRunEnd = curRunStartOff + 
curRunLen;
-       //                              }
-
-       //                              for(int j = 0; j < _colIndexes.length; 
j++) {
-       //                                      int colIx = _colIndexes[j] + 
offC;
-       //                                      // scale partial results by 
values and write results
-       //                                      c[colIx] += vsum * 
values[valOff++];
-       //                              }
-       //                      }
-       //              }
-       //      }
+       // final int numCols, int rl, final int ru, final int vOff) {
+
+       // final int numVals = getNumValues();
+       // if(numVals >= 1 && _numRows > CompressionSettings.BITMAP_BLOCK_SZ) {
+       // for(int i = rl; i < ru; i++) {
+       // double[] cvals = preAggregate(a, i);
+       // postScaling(values, cvals, c, numVals, i, numCols);
+       // }
+       // }
+       // else {
+       // // iterate over all values and their bitmaps
+       // for(int i = rl, off = vOff * _numRows; i < ru; i++, off += _numRows) 
{
+       // int offC = i * numCols;
+       // int valOff = 0;
+       // for(int k = 0; k < numVals; k++) {
+       // int boff = _ptr[k];
+       // int blen = len(k);
+
+       // double vsum = 0;
+       // int curRunEnd = 0;
+       // for(int bix = 0; bix < blen; bix += 2) {
+       // int curRunStartOff = curRunEnd + _data[boff + bix];
+       // int curRunLen = _data[boff + bix + 1];
+       // vsum += LinearAlgebraUtils.vectSum(a, curRunStartOff + off, 
curRunLen);
+       // curRunEnd = curRunStartOff + curRunLen;
+       // }
+
+       // for(int j = 0; j < _colIndexes.length; j++) {
+       // int colIx = _colIndexes[j] + offC;
+       // // scale partial results by values and write results
+       // c[colIx] += vsum * values[valOff++];
+       // }
+       // }
+       // }
+       // }
        // }
 
        // @Override
-       // public void leftMultBySparseMatrix(SparseBlock sb, double[] c, 
double[] values, int numRows, int numCols, int row) {
-
-       //      final int numVals = getNumValues();
-       //      int sparseEndIndex = sb.size(row) + sb.pos(row);
-       //      int[] indexes = sb.indexes(row);
-       //      double[] sparseV = sb.values(row);
-       //      for(int k = 0, valOff = 0; k < numVals; k++, valOff += 
_colIndexes.length) {
-       //              int boff = _ptr[k];
-       //              int blen = len(k);
-
-       //              double vsum = 0;
-       //              int pointSparse = sb.pos(row);
-       //              int curRunEnd = 0;
-       //              for(int bix = 0; bix < blen; bix += 2) {
-       //                      int curRunStartOff = curRunEnd + _data[boff + 
bix];
-       //                      int curRunLen = _data[boff + bix + 1];
-       //                      curRunEnd = curRunStartOff + curRunLen;
-       //                      while(pointSparse < sparseEndIndex && 
indexes[pointSparse] < curRunStartOff) {
-       //                              pointSparse++;
-       //                      }
-       //                      while(pointSparse != sparseEndIndex && 
indexes[pointSparse] >= curRunStartOff &&
-       //                              indexes[pointSparse] < curRunEnd) {
-       //                              vsum += sparseV[pointSparse++];
-       //                      }
-       //                      if(pointSparse == sparseEndIndex) {
-       //                              break;
-       //                      }
-       //              }
-
-       //              for(int j = 0; j < _colIndexes.length; j++) {
-       //                      int Voff = _colIndexes[j] + row * numCols;
-       //                      c[Voff] += vsum * values[valOff + j];
-       //              }
-       //      }
+       // public void leftMultBySparseMatrix(SparseBlock sb, double[] c, 
double[] values, int numRows, int numCols, int
+       // row) {
+
+       // final int numVals = getNumValues();
+       // int sparseEndIndex = sb.size(row) + sb.pos(row);
+       // int[] indexes = sb.indexes(row);
+       // double[] sparseV = sb.values(row);
+       // for(int k = 0, valOff = 0; k < numVals; k++, valOff += 
_colIndexes.length) {
+       // int boff = _ptr[k];
+       // int blen = len(k);
+
+       // double vsum = 0;
+       // int pointSparse = sb.pos(row);
+       // int curRunEnd = 0;
+       // for(int bix = 0; bix < blen; bix += 2) {
+       // int curRunStartOff = curRunEnd + _data[boff + bix];
+       // int curRunLen = _data[boff + bix + 1];
+       // curRunEnd = curRunStartOff + curRunLen;
+       // while(pointSparse < sparseEndIndex && indexes[pointSparse] < 
curRunStartOff) {
+       // pointSparse++;
+       // }
+       // while(pointSparse != sparseEndIndex && indexes[pointSparse] >= 
curRunStartOff &&
+       // indexes[pointSparse] < curRunEnd) {
+       // vsum += sparseV[pointSparse++];
+       // }
+       // if(pointSparse == sparseEndIndex) {
+       // break;
+       // }
+       // }
+
+       // for(int j = 0; j < _colIndexes.length; j++) {
+       // int Voff = _colIndexes[j] + row * numCols;
+       // c[Voff] += vsum * values[valOff + j];
+       // }
+       // }
 
        // }
 
@@ -1091,7 +1067,7 @@ public class ColGroupRLE extends ColGroupOffset {
                                lenL = _data[boffL + bixL + 1];
                                final int endL = startL + lenL;
                                for(int i = startL; i < endL; i++)
-                                       ag.increment(lhs.getIndex(i) + offKr);
+                                       ag.increment(lhs._data.getIndex(i) + 
offKr);
 
                        }
                }
@@ -1169,7 +1145,7 @@ public class ColGroupRLE extends ColGroupOffset {
        }
 
        @Override
-       public Dictionary preAggregateThatSDCStructure(ColGroupSDC that, 
Dictionary ret) {
+       public Dictionary preAggregateThatSDCStructure(ColGroupSDC that, 
Dictionary ret, boolean preModified) {
                throw new NotImplementedException();
        }
 
@@ -1182,4 +1158,9 @@ public class ColGroupRLE extends ColGroupOffset {
        public Dictionary 
preAggregateThatSDCSingleZerosStructure(ColGroupSDCSingleZeros that, Dictionary 
ret) {
                throw new NotImplementedException();
        }
+
+       @Override
+       public Dictionary preAggregateThatSDCSingleStructure(ColGroupSDCSingle 
that, Dictionary ret, boolean preModified){
+               throw new NotImplementedException();
+       }
 }
diff --git 
a/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupSDC.java 
b/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupSDC.java
index 5e2d303..1acbfdc 100644
--- a/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupSDC.java
+++ b/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupSDC.java
@@ -61,7 +61,7 @@ public class ColGroupSDC extends ColGroupValue {
         */
        protected AMapToData _data;
 
-               /**
+       /**
         * Constructor for serialization
         * 
         * @param numRows Number of rows contained
@@ -405,7 +405,7 @@ public class ColGroupSDC extends ColGroupValue {
 
                int row;
                for(; i < this._numRows && it.hasNext(); i++) {
-                       int col = lhs.getIndex(i);
+                       int col = lhs._data.getIndex(i);
                        if(it.value() == i)
                                row = getIndex(it.getDataIndexAndIncrement());
                        else
@@ -414,7 +414,7 @@ public class ColGroupSDC extends ColGroupValue {
                }
                row = offsetToDefault;
                for(; i < this._numRows; i++) {
-                       int col = lhs.getIndex(i);
+                       int col = lhs._data.getIndex(i);
                        ag.increment(col + row * nCol);
                }
 
@@ -612,54 +612,77 @@ public class ColGroupSDC extends ColGroupValue {
 
                for(; i < _numRows && it.hasNext(); i++) {
                        int to = (it.value() == i) ? 
getIndex(it.getDataIndexAndIncrement()) : offsetToDefault;
-                       that._dict.addToEntry(ret, that.getIndex(i), to, nCol);
+                       that._dict.addToEntry(ret, that._data.getIndex(i), to, 
nCol);
                }
 
                for(; i < _numRows; i++)
-                       that._dict.addToEntry(ret, that.getIndex(i), 
offsetToDefault, nCol);
+                       that._dict.addToEntry(ret, that._data.getIndex(i), 
offsetToDefault, nCol);
 
                return ret;
        }
 
        @Override
-       public Dictionary preAggregateThatSDCStructure(ColGroupSDC that, 
Dictionary ret) {
+       public Dictionary preAggregateThatSDCStructure(ColGroupSDC that, 
Dictionary ret, boolean preModified) {
                final AIterator itThat = that._indexes.getIterator();
                final AIterator itThis = _indexes.getIterator();
+               final int nCol = that._colIndexes.length;
                final int offsetToDefaultThat = that.getNumValues() - 1;
                final int offsetToDefaultThis = getNumValues() - 1;
-               final int nCol = that._colIndexes.length;
-
-               int i = 0;
-
-               for(; i < _numRows && itThat.hasNext() && itThis.hasNext(); 
i++) {
-                       int to = (itThis.value() == i) ? 
getIndex(itThis.getDataIndexAndIncrement()) : offsetToDefaultThis;
-                       int fr = (itThat.value() == i) ? 
that.getIndex(itThat.getDataIndexAndIncrement()) : offsetToDefaultThat;
-                       that._dict.addToEntry(ret, fr, to, nCol);
-               }
 
-               if(itThat.hasNext()) {
-                       for(; i < _numRows && itThat.hasNext(); i++) {
-                               int fr = (itThat.value() == i) ? 
that.getIndex(itThat.getDataIndexAndIncrement()) : offsetToDefaultThat;
+               if(preModified) {
+                       while(itThat.hasNext() && itThis.hasNext()) {
+                               if(itThat.value() == itThis.value()) {
+                                       final int fr = 
that.getIndex(itThat.getDataIndexAndIncrement());
+                                       final int to = 
getIndex(itThis.getDataIndexAndIncrement());
+                                       that._dict.addToEntry(ret, fr, to, 
nCol);
+                               }
+                               else if(itThat.value() < itThis.value()) {
+                                       final int fr = 
that.getIndex(itThat.getDataIndexAndIncrement());
+                                       that._dict.addToEntry(ret, fr, 
offsetToDefaultThis, nCol);
+                               }
+                               else
+                                       itThis.next();
+                       }
+       
+                       while(itThat.hasNext()) {
+                               final int fr = 
that.getIndex(itThat.getDataIndexAndIncrement());
                                that._dict.addToEntry(ret, fr, 
offsetToDefaultThis, nCol);
                        }
                }
+               else {
+                       int i = 0;
 
-               if(itThis.hasNext()) {
-                       for(; i < _numRows && itThis.hasNext(); i++) {
+                       for(; i < _numRows && itThat.hasNext() && 
itThis.hasNext(); i++) {
                                int to = (itThis.value() == i) ? 
getIndex(itThis.getDataIndexAndIncrement()) : offsetToDefaultThis;
-                               that._dict.addToEntry(ret, offsetToDefaultThat, 
to, nCol);
+                               int fr = (itThat.value() == i) ? 
that.getIndex(itThat.getDataIndexAndIncrement()) : offsetToDefaultThat;
+                               that._dict.addToEntry(ret, fr, to, nCol);
+                       }
+
+                       if(itThat.hasNext()) {
+                               for(; i < _numRows && itThat.hasNext(); i++) {
+                                       int fr = (itThat.value() == i) ? that
+                                               
.getIndex(itThat.getDataIndexAndIncrement()) : offsetToDefaultThat;
+                                       that._dict.addToEntry(ret, fr, 
offsetToDefaultThis, nCol);
+                               }
                        }
+
+                       if(itThis.hasNext()) {
+                               for(; i < _numRows && itThis.hasNext(); i++) {
+                                       int to = (itThis.value() == i) ? 
getIndex(itThis.getDataIndexAndIncrement()) : offsetToDefaultThis;
+                                       that._dict.addToEntry(ret, 
offsetToDefaultThat, to, nCol);
+                               }
+                       }
+
+                       for(; i < _numRows; i++)
+                               that._dict.addToEntry(ret, offsetToDefaultThat, 
offsetToDefaultThis, nCol);
                }
 
-               for(; i < _numRows; i++)
-                       that._dict.addToEntry(ret, offsetToDefaultThat, 
offsetToDefaultThis, nCol);
-                       
                return ret;
        }
 
        @Override
        public Dictionary preAggregateThatSDCZerosStructure(ColGroupSDCZeros 
that, Dictionary ret) {
-               
+
                final AIterator itThat = that._indexes.getIterator();
                final AIterator itThis = _indexes.getIterator();
                final int nCol = that._colIndexes.length;
@@ -671,7 +694,7 @@ public class ColGroupSDC extends ColGroupValue {
                                final int to = 
getIndex(itThis.getDataIndexAndIncrement());
                                that._dict.addToEntry(ret, fr, to, nCol);
                        }
-                       else if(itThat.value() < itThis.value()){
+                       else if(itThat.value() < itThis.value()) {
                                final int fr = 
that.getIndex(itThat.getDataIndexAndIncrement());
                                that._dict.addToEntry(ret, fr, defThis, nCol);
                        }
@@ -679,7 +702,7 @@ public class ColGroupSDC extends ColGroupValue {
                                itThis.next();
                }
 
-               while(itThat.hasNext()){
+               while(itThat.hasNext()) {
                        final int fr = 
that.getIndex(itThat.getDataIndexAndIncrement());
                        that._dict.addToEntry(ret, fr, defThis, nCol);
                }
@@ -688,7 +711,12 @@ public class ColGroupSDC extends ColGroupValue {
        }
 
        @Override
-       public Dictionary 
preAggregateThatSDCSingleZerosStructure(ColGroupSDCSingleZeros that, Dictionary 
ret){
+       public Dictionary 
preAggregateThatSDCSingleZerosStructure(ColGroupSDCSingleZeros that, Dictionary 
ret) {
+               throw new NotImplementedException();
+       }
+
+       @Override
+       public Dictionary preAggregateThatSDCSingleStructure(ColGroupSDCSingle 
that, Dictionary ret, boolean preModified) {
                throw new NotImplementedException();
        }
 
diff --git 
a/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupSDCSingle.java
 
b/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupSDCSingle.java
index 82e5c81..85aab80 100644
--- 
a/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupSDCSingle.java
+++ 
b/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupSDCSingle.java
@@ -243,19 +243,24 @@ public class ColGroupSDCSingle extends ColGroupValue {
 
        @Override
        public int[] getCounts(int[] counts) {
-               final AIterator it = _indexes.getIterator();
-
-               while(it.hasNext()) {
-                       it.next();
-                       counts[1]++;
-               }
-               counts[0] = _numRows - counts[1];
+               counts[0] = _indexes.getSize();
+               counts[1] = _numRows - counts[0];
                return counts;
        }
 
        @Override
        public int[] getCounts(int rl, int ru, int[] counts) {
-               throw new NotImplementedException("Not Implemented");
+               final AIterator it = _indexes.getIterator();
+               it.skipTo(rl);
+
+               while(it.hasNext() && it.value() < ru) {
+                       it.next();
+                       counts[0]++;
+               }
+
+               counts[1] = ru - rl - counts[0];
+
+               return counts;
        }
 
        public double[] preAggregate(double[] a, int row) {
@@ -275,7 +280,7 @@ public class ColGroupSDCSingle extends ColGroupValue {
                        for(; i < _numRows; i++, offA++)
                                vals[0] += a[offA];
                }
-               else{
+               else {
                        for(; i < _numRows && it.hasNext(); i++)
                                if(it.value() == i)
                                        vals[1] += a[i];
@@ -294,10 +299,10 @@ public class ColGroupSDCSingle extends ColGroupValue {
                final int[] indexes = sb.indexes(row);
                final double[] sparseV = sb.values(row);
                final AIterator it = _indexes.getIterator();
-               
+
                for(int i = sb.pos(row); i < sb.size(row) + sb.pos(row); i++) {
                        it.skipTo(indexes[i]);
-                       if(it.value() == indexes[i]){
+                       if(it.value() == indexes[i]) {
                                vals[0] += sparseV[i];
                                it.next();
                        }
@@ -309,7 +314,7 @@ public class ColGroupSDCSingle extends ColGroupValue {
 
        @Override
        public long estimateInMemorySize() {
-               long size = 
ColGroupSizes.estimateInMemorySizeGroupValue(_colIndexes.length, 
getNumValues(),  isLossy());
+               long size = 
ColGroupSizes.estimateInMemorySizeGroupValue(_colIndexes.length, 
getNumValues(), isLossy());
                size += _indexes.getInMemorySize();
                return size;
        }
@@ -375,8 +380,8 @@ public class ColGroupSDCSingle extends ColGroupValue {
 
                int row;
                for(; i < this._numRows && it.hasNext(); i++) {
-                       int col = lhs.getIndex(i);
-                       if(it.value() == i){
+                       int col = lhs._data.getIndex(i);
+                       if(it.value() == i) {
                                row = 1;
                                it.next();
                        }
@@ -387,7 +392,7 @@ public class ColGroupSDCSingle extends ColGroupValue {
                }
                row = 0;
                for(; i < this._numRows; i++) {
-                       int col = lhs.getIndex(i);
+                       int col = lhs._data.getIndex(i);
                        if(col < lhs.getNumValues())
                                ag.increment(col + row * nCol);
                }
@@ -459,7 +464,8 @@ public class ColGroupSDCSingle extends ColGroupValue {
                final int rhsNV = this.getNumValues();
                final int retSize = lhsNV * rhsNV;
                final int nCol = lhs.getNumValues();
-               IPreAggregate ag = PreAggregateFactory.ag(retSize);;
+               IPreAggregate ag = PreAggregateFactory.ag(retSize);
+               ;
                final AIterator lIt = lhs._indexes.getIterator();
                final AIterator rIt = _indexes.getIterator();
 
@@ -473,7 +479,7 @@ public class ColGroupSDCSingle extends ColGroupValue {
                        }
                        else
                                col = 0;
-                       if(rIt.value() == i){
+                       if(rIt.value() == i) {
                                row = 1;
                                rIt.next();
                        }
@@ -482,9 +488,9 @@ public class ColGroupSDCSingle extends ColGroupValue {
                        ag.increment(col + row * nCol);
                }
 
-               if(lIt.hasNext() ) {
+               if(lIt.hasNext()) {
                        row = 1;
-                       for(; i < _numRows && lIt.hasNext() ; i++) {
+                       for(; i < _numRows && lIt.hasNext(); i++) {
                                if(lIt.value() == i) {
                                        col = 1;
                                        lIt.next();
@@ -496,10 +502,10 @@ public class ColGroupSDCSingle extends ColGroupValue {
                        }
                }
 
-               if( rIt.hasNext()) {
+               if(rIt.hasNext()) {
                        col = 1;
                        for(; i < _numRows && rIt.hasNext(); i++) {
-                               if(rIt.value()== i) {
+                               if(rIt.value() == i) {
                                        row = 1;
                                        rIt.next();
                                }
@@ -558,18 +564,91 @@ public class ColGroupSDCSingle extends ColGroupValue {
        }
 
        @Override
-       public Dictionary preAggregateThatSDCStructure(ColGroupSDC that, 
Dictionary ret) {
+       public Dictionary preAggregateThatSDCStructure(ColGroupSDC that, 
Dictionary ret, boolean preModified) {
                throw new NotImplementedException();
        }
 
        @Override
-       public Dictionary preAggregateThatSDCZerosStructure(ColGroupSDCZeros 
that, Dictionary ret){
+       public Dictionary preAggregateThatSDCZerosStructure(ColGroupSDCZeros 
that, Dictionary ret) {
                throw new NotImplementedException();
        }
-       
+
        @Override
-       public Dictionary 
preAggregateThatSDCSingleZerosStructure(ColGroupSDCSingleZeros that, Dictionary 
ret){
+       public Dictionary 
preAggregateThatSDCSingleZerosStructure(ColGroupSDCSingleZeros that, Dictionary 
ret) {
                throw new NotImplementedException();
        }
 
+       @Override
+       public Dictionary preAggregateThatSDCSingleStructure(ColGroupSDCSingle 
that, Dictionary ret, boolean preModified) {
+               final AIterator itThat = that._indexes.getIterator();
+               final AIterator itThis = _indexes.getIterator();
+               final int nCol = that._colIndexes.length;
+
+               if(preModified) {
+
+                       while(itThat.hasNext() && itThis.hasNext()) {
+                               if(itThat.value() == itThis.value()) {
+                                       itThat.next();
+                                       itThis.next();
+                                       that._dict.addToEntry(ret, 1, 0, nCol);
+                               }
+                               else if(itThat.value() < itThis.value()) {
+                                       itThat.next();
+                                       // that._dict.addToEntry(ret, 0, 1, 
nCol);
+                               }
+                               else
+                                       itThis.next();
+                       }
+
+                       // while(itThat.hasNext()) {
+                       //      final int fr = 
that.getIndex(itThat.getDataIndexAndIncrement());
+                       //      that._dict.addToEntry(ret, fr, 1, nCol);
+                       // }
+                       return ret;
+               }
+               else {
+                       int i = 0;
+                       for(; i < _numRows && itThat.hasNext() && 
itThis.hasNext(); i++) {
+                               int to = 0;
+                               if(itThis.value() == i) {
+                                       itThis.next();
+                                       to = 1;
+                               }
+                               int fr = 0;
+                               if(itThat.value() == i) {
+                                       itThat.next();
+                                       fr = 1;
+                               }
+                               that._dict.addToEntry(ret, fr, to, nCol);
+                       }
+
+                       if(itThat.hasNext()) {
+                               for(; i < _numRows && itThat.hasNext(); i++) {
+                                       int fr = 0;
+                                       if(itThat.value() == i) {
+                                               itThat.next();
+                                               fr = 1;
+                                       }
+                                       that._dict.addToEntry(ret, fr, 1, nCol);
+                               }
+                       }
+
+                       if(itThis.hasNext()) {
+                               for(; i < _numRows && itThis.hasNext(); i++) {
+                                       int to = 0;
+                                       if(itThis.value() == i) {
+                                               itThis.next();
+                                               to = 1;
+                                       }
+                                       that._dict.addToEntry(ret, 1, to, nCol);
+                               }
+                       }
+
+                       for(; i < _numRows; i++)
+                               that._dict.addToEntry(ret, 1, 1, nCol);
+
+                       return ret;
+               }
+
+       }
 }
diff --git 
a/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupSDCSingleZeros.java
 
b/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupSDCSingleZeros.java
index faf8955..34d148d 100644
--- 
a/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupSDCSingleZeros.java
+++ 
b/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupSDCSingleZeros.java
@@ -67,7 +67,7 @@ public class ColGroupSDCSingleZeros extends ColGroupValue {
                int[] cachedCounts) {
                super(colIndices, numRows, dict, cachedCounts);
                _indexes = OffsetFactory.create(indexes, numRows);
-               _zeros = false;
+               _zeros = true;
        }
 
        protected ColGroupSDCSingleZeros(int[] colIndices, int numRows, 
ADictionary dict, AOffset offsets,
@@ -196,25 +196,22 @@ public class ColGroupSDCSingleZeros extends ColGroupValue 
{
 
        @Override
        public int[] getCounts(int[] counts) {
-               return getCounts(0, _numRows, counts);
+               counts[0] = _indexes.getSize();
+               counts[1] = _numRows - counts[0];
+               return counts;
        }
 
        @Override
        public int[] getCounts(int rl, int ru, int[] counts) {
-               int i = rl;
                final AIterator it = _indexes.getIterator();
                it.skipTo(rl);
 
-               int zeros = 0;
                while(it.hasNext() && it.value() < ru) {
-                       int oldI = i;
-                       i = it.value();
                        it.next();
-                       zeros += i - oldI - 1;
                        counts[0]++;
                }
 
-               counts[counts.length - 1] += zeros + ru - i;
+               counts[1] = ru - rl - counts[0];
 
                return counts;
        }
@@ -287,8 +284,8 @@ public class ColGroupSDCSingleZeros extends ColGroupValue {
                        return new ColGroupSDCSingleZeros(_colIndexes, 
_numRows, applyBinaryRowOp(op.fn, v, sparseSafe, left),
                                _indexes, getCachedCounts());
                else {
-                       ADictionary aDictionary = 
swapEntries(applyBinaryRowOp(op.fn, v, sparseSafe, left));
-                       return new ColGroupSDCSingle(_colIndexes, _numRows, 
aDictionary, _indexes, null);
+                       ADictionary aDictionary = applyBinaryRowOp(op.fn, v, 
sparseSafe, left);
+                       return new ColGroupSDCSingle(_colIndexes, _numRows, 
aDictionary, _indexes, getCachedCounts());
                }
        }
 
@@ -348,7 +345,7 @@ public class ColGroupSDCSingleZeros extends ColGroupValue {
                final AIterator it = _indexes.getIterator();
 
                while(it.hasNext()) {
-                       final int col = lhs.getIndex(it.value());
+                       final int col = lhs._data.getIndex(it.value());
                        ag.increment(col);
                }
                return ag;
@@ -427,7 +424,7 @@ public class ColGroupSDCSingleZeros extends ColGroupValue {
        }
 
        @Override
-       public Dictionary preAggregateThatSDCStructure(ColGroupSDC that, 
Dictionary ret) {
+       public Dictionary preAggregateThatSDCStructure(ColGroupSDC that, 
Dictionary ret, boolean preModified) {
                throw new NotImplementedException();
        }
 
@@ -451,4 +448,9 @@ public class ColGroupSDCSingleZeros extends ColGroupValue {
 
                return ret;
        }
+
+       @Override
+       public Dictionary preAggregateThatSDCSingleStructure(ColGroupSDCSingle 
that, Dictionary ret, boolean preModified){
+               throw new NotImplementedException();
+       }
 }
diff --git 
a/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupSDCZeros.java
 
b/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupSDCZeros.java
index 3f6b8d3..410ec90 100644
--- 
a/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupSDCZeros.java
+++ 
b/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupSDCZeros.java
@@ -358,7 +358,7 @@ public class ColGroupSDCZeros extends ColGroupValue {
                final AIterator it = _indexes.getIterator();
 
                while(it.hasNext()) {
-                       final int col = lhs.getIndex(it.value());
+                       final int col = lhs._data.getIndex(it.value());
                        final int row = getIndex(it.getDataIndexAndIncrement());
                        ag.increment(col + row * nCol);
                }
@@ -482,7 +482,7 @@ public class ColGroupSDCZeros extends ColGroupValue {
                final int nCol = that._colIndexes.length;
 
                while(itThis.hasNext()) {
-                       final int fr = that.getIndex(itThis.value());
+                       final int fr = that._data.getIndex(itThis.value());
                        final int to = 
getIndex(itThis.getDataIndexAndIncrement());
                        that._dict.addToEntry(ret, fr, to, nCol);
                }
@@ -491,7 +491,7 @@ public class ColGroupSDCZeros extends ColGroupValue {
        }
 
        @Override
-       public Dictionary preAggregateThatSDCStructure(ColGroupSDC that, 
Dictionary ret) {
+       public Dictionary preAggregateThatSDCStructure(ColGroupSDC that, 
Dictionary ret, boolean preModified) {
                throw new NotImplementedException();
        }
 
@@ -534,4 +534,9 @@ public class ColGroupSDCZeros extends ColGroupValue {
                }
                return ret;
        }
+       
+       @Override
+       public Dictionary preAggregateThatSDCSingleStructure(ColGroupSDCSingle 
that, Dictionary re, boolean preModified){
+               throw new NotImplementedException();
+       }
 }
diff --git 
a/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupUncompressed.java
 
b/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupUncompressed.java
index b13078c..a848031 100644
--- 
a/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupUncompressed.java
+++ 
b/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupUncompressed.java
@@ -25,7 +25,6 @@ import java.io.IOException;
 import java.util.Arrays;
 
 import org.apache.commons.lang.NotImplementedException;
-import org.apache.sysds.runtime.DMLCompressionException;
 import 
org.apache.sysds.runtime.controlprogram.parfor.stat.InfrastructureAnalyzer;
 import org.apache.sysds.runtime.data.DenseBlock;
 import org.apache.sysds.runtime.data.DenseBlockFP64;
@@ -318,16 +317,15 @@ public class ColGroupUncompressed extends AColGroup {
 
        public void leftMultByMatrix(MatrixBlock matrix, double[] result, int 
numCols, int rl, int ru) {
 
-               MatrixBlock tmpRet = new MatrixBlock(ru - rl, 
_data.getNumColumns(), false);
+               final MatrixBlock tmpRet = new MatrixBlock(ru - rl, 
_data.getNumColumns(), false);
                tmpRet.allocateDenseBlock();
-               MatrixBlock leftSlice = matrix.slice(rl, ru - 1, false);
+               final MatrixBlock leftSlice = matrix.slice(rl, ru - 1, false);
                LibMatrixMult.matrixMult(leftSlice, _data, tmpRet);
                int offT = numCols * rl;
-
                if(tmpRet.isEmpty())
                        return;
-               if(tmpRet.isInSparseFormat()) {
-                       SparseBlock sb = tmpRet.getSparseBlock();
+               else if(tmpRet.isInSparseFormat()) {
+                       final SparseBlock sb = tmpRet.getSparseBlock();
                        for(int rowIdx = 0; rowIdx < ru - rl; rowIdx++, offT += 
numCols) {
                                if(!sb.isEmpty(rowIdx)) {
                                        final int apos = sb.pos(rowIdx);
@@ -340,33 +338,10 @@ public class ColGroupUncompressed extends AColGroup {
                        }
                }
                else {
-                       double[] tmpRetV = tmpRet.getDenseBlockValues();
-                       for(int j = rl, offTemp = 0; j < ru; j++, offTemp += 
_colIndexes.length, offT += numCols) {
+                       final double[] tmpRetV = tmpRet.getDenseBlockValues();
+                       for(int j = rl, offTemp = 0; j < ru; j++, offTemp += 
_colIndexes.length, offT += numCols)
                                for(int i = 0; i < _colIndexes.length; i++)
                                        result[offT + _colIndexes[i]] += 
tmpRetV[offTemp + i];
-                       }
-               }
-       }
-
-       public void rightMultByMatrix(int[] outputColumns, double[] 
preAggregatedB, double[] c, int thatNrColumns, int rl,
-               int ru) {
-               throw new NotImplementedException("Should not be called use 
other matrix function for uncompressed columns");
-       }
-
-       public double computeMxx(double c, Builtin builtin) {
-               throw new NotImplementedException("Not implemented max min on 
uncompressed");
-       }
-
-       public void leftMultByMatrix(MatrixBlock matrix, MatrixBlock result) {
-               MatrixBlock pret = new MatrixBlock(matrix.getNumRows(), 
_colIndexes.length, false);
-               LibMatrixMult.matrixMult(matrix, _data, pret);
-
-               // copying partialResult to the proper indices of the result
-               if(!pret.isEmptyBlock(false)) {
-                       double[] rsltArr = result.getDenseBlockValues();
-                       for(int colIx = 0; colIx < _colIndexes.length; colIx++)
-                               rsltArr[_colIndexes[colIx]] = 
pret.quickGetValue(0, colIx);
-                       result.recomputeNonZeros();
                }
        }
 
@@ -382,14 +357,20 @@ public class ColGroupUncompressed extends AColGroup {
 
        @Override
        public AColGroup binaryRowOp(BinaryOperator op, double[] v, boolean 
sparseSafe, boolean left) {
-               DenseBlock b = new DenseBlockFP64(new int[] {1, v.length}, v);
-               MatrixBlock that = new MatrixBlock(1, v.length, b);
-               that.setNonZeros(v.length);
+               double[] selectedValues = new double[_colIndexes.length];
+               for(int i = 0; i < _colIndexes.length; i++) {
+                       selectedValues[i] = v[_colIndexes[i]];
+               }
+               DenseBlock b = new DenseBlockFP64(new int[] {1, 
_colIndexes.length}, selectedValues);
+               MatrixBlock that = new MatrixBlock(1, _colIndexes.length, b);
+               that.setNonZeros(_colIndexes.length);
                MatrixBlock resultBlock = new MatrixBlock();
+
                if(left)
                        that.binaryOperations(op, _data, resultBlock);
                else
                        _data.binaryOperations(op, that, resultBlock);
+
                return new ColGroupUncompressed(_colIndexes, resultBlock, 
false);
        }
 
@@ -557,7 +538,10 @@ public class ColGroupUncompressed extends AColGroup {
 
        @Override
        public AColGroup copy() {
-               throw new NotImplementedException("Not implemented copy of 
uncompressed colGroup yet.");
+               MatrixBlock newData = new MatrixBlock(_data.getNumRows(), 
_data.getNumColumns(), _data.isInSparseFormat());
+               // _data.copy(newData);
+               newData.copy(_data);
+               return new ColGroupUncompressed(_colIndexes, newData, false);
        }
 
        @Override
@@ -585,7 +569,49 @@ public class ColGroupUncompressed extends AColGroup {
                if(lhs instanceof ColGroupEmpty)
                        return;
                if(lhs instanceof ColGroupUncompressed) {
-                       throw new DMLCompressionException("Not Implemented");
+                       ColGroupUncompressed lhsUC = (ColGroupUncompressed) lhs;
+                       MatrixBlock tmpRet = new 
MatrixBlock(_colIndexes.length, _colIndexes.length, 0);
+
+                       if(lhsUC._data == this._data) {
+
+                               
LibMatrixMult.matrixMultTransposeSelf(this._data, tmpRet, true,
+                                       
InfrastructureAnalyzer.getLocalParallelism());
+                       }
+                       else {
+                               LOG.warn("Inefficient Left Matrix 
Multiplication with transpose of left hand side : t(l) %*% r");
+                               MatrixBlock lhData = lhsUC._data;
+                               MatrixBlock transposed = new 
MatrixBlock(lhData.getNumColumns(), lhData.getNumRows(), false);
+                               LibMatrixReorg.transpose(lhData, transposed);
+                               transposed.setNonZeros(lhData.getNonZeros());
+                               // do transposed left hand side, matrix 
multiplication.
+                               LibMatrixMult.matrixMult(transposed, 
this._data, tmpRet);
+                       }
+
+                       if(tmpRet.isEmpty())
+                               return;
+                       else if(tmpRet.isInSparseFormat()) {
+                               SparseBlock sb = tmpRet.getSparseBlock();
+                               for(int rowIdx = 0, offT = 0; rowIdx < 
tmpRet.getNumRows(); rowIdx++, offT += numCols) {
+                                       if(!sb.isEmpty(rowIdx)) {
+                                               final int apos = sb.pos(rowIdx);
+                                               final int alen = 
sb.size(rowIdx) + apos;
+                                               final int[] aix = 
sb.indexes(rowIdx);
+                                               final double[] avals = 
sb.values(rowIdx);
+                                               for(int col = apos; col < alen; 
col++)
+                                                       result[offT + 
_colIndexes[aix[col]]] += avals[col];
+                                       }
+                               }
+                       }
+                       else {
+                               double[] tmpRetV = tmpRet.getDenseBlockValues();
+                               for(int j = 0, offTemp = 0, offT = 0;
+                                       j < tmpRet.getNumRows();
+                                       j++, offTemp += _colIndexes.length, 
offT += numCols) {
+                                       for(int i = 0; i < _colIndexes.length; 
i++)
+                                               result[offT + _colIndexes[i]] 
+= tmpRetV[offTemp + i];
+                               }
+                       }
+
                }
                else {
 
diff --git 
a/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupValue.java 
b/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupValue.java
index 0eeeb13..223c3a0 100644
--- 
a/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupValue.java
+++ 
b/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupValue.java
@@ -30,21 +30,14 @@ import org.apache.commons.lang.NotImplementedException;
 import org.apache.commons.lang3.tuple.ImmutablePair;
 import org.apache.commons.lang3.tuple.Pair;
 import org.apache.sysds.runtime.DMLCompressionException;
-import org.apache.sysds.runtime.compress.CompressionSettings;
 import org.apache.sysds.runtime.compress.colgroup.dictionary.ADictionary;
 import org.apache.sysds.runtime.compress.colgroup.dictionary.Dictionary;
 import org.apache.sysds.runtime.compress.colgroup.dictionary.DictionaryFactory;
-import org.apache.sysds.runtime.compress.colgroup.dictionary.QDictionary;
 import org.apache.sysds.runtime.compress.colgroup.pre.ArrPreAggregate;
 import org.apache.sysds.runtime.compress.colgroup.pre.IPreAggregate;
-import org.apache.sysds.runtime.compress.colgroup.pre.MapPreAggregate;
-import org.apache.sysds.runtime.compress.utils.ABitmap;
-import org.apache.sysds.runtime.compress.utils.Bitmap;
-import org.apache.sysds.runtime.compress.utils.BitmapLossy;
 import 
org.apache.sysds.runtime.controlprogram.parfor.stat.InfrastructureAnalyzer;
 import org.apache.sysds.runtime.data.SparseBlock;
 import org.apache.sysds.runtime.functionobjects.Builtin;
-import org.apache.sysds.runtime.functionobjects.Builtin.BuiltinCode;
 import org.apache.sysds.runtime.functionobjects.ValueFunction;
 import org.apache.sysds.runtime.matrix.data.LibMatrixReorg;
 import org.apache.sysds.runtime.matrix.data.MatrixBlock;
@@ -81,34 +74,6 @@ public abstract class ColGroupValue extends 
ColGroupCompressed implements Clonea
                super(numRows);
        }
 
-       /**
-        * Main constructor for the ColGroupValues. Used to contain the 
dictionaries used for the different types of
-        * ColGroup.
-        * 
-        * @param colIndices indices (within the block) of the columns included 
in this column
-        * @param numRows    total number of rows in the parent block
-        * @param ubm        Uncompressed bitmap representation of the block
-        * @param cs         The Compression settings used for compression
-        */
-       protected ColGroupValue(int[] colIndices, int numRows, ABitmap ubm, 
CompressionSettings cs) {
-               super(colIndices, numRows);
-
-               _zeros = ubm.getNumOffsets() < (long) numRows;
-               // sort values by frequency, if requested
-               if(cs.sortValuesByLength && numRows > 
CompressionSettings.BITMAP_BLOCK_SZ)
-                       ubm.sortValuesByFrequency();
-
-               switch(ubm.getType()) {
-                       case Full:
-                               _dict = new Dictionary(((Bitmap) 
ubm).getValues());
-                               break;
-                       case Lossy:
-                               _dict = new QDictionary((BitmapLossy) 
ubm).makeDoubleDictionary();
-                               // _lossy = true;
-                               break;
-               }
-       }
-
        protected ColGroupValue(int[] colIndices, int numRows, ADictionary 
dict, int[] cachedCounts) {
                super(colIndices, numRows);
                _dict = dict;
@@ -130,20 +95,21 @@ public abstract class ColGroupValue extends 
ColGroupCompressed implements Clonea
         * 
         * @return the number of distinct sets of values associated with the 
bitmaps in this column group
         */
-       public int getNumValues() {
+       public final int getNumValues() {
                return _dict.getNumberOfValues(_colIndexes.length);
        }
 
        @Override
-       public double[] getValues() {
+       public final double[] getValues() {
                return _dict != null ? _dict.getValues() : null;
        }
 
-       public ADictionary getDictionary() {
+       public final ADictionary getDictionary() {
                return _dict;
        }
 
-       public void addMinMax(double[] ret) {
+       @Override
+       public final void addMinMax(double[] ret) {
                _dict.addMaxAndMin(ret, _colIndexes);
        }
 
@@ -362,14 +328,6 @@ public abstract class ColGroupValue extends 
ColGroupCompressed implements Clonea
                return ret;
        }
 
-       public double getMin() {
-               return computeMxx(Double.POSITIVE_INFINITY, 
Builtin.getBuiltinFnObject(BuiltinCode.MIN));
-       }
-
-       public double getMax() {
-               return computeMxx(Double.NEGATIVE_INFINITY, 
Builtin.getBuiltinFnObject(BuiltinCode.MAX));
-       }
-
        protected double computeMxx(double c, Builtin builtin) {
                if(_zeros)
                        c = builtin.execute(c, 0);
@@ -487,11 +445,11 @@ public abstract class ColGroupValue extends 
ColGroupCompressed implements Clonea
        @Override
        public String toString() {
                StringBuilder sb = new StringBuilder();
+               sb.append(" Is Lossy: " + _dict.isLossy() + " num Rows: " + 
getNumRows() + " contain zero row:" + _zeros);
                sb.append(super.toString());
-               sb.append("Is Lossy: " + _dict.isLossy() + " num Rows: " + 
getNumRows() + " contain zero row:" + _zeros);
                if(_dict != null) {
                        sb.append(String.format("\n%15s%5d ", "Values:", 
_dict.getValues().length));
-                       _dict.getString(sb, _colIndexes.length);
+                       sb.append(_dict.getString(_colIndexes.length));
                }
                return sb.toString();
        }
@@ -556,6 +514,30 @@ public abstract class ColGroupValue extends 
ColGroupCompressed implements Clonea
                return super.clone();
        }
 
+       public AColGroup copyAndSet(double[] newDictionary) {
+               try {
+                       ColGroupValue clone = (ColGroupValue) this.clone();
+                       clone.setDictionary(new Dictionary(newDictionary));
+                       return clone;
+               }
+               catch(CloneNotSupportedException e) {
+                       e.printStackTrace();
+               }
+               return null;
+       }
+
+       public AColGroup copyAndSet(ADictionary newDictionary) {
+               try {
+                       ColGroupValue clone = (ColGroupValue) this.clone();
+                       clone.setDictionary(newDictionary);
+                       return clone;
+               }
+               catch(CloneNotSupportedException e) {
+                       e.printStackTrace();
+               }
+               return null;
+       }
+
        public AColGroup copyAndSet(int[] colIndexes, double[] newDictionary) {
                try {
                        ColGroupValue clone = (ColGroupValue) this.clone();
@@ -594,7 +576,6 @@ public abstract class ColGroupValue extends 
ColGroupCompressed implements Clonea
                return null;
        }
 
-
        @Override
        protected AColGroup sliceSingleColumn(int idx) {
                ColGroupValue ret = (ColGroupValue) copy();
@@ -667,12 +648,12 @@ public abstract class ColGroupValue extends 
ColGroupCompressed implements Clonea
         */
        protected void postScaling(double[] dictValues, double[] vals, double[] 
c, int numVals, int row, int totalCols,
                int offT) {
-               final int ncol = getNumCols();
+               final int nCol = getNumCols();
                int valOff = 0;
 
                for(int k = 0; k < numVals; k++) {
                        double aval = vals[k];
-                       for(int j = 0; j < ncol; j++) {
+                       for(int j = 0; j < nCol; j++) {
                                int colIx = _colIndexes[j] + row * totalCols;
                                c[offT + colIx] += aval * dictValues[valOff++];
                        }
@@ -771,21 +752,27 @@ public abstract class ColGroupValue extends 
ColGroupCompressed implements Clonea
        /**
         * Pre aggregate into a dictionary. It is assumed that "that" have more 
distinct values than, "this".
         * 
-        * @param that the other column group whose indexes are used for 
aggregation.
+        * @param that      the other column group whose indexes are used for 
aggregation.
+        * @param preModify specifies if the matrix in this
         * @return A aggregate dictionary
         */
-       public Dictionary preAggregateThatIndexStructure(ColGroupValue that) {
+       public Dictionary preAggregateThatIndexStructure(ColGroupValue that, 
boolean preModify) {
+               int outputLength = that._colIndexes.length * 
this.getNumValues();
+               Dictionary ret = new Dictionary(new double[outputLength]);
 
-               Dictionary ret = new Dictionary(new 
double[that._colIndexes.length * this.getNumValues()]);
+               // if(preModify)
+               // LOG.error(preModify + " " + that.getClass().getSimpleName() 
+ " in " + this.getClass().getSimpleName());
 
                if(that instanceof ColGroupDDC)
                        return preAggregateThatDDCStructure((ColGroupDDC) that, 
ret);
                else if(that instanceof ColGroupSDC)
-                       return preAggregateThatSDCStructure((ColGroupSDC) that, 
ret);
-               else if(that instanceof ColGroupSDCZeros)
-                       return 
preAggregateThatSDCZerosStructure((ColGroupSDCZeros) that, ret);
+                       return preAggregateThatSDCStructure((ColGroupSDC) that, 
ret, preModify);
+               else if(that instanceof ColGroupSDCSingle)
+                       return 
preAggregateThatSDCSingleStructure((ColGroupSDCSingle) that, ret, preModify);
                else if(that instanceof ColGroupSDCSingleZeros)
                        return 
preAggregateThatSDCSingleZerosStructure((ColGroupSDCSingleZeros) that, ret);
+               else if(that instanceof ColGroupSDCZeros)
+                       return 
preAggregateThatSDCZerosStructure((ColGroupSDCZeros) that, ret);
                else if(that instanceof ColGroupConst)
                        return preAggregateThatConstStructure((ColGroupConst) 
that, ret);
 
@@ -795,15 +782,17 @@ public abstract class ColGroupValue extends 
ColGroupCompressed implements Clonea
 
        public abstract Dictionary preAggregateThatDDCStructure(ColGroupDDC 
that, Dictionary ret);
 
-       public abstract Dictionary preAggregateThatSDCStructure(ColGroupSDC 
that, Dictionary ret);
+       public abstract Dictionary preAggregateThatSDCStructure(ColGroupSDC 
that, Dictionary ret, boolean preModified);
 
        public abstract Dictionary 
preAggregateThatSDCZerosStructure(ColGroupSDCZeros that, Dictionary ret);
 
        public abstract Dictionary 
preAggregateThatSDCSingleZerosStructure(ColGroupSDCSingleZeros that, Dictionary 
ret);
 
+       public abstract Dictionary 
preAggregateThatSDCSingleStructure(ColGroupSDCSingle that, Dictionary ret,
+               boolean preModified);
+
        public Dictionary preAggregateThatConstStructure(ColGroupConst that, 
Dictionary ret) {
                computeColSums(ret.getValues(), false);
-               LOG.error(Arrays.toString(ret.getValues()));
                return ret;
        }
 
@@ -835,53 +824,82 @@ public abstract class ColGroupValue extends 
ColGroupCompressed implements Clonea
                final int lCol = lhs._colIndexes.length;
                final int rCol = this._colIndexes.length;
 
+               final double threshold = 0.2;
+
                if(sameIndexStructure(lhs)) {
                        int[] agI = getCounts();
                        for(int a = 0, off = 0; a < nvL; a++, off += nvL + 1)
                                leftMultDictEntry(agI[a], off, nvL, lCol, rCol, 
lhs, numCols, lhValues, rhValues, result);
                }
                else if(lhs instanceof ColGroupConst || this instanceof 
ColGroupConst) {
-                       IPreAggregate ag = preAggregate(lhs);
-                       if(ag == null)
-                               return;
-                       else if(ag instanceof MapPreAggregate)
-                               leftMultMapPreAggregate(nvL, lCol, rCol, lhs, 
numCols, lhValues, rhValues, result,
-                                       (MapPreAggregate) ag);
-                       else
-                               leftMultArrayPreAggregate(nvL, nvR, lCol, rCol, 
lhs, numCols, lhValues, rhValues, result,
-                                       ((ArrPreAggregate) ag).getArr());
+                       double[] r = this instanceof ColGroupConst ? rhValues : 
this._dict.colSum(getCounts(), rCol);
+                       double[] l = lhs instanceof ColGroupConst ? lhValues : 
lhs._dict.colSum(lhs.getCounts(), lCol);
+                       vectorVectorMultiply(l, lhs._colIndexes, r, 
this._colIndexes, result, numCols);
                }
                else {
-                       if(nvR * rCol < nvL * lCol) {
-                               Dictionary preAgg = 
lhs.preAggregateThatIndexStructure(this);
+                       int[] countsRight = getCounts();
+                       int mostFrequentRight = Math.max(countsRight[0], 
countsRight[countsRight.length - 1]);
+                       double percentageRight = (double) mostFrequentRight / 
this._numRows;
+                       double skipRight = percentageRight * rCol;
+                       int[] countsLeft = lhs.getCounts();
+                       int mostFrequentLeft = Math.max(countsLeft[0], 
countsLeft[countsLeft.length - 1]);
+                       double percentageLeft = (double) mostFrequentLeft / 
this._numRows;
+                       double skipLeft = percentageLeft * lCol;
+
+                       if(skipRight > threshold && percentageRight > 
percentageLeft && !(this instanceof ColGroupDDC)) {
+                               double[] mct = 
this._dict.getMostCommonTuple(this.getCounts(), rCol);
+                               double[] lhsSum = 
lhs._dict.colSum(lhs.getCounts(), lCol);
+                               if(mct != null)
+                                       vectorVectorMultiply(lhsSum, 
lhs._colIndexes, mct, this._colIndexes, result, numCols);
+
+                               ColGroupValue thisM = (mct != null) ? 
(ColGroupValue) this
+                                       
.copyAndSet(this._dict.subtractTuple(mct)) : this;
+                               Dictionary preAgg = 
lhs.preAggregateThatIndexStructure(thisM, true);
+                               
matrixMultDictionariesAndOutputToColIndexes(lhValues, preAgg.getValues(), 
lhs._colIndexes,
+                                       this._colIndexes, result, numCols);
+                       }
+                       else if(skipLeft > threshold && !(lhs instanceof 
ColGroupDDC)) {
+                               double[] mct = 
lhs._dict.getMostCommonTuple(lhs.getCounts(), lCol);
+                               double[] thisColSum = 
this._dict.colSum(getCounts(), rCol);
+                               if(mct != null)
+                                       vectorVectorMultiply(mct, 
lhs._colIndexes, thisColSum, this._colIndexes, result, numCols);
+
+                               ColGroupValue lhsM = (mct != null) ? 
(ColGroupValue) lhs.copyAndSet(lhs._dict.subtractTuple(mct)) : lhs;
+                               Dictionary preAgg = 
this.preAggregateThatIndexStructure(lhsM, true);
+                               
matrixMultDictionariesAndOutputToColIndexes(preAgg.getValues(), rhValues, 
lhs._colIndexes,
+                                       this._colIndexes, result, numCols);
+                       }
+                       else if(nvR * rCol < nvL * lCol) {
+                               Dictionary preAgg = 
lhs.preAggregateThatIndexStructure(this, false);
                                
matrixMultDictionariesAndOutputToColIndexes(lhValues, preAgg.getValues(), 
lhs._colIndexes,
                                        this._colIndexes, result, numCols);
                        }
                        else {
-                               Dictionary preAgg = 
this.preAggregateThatIndexStructure(lhs);
+                               Dictionary preAgg = 
this.preAggregateThatIndexStructure(lhs, false);
                                
matrixMultDictionariesAndOutputToColIndexes(preAgg.getValues(), rhValues, 
lhs._colIndexes,
                                        this._colIndexes, result, numCols);
                        }
                }
        }
 
-       private void leftMultMapPreAggregate(final int nvL, final int lCol, 
final int rCol, final ColGroupValue lhs,
-               final int numCols, double[] lhValues, double[] rhValues, 
double[] c, MapPreAggregate agM) {
-               final int[] map = agM.getMap();
-               final int aggSize = agM.getSize();
-               for(int k = 0; k < aggSize; k += 2)
-                       leftMultDictEntry(map[k + 1], map[k], nvL, lCol, rCol, 
lhs, numCols, lhValues, rhValues, c);
-               leftMultDictEntry(agM.getMapFreeValue(), 0, nvL, lCol, rCol, 
lhs, numCols, lhValues, rhValues, c);
-       }
+       // private void leftMultMapPreAggregate(final int nvL, final int lCol, 
final int rCol, final ColGroupValue lhs,
+       // final int numCols, double[] lhValues, double[] rhValues, double[] c, 
MapPreAggregate agM) {
+       // final int[] map = agM.getMap();
+       // final int aggSize = agM.getSize();
+       // for(int k = 0; k < aggSize; k += 2)
+       // leftMultDictEntry(map[k + 1], map[k], nvL, lCol, rCol, lhs, numCols, 
lhValues, rhValues, c);
+       // leftMultDictEntry(agM.getMapFreeValue(), 0, nvL, lCol, rCol, lhs, 
numCols, lhValues, rhValues, c);
+       // }
 
-       private void leftMultArrayPreAggregate(final int nvL, final int nvR, 
final int lCol, final int rCol,
-               final ColGroupValue lhs, final int numCols, double[] lhValues, 
double[] rhValues, double[] c, int[] arr) {
-               for(int a = 0; a < nvL * nvR; a++)
-                       leftMultDictEntry(arr[a], a, nvL, lCol, rCol, lhs, 
numCols, lhValues, rhValues, c);
-       }
+       // private void leftMultArrayPreAggregate(final int nvL, final int nvR, 
final int lCol, final int rCol,
+       // final ColGroupValue lhs, final int numCols, double[] lhValues, 
double[] rhValues, double[] c, int[] arr) {
+       // for(int a = 0; a < nvL * nvR; a++)
+       // leftMultDictEntry(arr[a], a, nvL, lCol, rCol, lhs, numCols, 
lhValues, rhValues, c);
+       // }
 
        private void leftMultDictEntry(final int m, final int a, final int nvL, 
final int lCol, final int rCol,
-               final ColGroupValue lhs, final int numCols, double[] lhValues, 
double[] rhValues, double[] c) {
+               final ColGroupValue lhs, final int numCols, final double[] 
lhValues, final double[] rhValues,
+               final double[] c) {
 
                if(m > 0) {
                        final int lhsRowOffset = (a % nvL) * lCol;
@@ -930,27 +948,90 @@ public abstract class ColGroupValue extends 
ColGroupCompressed implements Clonea
                return _dict.getNumberNonZeros(counts, _colIndexes.length);
        }
 
-       private static void 
matrixMultDictionariesAndOutputToColIndexes(double[] left, double[] right, 
int[] colsLeft,
-               int[] colsRight, double[] result, int outCols) {
-               final int rows = left.length / colsLeft.length;
-               for(int k = 0; k < rows; k++) {
-                       final int offL = k * colsLeft.length;
-                       final int offR = k * colsRight.length;
-                       for(int i = 0; i < colsLeft.length; i++) {
-                               final int offOut = colsLeft[i] * outCols;
-                               final double vl = left[offL + i];
-                               if(vl != 0)
-                                       for(int j = 0; j < colsRight.length; 
j++) {
-                                               final double vr = right[offR + 
j];
-                                               result[offOut + colsRight[j]] 
+= vl * vr;
-                                       }
-                       }
+       private static void vectorVectorMultiply(final double[] left, final 
int[] leftRows, final double[] right,
+               final int[] rightColumns, final double[] result, final int 
outCols) {
+               if(left.length != leftRows.length) {
+                       // LOG.error(Arrays.toString(left));
+                       // LOG.error(Arrays.toString(right));
+                       // LOG.error(Arrays.toString(leftRows));
+                       // LOG.error(Arrays.toString(rightColumns));
+                       throw new DMLCompressionException(
+                               "Error left length " + left.length + " not 
equal columns length" + leftRows.length);
+               }
+               if(right.length != rightColumns.length)
+                       throw new DMLCompressionException(
+                               "Error right not equal length " + right.length 
+ "  " + rightColumns.length);
+               for(int row = 0; row < leftRows.length; row++) {
+                       final int outputRowOffset = leftRows[row] * outCols;
+                       final double vLeft = left[row];
+                       for(int col = 0; col < rightColumns.length; col++)
+                               result[outputRowOffset + rightColumns[col]] += 
vLeft * right[col];
                }
        }
 
-       @Override
-       public int getNumRows() {
-               return _numRows;
+       private static boolean logMM = true;
+
+       /**
+        * Matrix Multiply the two matrices, note that the left side is 
transposed,
+        * 
+        * making the multiplication a: t(left) %*% right
+        * 
+        * @param left      The left side matrix, transposed linearized row 
major
+        * @param right     The right hand side linearized row major
+        * @param rowsLeft  The number of rows and the row indexes on the left 
hand side
+        * @param colsRight The number of columns and the column indexes on the 
right hand side
+        * @param result    The result matrix to put the results into, 
linearized row major
+        * @param outCols   The output columns count, to know how much to 
offset into with results.
+        */
+       private static void 
matrixMultDictionariesAndOutputToColIndexes(double[] left, double[] right, 
int[] rowsLeft,
+               int[] colsRight, double[] result, int outCols) {
+
+               try {
+                       final int rows = left.length / rowsLeft.length;
+                       if(rows != right.length / colsRight.length)
+                               throw new DMLCompressionException(
+                                       "Not equal number of rows: " + rows + " 
" + right.length / colsRight.length);
+                       for(int k = 0; k < rows; k++) {
+                               final int offL = k * rowsLeft.length;
+                               final int offR = k * colsRight.length;
+                               // final int offL = k * colsRight.length;
+                               // final int offR = k * rowsLeft.length;
+                               // if(offR < right.length && offL < left.length)
+                               for(int i = 0; i < rowsLeft.length; i++) {
+                                       final int offOut = rowsLeft[i] * 
outCols;
+                                       final double vl = left[offL + i];
+                                       if(vl != 0)
+                                               for(int j = 0; j < 
colsRight.length; j++) {
+                                                       final double vr = 
right[offR + j];
+                                                       result[offOut + 
colsRight[j]] += vl * vr;
+                                               }
+                               }
+                       }
+               }
+               catch(Exception e) {
+
+                       if(logMM) {
+                               StringBuilder sb = new StringBuilder();
+                               sb.append("\nLeft (transposed):\n");
+                               for(int i = 0; i < rowsLeft.length; i++) {
+                                       for(int j = i * rowsLeft.length; j < (i 
+ 1) * rowsLeft.length; j++)
+                                               sb.append(left[j] + ", ");
+                                       sb.append("\n");
+                               }
+                               LOG.error(sb);
+
+                               sb = new StringBuilder();
+                               sb.append("\nRight:\n");
+                               for(int i = 0; i < colsRight.length; i++) {
+                                       for(int j = i * colsRight.length; j < 
(i + 1) * colsRight.length; j++)
+                                               sb.append(right[j] + ", ");
+                                       sb.append("\n");
+                               }
+                               LOG.error(sb);
+                               logMM = false;
+                       }
+                       throw new DMLCompressionException("MM of pre aggregated 
colGroups failed", e);
+               }
        }
 
        @Override
diff --git 
a/src/main/java/org/apache/sysds/runtime/compress/colgroup/dictionary/ADictionary.java
 
b/src/main/java/org/apache/sysds/runtime/compress/colgroup/dictionary/ADictionary.java
index 9b084e8..1aeda85 100644
--- 
a/src/main/java/org/apache/sysds/runtime/compress/colgroup/dictionary/ADictionary.java
+++ 
b/src/main/java/org/apache/sysds/runtime/compress/colgroup/dictionary/ADictionary.java
@@ -164,7 +164,7 @@ public abstract class ADictionary {
         * 
         * @return the long count of bytes to store the dictionary.
         */
-       public long getExactSizeOnDisk(){
+       public long getExactSizeOnDisk() {
                return 1;
        }
 
@@ -212,13 +212,15 @@ public abstract class ADictionary {
         */
        public abstract double sumRow(int k, boolean square, int nrColumns);
 
+       public abstract double[] colSum(int[] counts, int nCol);
+
        public abstract void colSum(double[] c, int[] counts, int[] colIndexes, 
boolean square);
 
        public abstract double sum(int[] counts, int ncol);
 
        public abstract double sumsq(int[] counts, int ncol);
 
-       public abstract StringBuilder getString(StringBuilder sb, int 
colIndexes);
+       public abstract String getString(int colIndexes);
 
        /**
         * This method adds the max and min values contained in the dictionary 
to corresponding cells in the ret variable.
@@ -252,6 +254,8 @@ public abstract class ADictionary {
 
        public abstract long getNumberNonZeros(int[] counts, int nCol);
 
+       public abstract long getNumberNonZerosContained();
+
        /**
         * Copies and adds the dictionary entry from this dictionary to the d 
dictionary
         * 
@@ -268,4 +272,23 @@ public abstract class ADictionary {
                else
                        return new Dictionary(((Bitmap) ubm).getValues());
        }
+
+       /**
+        * Get the most common tuple element contained in the dictionary
+        * 
+        * returns null if that tuple is all zero values.
+        * 
+        * @param counts The counts of the individual tuples contained, managed 
by the column group.
+        * @return a new double array containing the most common value
+        */
+       public abstract double[] getMostCommonTuple(int[] counts, int nCol);
+
+       /**
+        * Allocate a new dictionary where the tuple given is subtracted from 
all tuples in the previous dictionary.
+        * 
+        * @param tuple a double list representing a tuple, it is given that 
the tuple with is the same as this
+        *              dictionaries.
+        * @return a new instance of dictionary with the tuple subtracted.
+        */
+       public abstract ADictionary subtractTuple(double[] tuple);
 }
diff --git 
a/src/main/java/org/apache/sysds/runtime/compress/colgroup/dictionary/Dictionary.java
 
b/src/main/java/org/apache/sysds/runtime/compress/colgroup/dictionary/Dictionary.java
index 26d090d..b4f843f 100644
--- 
a/src/main/java/org/apache/sysds/runtime/compress/colgroup/dictionary/Dictionary.java
+++ 
b/src/main/java/org/apache/sysds/runtime/compress/colgroup/dictionary/Dictionary.java
@@ -85,9 +85,8 @@ public class Dictionary extends ADictionary {
        @Override
        public double aggregate(double init, Builtin fn) {
                // full aggregate can disregard tuple boundaries
-               int len = size();
                double ret = init;
-               for(int i = 0; i < len; i++)
+               for(int i = 0; i < _values.length; i++)
                        ret = fn.execute(ret, _values[i]);
                return ret;
        }
@@ -249,12 +248,23 @@ public class Dictionary extends ADictionary {
        }
 
        @Override
+       public double[] colSum(int[] counts, int nCol) {
+               final double[] res = new double[nCol];
+               int idx = 0;
+               for(int k = 0; k < _values.length / nCol; k++) {
+                       final int cntk = counts[k];
+                       for(int j = 0; j < nCol; j++)
+                               res[j] += _values[idx++] * cntk;
+               }
+               return res;
+       }
+
+       @Override
        public void colSum(double[] c, int[] counts, int[] colIndexes, boolean 
square) {
                if(_values == null)
                        return;
-
                for(int k = 0; k < _values.length / colIndexes.length; k++) {
-                       int cntk = counts[k];
+                       final int cntk = counts[k];
                        for(int j = 0; j < colIndexes.length; j++) {
                                double v = _values[k * colIndexes.length + j];
                                if(square)
@@ -328,17 +338,22 @@ public class Dictionary extends ADictionary {
                }
        }
 
-       public StringBuilder getString(StringBuilder sb, int colIndexes) {
-               sb.append("[");
-               for(int i = 0; i < _values.length - 1; i++) {
-                       sb.append(_values[i]);
-                       sb.append((i) % (colIndexes) == colIndexes - 1 ? ", " : 
": ");
-               }
-               if(_values != null && _values.length > 0) {
-                       sb.append(_values[_values.length - 1]);
+       public String getString(int colIndexes) {
+               StringBuilder sb = new StringBuilder();
+               if(colIndexes == 1)
+                       sb.append(Arrays.toString(_values));
+               else {
+                       sb.append("[");
+                       for(int i = 0; i < _values.length - 1; i++) {
+                               sb.append(_values[i]);
+                               sb.append((i) % (colIndexes) == colIndexes - 1 
? "\n: " : ", ");
+                       }
+                       if(_values != null && _values.length > 0) {
+                               sb.append(_values[_values.length - 1]);
+                       }
+                       sb.append("]");
                }
-               sb.append("]");
-               return sb;
+               return sb.toString();
        }
 
        public ADictionary sliceOutColumnRange(int idxStart, int idxEnd, int 
previousNumberOfColumns) {
@@ -419,4 +434,46 @@ public class Dictionary extends ADictionary {
        public boolean isLossy() {
                return false;
        }
+
+       @Override
+       public long getNumberNonZerosContained() {
+               long count = 0;
+               for(double v : _values) {
+                       if(v != 0.0)
+                               count++;
+               }
+               return count;
+       }
+
+       @Override
+       public double[] getMostCommonTuple(int[] counts, int nCol) {
+               int maxIndex = 0;
+               int maxCount = 0;
+               for(int i = 0; i < counts.length; i++) {
+                       if(counts[i] > maxCount) {
+                               maxCount = counts[i];
+                               maxIndex = i;
+                       }
+               }
+               final double[] tuple = new double[nCol];
+               boolean allZero = true;
+               for(int i = maxIndex * nCol, off = 0; i < (maxIndex + 1) * nCol 
&& i < _values.length; i++, off++) {
+                       final double v = _values[i];
+                       if(v != 0) {
+                               tuple[off] = _values[i];
+                               allZero = false;
+                       }
+               }
+
+               return allZero ? null : tuple;
+       }
+
+       @Override
+       public ADictionary subtractTuple(double[] tuple) {
+               double[] newValues = new double[_values.length];
+               for(int i = 0; i < _values.length; i++) {
+                       newValues[i] = _values[i] - tuple[i % tuple.length];
+               }
+               return new Dictionary(newValues);
+       }
 }
diff --git 
a/src/main/java/org/apache/sysds/runtime/compress/colgroup/dictionary/QDictionary.java
 
b/src/main/java/org/apache/sysds/runtime/compress/colgroup/dictionary/QDictionary.java
index 524bb32..a8559db 100644
--- 
a/src/main/java/org/apache/sysds/runtime/compress/colgroup/dictionary/QDictionary.java
+++ 
b/src/main/java/org/apache/sysds/runtime/compress/colgroup/dictionary/QDictionary.java
@@ -314,6 +314,20 @@ public class QDictionary extends ADictionary {
                }
        }
 
+
+       @Override
+       public double[] colSum(int[] counts, int nCol){
+               throw new NotImplementedException("Not Implemented");
+               // final double[] res = new double[counts.length];
+               // int idx = 0;
+               // for(int k = 0; k< _values.length / counts.length; k++){
+               //      final int cntk = counts[k];
+               //      for(int j = 0; j< counts.length; j++){
+               //              res[j] += _values[idx++] * cntk;
+               //      }
+               // }
+               // return res;
+       }
        @Override
        public void colSum(double[] c, int[] counts, int[] colIndexes, boolean 
square) {
                throw new NotImplementedException("Not Implemented");
@@ -398,12 +412,13 @@ public class QDictionary extends ADictionary {
                }
        }
 
-       public StringBuilder getString(StringBuilder sb, int colIndexes) {
+       public String getString( int colIndexes) {
+               StringBuilder sb = new StringBuilder();
                for(int i = 0; i < size(); i++) {
                        sb.append(_values[i]);
                        sb.append((i) % (colIndexes) == colIndexes - 1 ? "\n" : 
" ");
                }
-               return sb;
+               return sb.toString();
        }
 
        public Dictionary makeDoubleDictionary() {
@@ -470,4 +485,24 @@ public class QDictionary extends ADictionary {
        public boolean isLossy() {
                return false;
        }
+
+       @Override
+       public long getNumberNonZerosContained(){
+               long count = 0;
+               for(double v : _values){
+                       if(v != 0.0)
+                               count++;
+               }
+               return count;
+       }
+
+       @Override
+       public double[] getMostCommonTuple(int[] counts, int nCol){
+               return null;
+       }
+
+       @Override
+       public ADictionary subtractTuple(double[] tuple){
+               throw new NotImplementedException();
+       }
 }
diff --git 
a/src/main/java/org/apache/sysds/runtime/compress/lib/CLALibCompAgg.java 
b/src/main/java/org/apache/sysds/runtime/compress/lib/CLALibCompAgg.java
index 13b2007..f6c734c 100644
--- a/src/main/java/org/apache/sysds/runtime/compress/lib/CLALibCompAgg.java
+++ b/src/main/java/org/apache/sysds/runtime/compress/lib/CLALibCompAgg.java
@@ -83,7 +83,6 @@ public class CLALibCompAgg {
                        else
                                
aggregateUnaryNormalCompressedMatrixBlock(inputMatrix, outputMatrix, op, blen, 
indexesIn, inCP);
                }
-
                outputMatrix.recomputeNonZeros();
 
                return outputMatrix;
diff --git 
a/src/main/java/org/apache/sysds/runtime/compress/lib/CLALibLeftMultBy.java 
b/src/main/java/org/apache/sysds/runtime/compress/lib/CLALibLeftMultBy.java
index b505c71..2779a68 100644
--- a/src/main/java/org/apache/sysds/runtime/compress/lib/CLALibLeftMultBy.java
+++ b/src/main/java/org/apache/sysds/runtime/compress/lib/CLALibLeftMultBy.java
@@ -35,10 +35,8 @@ import org.apache.sysds.runtime.compress.colgroup.AColGroup;
 import org.apache.sysds.runtime.compress.colgroup.AColGroup.CompressionType;
 import org.apache.sysds.runtime.compress.colgroup.ColGroupValue;
 import org.apache.sysds.runtime.compress.colgroup.pre.IPreAggregate;
-import org.apache.sysds.runtime.functionobjects.SwapIndex;
 import org.apache.sysds.runtime.matrix.data.LibMatrixReorg;
 import org.apache.sysds.runtime.matrix.data.MatrixBlock;
-import org.apache.sysds.runtime.matrix.operators.ReorgOperator;
 import org.apache.sysds.runtime.util.CommonThreadPool;
 
 public class CLALibLeftMultBy {
@@ -46,16 +44,18 @@ public class CLALibLeftMultBy {
 
        public static MatrixBlock 
leftMultByMatrixTransposed(CompressedMatrixBlock m1, MatrixBlock m2, 
MatrixBlock ret,
                int k) {
+               if(m2.isEmpty())
+                       return ret;
                MatrixBlock transposed = new MatrixBlock(m2.getNumColumns(), 
m2.getNumRows(), false);
                LibMatrixReorg.transpose(m2, transposed);
                ret = leftMultByMatrix(m1, transposed, ret, k);
                ret.recomputeNonZeros();
                return ret;
-               // return LibMatrixReorg.transpose(ret, new 
MatrixBlock(ret.getNumColumns(), ret.getNumRows(), false));
        }
 
        public static MatrixBlock 
leftMultByMatrixTransposed(CompressedMatrixBlock m1, CompressedMatrixBlock m2,
                MatrixBlock ret, int k) {
+
                prepareReturnMatrix(m1, m2, ret, true);
                leftMultByCompressedTransposedMatrix(m1.getColGroups(), m2, 
ret, k, m1.getNumColumns(), m1.getMaxNumValues(),
                        m1.isOverlapping());
@@ -66,24 +66,14 @@ public class CLALibLeftMultBy {
 
        public static MatrixBlock leftMultByMatrix(CompressedMatrixBlock m1, 
MatrixBlock m2, MatrixBlock ret, int k) {
                prepareReturnMatrix(m1, m2, ret, false);
-               ret = leftMultByMatrix(m1.getColGroups(), m2, ret, false, 
m1.getNumColumns(), m1.isOverlapping(), k,
-                       m1.getMaxNumValues());
+               if(m2.isEmpty())
+                       return ret;
+               ret = leftMultByMatrix(m1.getColGroups(), m2, ret, k, 
m1.getNumColumns(), m1.getMaxNumValues(),
+                       m1.isOverlapping());
                ret.recomputeNonZeros();
                return ret;
        }
 
-       private static MatrixBlock leftMultByMatrix(List<AColGroup> groups, 
MatrixBlock that, MatrixBlock ret,
-               boolean doTranspose, int numCols, boolean overlapping, int k, 
Pair<Integer, int[]> v) {
-
-               if(doTranspose) {
-                       ReorgOperator r_op = new 
ReorgOperator(SwapIndex.getSwapIndexFnObject(), k);
-                       that = that.reorgOperations(r_op, new MatrixBlock(), 0, 
0, 0);
-               }
-
-               return leftMultByMatrix(groups, that, ret, k, numCols, v, 
overlapping);
-
-       }
-
        private static MatrixBlock prepareReturnMatrix(MatrixBlock m1, 
MatrixBlock m2, MatrixBlock ret,
                boolean doTranspose) {
                int numRowsOutput = doTranspose ? m2.getNumColumns() : 
m2.getNumRows();
@@ -95,9 +85,9 @@ public class CLALibLeftMultBy {
                return ret;
        }
 
-       public static void leftMultByTransposeSelf(CompressedMatrixBlock mb, 
MatrixBlock result, int k) {
+       // public static void leftMultByTransposeSelf(CompressedMatrixBlock mb, 
MatrixBlock result, int k) {
 
-       }
+       // }
 
        public static void leftMultByTransposeSelf(List<AColGroup> groups, 
MatrixBlock result, int k, int numColumns,
                Pair<Integer, int[]> v, boolean overlapping) {
@@ -109,11 +99,18 @@ public class CLALibLeftMultBy {
                        return;
                }
 
-               if(k <= 1 || overlapping) {
-                       if(overlapping)
-                               LOG.warn(
-                                       "Inefficient TSMM with overlapping 
matrix Could be implemented multi-threaded but is not yet.");
+               if(overlapping) {
+                       LOG.warn("Inefficient TSMM with overlapping matrix 
could be implemented multi-threaded but is not yet.");
                        leftMultByCompressedTransposedMatrix(groups, groups, 
result);
+
+                       result.recomputeNonZeros();
+                       return;
+               }
+
+               if(k <= 1) {
+                       for(int i = 0; i < groups.size(); i++)
+                               
leftMultByCompressedTransposedMatrix(groups.get(i), groups, 
result.getDenseBlockValues(),
+                                       result.getNumRows(), 
result.getNumColumns(), i, groups.size());
                }
                else {
                        try {
@@ -132,8 +129,8 @@ public class CLALibLeftMultBy {
                                throw new DMLRuntimeException(e);
                        }
                }
-               copyToUpperTriangle(result.getDenseBlockValues(), numColumns);
 
+               copyToUpperTriangle(result.getDenseBlockValues(), numColumns);
                result.recomputeNonZeros();
        }
 
@@ -162,9 +159,10 @@ public class CLALibLeftMultBy {
                List<AColGroup> thatCGs = that.getColGroups();
                Pair<Integer, int[]> thatV = that.getMaxNumValues();
 
-               if(k <= 1 || overlapping || that.isOverlapping()){
+               if(k <= 1 || overlapping || that.isOverlapping()) {
                        if(overlapping || that.isOverlapping())
-                               LOG.warn("Inefficient Compressed multiplication 
with overlapping matrix could be implemented multi-threaded but is not yet.");
+                               LOG.warn(
+                                       "Inefficient Compressed multiplication 
with overlapping matrix could be implemented multi-threaded but is not yet.");
                        leftMultByCompressedTransposedMatrix(colGroups, 
thatCGs, ret);
                }
                else
@@ -220,6 +218,7 @@ public class CLALibLeftMultBy {
 
                        }
                        catch(Exception e) {
+                               e.printStackTrace();
                                throw new DMLRuntimeException(e);
                        }
                        return null;
@@ -229,7 +228,6 @@ public class CLALibLeftMultBy {
        private static void 
leftMultByCompressedTransposedMatrix(List<AColGroup> thisCG, List<AColGroup> 
thatCG,
                MatrixBlock ret) {
                double[] c = ret.getDenseBlockValues();
-
                for(AColGroup lhs : thatCG) {
                        leftMultByCompressedTransposedMatrix(lhs, thisCG, c, 
ret.getNumRows(), ret.getNumColumns(), 0,
                                thisCG.size());
@@ -256,16 +254,9 @@ public class CLALibLeftMultBy {
                ret.allocateDenseBlock();
                double[] retV = ret.getDenseBlockValues();
 
-               // for(int b = 0; b < db.numBlocks(); b++) {
-               // int blockSize = db.blockSize(b);
-               // blockU = Math.min(blockL + blockSize, ret.getNumRows());
-
-               if(k == 1) {
-                       LOG.trace("Single treaded left matrix multiplication");
-                       for(int j = 0; j < colGroups.size(); j++) {
+               if(k == 1)
+                       for(int j = 0; j < colGroups.size(); j++)
                                colGroups.get(j).leftMultByMatrix(that, retV, 
numColumns);
-                       }
-               }
                else {
                        try {
                                ExecutorService pool = CommonThreadPool.get(k);
diff --git 
a/src/main/java/org/apache/sysds/runtime/compress/lib/CLALibRightMultBy.java 
b/src/main/java/org/apache/sysds/runtime/compress/lib/CLALibRightMultBy.java
index 5417470..5b0a105 100644
--- a/src/main/java/org/apache/sysds/runtime/compress/lib/CLALibRightMultBy.java
+++ b/src/main/java/org/apache/sysds/runtime/compress/lib/CLALibRightMultBy.java
@@ -20,7 +20,9 @@
 package org.apache.sysds.runtime.compress.lib;
 
 import java.util.ArrayList;
+import java.util.HashSet;
 import java.util.List;
+import java.util.Set;
 import java.util.concurrent.Callable;
 import java.util.concurrent.ExecutionException;
 import java.util.concurrent.ExecutorService;
@@ -32,7 +34,7 @@ import org.apache.commons.logging.LogFactory;
 import org.apache.sysds.runtime.DMLRuntimeException;
 import org.apache.sysds.runtime.compress.CompressedMatrixBlock;
 import org.apache.sysds.runtime.compress.colgroup.AColGroup;
-import org.apache.sysds.runtime.compress.colgroup.ColGroupUncompressed;
+import org.apache.sysds.runtime.compress.colgroup.ColGroupEmpty;
 import org.apache.sysds.runtime.matrix.data.MatrixBlock;
 import org.apache.sysds.runtime.util.CommonThreadPool;
 
@@ -55,34 +57,18 @@ public class CLALibRightMultBy {
                that = that instanceof CompressedMatrixBlock ? 
((CompressedMatrixBlock) that).decompress(k) : that;
 
                MatrixBlock m = rightMultByMatrixOverlapping(colGroups, that, 
ret, k, v);
+
                if(m instanceof CompressedMatrixBlock)
                        if(allowOverlappingOutput(colGroups, allowOverlap))
                                return m;
-                       else {
-                               CompressedMatrixBlock outBlock = 
(CompressedMatrixBlock) m;
-                               ColGroupUncompressed uccg = 
findUncompressedColGroup(outBlock.getColGroups());
-                               if(uccg == null)
-                                       return outBlock.decompress(k);
-                               else{
-                                       MatrixBlock outputBlock = 
uccg.getData();
-                                       outputBlock.sparseToDense();
-                                       return outBlock.decompress(outputBlock, 
k);
-                               }
-                       }
+                       else
+                               return ((CompressedMatrixBlock) 
m).decompress(k);
                else
                        return m;
        }
 
-       private static ColGroupUncompressed 
findUncompressedColGroup(List<AColGroup> colGroups){
-               for(AColGroup g: colGroups){
-                       if(g instanceof ColGroupUncompressed)
-                               return (ColGroupUncompressed) g;
-               }
-               return null;
-       }
-
        private static boolean allowOverlappingOutput(List<AColGroup> 
colGroups, boolean allowOverlap) {
-               
+
                if(!allowOverlap) {
                        LOG.debug("Not Overlapping because it is not allowed");
                        return false;
@@ -91,18 +77,18 @@ public class CLALibRightMultBy {
                        return true;
                // int distinctCount = 0;
                // for(AColGroup g : colGroups) {
-               //      if(g instanceof ColGroupCompressed)
-               //              distinctCount += ((ColGroupCompressed) 
g).getNumValues();
-               //      else {
-               //              LOG.debug("Not Overlapping because there is an 
un-compressed column group");
-               //              return false;
-               //      }
+               // if(g instanceof ColGroupCompressed)
+               // distinctCount += ((ColGroupCompressed) g).getNumValues();
+               // else {
+               // LOG.debug("Not Overlapping because there is an un-compressed 
column group");
+               // return false;
+               // }
                // }
                // final int threshold = colGroups.get(0).getNumRows() / 2;
                // boolean allow = distinctCount <= threshold;
                // if(LOG.isDebugEnabled() && !allow)
-               //      LOG.debug("Not Allowing Overlap because of number of 
distinct items in compression: " + distinctCount
-               //              + " is greater than threshold: " + threshold);
+               // LOG.debug("Not Allowing Overlap because of number of 
distinct items in compression: " + distinctCount
+               // + " is greater than threshold: " + threshold);
                // return allow;
 
        }
@@ -122,9 +108,15 @@ public class CLALibRightMultBy {
                CompressedMatrixBlock ret, int k, Pair<Integer, int[]> v) {
 
                List<AColGroup> retCg = new ArrayList<>();
+               boolean containsNull = false;
                if(k == 1) {
-                       for(AColGroup g : colGroups)
-                               retCg.add(g.rightMultByMatrix(that));
+                       for(AColGroup g : colGroups) {
+                               AColGroup retG = g.rightMultByMatrix(that);
+                               if(retG != null)
+                                       retCg.add(retG);
+                               else
+                                       containsNull = true;
+                       }
                }
                else {
                        ExecutorService pool = CommonThreadPool.get(k);
@@ -132,10 +124,12 @@ public class CLALibRightMultBy {
                                List<Callable<AColGroup>> tasks = new 
ArrayList<>(colGroups.size());
                                for(AColGroup g : colGroups)
                                        tasks.add(new RightMatrixMultTask(g, 
that));
-                               for(Future<AColGroup> fg : 
pool.invokeAll(tasks)){
+                               for(Future<AColGroup> fg : 
pool.invokeAll(tasks)) {
                                        AColGroup g = fg.get();
                                        if(g != null)
                                                retCg.add(g);
+                                       else
+                                               containsNull = true;
                                }
                        }
                        catch(InterruptedException | ExecutionException e) {
@@ -145,9 +139,32 @@ public class CLALibRightMultBy {
                ret.allocateColGroupList(retCg);
                if(retCg.size() > 1)
                        ret.setOverlapping(true);
+
+               if(containsNull) {
+                       ColGroupEmpty cge = 
findEmptyColumnsAndMakeEmptyColGroup(retCg, ret.getNumColumns());
+                       if(cge != null)
+                               retCg.add(cge);
+               }
                return ret;
        }
 
+       private static ColGroupEmpty 
findEmptyColumnsAndMakeEmptyColGroup(List<AColGroup> colGroups, int nCols) {
+               Set<Integer> emptyColumns = new HashSet<Integer>(nCols);
+               for(int i = 0; i < nCols; i++)
+                       emptyColumns.add(i);
+
+               for(AColGroup g : colGroups)
+                       for(int c : g.getColIndices())
+                               emptyColumns.remove(c);
+
+               if(emptyColumns.size() != 0) {
+                       int[] emptyColumnsFinal = 
emptyColumns.stream().mapToInt(Integer::intValue).toArray();
+                       return new ColGroupEmpty(emptyColumnsFinal, 
colGroups.get(0).getNumRows());
+               }
+               else
+                       return null;
+       }
+
        private static class RightMatrixMultTask implements Callable<AColGroup> 
{
                private final AColGroup _colGroup;
                private final MatrixBlock _b;
@@ -163,7 +180,6 @@ public class CLALibRightMultBy {
                                return _colGroup.rightMultByMatrix(_b);
                        }
                        catch(Exception e) {
-                               LOG.error(e);
                                throw new DMLRuntimeException(e);
                        }
                }
diff --git 
a/src/main/java/org/apache/sysds/runtime/compress/lib/CLALibScalar.java 
b/src/main/java/org/apache/sysds/runtime/compress/lib/CLALibScalar.java
index 11e0b8b..e3ef846 100644
--- a/src/main/java/org/apache/sysds/runtime/compress/lib/CLALibScalar.java
+++ b/src/main/java/org/apache/sysds/runtime/compress/lib/CLALibScalar.java
@@ -33,6 +33,7 @@ import org.apache.sysds.hops.OptimizerUtils;
 import org.apache.sysds.runtime.DMLRuntimeException;
 import org.apache.sysds.runtime.compress.CompressedMatrixBlock;
 import org.apache.sysds.runtime.compress.colgroup.AColGroup;
+import org.apache.sysds.runtime.compress.colgroup.ColGroupCompressed;
 import org.apache.sysds.runtime.compress.colgroup.ColGroupConst;
 import org.apache.sysds.runtime.compress.colgroup.ColGroupOLE;
 import org.apache.sysds.runtime.compress.colgroup.ColGroupUncompressed;
@@ -180,7 +181,7 @@ public class CLALibScalar {
                                tasks.add(new ScalarTask(uc, sop));
                        }
                        else {
-                               int nv = ((ColGroupValue) grp).getNumValues() * 
grp.getColIndices().length;
+                               int nv = ((ColGroupCompressed) 
grp).getNumValues() * grp.getColIndices().length;
                                if(nv < MINIMUM_PARALLEL_SIZE && !(grp 
instanceof ColGroupOLE)) {
                                        small.add(grp);
                                }
diff --git 
a/src/main/java/org/apache/sysds/runtime/matrix/data/MatrixValue.java 
b/src/main/java/org/apache/sysds/runtime/matrix/data/MatrixValue.java
index c9838d5..98a5536 100644
--- a/src/main/java/org/apache/sysds/runtime/matrix/data/MatrixValue.java
+++ b/src/main/java/org/apache/sysds/runtime/matrix/data/MatrixValue.java
@@ -97,7 +97,7 @@ public abstract class MatrixValue implements 
WritableComparable
        public abstract void reset(int rl, int cl, double v);
 
        /**
-        * Copy this MatrixValue into that MatrixValue.
+        * Copy that MatrixValue into this MatrixValue.
         * 
         * If the MatrixValue is a MatrixBlock evaluate the sparsity of the 
original matrix,
         * and copy into either a sparse or a dense matrix.
@@ -107,7 +107,7 @@ public abstract class MatrixValue implements 
WritableComparable
        public abstract void copy(MatrixValue that);
 
        /**
-        * Copy this MatrixValue into that MatrixValue. But select sparse 
destination block depending on boolean parameter.
+        * Copy that MatrixValue into this MatrixValue. But select sparse 
destination block depending on boolean parameter.
         * 
         * @param that object to copy the values into.
         * @param sp boolean specifying if output should be forced sparse or 
dense. (only applicable if the 'that' is a MatrixBlock)
diff --git 
a/src/test/java/org/apache/sysds/test/component/compress/CompressedTestBase.java
 
b/src/test/java/org/apache/sysds/test/component/compress/CompressedTestBase.java
index a93edd2..1167512 100644
--- 
a/src/test/java/org/apache/sysds/test/component/compress/CompressedTestBase.java
+++ 
b/src/test/java/org/apache/sysds/test/component/compress/CompressedTestBase.java
@@ -85,7 +85,7 @@ public abstract class CompressedTestBase extends TestBase {
 
        protected static OverLapping[] overLapping = new OverLapping[] {
                // OverLapping.COL,
-               OverLapping.PLUS, OverLapping.MATRIX, OverLapping.NONE,
+               OverLapping.PLUS, OverLapping.MATRIX, OverLapping.NONE, 
OverLapping.APPEND_EMPTY, OverLapping.APPEND_CONST,
                // OverLapping.MATRIX_PLUS,
                // OverLapping.SQUASH,
                // OverLapping.MATRIX_MULT_NEGATIVE
@@ -217,11 +217,23 @@ public abstract class CompressedTestBase extends TestBase 
{
                                        lossyTolerance = lossyTolerance * 160;
                                        cols = 2;
                                        break;
+                               case APPEND_EMPTY:
+                                       tmp = new MatrixBlock(rows, 1, 0);
+                                       break;
+                               case APPEND_CONST:
+                                       tmp = new MatrixBlock(rows, 1, 0)
+                                               .scalarOperations(new 
RightScalarOperator(Plus.getPlusFnObject(), 1), new MatrixBlock());
+                                       break;
                                default:
                                        break;
                        }
                        if(cmb instanceof CompressedMatrixBlock) {
-                               if(tmp != null) {
+                               if(tmp != null && ov == 
OverLapping.APPEND_EMPTY || ov == OverLapping.APPEND_CONST) {
+                                       mb = mb.append(tmp, new MatrixBlock());
+                                       cmb = cmb.append(tmp, new 
MatrixBlock());
+                                       cols += tmp.getNumColumns();
+                               }
+                               else if(tmp != null) {
                                        // Make Operator
                                        AggregateBinaryOperator abop = 
InstructionUtils.getMatMultOperator(_k);
 
@@ -337,10 +349,10 @@ public abstract class CompressedTestBase extends TestBase 
{
                                return; // Input was not compressed then just 
pass test
 
                        MatrixBlock vector1 = DataConverter
-                               
.convertToMatrixBlock(TestUtils.generateTestMatrix(cols, 1, 0.9, 1.1, 1.0, 3));
+                               
.convertToMatrixBlock(TestUtils.generateTestMatrix(cols, 1, 0.9, 1.5, 1.0, 3));
 
                        MatrixBlock vector2 = (ctype == ChainType.XtwXv) ? 
DataConverter
-                               
.convertToMatrixBlock(TestUtils.generateTestMatrix(rows, 1, 0.9, 1.1, 1.0, 3)) 
: null;
+                               
.convertToMatrixBlock(TestUtils.generateTestMatrix(rows, 1, 0.9, 1.5, 1.0, 3)) 
: null;
 
                        // matrix-vector uncompressed
                        MatrixBlock ret1 = 
mb.chainMatrixMultOperations(vector1, vector2, new MatrixBlock(), ctype, _k);
@@ -681,7 +693,6 @@ public abstract class CompressedTestBase extends TestBase {
                try {
                        if(!(cmb instanceof CompressedMatrixBlock))
                                return; // Input was not compressed then just 
pass test
-                       // ChainType ctype = ChainType.XtwXv;
                        for(MMTSJType mType : new MMTSJType[] {MMTSJType.LEFT,
                                // MMTSJType.RIGHT
                        }) {
@@ -691,8 +702,6 @@ public abstract class CompressedTestBase extends TestBase {
                                // matrix-vector compressed
                                MatrixBlock ret2 = 
cmb.transposeSelfMatrixMultOperations(new MatrixBlock(), mType, _k);
 
-                               // LOG.error(ret1);
-                               // LOG.error(ret2);
                                // compare result with input
                                compareResultMatrices(ret1, ret2, 100);
                        }
diff --git 
a/src/test/java/org/apache/sysds/test/component/compress/TestConstants.java 
b/src/test/java/org/apache/sysds/test/component/compress/TestConstants.java
index 0fcff40..c936c1b 100644
--- a/src/test/java/org/apache/sysds/test/component/compress/TestConstants.java
+++ b/src/test/java/org/apache/sysds/test/component/compress/TestConstants.java
@@ -57,7 +57,7 @@ public class TestConstants {
        }
 
        public enum OverLapping {
-               COL, MATRIX, NONE, MATRIX_PLUS, MATRIX_MULT_NEGATIVE, SQUASH, 
PLUS;
+               COL, MATRIX, NONE, MATRIX_PLUS, MATRIX_MULT_NEGATIVE, SQUASH, 
PLUS, APPEND_EMPTY, APPEND_CONST;
 
                public static boolean effectOnOutput(OverLapping opcode) {
                        switch(opcode) {

Reply via email to