This is an automated email from the ASF dual-hosted git repository.

baunsgaard pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/systemds.git

commit 0f8e302bcf90434269712a13617728fdaa69f119
Author: baunsgaard <[email protected]>
AuthorDate: Tue May 18 22:35:51 2021 +0200

    [SYSTEMDS-2997] CLA MatrixBlock Dictionary Update
    
    This commit change the dictionary of the column groups to support
    MatrixBlocks, this further enforce the previous design of using already
    implemented kernels, and allow for sparse dictionary exploitation in
    operations.
---
 .../runtime/compress/CompressedMatrixBlock.java    |  39 +-
 .../compress/cocode/CoCodeCostMatrixMult.java      |   2 +-
 .../sysds/runtime/compress/colgroup/AColGroup.java |  64 +---
 .../runtime/compress/colgroup/ColGroupConst.java   |  14 +-
 .../runtime/compress/colgroup/ColGroupDDC.java     |  44 ++-
 .../runtime/compress/colgroup/ColGroupEmpty.java   |   6 +-
 .../runtime/compress/colgroup/ColGroupOLE.java     |  12 +-
 .../runtime/compress/colgroup/ColGroupRLE.java     |  12 +-
 .../runtime/compress/colgroup/ColGroupSDC.java     |  10 +-
 .../compress/colgroup/ColGroupSDCSingle.java       |  71 ++--
 .../compress/colgroup/ColGroupSDCSingleZeros.java  |  30 +-
 .../compress/colgroup/ColGroupSDCZeros.java        |  11 +-
 .../compress/colgroup/ColGroupUncompressed.java    |  17 +-
 .../runtime/compress/colgroup/ColGroupValue.java   | 399 ++++++++++++++-------
 .../compress/colgroup/dictionary/ADictionary.java  |  41 ++-
 .../compress/colgroup/dictionary/Dictionary.java   |  43 ++-
 .../colgroup/dictionary/DictionaryFactory.java     |  34 +-
 .../colgroup/dictionary/MatrixBlockDictionary.java | 195 ++++++++--
 .../compress/colgroup/dictionary/QDictionary.java  |  31 +-
 .../sysds/runtime/compress/lib/BitmapEncoder.java  |   1 +
 .../runtime/compress/lib/CLALibBinaryCellOp.java   |  11 +-
 .../sysds/runtime/compress/lib/CLALibCompAgg.java  |   2 +-
 .../runtime/compress/lib/CLALibLeftMultBy.java     | 358 +++++++++---------
 .../runtime/compress/lib/CLALibRelationalOp.java   |   9 +-
 .../sysds/runtime/compress/lib/CLALibScalar.java   |   1 -
 .../readers/ReaderCompressedSelection.java         |   6 +-
 .../sysds/runtime/compress/utils/DblArray.java     |  11 +-
 .../sysds/runtime/matrix/data/LibMatrixMult.java   |   2 +-
 .../sysds/runtime/matrix/data/MatrixBlock.java     |  10 +
 .../component/compress/CompressedMatrixTest.java   |   7 +-
 .../component/compress/CompressedTestBase.java     |  40 ++-
 .../sysds/test/component/compress/TestBase.java    |  14 +-
 32 files changed, 958 insertions(+), 589 deletions(-)

diff --git 
a/src/main/java/org/apache/sysds/runtime/compress/CompressedMatrixBlock.java 
b/src/main/java/org/apache/sysds/runtime/compress/CompressedMatrixBlock.java
index 249ad3e..27d77ed 100644
--- a/src/main/java/org/apache/sysds/runtime/compress/CompressedMatrixBlock.java
+++ b/src/main/java/org/apache/sysds/runtime/compress/CompressedMatrixBlock.java
@@ -62,7 +62,6 @@ import org.apache.sysds.runtime.compress.lib.CLALibReExpand;
 import org.apache.sysds.runtime.compress.lib.CLALibRightMultBy;
 import org.apache.sysds.runtime.compress.lib.CLALibScalar;
 import org.apache.sysds.runtime.compress.lib.CLALibSquash;
-import org.apache.sysds.runtime.compress.utils.LinearAlgebraUtils;
 import org.apache.sysds.runtime.controlprogram.caching.CacheBlock;
 import org.apache.sysds.runtime.controlprogram.caching.MatrixObject.UpdateType;
 import org.apache.sysds.runtime.controlprogram.parfor.stat.Timing;
@@ -198,13 +197,17 @@ public class CompressedMatrixBlock extends MatrixBlock {
 
                Timing time = new Timing(true);
 
+               long nnz = getNonZeros() == -1 ? recomputeNonZeros() : nonZeros;
+               if(isEmpty())
+                       return new MatrixBlock(rlen, clen, true, 0);
+
                // preallocation sparse rows to avoid repeated reallocations
                MatrixBlock ret = 
getUncompressedColGroupAndRemoveFromListOfColGroups();
                if(ret != null && getColGroups().size() == 0)
                        return ret;
                else if(ret == null)
                        ret = new MatrixBlock(rlen, clen, false, -1);
-
+               ret.setNonZeros(nnz);
                ret.allocateDenseBlock();
                decompress(ret);
 
@@ -220,7 +223,7 @@ public class CompressedMatrixBlock extends MatrixBlock {
        private MatrixBlock decompress(MatrixBlock ret) {
 
                for(AColGroup grp : _colGroups)
-                       grp.decompressToBlockUnSafe(ret, 0, rlen, 0, 
grp.getValues());
+                       grp.decompressToBlockUnSafe(ret, 0, rlen, 0);
 
                if(ret.isInSparseFormat())
                        ret.sortSparseRows();
@@ -229,10 +232,6 @@ public class CompressedMatrixBlock extends MatrixBlock {
                        ret.recomputeNonZeros();
                        ret.examSparsity();
                }
-               else if(nonZeros == -1)
-                       ret.setNonZeros(this.recomputeNonZeros());
-               else
-                       ret.setNonZeros(nonZeros);
 
                return ret;
        }
@@ -289,10 +288,7 @@ public class CompressedMatrixBlock extends MatrixBlock {
                        ret.recomputeNonZeros();
                        ret.examSparsity();
                }
-               else if(nonZeros == -1)
-                       ret.setNonZeros(this.recomputeNonZeros());
-               else
-                       ret.setNonZeros(nonZeros);
+
                return ret;
        }
 
@@ -332,7 +328,6 @@ public class CompressedMatrixBlock extends MatrixBlock {
                        long nnz = 0;
                        for(AColGroup g : _colGroups)
                                nnz += g.getNumberNonZeros();
-
                        nonZeros = nnz;
                }
                return nonZeros;
@@ -673,21 +668,17 @@ public class CompressedMatrixBlock extends MatrixBlock {
                // check for transpose type
                if(tstype != MMTSJType.LEFT) // right not supported yet
                        throw new DMLRuntimeException("Invalid MMTSJ type '" + 
tstype.toString() + "'.");
-
+               if(isEmptyBlock())
+                       return new MatrixBlock(clen, clen, true);
                // create output matrix block
                if(out == null)
                        out = new MatrixBlock(clen, clen, false);
                else
                        out.reset(clen, clen, false);
                out.allocateDenseBlock();
-
-               if(!isEmptyBlock(false)) {
-                       // compute matrix mult
-                       CLALibLeftMultBy.leftMultByTransposeSelf(_colGroups, 
out, k, getNumColumns(), getMaxNumValues(),
-                               isOverlapping());
-                       // post-processing
-                       
out.setNonZeros(LinearAlgebraUtils.copyUpperToLowerTriangle(out));
-               }
+               // compute matrix mult
+               CLALibLeftMultBy.leftMultByTransposeSelf(_colGroups, out, k, 
getNumColumns(), getMaxNumValues(),
+                       isOverlapping());
                return out;
        }
 
@@ -767,7 +758,7 @@ public class CompressedMatrixBlock extends MatrixBlock {
 
                        // decompress row partition
                        for(AColGroup grp : _colGroups)
-                               grp.decompressToBlock(_ret, _rl, _ru, 
grp.getValues(), false);
+                               grp.decompressToBlock(_ret, _rl, _ru, false);
 
                        // post processing (sort due to append)
                        if(_ret.isInSparseFormat())
@@ -927,7 +918,7 @@ public class CompressedMatrixBlock extends MatrixBlock {
 
        @Override
        public boolean isEmptyBlock(boolean safe) {
-               return(_colGroups == null || getNonZeros() == 0);
+               return _colGroups == null || nonZeros == 0;
        }
 
        @Override
@@ -1002,7 +993,7 @@ public class CompressedMatrixBlock extends MatrixBlock {
                AColGroup grp = _colGroups.get(0);
                MatrixBlock vals = grp.getValuesAsBlock();
                if(grp instanceof ColGroupValue) {
-                       MatrixBlock counts = getCountsAsBlock(((ColGroupValue) 
grp).getCounts());
+                       MatrixBlock counts = getCountsAsBlock( ((ColGroupValue) 
grp).getCounts());
                        if(counts.isEmpty())
                                return vals.cmOperations(op);
                        return vals.cmOperations(op, counts);
diff --git 
a/src/main/java/org/apache/sysds/runtime/compress/cocode/CoCodeCostMatrixMult.java
 
b/src/main/java/org/apache/sysds/runtime/compress/cocode/CoCodeCostMatrixMult.java
index 836e4d0..0d39b47 100644
--- 
a/src/main/java/org/apache/sysds/runtime/compress/cocode/CoCodeCostMatrixMult.java
+++ 
b/src/main/java/org/apache/sysds/runtime/compress/cocode/CoCodeCostMatrixMult.java
@@ -113,7 +113,7 @@ public class CoCodeCostMatrixMult extends AColumnCoCoder {
                        final double postScalingCost = (nCols > 1 && 
elm.getTupleSparsity() > 0.4) ? numberTuples *
                                nCols : numberTuples * nCols * tupleSparsity;
 
-                       this.cost = preAggregateCost + postScalingCost;
+                       this.cost = preAggregateCost + postScalingCost ;
                }
 
                @Override
diff --git 
a/src/main/java/org/apache/sysds/runtime/compress/colgroup/AColGroup.java 
b/src/main/java/org/apache/sysds/runtime/compress/colgroup/AColGroup.java
index 6b74136..c1daeb9 100644
--- a/src/main/java/org/apache/sysds/runtime/compress/colgroup/AColGroup.java
+++ b/src/main/java/org/apache/sysds/runtime/compress/colgroup/AColGroup.java
@@ -207,7 +207,7 @@ public abstract class AColGroup implements Serializable {
         * @param ru     row upper
         */
        public void decompressToBlock(MatrixBlock target, int rl, int ru) {
-               decompressToBlock(target, rl, ru, rl, getValues(), true);
+               decompressToBlock(target, rl, ru, rl, true);
        }
 
        /**
@@ -219,23 +219,9 @@ public abstract class AColGroup implements Serializable {
         * @param offT   The rowOffset into target to decompress to.
         */
        public void decompressToBlock(MatrixBlock target, int rl, int ru, int 
offT) {
-               decompressToBlock(target, rl, ru, offT, getValues(), true);
+               decompressToBlock(target, rl, ru, offT, true);
        }
 
-       /**
-        * Decompress the contents of this column group into the target 
matrixBlock using the values provided as replacement
-        * of the dictionary values, it is assumed that the target matrix Block 
have the same number of columns and at least
-        * the number of rows ru.
-        * 
-        * @param target The target matrixBlock to decompress into
-        * @param rl     The row to start at
-        * @param ru     The row to end at
-        * @param values The dictionary values materialized.
-        * @param safe   Boolean specifying if the operation should be safe, 
aka counting nnz.
-        */
-       public void decompressToBlock(MatrixBlock target, int rl, int ru, 
double[] values, boolean safe) {
-               decompressToBlock(target, rl, ru, rl, values, safe);
-       }
 
        /**
         * Decompress the contents of this column group into the target 
matrixBlock, it is assumed that the target matrix
@@ -247,25 +233,9 @@ public abstract class AColGroup implements Serializable {
         * @param safe   Boolean specifying if the operation should be safe, 
aka counting nnz.
         */
        public void decompressToBlock(MatrixBlock target, int rl, int ru, 
boolean safe) {
-               decompressToBlock(target, rl, ru, rl, getValues(), safe);
+               decompressToBlock(target, rl, ru, rl, safe);
        }
 
-       /**
-        * Decompress the contents of this column group into the target 
matrixBlock with an offset of the indexes, it is
-        * assumed that the target matrix Block have the same number of columns 
and at least the number of rows ru.
-        * 
-        * The offset of indexes makes it possible to decompress parts of the 
compressed column group like say rows 10 to
-        * 20, into row 0 to 10 in the target matrix.
-        * 
-        * @param target The target matrixBlock to decompress into
-        * @param rl     The row to start at
-        * @param ru     The row to end at
-        * @param offT   The offset into the target to decompress to.
-        * @param safe   Boolean specifying if the operation should be safe, 
aka counting nnz.
-        */
-       public void decompressToBlock(MatrixBlock target, int rl, int ru, int 
offT, boolean safe) {
-               decompressToBlock(target, rl, ru, offT, getValues(), safe);
-       }
 
        /**
         * Decompress the contents of this column group into the target 
matrixBlock with an offset of the indexes using the
@@ -279,14 +249,13 @@ public abstract class AColGroup implements Serializable {
         * @param rl     The row to start at
         * @param ru     The row to end at
         * @param offT   The offset into the target to decompress to.
-        * @param values The dictionary values materialized.
         * @param safe   Boolean specifying if the operation should be safe, 
aka counting nnz.
         */
-       public void decompressToBlock(MatrixBlock target, int rl, int ru, int 
offT, double[] values, boolean safe) {
+       public void decompressToBlock(MatrixBlock target, int rl, int ru, int 
offT,  boolean safe) {
                if(safe)
-                       decompressToBlockSafe(target, rl, ru, offT, values);
+                       decompressToBlockSafe(target, rl, ru, offT);
                else
-                       decompressToBlockUnSafe(target, rl, ru, offT, values);
+                       decompressToBlockUnSafe(target, rl, ru, offT);
        }
 
        /**
@@ -297,9 +266,8 @@ public abstract class AColGroup implements Serializable {
         * @param rl     row lower
         * @param ru     row upper
         * @param offT   Offset into target to assign from
-        * @param values The Values materialized in the dictionary
         */
-       public abstract void decompressToBlockSafe(MatrixBlock target, int rl, 
int ru, int offT, double[] values);
+       public abstract void decompressToBlockSafe(MatrixBlock target, int rl, 
int ru, int offT);
 
        /**
         * Decompress the contents of the columngroup unsafely, meaning that it 
does not count nonzero values.
@@ -308,22 +276,8 @@ public abstract class AColGroup implements Serializable {
         * @param rl     row lower
         * @param ru     row upper
         * @param offT   Offset into target to assign from
-        * @param values The Values materialized in the dictionary
-        */
-       public abstract void decompressToBlockUnSafe(MatrixBlock target, int 
rl, int ru, int offT, double[] values);
-
-       /**
-        * Decompress the contents of this column group into the specified full 
matrix block.
-        * 
-        * @param target a matrix block where the columns covered by this 
column group have not yet been filled in.
-        * @param rl     row lower
-        * @param ru     row upper
-        * @param offT   The offset into the target matrix block to decompress 
to.
-        * @param values The Values materialized in the dictionary
         */
-       public void decompressToBlock(MatrixBlock target, int rl, int ru, int 
offT, double[] values) {
-               decompressToBlockSafe(target, rl, ru, offT, values);
-       }
+       public abstract void decompressToBlockUnSafe(MatrixBlock target, int 
rl, int ru, int offT);
 
        /**
         * Decompress the contents of this column group into uncompressed 
packed columns
@@ -401,7 +355,7 @@ public abstract class AColGroup implements Serializable {
         */
        public static void decompressColumnToBlockUnSafe(MatrixBlock target, 
int rl, int ru, List<AColGroup> colGroups) {
                for(AColGroup g : colGroups)
-                       g.decompressToBlockUnSafe(target, rl, ru, rl, 
g.getValues());
+                       g.decompressToBlockUnSafe(target, rl, ru, rl);
        }
 
        /**
diff --git 
a/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupConst.java 
b/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupConst.java
index e3c2965..d439c4e 100644
--- 
a/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupConst.java
+++ 
b/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupConst.java
@@ -113,14 +113,15 @@ public class ColGroupConst extends ColGroupValue {
        }
 
        @Override
-       public void decompressToBlockSafe(MatrixBlock target, int rl, int ru, 
int offT, double[] values) {
-               decompressToBlockUnSafe(target, rl, ru, offT, values);
+       public void decompressToBlockSafe(MatrixBlock target, int rl, int ru, 
int offT) {
+               decompressToBlockUnSafe(target, rl, ru, offT);
                target.setNonZeros(_colIndexes.length * target.getNumRows() + 
target.getNonZeros());
        }
 
        @Override
-       public void decompressToBlockUnSafe(MatrixBlock target, int rl, int ru, 
int offT, double[] values) {
+       public void decompressToBlockUnSafe(MatrixBlock target, int rl, int ru, 
int offT) {
                double[] c = target.getDenseBlockValues();
+               double[] values = getValues();
                offT = offT * target.getNumColumns();
                for(int i = rl; i < ru; i++, offT += target.getNumColumns())
                        for(int j = 0; j < _colIndexes.length; j++)
@@ -205,9 +206,10 @@ public class ColGroupConst extends ColGroupValue {
        }
 
        @Override
-       public void leftMultByMatrix(MatrixBlock a, MatrixBlock c, double[] 
values, int rl, int ru) {
+       public void leftMultByMatrix(MatrixBlock a, MatrixBlock c, int rl, int 
ru) {
                final double[] cV = c.getDenseBlockValues();
-               if(a.isEmpty())
+               final double[] values = getValues();
+               if(values == null  || a.isEmpty())
                        return;
                else if(a.isInSparseFormat()) {
                        SparseBlock sb = a.getSparseBlock();
@@ -241,7 +243,7 @@ public class ColGroupConst extends ColGroupValue {
 
        @Override
        public AColGroup binaryRowOp(BinaryOperator op, double[] v, boolean 
sparseSafe, boolean left) {
-               return new ColGroupConst(_colIndexes, _numRows, 
applyBinaryRowOp(op.fn, v, true, left));
+               return new ColGroupConst(_colIndexes, _numRows, 
applyBinaryRowOp(op, v, true, left));
        }
 
        @Override
diff --git 
a/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupDDC.java 
b/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupDDC.java
index fa967da..42cf04b 100644
--- a/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupDDC.java
+++ b/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupDDC.java
@@ -27,6 +27,7 @@ import java.util.Arrays;
 import org.apache.sysds.runtime.compress.CompressionSettings;
 import org.apache.sysds.runtime.compress.colgroup.dictionary.ADictionary;
 import org.apache.sysds.runtime.compress.colgroup.dictionary.Dictionary;
+import 
org.apache.sysds.runtime.compress.colgroup.dictionary.MatrixBlockDictionary;
 import org.apache.sysds.runtime.compress.colgroup.mapping.AMapToData;
 import org.apache.sysds.runtime.compress.colgroup.mapping.MapToFactory;
 import org.apache.sysds.runtime.compress.colgroup.offset.AIterator;
@@ -64,16 +65,50 @@ public class ColGroupDDC extends ColGroupValue {
        }
 
        @Override
-       public void decompressToBlockSafe(MatrixBlock target, int rl, int ru, 
int offT, double[] values) {
-               decompressToBlockUnSafe(target, rl, ru, offT, values);
+       public void decompressToBlockSafe(MatrixBlock target, int rl, int ru, 
int offT) {
+               decompressToBlockUnSafe(target, rl, ru, offT);
                target.setNonZeros(target.getNonZeros() + _numRows * 
_colIndexes.length);
        }
 
        @Override
-       public void decompressToBlockUnSafe(MatrixBlock target, int rl, int ru, 
int offT, double[] values) {
+       public void decompressToBlockUnSafe(MatrixBlock target, int rl, int ru, 
int offT) {
                final int nCol = _colIndexes.length;
                final int tCol = target.getNumColumns();
                final double[] c = target.getDenseBlockValues();
+               if(_dict instanceof MatrixBlockDictionary) {
+                       MatrixBlock dmb = ((MatrixBlockDictionary) 
_dict).getMatrixBlock();
+                       if(dmb.isEmpty())
+                               return;
+                       else if(dmb.isInSparseFormat())
+                               decompressToBlockUnsafeSparse(c, rl, ru, offT, 
dmb.getSparseBlock(), tCol, nCol);
+                       else
+                               decompressToBlockUnsafeDense(c, rl, ru, offT, 
dmb.getDenseBlockValues(), tCol, nCol);
+               }
+               else
+                       decompressToBlockUnsafeDense(c, rl, ru, offT, 
getValues(), tCol, nCol);
+
+       }
+
+       private void decompressToBlockUnsafeSparse(double[] c, int rl, int ru, 
int offT, SparseBlock sb, int tCol,
+               int nCol) {
+               offT = offT * tCol;
+               for(int i = rl; i < ru; i++, offT += tCol) {
+                       final int rowIndex = _data.getIndex(i);
+                       if(sb.isEmpty(rowIndex))
+                               continue;
+                       final int apos = sb.pos(rowIndex);
+                       final int alen = sb.size(rowIndex) + apos;
+                       final double[] avals = sb.values(rowIndex);
+                       final int[] aix = sb.indexes(rowIndex);
+                       for(int j = apos; j < alen; j++)
+                               c[offT + _colIndexes[aix[j]]] += avals[j];
+
+               }
+       }
+
+       private void decompressToBlockUnsafeDense(double[] c, int rl, int ru, 
int offT, double[] values, int tCol,
+               int nCol) {
+               // final double[] values = getValues();
                offT = offT * tCol;
 
                for(int i = rl; i < ru; i++, offT += tCol) {
@@ -81,7 +116,6 @@ public class ColGroupDDC extends ColGroupValue {
                        for(int j = 0; j < nCol; j++)
                                c[offT + _colIndexes[j]] += values[rowIndex + 
j];
                }
-
        }
 
        @Override
@@ -567,7 +601,7 @@ public class ColGroupDDC extends ColGroupValue {
 
        @Override
        public AColGroup binaryRowOp(BinaryOperator op, double[] v, boolean 
sparseSafe, boolean left) {
-               ADictionary aDict = applyBinaryRowOp(op.fn, v, true, left);
+               ADictionary aDict = applyBinaryRowOp(op, v, true, left);
                return new ColGroupDDC(_colIndexes, _numRows, aDict, _data, 
getCachedCounts());
        }
 
diff --git 
a/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupEmpty.java 
b/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupEmpty.java
index be33491..11adb66 100644
--- 
a/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupEmpty.java
+++ 
b/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupEmpty.java
@@ -85,12 +85,12 @@ public class ColGroupEmpty extends ColGroupCompressed {
 
 
        @Override
-       public void decompressToBlockSafe(MatrixBlock target, int rl, int ru, 
int offT, double[] values) {
+       public void decompressToBlockSafe(MatrixBlock target, int rl, int ru, 
int offT) {
                // do nothing.
        }
 
        @Override
-       public void decompressToBlockUnSafe(MatrixBlock target, int rl, int ru, 
int offT, double[] values) {
+       public void decompressToBlockUnSafe(MatrixBlock target, int rl, int ru, 
int offT) {
                // do nothing.
        }
 
@@ -138,7 +138,7 @@ public class ColGroupEmpty extends ColGroupCompressed {
                if(sparseSafe)
                        return this;
                return new ColGroupConst(_colIndexes, _numRows,
-                       new Dictionary(new 
double[_colIndexes.length]).applyBinaryRowOp(op.fn, v, sparseSafe, _colIndexes, 
left));
+                       new Dictionary(new 
double[_colIndexes.length]).applyBinaryRowOp(op, v, sparseSafe, _colIndexes, 
left));
        }
 
        @Override
diff --git 
a/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupOLE.java 
b/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupOLE.java
index bb4f325..712a574 100644
--- a/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupOLE.java
+++ b/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupOLE.java
@@ -70,11 +70,12 @@ public class ColGroupOLE extends ColGroupOffset {
        }
 
        @Override
-       public void decompressToBlockSafe(MatrixBlock target, int rl, int ru, 
int offT, double[] values) {
+       public void decompressToBlockSafe(MatrixBlock target, int rl, int ru, 
int offT) {
 
                final int blksz = CompressionSettings.BITMAP_BLOCK_SZ;
                final int numCols = getNumCols();
                final int numVals = getNumValues();
+               final double[] values = getValues();
 
                // cache blocking config and position array
                int[] apos = skipScan(numVals, rl);
@@ -112,13 +113,14 @@ public class ColGroupOLE extends ColGroupOffset {
        }
 
        @Override
-       public void decompressToBlockUnSafe(MatrixBlock target, int rl, int ru, 
int offT, double[] values) {
+       public void decompressToBlockUnSafe(MatrixBlock target, int rl, int ru, 
int offT) {
 
                final int blksz = CompressionSettings.BITMAP_BLOCK_SZ;
                final int numCols = getNumCols();
                final int numVals = getNumValues();
                final int offOut = (rl - offT);
                final int targetCols = target.getNumColumns();
+               final double[] values = getValues();
 
                // cache blocking config and position array
                int[] apos = skipScan(numVals, rl);
@@ -363,7 +365,7 @@ public class ColGroupOLE extends ColGroupOffset {
                // fast path: sparse-safe operations
                // Note that bitmaps don't change and are shallow-copied
                if(sparseSafe) {
-                       return new ColGroupOLE(_colIndexes, _numRows, _zeros, 
applyBinaryRowOp(op.fn, v, sparseSafe, left), _data,
+                       return new ColGroupOLE(_colIndexes, _numRows, _zeros, 
applyBinaryRowOp(op, v, sparseSafe, left), _data,
                                _ptr, getCachedCounts());
                }
 
@@ -372,10 +374,10 @@ public class ColGroupOLE extends ColGroupOffset {
                boolean[] lind = computeZeroIndicatorVector();
                int[] loff = computeOffsets(lind);
                if(loff.length == 0) { // empty offset list: go back to fast 
path
-                       return new ColGroupOLE(_colIndexes, _numRows, false, 
applyBinaryRowOp(op.fn, v, true, left), _data, _ptr,
+                       return new ColGroupOLE(_colIndexes, _numRows, false, 
applyBinaryRowOp(op, v, true, left), _data, _ptr,
                                getCachedCounts());
                }
-               ADictionary rvalues = applyBinaryRowOp(op.fn, v, sparseSafe, 
left);
+               ADictionary rvalues = applyBinaryRowOp(op, v, sparseSafe, left);
                char[] lbitmap = genOffsetBitmap(loff, loff.length);
                char[] rbitmaps = Arrays.copyOf(_data, _data.length + 
lbitmap.length);
                System.arraycopy(lbitmap, 0, rbitmaps, _data.length, 
lbitmap.length);
diff --git 
a/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupRLE.java 
b/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupRLE.java
index 1649591..1c81fca 100644
--- a/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupRLE.java
+++ b/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupRLE.java
@@ -68,10 +68,11 @@ public class ColGroupRLE extends ColGroupOffset {
        }
 
        @Override
-       public void decompressToBlockSafe(MatrixBlock target, int rl, int ru, 
int offT, double[] values) {
+       public void decompressToBlockSafe(MatrixBlock target, int rl, int ru, 
int offT) {
                final int blksz = CompressionSettings.BITMAP_BLOCK_SZ;
                final int numCols = getNumCols();
                final int numVals = getNumValues();
+               final double[] values = getValues();
 
                // position and start offset arrays
                int[] astart = new int[numVals];
@@ -111,10 +112,11 @@ public class ColGroupRLE extends ColGroupOffset {
        }
 
        @Override
-       public void decompressToBlockUnSafe(MatrixBlock target, int rl, int ru, 
int offT, double[] values) {
+       public void decompressToBlockUnSafe(MatrixBlock target, int rl, int ru, 
int offT) {
                final int blksz = CompressionSettings.BITMAP_BLOCK_SZ;
                final int numCols = getNumCols();
                final int numVals = getNumValues();
+               final double[] values = getValues();
 
                // position and start offset arrays
                int[] astart = new int[numVals];
@@ -623,7 +625,7 @@ public class ColGroupRLE extends ColGroupOffset {
                // fast path: sparse-safe operations
                // Note that bitmaps don't change and are shallow-copied
                if(sparseSafe) {
-                       return new ColGroupRLE(_colIndexes, _numRows, _zeros, 
applyBinaryRowOp(op.fn, v, sparseSafe, left), _data,
+                       return new ColGroupRLE(_colIndexes, _numRows, _zeros, 
applyBinaryRowOp(op, v, sparseSafe, left), _data,
                                _ptr, getCachedCounts());
                }
 
@@ -632,11 +634,11 @@ public class ColGroupRLE extends ColGroupOffset {
                boolean[] lind = computeZeroIndicatorVector();
                int[] loff = computeOffsets(lind);
                if(loff.length == 0) { // empty offset list: go back to fast 
path
-                       return new ColGroupRLE(_colIndexes, _numRows, false, 
applyBinaryRowOp(op.fn, v, true, left), _data, _ptr,
+                       return new ColGroupRLE(_colIndexes, _numRows, false, 
applyBinaryRowOp(op, v, true, left), _data, _ptr,
                                getCachedCounts());
                }
 
-               ADictionary rvalues = applyBinaryRowOp(op.fn, v, sparseSafe, 
left);
+               ADictionary rvalues = applyBinaryRowOp(op, v, sparseSafe, left);
                char[] lbitmap = genRLEBitmap(loff, loff.length);
                char[] rbitmaps = Arrays.copyOf(_data, _data.length + 
lbitmap.length);
                System.arraycopy(lbitmap, 0, rbitmaps, _data.length, 
lbitmap.length);
diff --git 
a/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupSDC.java 
b/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupSDC.java
index d2cebf5..769b2fe 100644
--- a/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupSDC.java
+++ b/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupSDC.java
@@ -97,17 +97,19 @@ public class ColGroupSDC extends ColGroupValue {
        }
 
        @Override
-       public void decompressToBlockSafe(MatrixBlock target, int rl, int ru, 
int offT, double[] values) {
-               decompressToBlockUnSafe(target, rl, ru, offT, values);
+       public void decompressToBlockSafe(MatrixBlock target, int rl, int ru, 
int offT) {
+               decompressToBlockUnSafe(target, rl, ru, offT);
                target.setNonZeros(getNumberNonZeros());
        }
 
        @Override
-       public void decompressToBlockUnSafe(MatrixBlock target, int rl, int ru, 
int offT, double[] values) {
+       public void decompressToBlockUnSafe(MatrixBlock target, int rl, int ru, 
int offT) {
 
                final int nCol = _colIndexes.length;
                final int tCol = target.getNumColumns();
+               final double[] values = getValues();
                final int offsetToDefault = values.length - nCol;
+
                double[] c = target.getDenseBlockValues();
                offT = offT * tCol;
                int i = rl;
@@ -343,7 +345,7 @@ public class ColGroupSDC extends ColGroupValue {
 
        @Override
        public AColGroup binaryRowOp(BinaryOperator op, double[] v, boolean 
sparseSafe, boolean left) {
-               return new ColGroupSDC(_colIndexes, _numRows, 
applyBinaryRowOp(op.fn, v, true, left), _indexes, _data,
+               return new ColGroupSDC(_colIndexes, _numRows, 
applyBinaryRowOp(op, v, true, left), _indexes, _data,
                        getCachedCounts());
        }
 
diff --git 
a/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupSDCSingle.java
 
b/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupSDCSingle.java
index 1424072..493afac 100644
--- 
a/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupSDCSingle.java
+++ 
b/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupSDCSingle.java
@@ -86,17 +86,19 @@ public class ColGroupSDCSingle extends ColGroupValue {
        }
 
        @Override
-       public void decompressToBlockSafe(MatrixBlock target, int rl, int ru, 
int offT, double[] values) {
-               decompressToBlockUnSafe(target, rl, ru, offT, values);
+       public void decompressToBlockSafe(MatrixBlock target, int rl, int ru, 
int offT) {
+               decompressToBlockUnSafe(target, rl, ru, offT);
                target.setNonZeros(_numRows * _colIndexes.length + 
target.getNonZeros());
        }
 
        @Override
-       public void decompressToBlockUnSafe(MatrixBlock target, int rl, int ru, 
int offT, double[] values) {
+       public void decompressToBlockUnSafe(MatrixBlock target, int rl, int ru, 
int offT) {
 
                final int nCol = _colIndexes.length;
                final int tCol = target.getNumColumns();
+               final double[] values = getValues();
                final int offsetToDefault = values.length - nCol;
+
                double[] c = target.getDenseBlockValues();
                offT = offT * tCol;
                int i = rl;
@@ -105,17 +107,17 @@ public class ColGroupSDCSingle extends ColGroupValue {
                for(; i < ru && it.hasNext(); i++, offT += tCol) {
                        if(it.value() == i) {
                                for(int j = 0; j < nCol; j++)
-                                       c[offT + _colIndexes[j]] += 
values[offsetToDefault + j];
+                                       c[offT + _colIndexes[j]] += values[j];
                                it.next();
                        }
                        else
                                for(int j = 0; j < nCol; j++)
-                                       c[offT + _colIndexes[j]] += values[j];
+                                       c[offT + _colIndexes[j]] += 
values[offsetToDefault + j];
                }
 
                for(; i < ru; i++, offT += tCol)
                        for(int j = 0; j < nCol; j++)
-                               c[offT + _colIndexes[j]] += values[j];
+                               c[offT + _colIndexes[j]] += 
values[offsetToDefault + j];
        }
 
        @Override
@@ -166,14 +168,14 @@ public class ColGroupSDCSingle extends ColGroupValue {
                for(; i < ru && it.hasNext(); i++, offT++) {
                        if(it.value() == i) {
                                it.next();
-                               c[offT] += values[offsetToDefault + colpos];
+                               c[offT] += values[colpos];
                        }
                        else
-                               c[offT] += values[colpos];
+                               c[offT] += values[offsetToDefault + colpos];
                }
 
                for(; i < ru; i++, offT++)
-                       c[offT] += values[colpos];
+                       c[offT] += values[offsetToDefault + colpos];
        }
 
        @Override
@@ -186,9 +188,9 @@ public class ColGroupSDCSingle extends ColGroupValue {
                AIterator it = _indexes.getIterator();
                it.skipTo(r);
                if(it.value() == r)
-                       return _dict.getValue(_colIndexes.length + ix);
+                       return _dict.getValue(ix + c);
                else
-                       return _dict.getValue(ix);
+                       return _dict.getValue(_colIndexes.length + c);
 
        }
 
@@ -208,14 +210,14 @@ public class ColGroupSDCSingle extends ColGroupValue {
                it.skipTo(rl);
                for(; rix < ru && it.hasNext(); rix++) {
                        if(it.value() != rix)
-                               c[rix] += vals[0];
-                       else {
                                c[rix] += vals[1];
+                       else {
+                               c[rix] += vals[0];
                                it.next();
                        }
                }
                for(; rix < ru; rix++) {
-                       c[rix] += vals[0];
+                       c[rix] += vals[1];
                }
        }
 
@@ -230,14 +232,14 @@ public class ColGroupSDCSingle extends ColGroupValue {
 
                for(; rix < ru && it.hasNext(); rix++) {
                        if(it.value() != rix)
-                               c[rix] = builtin.execute(c[rix], vals[0]);
-                       else {
                                c[rix] = builtin.execute(c[rix], vals[1]);
+                       else {
+                               c[rix] = builtin.execute(c[rix], vals[0]);
                                it.next();
                        }
                }
                for(; rix < ru; rix++) {
-                       c[rix] = builtin.execute(c[rix], vals[0]);
+                       c[rix] = builtin.execute(c[rix], vals[1]);
                }
        }
 
@@ -273,21 +275,25 @@ public class ColGroupSDCSingle extends ColGroupValue {
                if(row > 0) {
                        int offA = _numRows * row;
                        for(; i < _numRows && it.hasNext(); i++, offA++)
-                               if(it.value() == i)
-                                       vals[1] += a[offA];
-                               else
+                               if(it.value() == i){
+                                       it.next();
                                        vals[0] += a[offA];
+                               }
+                               else
+                                       vals[1] += a[offA];
                        for(; i < _numRows; i++, offA++)
-                               vals[0] += a[offA];
+                               vals[1] += a[offA];
                }
                else {
                        for(; i < _numRows && it.hasNext(); i++)
-                               if(it.value() == i)
-                                       vals[1] += a[i];
-                               else
+                               if(it.value() == i){
+                                       it.next();
                                        vals[0] += a[i];
+                               }
+                               else
+                                       vals[1] += a[i];
                        for(; i < _numRows; i++)
-                               vals[0] += a[i];
+                               vals[1] += a[i];
                }
 
                return vals;
@@ -326,7 +332,7 @@ public class ColGroupSDCSingle extends ColGroupValue {
 
        @Override
        public AColGroup binaryRowOp(BinaryOperator op, double[] v, boolean 
sparseSafe, boolean left) {
-               return new ColGroupSDCSingle(_colIndexes, _numRows, 
applyBinaryRowOp(op.fn, v, true, left), _indexes,
+               return new ColGroupSDCSingle(_colIndexes, _numRows, 
applyBinaryRowOp(op, v, true, left), _indexes,
                        getCachedCounts());
        }
 
@@ -382,11 +388,12 @@ public class ColGroupSDCSingle extends ColGroupValue {
                for(; i < this._numRows && it.hasNext(); i++) {
                        int col = lhs._data.getIndex(i);
                        if(it.value() == i) {
-                               row = 1;
+                               row = 0;
                                it.next();
                        }
                        else
-                               row = 0;
+                               row = 1;
+
                        if(col < lhs.getNumValues())
                                ag.increment(col + row * nCol);
                }
@@ -420,11 +427,11 @@ public class ColGroupSDCSingle extends ColGroupValue {
                        else
                                col = defL;
                        if(rIt.value() == i) {
-                               row = 1;
+                               row = 0;
                                rIt.next();
                        }
                        else
-                               row = 0;
+                               row = 1;
                        ag.increment(col + row * nCol);
                }
 
@@ -444,11 +451,11 @@ public class ColGroupSDCSingle extends ColGroupValue {
                        col = defL;
                        for(; i < this._numRows && rIt.hasNext(); i++) {
                                if(rIt.value() == i) {
-                                       row = 1;
+                                       row = 0;
                                        rIt.next();
                                }
                                else
-                                       row = 0;
+                                       row = 1;
                                ag.increment(col + row * nCol);
                        }
                }
diff --git 
a/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupSDCSingleZeros.java
 
b/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupSDCSingleZeros.java
index 997c42a..7e68e17 100644
--- 
a/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupSDCSingleZeros.java
+++ 
b/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupSDCSingleZeros.java
@@ -88,15 +88,16 @@ public class ColGroupSDCSingleZeros extends ColGroupValue {
        }
 
        @Override
-       public void decompressToBlockSafe(MatrixBlock target, int rl, int ru, 
int offT, double[] values) {
-               decompressToBlockUnSafe(target, rl, ru, offT, values);
+       public void decompressToBlockSafe(MatrixBlock target, int rl, int ru, 
int offT) {
+               decompressToBlockUnSafe(target, rl, ru, offT);
                target.setNonZeros(_indexes.getSize() * _colIndexes.length + 
target.getNonZeros());
        }
 
        @Override
-       public void decompressToBlockUnSafe(MatrixBlock target, int rl, int ru, 
int offT, double[] values) {
+       public void decompressToBlockUnSafe(MatrixBlock target, int rl, int ru, 
int offT) {
                final int nCol = _colIndexes.length;
                final int tCol = target.getNumColumns();
+               final double[] values = getValues();
                final int offTCorrected = offT - rl;
                final double[] c = target.getDenseBlockValues();
 
@@ -108,6 +109,7 @@ public class ColGroupSDCSingleZeros extends ColGroupValue {
                        for(int j = 0; j < nCol; j++) {
                                c[rc + _colIndexes[j]] += values[j];
                        }
+                       it.next();
                }
        }
 
@@ -280,7 +282,7 @@ public class ColGroupSDCSingleZeros extends ColGroupValue {
                if(isSparseSafeOp)
                        return new ColGroupSDCSingleZeros(_colIndexes, 
_numRows, applyScalarOp(op), _indexes, getCachedCounts());
                else {
-                       ADictionary aDictionary = swapEntries(applyScalarOp(op, 
val0, getNumCols()));
+                       ADictionary aDictionary = applyScalarOp(op, val0, 
getNumCols());// swapEntries();
                        // ADictionary aDictionary = applyScalarOp(op, val0, 
getNumCols());
                        return new ColGroupSDCSingle(_colIndexes, _numRows, 
aDictionary, _indexes, null);
                }
@@ -289,22 +291,22 @@ public class ColGroupSDCSingleZeros extends ColGroupValue 
{
        @Override
        public AColGroup binaryRowOp(BinaryOperator op, double[] v, boolean 
sparseSafe, boolean left) {
                if(sparseSafe)
-                       return new ColGroupSDCSingleZeros(_colIndexes, 
_numRows, applyBinaryRowOp(op.fn, v, sparseSafe, left),
+                       return new ColGroupSDCSingleZeros(_colIndexes, 
_numRows, applyBinaryRowOp(op, v, sparseSafe, left),
                                _indexes, getCachedCounts());
                else {
-                       ADictionary aDictionary = applyBinaryRowOp(op.fn, v, 
sparseSafe, left);
+                       ADictionary aDictionary = applyBinaryRowOp(op, v, 
sparseSafe, left);
                        return new ColGroupSDCSingle(_colIndexes, _numRows, 
aDictionary, _indexes, getCachedCounts());
                }
        }
 
-       private ADictionary swapEntries(ADictionary aDictionary) {
-               double[] values = aDictionary.getValues().clone();
-               double[] swap = new double[_colIndexes.length];
-               System.arraycopy(values, 0, swap, 0, _colIndexes.length);
-               System.arraycopy(values, _colIndexes.length, values, 0, 
_colIndexes.length);
-               System.arraycopy(swap, 0, values, _colIndexes.length, 
_colIndexes.length);
-               return new Dictionary(values);
-       }
+       // private ADictionary swapEntries(ADictionary aDictionary) {
+       //      double[] values = aDictionary.getValues().clone();
+       //      double[] swap = new double[_colIndexes.length];
+       //      System.arraycopy(values, 0, swap, 0, _colIndexes.length);
+       //      System.arraycopy(values, _colIndexes.length, values, 0, 
_colIndexes.length);
+       //      System.arraycopy(swap, 0, values, _colIndexes.length, 
_colIndexes.length);
+       //      return new Dictionary(values);
+       // }
 
        @Override
        public void write(DataOutput out) throws IOException {
diff --git 
a/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupSDCZeros.java
 
b/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupSDCZeros.java
index 77cf6e2..f04f1a3 100644
--- 
a/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupSDCZeros.java
+++ 
b/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupSDCZeros.java
@@ -105,15 +105,16 @@ public class ColGroupSDCZeros extends ColGroupValue {
        }
 
        @Override
-       public void decompressToBlockSafe(MatrixBlock target, int rl, int ru, 
int offT, double[] values) {
-               decompressToBlockUnSafe(target, rl, ru, offT, values);
+       public void decompressToBlockSafe(MatrixBlock target, int rl, int ru, 
int offT) {
+               decompressToBlockUnSafe(target, rl, ru, offT);
                target.setNonZeros(getNumberNonZeros());
        }
 
        @Override
-       public void decompressToBlockUnSafe(MatrixBlock target, int rl, int ru, 
int offT, double[] values) {
+       public void decompressToBlockUnSafe(MatrixBlock target, int rl, int ru, 
int offT) {
                final int nCol = _colIndexes.length;
                final int tCol = target.getNumColumns();
+               final double[] values = getValues();
                final int offTCorrected = offT - rl;
                final double[] c = target.getDenseBlockValues();
 
@@ -306,10 +307,10 @@ public class ColGroupSDCZeros extends ColGroupValue {
        @Override
        public AColGroup binaryRowOp(BinaryOperator op, double[] v, boolean 
sparseSafe, boolean left) {
                if(sparseSafe)
-                       return new ColGroupSDCZeros(_colIndexes, _numRows, 
applyBinaryRowOp(op.fn, v, sparseSafe, left), _indexes,
+                       return new ColGroupSDCZeros(_colIndexes, _numRows, 
applyBinaryRowOp(op, v, sparseSafe, left), _indexes,
                                _data, getCachedCounts());
                else
-                       return new ColGroupSDC(_colIndexes, _numRows, 
applyBinaryRowOp(op.fn, v, sparseSafe, left), _indexes, _data,
+                       return new ColGroupSDC(_colIndexes, _numRows, 
applyBinaryRowOp(op, v, sparseSafe, left), _indexes, _data,
                                getCachedCounts());
        }
 
diff --git 
a/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupUncompressed.java
 
b/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupUncompressed.java
index c8f9bf6..562e4d8 100644
--- 
a/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupUncompressed.java
+++ 
b/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupUncompressed.java
@@ -157,7 +157,7 @@ public class ColGroupUncompressed extends AColGroup {
        }
 
        @Override
-       public void decompressToBlockSafe(MatrixBlock target, int rl, int ru, 
int offT, double[] values) {
+       public void decompressToBlockSafe(MatrixBlock target, int rl, int ru, 
int offT) {
                double[] c = target.getDenseBlockValues();
                final int nCol = _colIndexes.length;
                final int tCol = target.getNumColumns();
@@ -180,7 +180,7 @@ public class ColGroupUncompressed extends AColGroup {
                        }
                }
                else {
-                       values = _data.getDenseBlockValues();
+                       double[] values = _data.getDenseBlockValues();
                        offT = offT * tCol;
                        int offS = rl * nCol;
                        for(int row = rl; row < ru; row++, offT += tCol, offS 
+= nCol) {
@@ -197,7 +197,7 @@ public class ColGroupUncompressed extends AColGroup {
        }
 
        @Override
-       public void decompressToBlockUnSafe(MatrixBlock target, int rl, int ru, 
int offT, double[] values) {
+       public void decompressToBlockUnSafe(MatrixBlock target, int rl, int ru, 
int offT) {
                double[] c = target.getDenseBlockValues();
                final int nCol = _colIndexes.length;
                final int tCol = target.getNumColumns();
@@ -218,7 +218,7 @@ public class ColGroupUncompressed extends AColGroup {
                        }
                }
                else {
-                       values = _data.getDenseBlockValues();
+                       double[] values = _data.getDenseBlockValues();
                        offT = offT * tCol;
                        int offS = rl * nCol;
                        for(int row = rl; row < ru; row++, offT += tCol, offS 
+= nCol)
@@ -521,7 +521,8 @@ public class ColGroupUncompressed extends AColGroup {
 
        @Override
        public void tsmm(double[] result, int numColumns) {
-               MatrixBlock tmp = new MatrixBlock(_colIndexes.length, 
_colIndexes.length, true);
+               final int tCol = _colIndexes.length;
+               MatrixBlock tmp = new MatrixBlock(tCol, tCol, true);
                LibMatrixMult.matrixMultTransposeSelf(_data, tmp, true, false);
                if(tmp.getDenseBlock() == null && tmp.getSparseBlock() == null)
                        return;
@@ -530,9 +531,9 @@ public class ColGroupUncompressed extends AColGroup {
                }
                else {
                        double[] tmpV = tmp.getDenseBlockValues();
-                       for(int i = 0, offD = 0, offT = 0; i < numColumns; i++, 
offD += numColumns, offT += _colIndexes.length)
-                               for(int j = i; j < numColumns; j++)
-                                       result[offD + _colIndexes[j]] += 
tmpV[offT + j];
+                       for(int row = 0, offRet = 0, offTmp = 0; row < tCol; 
row++, offRet += numColumns, offTmp += tCol)
+                               for(int col = row; col < tCol; col++)
+                                       result[offRet + _colIndexes[col]] += 
tmpV[offTmp + col];
                }
 
        }
diff --git 
a/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupValue.java 
b/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupValue.java
index a0c127a..42bf407 100644
--- 
a/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupValue.java
+++ 
b/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupValue.java
@@ -41,10 +41,10 @@ import org.apache.sysds.runtime.data.DenseBlock;
 import org.apache.sysds.runtime.data.DenseBlockFP64;
 import org.apache.sysds.runtime.data.SparseBlock;
 import org.apache.sysds.runtime.functionobjects.Builtin;
-import org.apache.sysds.runtime.functionobjects.ValueFunction;
 import org.apache.sysds.runtime.matrix.data.LibMatrixMult;
 import org.apache.sysds.runtime.matrix.data.LibMatrixReorg;
 import org.apache.sysds.runtime.matrix.data.MatrixBlock;
+import org.apache.sysds.runtime.matrix.operators.BinaryOperator;
 import org.apache.sysds.runtime.matrix.operators.ScalarOperator;
 
 /**
@@ -90,11 +90,6 @@ public abstract class ColGroupValue extends 
ColGroupCompressed implements Clonea
        }
 
        @Override
-       public void decompressToBlock(MatrixBlock target, int rl, int ru, int 
offT) {
-               decompressToBlock(target, rl, ru, offT, getValues());
-       }
-
-       @Override
        public final int getNumValues() {
                return _dict.getNumberOfValues(_colIndexes.length);
        }
@@ -119,12 +114,13 @@ public abstract class ColGroupValue extends 
ColGroupCompressed implements Clonea
 
        @Override
        public MatrixBlock getValuesAsBlock() {
-               final double[] values = getValues();
-               int vlen = values.length;
-               int rlen = _zeros ? vlen + 1 : vlen;
-               MatrixBlock ret = new MatrixBlock(rlen, 1, false);
-               for(int i = 0; i < vlen; i++)
-                       ret.quickSetValue(i, 0, values[i]);
+               _dict = _dict.getAsMatrixBlockDictionary(_colIndexes.length);
+               MatrixBlock ret = ((MatrixBlockDictionary) 
_dict).getMatrixBlock();
+               if(_zeros) {
+                       MatrixBlock tmp = new MatrixBlock();
+                       ret.append(new MatrixBlock(1, _colIndexes.length, 0), 
tmp, false);
+                       return tmp;
+               }
                return ret;
        }
 
@@ -379,9 +375,9 @@ public abstract class ColGroupValue extends 
ColGroupCompressed implements Clonea
         * @param left       Specify which side the operation is executed on.
         * @return The new Dictionary with values.
         */
-       public ADictionary applyBinaryRowOp(ValueFunction fn, double[] v, 
boolean sparseSafe, boolean left) {
-               return sparseSafe ? _dict.clone().applyBinaryRowOp(fn, v, 
sparseSafe, _colIndexes, left) : _dict
-                       .applyBinaryRowOp(fn, v, sparseSafe, _colIndexes, left);
+       public ADictionary applyBinaryRowOp(BinaryOperator op, double[] v, 
boolean sparseSafe, boolean left) {
+               return sparseSafe ? _dict.clone().applyBinaryRowOp(op, v, 
sparseSafe, _colIndexes, left) : _dict
+                       .applyBinaryRowOp(op, v, sparseSafe, _colIndexes, left);
        }
 
        protected void setandExecute(double[] c, boolean square, double val, 
int rix) {
@@ -446,7 +442,7 @@ public abstract class ColGroupValue extends 
ColGroupCompressed implements Clonea
                sb.append(" Is Lossy: " + _dict.isLossy() + " num Rows: " + 
getNumRows() + " contain zero row:" + _zeros);
                sb.append(super.toString());
                if(_dict != null) {
-                       sb.append(String.format("\n%15s%5d ", "Values:", 
_dict.getValues().length));
+                       sb.append(String.format("\n%15s ", "Values: " + 
_dict.getClass().getSimpleName()));
                        sb.append(_dict.getString(_colIndexes.length));
                }
                return sb.toString();
@@ -468,9 +464,8 @@ public abstract class ColGroupValue extends 
ColGroupCompressed implements Clonea
        @Override
        public void write(DataOutput out) throws IOException {
                super.write(out);
-
                out.writeBoolean(_zeros);
-               out.writeBoolean(_dict.isLossy());
+
                _dict.write(out);
 
        }
@@ -802,8 +797,8 @@ public abstract class ColGroupValue extends 
ColGroupCompressed implements Clonea
        private void leftMultByColGroupValue(ColGroupValue lhs, MatrixBlock 
result) {
                final int nvL = lhs.getNumValues();
                final int nvR = this.getNumValues();
-               final double[] lhValues = lhs.getValues();
-               final double[] rhValues = this.getValues();
+               // final double[] lhValues = lhs.getValues();
+               // final double[] rhValues = this.getValues();
                final int lCol = lhs._colIndexes.length;
                final int rCol = this._colIndexes.length;
                final double[] resV = result.getDenseBlockValues();
@@ -812,25 +807,20 @@ public abstract class ColGroupValue extends 
ColGroupCompressed implements Clonea
                final double threshold = 0.2;
 
                if(sameIndexStructure(lhs)) {
-                       int[] agI = getCounts();
-                       for(int i = 0; i < agI.length; i++) {
-                               if(i < nvL)
-                                       for(int l = 0; l < lCol; l++) {
-                                               final int leftOff = 
lhs._colIndexes[l] * numCols;
-                                               final double lhV = lhValues[i * 
lCol + l] * agI[i];
-                                               if(lhV != 0)
-                                                       for(int r = 0; r < 
rCol; r++) {
-                                                               final double 
rhV = rhValues[i * rCol + r];
-                                                               final double va 
= lhV * rhV;
-                                                               resV[leftOff + 
this._colIndexes[r]] += va;
-                                                       }
-                                       }
+                       if(this._dict == lhs._dict) {
+                               tsmmDictionaryWithScaling(_dict, getCounts(), 
lhs._colIndexes, this._colIndexes, resV, numCols);
                        }
+                       else
+                               
matrixMultDictionariesAndOutputToColIndexesWithScaling(lhs._dict, this._dict, 
lhs._colIndexes,
+                                       this._colIndexes, resV, numCols, 
getCounts());
+
                }
                else if(lhs instanceof ColGroupConst || this instanceof 
ColGroupConst) {
-                       double[] r = this instanceof ColGroupConst ? rhValues : 
this._dict.colSum(getCounts(), rCol);
-                       double[] l = lhs instanceof ColGroupConst ? lhValues : 
lhs._dict.colSum(lhs.getCounts(), lCol);
-                       vectorVectorMultiply(l, lhs._colIndexes, r, 
this._colIndexes, resV, numCols);
+                       ADictionary r = this instanceof ColGroupConst ? 
this._dict : new Dictionary(
+                               this._dict.colSum(getCounts(), rCol));
+                       ADictionary l = lhs instanceof ColGroupConst ? 
lhs._dict : new Dictionary(
+                               lhs._dict.colSum(lhs.getCounts(), lCol));
+                       matrixMultDictionariesAndOutputToColIndexes(l, r, 
lhs._colIndexes, this._colIndexes, resV, numCols);
                }
                else {
                        int[] countsRight = getCounts();
@@ -846,53 +836,90 @@ public abstract class ColGroupValue extends 
ColGroupCompressed implements Clonea
                                double[] mct = 
this._dict.getMostCommonTuple(this.getCounts(), rCol);
                                double[] lhsSum = 
lhs._dict.colSum(lhs.getCounts(), lCol);
                                if(mct != null)
-                                       vectorVectorMultiply(lhsSum, 
lhs._colIndexes, mct, this._colIndexes, resV, numCols);
+                                       outerProduct(lhsSum, lhs._colIndexes, 
mct, this._colIndexes, resV, numCols);
 
                                ColGroupValue thisM = (mct != null) ? 
(ColGroupValue) this
                                        
.copyAndSet(this._dict.subtractTuple(mct)) : this;
                                Dictionary preAgg = 
lhs.preAggregateThatIndexStructure(thisM, true);
-                               
matrixMultDictionariesAndOutputToColIndexes(lhValues, preAgg.getValues(), 
lhs._colIndexes,
-                                       this._colIndexes, resV, numCols);
+                               
matrixMultDictionariesAndOutputToColIndexes(lhs._dict, preAgg, lhs._colIndexes, 
this._colIndexes, resV,
+                                       numCols);
                        }
                        else if(skipLeft > threshold && !(lhs instanceof 
ColGroupDDC)) {
                                double[] mct = 
lhs._dict.getMostCommonTuple(lhs.getCounts(), lCol);
                                double[] thisColSum = 
this._dict.colSum(getCounts(), rCol);
                                if(mct != null)
-                                       vectorVectorMultiply(mct, 
lhs._colIndexes, thisColSum, this._colIndexes, resV, numCols);
+                                       outerProduct(mct, lhs._colIndexes, 
thisColSum, this._colIndexes, resV, numCols);
 
                                ColGroupValue lhsM = (mct != null) ? 
(ColGroupValue) lhs.copyAndSet(lhs._dict.subtractTuple(mct)) : lhs;
                                Dictionary preAgg = 
this.preAggregateThatIndexStructure(lhsM, true);
-                               
matrixMultDictionariesAndOutputToColIndexes(preAgg.getValues(), rhValues, 
lhs._colIndexes,
-                                       this._colIndexes, resV, numCols);
+                               
matrixMultDictionariesAndOutputToColIndexes(preAgg, this._dict, 
lhs._colIndexes, this._colIndexes, resV,
+                                       numCols);
                        }
                        else if(nvR * rCol < nvL * lCol) {
                                Dictionary preAgg = 
lhs.preAggregateThatIndexStructure(this, false);
-                               
matrixMultDictionariesAndOutputToColIndexes(lhValues, preAgg.getValues(), 
lhs._colIndexes,
-                                       this._colIndexes, resV, numCols);
+                               
matrixMultDictionariesAndOutputToColIndexes(lhs._dict, preAgg, lhs._colIndexes, 
this._colIndexes, resV,
+                                       numCols);
                        }
                        else {
                                Dictionary preAgg = 
this.preAggregateThatIndexStructure(lhs, false);
-                               
matrixMultDictionariesAndOutputToColIndexes(preAgg.getValues(), rhValues, 
lhs._colIndexes,
-                                       this._colIndexes, resV, numCols);
+                               
matrixMultDictionariesAndOutputToColIndexes(preAgg, this._dict, 
lhs._colIndexes, this._colIndexes, resV,
+                                       numCols);
                        }
                }
        }
 
        @Override
        public void tsmm(double[] result, int numColumns) {
-               int[] counts = getCounts();
-               double[] values = getValues();
-               int[] columns = getColIndices();
+
+               // final int[] counts = getCounts();
+
+               // _dict = _dict.getAsMatrixBlockDictionary(_colIndexes.length);
+               // if(_dict instanceof MatrixBlockDictionary) {
+               //      MatrixBlockDictionary mbd = (MatrixBlockDictionary) 
_dict;
+               //      MatrixBlock mb = mbd.getMatrixBlock();
+               //      if(mb.isEmpty())
+               //              return;
+               //      else if(mb.isInSparseFormat())
+               //              tsmmSparse(result, numColumns, 
mb.getSparseBlock(), counts);
+               //      else
+               //              tsmmDense(result, numColumns, 
mb.getDenseBlockValues(), counts);
+               // }
+               // else
+               //      tsmmDense(result, numColumns, getValues(), counts);
+
+       }
+
+       private void tsmmDense(double[] result, int numColumns, double[] 
values, int[] counts) {
                if(values == null)
                        return;
-               for(int i = 0; i < columns.length; i++) {
-                       final int y = columns[i] * numColumns;
-                       for(int j = i; j < columns.length; j++) {
-                               final int x = columns[j];
-                               for(int h = 0; h < values.length / 
columns.length; h++) {
-                                       double a = values[h * columns.length + 
i];
-                                       double b = values[h * columns.length + 
j];
-                                       result[x + y] += a * b * counts[h];
+               final int nCol = _colIndexes.length;
+               final int nRow = values.length / _colIndexes.length;
+               for(int k = 0; k < nRow; k++) {
+                       final int offTmp = nCol * k;
+                       final int scale = counts[k];
+                       for(int i = 0; i < nCol; i++) {
+                               final int offRet = numColumns * _colIndexes[i];
+                               final double v = values[offTmp + i] * scale;
+                               if(v != 0)
+                                       for(int j = i; j < nCol; j++)
+                                               result[offRet + _colIndexes[j]] 
+= v * values[offTmp + j];
+                       }
+               }
+       }
+
+       private void tsmmSparse(double[] result, int numColumns, SparseBlock 
sb, int[] counts) {
+               for(int row = 0; row < sb.numRows(); row++) {
+                       if(sb.isEmpty(row))
+                               continue;
+                       final int apos = sb.pos(row);
+                       final int alen = sb.size(row);
+                       final int[] aix = sb.indexes(row);
+                       final double[] avals = sb.values(row);
+                       for(int i = apos; i < apos + alen; i++) {
+                               final int offRet = _colIndexes[aix[i]] * 
numColumns;
+                               final double val = avals[i] * counts[row];
+                               for(int j = i; j < apos + alen; j++) {
+                                       result[offRet + _colIndexes[aix[j]]] += 
val * avals[j];
                                }
                        }
                }
@@ -909,19 +936,71 @@ public abstract class ColGroupValue extends 
ColGroupCompressed implements Clonea
                return _dict.getNumberNonZeros(counts, _colIndexes.length);
        }
 
-       private static void vectorVectorMultiply(final double[] left, final 
int[] leftRows, final double[] right,
+       private static void 
matrixMultDictionariesAndOutputToColIndexesWithScaling(final ADictionary left,
+               final ADictionary right, final int[] leftRows, final int[] 
rightColumns, final double[] result,
+               final int outCols, final int[] counts) {
+               final boolean modifyRight = right.getInMemorySize() > 
left.getInMemorySize();
+               ADictionary rightM = modifyRight ? right.scaleTuples(counts, 
rightColumns.length) : right;
+               ADictionary leftM = modifyRight ? left : 
left.scaleTuples(counts, leftRows.length);
+
+               matrixMultDictionariesAndOutputToColIndexes(leftM, rightM, 
leftRows, rightColumns, result, outCols);
+
+       }
+
+       private static void tsmmDictionaryWithScaling(final ADictionary dict, 
final int[] counts, final int[] rows,
+               final int[] cols, final double[] res, final int outCols) {
+
+               if(dict instanceof MatrixBlockDictionary) {
+                       MatrixBlockDictionary mbd = (MatrixBlockDictionary) 
dict;
+                       MatrixBlock mb = mbd.getMatrixBlock();
+                       if(mb.isEmpty())
+                               return;
+                       else if(mb.isInSparseFormat()) {
+                               SparseBlock sb = mb.getSparseBlock();
+                               for(int row = 0; row < sb.numRows(); row++) {
+                                       if(sb.isEmpty(row))
+                                               continue;
+                                       final int apos = sb.pos(row);
+                                       final int alen = sb.size(row);
+                                       final int[] aix = sb.indexes(row);
+                                       final double[] avals = sb.values(row);
+                                       for(int i = apos; i < apos + alen; i++) 
{
+                                               final int offRet = rows[aix[i]] 
* outCols;
+                                               final double val = avals[i] * 
counts[row];
+                                               for(int j = i; j < apos + alen; 
j++) {
+                                                       res[offRet + 
cols[aix[j]]] += val * avals[j];
+                                               }
+                                       }
+                               }
+                       }
+                       else {
+                               throw new NotImplementedException();
+                       }
+               }
+               else {
+                       double[] values = dict.getValues();
+                       for(int row = 0; row < rows.length; row++) {
+                               final int offTmp = cols.length * row;
+                               final int offRet = outCols * rows[row];
+                               for(int col = 0; col < cols.length; col++) {
+                                       final double v = values[offTmp + col] * 
counts[row];
+                                       if(v != 0)
+                                               for(int j = col; j < 
cols.length; j++)
+                                                       res[offRet + cols[col]] 
+= v * values[offTmp + j];
+                               }
+                       }
+               }
+       }
+
+       private static void outerProduct(final double[] left, final int[] 
leftRows, final double[] right,
                final int[] rightColumns, final double[] result, final int 
outCols) {
-               if(left.length != leftRows.length) {
-                       // LOG.error(Arrays.toString(left));
-                       // LOG.error(Arrays.toString(right));
-                       // LOG.error(Arrays.toString(leftRows));
-                       // LOG.error(Arrays.toString(rightColumns));
+               if(left.length != leftRows.length)
                        throw new DMLCompressionException(
                                "Error left length " + left.length + " not 
equal columns length" + leftRows.length);
-               }
+
                if(right.length != rightColumns.length)
                        throw new DMLCompressionException(
-                               "Error right not equal length " + right.length 
+ "  " + rightColumns.length);
+                               "Error right not equal length " + right.length 
+ " " + rightColumns.length);
                for(int row = 0; row < leftRows.length; row++) {
                        final int outputRowOffset = leftRows[row] * outCols;
                        final double vLeft = left[row];
@@ -937,64 +1016,137 @@ public abstract class ColGroupValue extends 
ColGroupCompressed implements Clonea
         * 
         * making the multiplication a: t(left) %*% right
         * 
-        * @param left      The left side matrix, transposed linearized row 
major
-        * @param right     The right hand side linearized row major
+        * @param left      The left side dictionary
+        * @param right     The right side dictionary
         * @param rowsLeft  The number of rows and the row indexes on the left 
hand side
         * @param colsRight The number of columns and the column indexes on the 
right hand side
         * @param result    The result matrix to put the results into, 
linearized row major
         * @param outCols   The output columns count, to know how much to 
offset into with results.
         */
-       private static void 
matrixMultDictionariesAndOutputToColIndexes(double[] left, double[] right, 
int[] rowsLeft,
+       private static void 
matrixMultDictionariesAndOutputToColIndexes(ADictionary left, ADictionary 
right, int[] rowsLeft,
                int[] colsRight, double[] result, int outCols) {
 
                try {
-                       final int rows = left.length / rowsLeft.length;
-                       if(rows != right.length / colsRight.length)
-                               throw new DMLCompressionException(
-                                       "Not equal number of rows: " + rows + " 
" + right.length / colsRight.length);
-                       for(int k = 0; k < rows; k++) {
-                               final int offL = k * rowsLeft.length;
-                               final int offR = k * colsRight.length;
-                               // final int offL = k * colsRight.length;
-                               // final int offR = k * rowsLeft.length;
-                               // if(offR < right.length && offL < left.length)
-                               for(int i = 0; i < rowsLeft.length; i++) {
-                                       final int offOut = rowsLeft[i] * 
outCols;
-                                       final double vl = left[offL + i];
-                                       if(vl != 0)
-                                               for(int j = 0; j < 
colsRight.length; j++) {
-                                                       final double vr = 
right[offR + j];
-                                                       result[offOut + 
colsRight[j]] += vl * vr;
-                                               }
+                       double[] leftV = null;
+                       double[] rightV = null;
+
+                       if(left instanceof MatrixBlockDictionary) {
+                               MatrixBlockDictionary leftD = 
left.getAsMatrixBlockDictionary(rowsLeft.length);
+                               MatrixBlock leftMB = leftD.getMatrixBlock();
+                               if(leftMB.isEmpty())
+                                       return;
+                               else if(right instanceof MatrixBlockDictionary) 
{
+                                       MatrixBlockDictionary rightD = 
right.getAsMatrixBlockDictionary(colsRight.length);
+                                       MatrixBlock rightMB = 
rightD.getMatrixBlock();
+                                       if(rightMB.isEmpty())
+                                               return;
+                                       else if(rightMB.isInSparseFormat() && 
leftMB.isInSparseFormat()) {
+                                               throw new 
NotImplementedException("Not Supported sparse sparse dictionary 
multiplication");
+                                       }
+                               }
+                               else if(leftMB.isInSparseFormat()) {
+                                       
matrixMultDictionariesAndOutputToColIndecesSparseDense(leftMB.getSparseBlock(), 
right.getValues(),
+                                               rowsLeft, colsRight, result, 
outCols);
+                                       return;
                                }
                        }
+                       else {
+                               leftV = left.getValues();
+                       }
+
+                       if(right instanceof MatrixBlockDictionary) {
+                               MatrixBlockDictionary rightD = 
right.getAsMatrixBlockDictionary(colsRight.length);
+                               MatrixBlock rightMB = rightD.getMatrixBlock();
+
+                               if(rightMB.isEmpty())
+                                       return;
+                               else if(rightMB.isInSparseFormat()) {
+                                       
matrixMultDictionariesAndOutputToColIndecesDenseSparse(leftV, 
rightMB.getSparseBlock(), rowsLeft,
+                                               colsRight, result, outCols);
+                                       return;
+                               }
+                       }
+                       else {
+                               rightV = right.getValues();
+                       }
+
+                       if(leftV != null && rightV != null) {
+                               // default if there was not sparse found;
+                               LOG.warn("Inefficient forced dense values");
+                               
matrixMultDictionariesAndOutputToColIndexesDenseDense(leftV, rightV, rowsLeft, 
colsRight, result,
+                                       outCols);
+                       }
+
                }
                catch(Exception e) {
-
                        if(logMM) {
-                               StringBuilder sb = new StringBuilder();
-                               sb.append("\nLeft (transposed):\n");
-                               for(int i = 0; i < rowsLeft.length; i++) {
-                                       for(int j = i * rowsLeft.length; j < (i 
+ 1) * rowsLeft.length; j++)
-                                               sb.append(left[j] + ", ");
-                                       sb.append("\n");
-                               }
-                               LOG.error(sb);
-
-                               sb = new StringBuilder();
-                               sb.append("\nRight:\n");
-                               for(int i = 0; i < colsRight.length; i++) {
-                                       for(int j = i * colsRight.length; j < 
(i + 1) * colsRight.length; j++)
-                                               sb.append(right[j] + ", ");
-                                       sb.append("\n");
-                               }
-                               LOG.error(sb);
+                               LOG.error("\nLeft (transposed):\n" + left + 
"\nRight:\n" + right);
                                logMM = false;
                        }
                        throw new DMLCompressionException("MM of pre aggregated 
colGroups failed", e);
                }
        }
 
+       private static void 
matrixMultDictionariesAndOutputToColIndexesDenseDense(double[] left, double[] 
right,
+               int[] rowsLeft, int[] colsRight, double[] result, int outCols) {
+               final int commonDim = Math.min(left.length / rowsLeft.length, 
right.length / colsRight.length);
+               for(int k = 0; k < commonDim; k++) {
+                       final int offL = k * rowsLeft.length;
+                       final int offR = k * colsRight.length;
+                       for(int i = 0; i < rowsLeft.length; i++) {
+                               final int offOut = rowsLeft[i] * outCols;
+                               final double vl = left[offL + i];
+                               if(vl != 0)
+                                       for(int j = 0; j < colsRight.length; 
j++) {
+                                               final double vr = right[offR + 
j];
+                                               result[offOut + colsRight[j]] 
+= vl * vr;
+                                       }
+                       }
+               }
+       }
+
+       private static void 
matrixMultDictionariesAndOutputToColIndecesSparseDense(SparseBlock left, 
double[] right,
+               int[] rowsLeft, int[] colsRight, double[] result, int outCols) {
+               final int commonDim = Math.min(left.numRows(), right.length / 
colsRight.length);
+               for(int i = 0; i < commonDim; i++) {
+                       if(left.isEmpty(i))
+                               continue;
+                       final int apos = left.pos(i);
+                       final int alen = left.size(i) + apos;
+                       final int[] aix = left.indexes(i);
+                       final double[] leftVals = left.values(i);
+                       final int offRight = i * colsRight.length;
+                       for(int k = apos; k < alen; k++) {
+                               final int offOut = rowsLeft[aix[k]] * outCols;
+                               final double v = leftVals[k];
+                               for(int j = 0; j < colsRight.length; j++)
+                                       result[offOut + colsRight[j]] += v * 
right[offRight + j];
+                       }
+               }
+       }
+
+       private static void 
matrixMultDictionariesAndOutputToColIndecesDenseSparse(double[] left, 
SparseBlock right,
+               int[] rowsLeft, int[] colsRight, double[] result, int outCols) {
+               final int commonDim = Math.min(left.length / rowsLeft.length, 
right.numRows());
+               for(int i = 0; i < commonDim; i++) {
+                       if(right.isEmpty(i))
+                               continue;
+                       final int apos = right.pos(i);
+                       final int alen = right.size(i) + apos;
+                       final int[] aix = right.indexes(i);
+                       final double[] rightVals = right.values(i);
+                       final int offLeft = i * rowsLeft.length;
+                       for(int j = 0; j < rowsLeft.length; j++) {
+                               final int offOut = rowsLeft[j] * outCols;
+                               final double v = left[offLeft + j];
+                               if(v != 0)
+                                       for(int k = apos; k < alen; k++) {
+                                               result[offOut + 
colsRight[aix[k]]] += v * rightVals[k];
+                                       }
+                       }
+               }
+       }
+
        @Override
        public boolean isDense() {
                return !_zeros;
@@ -1009,7 +1161,8 @@ public abstract class ColGroupValue extends 
ColGroupCompressed implements Clonea
         * @param rl     The row to start the matrix multiplication from
         * @param ru     The row to stop the matrix multiplication at.
         */
-       public void leftMultByMatrix(MatrixBlock matrix, MatrixBlock result, 
double[] values, int rl, int ru) {
+       @Override
+       public void leftMultByMatrix(MatrixBlock matrix, MatrixBlock result, 
int rl, int ru) {
                final int numVals = getNumValues();
                if(!(_dict instanceof MatrixBlockDictionary))
                        _dict = 
_dict.getAsMatrixBlockDictionary(_colIndexes.length);
@@ -1026,37 +1179,39 @@ public abstract class ColGroupValue extends 
ColGroupCompressed implements Clonea
                                preAgg.setNonZeros(numVals);
                                // LOG.error("PreAgg Sparsity " + 
preAgg.getSparsity() + " nnz " + preAgg.getNonZeros());
                                LibMatrixMult.matrixMult(preAgg, dictM, tmpRes);
-                               addToResult(tmpRes, result, i);
+                               addVectorToResult(tmpRes, result, i);
                                tmpRes.reset();
                        }
                }
        }
 
-       private void addToResult(MatrixBlock tmp, MatrixBlock result, int row) {
+       private void addVectorToResult(MatrixBlock tmp, MatrixBlock result, int 
row) {
                if(tmp.isEmpty())
                        return;
-               else if(tmp.isInSparseFormat()) {
-                       throw new NotImplementedException();
+               final double[] retV = result.getDenseBlockValues();
+               final int nColRet = result.getNumColumns();
+               final int offR = row * nColRet;
+               if(tmp.isInSparseFormat()) {
+                       final SparseBlock sb = tmp.getSparseBlock();
+                       if(sb.isEmpty(0))
+                               return;
+                       final int apos = sb.pos(0);
+                       final int alen = sb.size(0);
+                       final int[] aix = sb.indexes(0);
+                       final double[] avals = sb.values(0);
+                       for(int i = apos; i < apos + alen; i++)
+                               retV[offR + _colIndexes[aix[i]]] += avals[i];
+
                }
                else {
                        final double[] tmpV = tmp.getDenseBlockValues();
-                       final double[] retV = result.getDenseBlockValues();
-                       final int nColRet = result.getNumColumns();
                        // final int nColTmp = tmp.getNumColumns();
-                       final int offR = row * nColRet;
                        // for(int row = rl, offT = 0, offR = rl * nColRet; row 
< ru; row++, offT += nColTmp, offR += nColRet) {
-                       for(int col = 0; col < _colIndexes.length; col++) {
-                               final int colOffset = _colIndexes[col];
-                               retV[offR + colOffset] += tmpV[col];
-                       }
+                       for(int col = 0; col < _colIndexes.length; col++)
+                               retV[offR + _colIndexes[col]] += tmpV[col];
+
                        // }
                }
-
-       }
-
-       @Override
-       public void leftMultByMatrix(MatrixBlock matrix, MatrixBlock result, 
int rl, int ru) {
-               leftMultByMatrix(matrix, result, getValues(), rl, ru);
        }
 
        public AColGroup rightMultByMatrix(MatrixBlock right) {
diff --git 
a/src/main/java/org/apache/sysds/runtime/compress/colgroup/dictionary/ADictionary.java
 
b/src/main/java/org/apache/sysds/runtime/compress/colgroup/dictionary/ADictionary.java
index e5d2a15..e1d4152 100644
--- 
a/src/main/java/org/apache/sysds/runtime/compress/colgroup/dictionary/ADictionary.java
+++ 
b/src/main/java/org/apache/sysds/runtime/compress/colgroup/dictionary/ADictionary.java
@@ -25,7 +25,7 @@ import java.io.IOException;
 import org.apache.commons.logging.Log;
 import org.apache.commons.logging.LogFactory;
 import org.apache.sysds.runtime.functionobjects.Builtin;
-import org.apache.sysds.runtime.functionobjects.ValueFunction;
+import org.apache.sysds.runtime.matrix.operators.BinaryOperator;
 import org.apache.sysds.runtime.matrix.operators.ScalarOperator;
 
 /**
@@ -113,16 +113,16 @@ public abstract class ADictionary {
         */
        public abstract ADictionary applyScalarOp(ScalarOperator op, double 
newVal, int numCols);
 
-       public ADictionary applyBinaryRowOp(ValueFunction fn, double[] v, 
boolean sparseSafe, int[] colIndexes,
+       public ADictionary applyBinaryRowOp(BinaryOperator op, double[] v, 
boolean sparseSafe, int[] colIndexes,
                boolean left) {
-               return (left) ? applyBinaryRowOpLeft(fn, v, sparseSafe, 
colIndexes) : applyBinaryRowOpRight(fn, v, sparseSafe,
+               return (left) ? applyBinaryRowOpLeft(op, v, sparseSafe, 
colIndexes) : applyBinaryRowOpRight(op, v, sparseSafe,
                        colIndexes);
        }
 
-       public abstract ADictionary applyBinaryRowOpLeft(ValueFunction fn, 
double[] v, boolean sparseSafe,
+       public abstract ADictionary applyBinaryRowOpLeft(BinaryOperator op, 
double[] v, boolean sparseSafe,
                int[] colIndexes);
 
-       public abstract ADictionary applyBinaryRowOpRight(ValueFunction fn, 
double[] v, boolean sparseSafe,
+       public abstract ADictionary applyBinaryRowOpRight(BinaryOperator op, 
double[] v, boolean sparseSafe,
                int[] colIndexes);
 
        /**
@@ -148,18 +148,14 @@ public abstract class ADictionary {
         * @param out the output sink to write the dictionary to.
         * @throws IOException if the sink fails.
         */
-       public void write(DataOutput out) throws IOException {
-               out.writeBoolean(isLossy());
-       }
+       public abstract void write(DataOutput out) throws IOException;
 
        /**
         * Calculate the space consumption if the dictionary is stored on disk.
         * 
         * @return the long count of bytes to store the dictionary.
         */
-       public long getExactSizeOnDisk() {
-               return 1;
-       }
+       public abstract long getExactSizeOnDisk();
 
        /**
         * Specify if the Dictionary is lossy.
@@ -275,7 +271,19 @@ public abstract class ADictionary {
         * @param counts The counts of the individual tuples contained, managed 
by the column group.
         * @return a new double array containing the most common value
         */
-       public abstract double[] getMostCommonTuple(int[] counts, int nCol);
+       public double[] getMostCommonTuple(int[] counts, int nCol) {
+               int maxIndex = 0;
+               int maxCount = 0;
+               for(int i = 0; i < counts.length; i++) {
+                       if(counts[i] > maxCount) {
+                               maxCount = counts[i];
+                               maxIndex = i;
+                       }
+               }
+               return getTuple(maxIndex, nCol);
+       }
+
+       public abstract double[] getTuple(int index, int nCol);
 
        /**
         * Allocate a new dictionary where the tuple given is subtracted from 
all tuples in the previous dictionary.
@@ -293,4 +301,13 @@ public abstract class ADictionary {
         * @return A Dictionary containing a MatrixBlock.
         */
        public abstract MatrixBlockDictionary getAsMatrixBlockDictionary(int 
nCol);
+
+       /**
+        * Scale all tuples contained in the dictionary by the scaling factor 
given in the int list.
+        * 
+        * @param scaling The ammout to multiply the given tuples with
+        * @param nCol    The number of columns contained in this column group.
+        * @return A New dictionary (since we don't want to modify the 
underlying dictionary)
+        */
+       public abstract ADictionary scaleTuples(int[] scaling, int nCol);
 }
diff --git 
a/src/main/java/org/apache/sysds/runtime/compress/colgroup/dictionary/Dictionary.java
 
b/src/main/java/org/apache/sysds/runtime/compress/colgroup/dictionary/Dictionary.java
index 595c250..770557b 100644
--- 
a/src/main/java/org/apache/sysds/runtime/compress/colgroup/dictionary/Dictionary.java
+++ 
b/src/main/java/org/apache/sysds/runtime/compress/colgroup/dictionary/Dictionary.java
@@ -30,6 +30,7 @@ import org.apache.sysds.runtime.data.DenseBlockFP64;
 import org.apache.sysds.runtime.functionobjects.Builtin;
 import org.apache.sysds.runtime.functionobjects.ValueFunction;
 import org.apache.sysds.runtime.matrix.data.MatrixBlock;
+import org.apache.sysds.runtime.matrix.operators.BinaryOperator;
 import org.apache.sysds.runtime.matrix.operators.ScalarOperator;
 import org.apache.sysds.utils.MemoryEstimates;
 
@@ -119,15 +120,16 @@ public class Dictionary extends ADictionary {
        public Dictionary applyScalarOp(ScalarOperator op, double newVal, int 
numCols) {
                // allocate new array just once because we need to add the 
newVal.
                double[] values = new double[_values.length + numCols];
-               for(int i = 0; i < _values.length; i++) {
+               for(int i = 0; i < _values.length; i++)
                        values[i] = op.executeScalar(_values[i]);
-               }
+               
                Arrays.fill(values, _values.length, _values.length + numCols, 
newVal);
                return new Dictionary(values);
        }
 
        @Override
-       public Dictionary applyBinaryRowOpRight(ValueFunction fn, double[] v, 
boolean sparseSafe, int[] colIndexes) {
+       public Dictionary applyBinaryRowOpRight(BinaryOperator op, double[] v, 
boolean sparseSafe, int[] colIndexes) {
+               ValueFunction fn = op.fn;
                final int len = size();
                final int lenV = colIndexes.length;
                if(sparseSafe) {
@@ -150,7 +152,8 @@ public class Dictionary extends ADictionary {
        }
 
        @Override
-       public Dictionary applyBinaryRowOpLeft(ValueFunction fn, double[] v, 
boolean sparseSafe, int[] colIndexes) {
+       public Dictionary applyBinaryRowOpLeft(BinaryOperator op, double[] v, 
boolean sparseSafe, int[] colIndexes) {
+               ValueFunction fn = op.fn;
                final int len = size();
                final int lenV = colIndexes.length;
                if(sparseSafe) {
@@ -194,6 +197,7 @@ public class Dictionary extends ADictionary {
 
        @Override
        public void write(DataOutput out) throws IOException {
+               out.writeByte(DictionaryFactory.Type.FP64_DICT.ordinal());
                out.writeInt(size());
                for(int i = 0; i < size(); i++)
                        out.writeDouble(_values[i]);
@@ -201,7 +205,7 @@ public class Dictionary extends ADictionary {
 
        @Override
        public long getExactSizeOnDisk() {
-               return 4 + 8 * size();
+               return 1 + 4 + 8 * size();
        }
 
        public int size() {
@@ -446,18 +450,11 @@ public class Dictionary extends ADictionary {
        }
 
        @Override
-       public double[] getMostCommonTuple(int[] counts, int nCol) {
-               int maxIndex = 0;
-               int maxCount = 0;
-               for(int i = 0; i < counts.length; i++) {
-                       if(counts[i] > maxCount) {
-                               maxCount = counts[i];
-                               maxIndex = i;
-                       }
-               }
+       public double[] getTuple(int index, int nCol) {
+
                final double[] tuple = new double[nCol];
                boolean allZero = true;
-               for(int i = maxIndex * nCol, off = 0; i < (maxIndex + 1) * nCol 
&& i < _values.length; i++, off++) {
+               for(int i = index * nCol, off = 0; i < (index + 1) * nCol && i 
< _values.length; i++, off++) {
                        final double v = _values[i];
                        if(v != 0) {
                                tuple[off] = _values[i];
@@ -493,6 +490,20 @@ public class Dictionary extends ADictionary {
                for(int k = 0; k < vlen; k++)
                        for(int j = 0, valOff = k * ncol; j < ncol; j++)
                                c[colIndexes[j]] = fn.execute(c[colIndexes[j]], 
getValue(valOff + j));
-               
+
+       }
+
+       @Override
+       public ADictionary scaleTuples(int[] scaling, int nCol) {
+               final double[] scaledValues = new double[_values.length];
+               int off = 0;
+               for(int tuple = 0; tuple < _values.length / nCol; tuple++) {
+                       final int scale = scaling[tuple];
+                       for(int v = 0; v < nCol; v++) {
+                               scaledValues[off] = _values[off] * scale;
+                               off++;
+                       }
+               }
+               return new Dictionary(scaledValues);
        }
 }
diff --git 
a/src/main/java/org/apache/sysds/runtime/compress/colgroup/dictionary/DictionaryFactory.java
 
b/src/main/java/org/apache/sysds/runtime/compress/colgroup/dictionary/DictionaryFactory.java
index bab32b0..1fd47d8 100644
--- 
a/src/main/java/org/apache/sysds/runtime/compress/colgroup/dictionary/DictionaryFactory.java
+++ 
b/src/main/java/org/apache/sysds/runtime/compress/colgroup/dictionary/DictionaryFactory.java
@@ -25,6 +25,7 @@ import java.io.IOException;
 import org.apache.commons.lang.NotImplementedException;
 import org.apache.commons.logging.Log;
 import org.apache.commons.logging.LogFactory;
+import org.apache.sysds.runtime.DMLCompressionException;
 import org.apache.sysds.runtime.compress.utils.ABitmap;
 import org.apache.sysds.runtime.compress.utils.Bitmap;
 import org.apache.sysds.runtime.compress.utils.BitmapLossy;
@@ -36,26 +37,23 @@ public class DictionaryFactory {
 
        protected static final Log LOG = 
LogFactory.getLog(DictionaryFactory.class.getName());
 
+       public enum Type {
+               FP64_DICT, MATRIX_BLOCK_DICT, INT8_DICT
+       }
+
        public static ADictionary read(DataInput in) throws IOException {
-               boolean lossy = in.readBoolean();
-               if(lossy) {
-
-                       double scale = in.readDouble();
-                       int numVals = in.readInt();
-                       // read distinct values
-                       byte[] values = numVals == 0 ? null : new byte[numVals];
-                       for(int i = 0; i < numVals; i++)
-                               values[i] = in.readByte();
-                       return new QDictionary(values, scale);
-               }
-               else {
-                       int numVals = in.readInt();
-                       // read distinct values
-                       double[] values = new double[numVals];
-                       for(int i = 0; i < numVals; i++)
-                               values[i] = in.readDouble();
-                       return new Dictionary(values);
+               Type type = Type.values()[in.readByte()];
+               switch(type) {
+                       case FP64_DICT:
+                               return Dictionary.read(in);
+                       case MATRIX_BLOCK_DICT:
+                               return MatrixBlockDictionary.read(in);
+                       case INT8_DICT:
+                               return QDictionary.read(in);
+                       default:
+                               throw new DMLCompressionException("Unsupported 
type of dictionary : " + type);
                }
+
        }
 
        public static long getInMemorySize(int nrValues, int nrColumns, double 
tupleSparsity, boolean lossy) {
diff --git 
a/src/main/java/org/apache/sysds/runtime/compress/colgroup/dictionary/MatrixBlockDictionary.java
 
b/src/main/java/org/apache/sysds/runtime/compress/colgroup/dictionary/MatrixBlockDictionary.java
index 322e56b..aae66e6 100644
--- 
a/src/main/java/org/apache/sysds/runtime/compress/colgroup/dictionary/MatrixBlockDictionary.java
+++ 
b/src/main/java/org/apache/sysds/runtime/compress/colgroup/dictionary/MatrixBlockDictionary.java
@@ -1,12 +1,18 @@
 package org.apache.sysds.runtime.compress.colgroup.dictionary;
 
+import java.io.DataInput;
+import java.io.DataOutput;
+import java.io.IOException;
+
 import org.apache.commons.lang.NotImplementedException;
+import org.apache.sysds.runtime.data.DenseBlockFP64;
 import org.apache.sysds.runtime.data.SparseBlock;
 import org.apache.sysds.runtime.functionobjects.Builtin;
 import org.apache.sysds.runtime.functionobjects.Builtin.BuiltinCode;
-import org.apache.sysds.runtime.functionobjects.ValueFunction;
+import org.apache.sysds.runtime.functionobjects.Minus;
 import org.apache.sysds.runtime.matrix.data.LibMatrixReorg;
 import org.apache.sysds.runtime.matrix.data.MatrixBlock;
+import org.apache.sysds.runtime.matrix.operators.BinaryOperator;
 import org.apache.sysds.runtime.matrix.operators.ScalarOperator;
 
 public class MatrixBlockDictionary extends ADictionary {
@@ -24,7 +30,8 @@ public class MatrixBlockDictionary extends ADictionary {
     @Override
     public double[] getValues() {
         LOG.warn("Inefficient force dense format.");
-        _data.sparseToDense();
+        if(_data.isInSparseFormat())
+            _data.sparseToDense();
         return _data.getDenseBlockValues();
     }
 
@@ -39,15 +46,13 @@ public class MatrixBlockDictionary extends ADictionary {
     public int hasZeroTuple(int nCol) {
         if(_data.isInSparseFormat()) {
             SparseBlock sb = _data.getSparseBlock();
-            for(int i = 0; i < _data.getNumRows(); i++) {
-                if(sb.isEmpty(i)) {
+            for(int i = 0; i < _data.getNumRows(); i++)
+                if(sb.isEmpty(i))
                     return i;
-                }
-            }
         }
-        else {
+        else
             throw new NotImplementedException();
-        }
+
         return -1;
     }
 
@@ -163,18 +168,56 @@ public class MatrixBlockDictionary extends ADictionary {
     @Override
     public ADictionary applyScalarOp(ScalarOperator op, double newVal, int 
numCols) {
         MatrixBlock res = _data.scalarOperations(op, new MatrixBlock());
-        MatrixBlock res2 = res.append(new MatrixBlock(1, 1, newVal), new 
MatrixBlock());
-        return new MatrixBlockDictionary(res2);
+        final int lastRow = res.getNumRows();
+        MatrixBlock res2 = new MatrixBlock(lastRow + 1, res.getNumColumns(), 
true);
+        if(res.isEmpty()) {
+            for(int i = 0; i < numCols; i++)
+                res2.appendValue(lastRow, i, newVal);
+            return new MatrixBlockDictionary(res2);
+        }
+        else {
+            res.append(new MatrixBlock(1, numCols, newVal), res2, false);
+            return new MatrixBlockDictionary(res2);
+        }
     }
 
     @Override
-    public ADictionary applyBinaryRowOpLeft(ValueFunction fn, double[] v, 
boolean sparseSafe, int[] colIndexes) {
-        throw new NotImplementedException();
+    public ADictionary applyBinaryRowOpLeft(BinaryOperator op, double[] v, 
boolean sparseSafe, int[] colIndexes) {
+        MatrixBlock rowVector = new MatrixBlock(1, colIndexes.length, false);
+        for(int i = 0; i < colIndexes.length; i++)
+            rowVector.quickSetValue(0, i, v[colIndexes[i]]);
+        MatrixBlock res = new MatrixBlock();
+        if(sparseSafe) {
+            rowVector.binaryOperations(op, _data, res);
+        }
+        else {
+            if(!_data.isInSparseFormat())
+                LOG.warn("Inefficient binary row op allocating Matrix multiple 
times");
+            MatrixBlock tmp = new MatrixBlock();
+            tmp = _data.append(new MatrixBlock(1, _data.getNumColumns(), 0), 
tmp, false);
+            rowVector.binaryOperations(op, tmp, res);
+
+        }
+        return new MatrixBlockDictionary(res);
     }
 
     @Override
-    public ADictionary applyBinaryRowOpRight(ValueFunction fn, double[] v, 
boolean sparseSafe, int[] colIndexes) {
-        throw new NotImplementedException();
+    public ADictionary applyBinaryRowOpRight(BinaryOperator op, double[] v, 
boolean sparseSafe, int[] colIndexes) {
+        MatrixBlock rowVector = new MatrixBlock(1, colIndexes.length, false);
+        for(int i = 0; i < colIndexes.length; i++)
+            rowVector.quickSetValue(0, i, v[colIndexes[i]]);
+        MatrixBlock res = new MatrixBlock();
+        if(sparseSafe) {
+            _data.binaryOperations(op, rowVector, res);
+        }
+        else {
+            if(!_data.isInSparseFormat())
+                LOG.warn("Inefficient binary row op allocating Matrix multiple 
times");
+            MatrixBlock tmp = new MatrixBlock();
+            tmp = _data.append(new MatrixBlock(1, _data.getNumColumns(), 0), 
tmp, false);
+            tmp.binaryOperations(op, rowVector, res);
+        }
+        return new MatrixBlockDictionary(res);
     }
 
     @Override
@@ -382,7 +425,8 @@ public class MatrixBlockDictionary extends ADictionary {
 
     @Override
     public ADictionary sliceOutColumnRange(int idxStart, int idxEnd, int 
previousNumberOfColumns) {
-        throw new NotImplementedException();
+        MatrixBlock retBlock = _data.slice(0, _data.getNumRows() - 1, 
idxStart, idxEnd - 1);
+        return new MatrixBlockDictionary(retBlock);
     }
 
     @Override
@@ -399,11 +443,12 @@ public class MatrixBlockDictionary extends ADictionary {
     public long getNumberNonZeros(int[] counts, int nCol) {
         if(_data.isEmpty())
             return 0;
+
         long nnz = 0;
         if(_data.isInSparseFormat()) {
             SparseBlock sb = _data.getSparseBlock();
             for(int i = 0; i < _data.getNumRows(); i++)
-                if(sb.isEmpty(i))
+                if(!sb.isEmpty(i))
                     nnz += sb.size(i) * counts[i];
 
         }
@@ -430,17 +475,67 @@ public class MatrixBlockDictionary extends ADictionary {
 
     @Override
     public void addToEntry(Dictionary d, int fr, int to, int nCol) {
-        throw new NotImplementedException();
+        double[] v = d.getValues();
+        if(_data.isEmpty())
+            return;
+        else if(_data.isInSparseFormat()) {
+            SparseBlock sb = _data.getSparseBlock();
+            if(sb.isEmpty(fr))
+                return;
+            final int apos = sb.pos(fr);
+            final int alen = sb.size(fr) + apos;
+            final int[] aix = sb.indexes(fr);
+            final double[] avals = sb.values(fr);
+            final int offsetTo = nCol * to;
+            for(int j = apos; j < alen; j++) {
+                v[offsetTo + aix[j]] += avals[j];
+            }
+        }
+        else {
+            final int sf = nCol * fr; // start from
+            final int ef = sf + nCol; // end from
+            final double[] thisV = _data.getDenseBlockValues();
+            for(int i = sf, j = nCol * to; i < ef; i++, j++) {
+                v[j] += thisV[i];
+            }
+        }
     }
 
     @Override
-    public double[] getMostCommonTuple(int[] counts, int nCol) {
-        throw new NotImplementedException();
+    public double[] getTuple(int index, int nCol) {
+        if(_data.isEmpty() || index >= _data.getNumRows())
+            return null;
+        else if(_data.isInSparseFormat()) {
+            SparseBlock sb = _data.getSparseBlock();
+            if(sb.isEmpty(index))
+                return null;
+            double[] tuple = new double[nCol];
+            final int apos = sb.pos(index);
+            final int alen = sb.size(index) + apos;
+            final int[] aix = sb.indexes(index);
+            final double[] avals = sb.values(index);
+            for(int j = apos; j < alen; j++) {
+                tuple[aix[j]] = avals[j];
+            }
+            return tuple;
+        }
+        else {
+            double[] tuple = new double[nCol];
+            double[] values = _data.getDenseBlockValues();
+            int offset = index * nCol;
+            for(int i = 0; i < nCol; i++, offset++)
+                tuple[i] = values[offset];
+            return tuple;
+        }
     }
 
     @Override
     public ADictionary subtractTuple(double[] tuple) {
-        throw new NotImplementedException();
+        DenseBlockFP64 b = new DenseBlockFP64(new int[] {1, tuple.length}, 
tuple);
+        MatrixBlock rowVector = new MatrixBlock(1, tuple.length, b);
+        MatrixBlock res = new MatrixBlock(_data.getNumColumns(), 
_data.getNumRows(), _data.isInSparseFormat());
+        _data.binaryOperations(new BinaryOperator(Minus.getMinusFnObject()), 
rowVector, res);
+        return new MatrixBlockDictionary(res);
     }
 
     @Override
@@ -453,4 +548,64 @@ public class MatrixBlockDictionary extends ADictionary {
     public String toString() {
         return "MatrixBlock Dictionary :" + _data.toString();
     }
+
+    @Override
+    public ADictionary scaleTuples(int[] scaling, int nCol) {
+        if(_data.isEmpty()) {
+            throw new NotImplementedException("could return null here? or 
empty DictionaryMatrixBlock...");
+        }
+        else if(_data.isInSparseFormat()) {
+            MatrixBlock retBlock = new MatrixBlock(_data.getNumRows(), 
_data.getNumColumns(), true);
+            retBlock.allocateSparseRowsBlock(true);
+            SparseBlock sbRet = retBlock.getSparseBlock();
+            SparseBlock sbThis = _data.getSparseBlock();
+            for(int i = 0; i < _data.getNumRows(); i++) {
+                if(!sbThis.isEmpty(i)) {
+                    sbRet.set(i, sbThis.get(i), true);
+
+                    final int count = scaling[i];
+                    final int apos = sbRet.pos(i);
+                    final int alen = sbRet.size(i) + apos;
+                    final double[] avals = sbRet.values(i);
+                    for(int j = apos; j < alen; j++)
+                        avals[j] = count * avals[j];
+                }
+            }
+            retBlock.setNonZeros(_data.getNonZeros());
+            return new MatrixBlockDictionary(retBlock);
+        }
+        else {
+            final double[] _values = _data.getDenseBlockValues();
+            final double[] scaledValues = new double[_values.length];
+            int off = 0;
+            for(int tuple = 0; tuple < _values.length / nCol; tuple++) {
+                final int scale = scaling[tuple];
+                for(int v = 0; v < nCol; v++) {
+                    scaledValues[off] = _values[off] * scale;
+                    off++;
+                }
+            }
+            DenseBlockFP64 db = new DenseBlockFP64(new int[] 
{_data.getNumRows(), _data.getNumColumns()}, scaledValues);
+            MatrixBlock retBlock = new MatrixBlock(_data.getNumRows(), 
_data.getNumColumns(), db);
+            retBlock.setNonZeros(_data.getNonZeros());
+            return new MatrixBlockDictionary(retBlock);
+        }
+    }
+
+    @Override
+    public void write(DataOutput out) throws IOException {
+        out.writeByte(DictionaryFactory.Type.MATRIX_BLOCK_DICT.ordinal());
+        _data.write(out);
+    }
+
+    public static MatrixBlockDictionary read(DataInput in) throws IOException {
+        MatrixBlock ret = new MatrixBlock();
+        ret.readFields(in);
+        return new MatrixBlockDictionary(ret);
+    }
+
+    @Override
+    public long getExactSizeOnDisk() {
+        return 1 + _data.getExactSizeOnDisk();
+    }
 }
diff --git 
a/src/main/java/org/apache/sysds/runtime/compress/colgroup/dictionary/QDictionary.java
 
b/src/main/java/org/apache/sysds/runtime/compress/colgroup/dictionary/QDictionary.java
index 70836ca..4c3f9f9 100644
--- 
a/src/main/java/org/apache/sysds/runtime/compress/colgroup/dictionary/QDictionary.java
+++ 
b/src/main/java/org/apache/sysds/runtime/compress/colgroup/dictionary/QDictionary.java
@@ -19,6 +19,7 @@
 
 package org.apache.sysds.runtime.compress.colgroup.dictionary;
 
+import java.io.DataInput;
 import java.io.DataOutput;
 import java.io.IOException;
 import java.util.Arrays;
@@ -30,6 +31,7 @@ import org.apache.sysds.runtime.functionobjects.Divide;
 import org.apache.sysds.runtime.functionobjects.Multiply;
 import org.apache.sysds.runtime.functionobjects.Plus;
 import org.apache.sysds.runtime.functionobjects.ValueFunction;
+import org.apache.sysds.runtime.matrix.operators.BinaryOperator;
 import org.apache.sysds.runtime.matrix.operators.ScalarOperator;
 import org.apache.sysds.utils.MemoryEstimates;
 
@@ -193,8 +195,8 @@ public class QDictionary extends ADictionary {
        }
 
        @Override
-       public QDictionary applyBinaryRowOpRight(ValueFunction fn, double[] v, 
boolean sparseSafe, int[] colIndexes) {
-
+       public QDictionary applyBinaryRowOpRight(BinaryOperator op, double[] v, 
boolean sparseSafe, int[] colIndexes) {
+               ValueFunction fn = op.fn;
                if(_values == null) {
                        if(sparseSafe) {
                                return new QDictionary(null, 1);
@@ -234,7 +236,7 @@ public class QDictionary extends ADictionary {
        }
 
        @Override
-       public QDictionary applyBinaryRowOpLeft(ValueFunction fn, double[] v, 
boolean sparseSafe, int[] colIndexes) {
+       public QDictionary applyBinaryRowOpLeft(BinaryOperator op, double[] v, 
boolean sparseSafe, int[] colIndexes) {
                throw new NotImplementedException("Not Implemented yet");
        }
 
@@ -256,16 +258,26 @@ public class QDictionary extends ADictionary {
 
        @Override
        public void write(DataOutput out) throws IOException {
-               super.write(out);
+               out.writeByte(DictionaryFactory.Type.INT8_DICT.ordinal());
                out.writeDouble(_scale);
                out.writeInt(_values.length);
                for(int i = 0; i < _values.length; i++)
                        out.writeByte(_values[i]);
        }
 
+       public static QDictionary read(DataInput in) throws IOException {
+               double scale = in.readDouble();
+               int numVals = in.readInt();
+               byte[] values = new byte[numVals];
+               for(int i = 0; i < numVals; i++) {
+                       values[i] = in.readByte();
+               }
+               return new QDictionary(values, scale);
+       }
+
        @Override
        public long getExactSizeOnDisk() {
-               return 8 + 4 + size();
+               return 1 + 8 + 4 + size();
        }
 
        @Override
@@ -491,7 +503,7 @@ public class QDictionary extends ADictionary {
        }
 
        @Override
-       public double[] getMostCommonTuple(int[] counts, int nCol) {
+       public double[] getTuple(int index, int nCol) {
                return null;
        }
 
@@ -507,6 +519,11 @@ public class QDictionary extends ADictionary {
 
        @Override
        public void aggregateCols(double[] c, Builtin fn, int[] colIndexes) {
-               throw new NotImplementedException();    
+               throw new NotImplementedException();
+       }
+
+       @Override
+       public ADictionary scaleTuples(int[] scaling, int nCol) {
+               throw new NotImplementedException();
        }
 }
diff --git 
a/src/main/java/org/apache/sysds/runtime/compress/lib/BitmapEncoder.java 
b/src/main/java/org/apache/sysds/runtime/compress/lib/BitmapEncoder.java
index 64b0bdd..9a52348 100644
--- a/src/main/java/org/apache/sysds/runtime/compress/lib/BitmapEncoder.java
+++ b/src/main/java/org/apache/sysds/runtime/compress/lib/BitmapEncoder.java
@@ -196,6 +196,7 @@ public class BitmapEncoder {
 
                while((cellVals = rowReader.nextRow()) != null) {
                        if(cellVals.getData() != null) {
+                               cellVals.resetHash();
                                IntArrayList lstPtr = 
distinctVals.get(cellVals);
                                if(lstPtr == null) {
                                        // create new objects only on demand
diff --git 
a/src/main/java/org/apache/sysds/runtime/compress/lib/CLALibBinaryCellOp.java 
b/src/main/java/org/apache/sysds/runtime/compress/lib/CLALibBinaryCellOp.java
index d454a7f..48c2810 100644
--- 
a/src/main/java/org/apache/sysds/runtime/compress/lib/CLALibBinaryCellOp.java
+++ 
b/src/main/java/org/apache/sysds/runtime/compress/lib/CLALibBinaryCellOp.java
@@ -177,7 +177,7 @@ public class CLALibBinaryCellOp {
                }
 
                List<AColGroup> newColGroups = new 
ArrayList<>(oldColGroups.size());
-               int k = OptimizerUtils.getConstrainedNumThreads(-1);
+               int k = op.getNumThreads();
                ExecutorService pool = CommonThreadPool.get(k);
                ArrayList<BinaryMVRowTask> tasks = new ArrayList<>();
                try {
@@ -238,8 +238,7 @@ public class CLALibBinaryCellOp {
                boolean foundConst = false;
                for(AColGroup grp : m1.getColGroups()) {
                        if(!m2.isEmpty() && !foundConst && grp instanceof 
ColGroupConst) {
-                               ADictionary newDict = ((ColGroupValue) 
grp).applyBinaryRowOp(op.fn, m2.getDenseBlockValues(), false,
-                                       left);
+                               ADictionary newDict = ((ColGroupValue) 
grp).applyBinaryRowOp(op, m2.getDenseBlockValues(), false, left);
                                newColGroups.add(new 
ColGroupConst(grp.getColIndices(), m1.getNumRows(), newDict));
                                foundConst = true;
                        }
@@ -251,7 +250,7 @@ public class CLALibBinaryCellOp {
                        int[] colIndexes = oldColGroups.get(0).getColIndices();
                        double[] v = m2.getDenseBlockValues();
                        ADictionary newDict = new Dictionary(new 
double[colIndexes.length]);
-                       newDict = newDict.applyBinaryRowOp(op.fn, v, true, 
colIndexes, left);
+                       newDict = newDict.applyBinaryRowOp(op, v, true, 
colIndexes, left);
                        newColGroups.add(new ColGroupConst(colIndexes, 
m1.getNumRows(), newDict));
                }
                ret.allocateColGroupList(newColGroups);
@@ -354,7 +353,7 @@ public class CLALibBinaryCellOp {
                public Integer call() {
                        // unsafe decompress, since we count nonzeros 
afterwards.
                        for(AColGroup g : _m1.getColGroups())
-                               g.decompressToBlock(_ret, _rl, _ru, 
g.getValues(), false);
+                               g.decompressToBlock(_ret, _rl, _ru, false);
 
                        if(_m2.isInSparseFormat())
                                throw new NotImplementedException("Not 
Implemented sparse Format execution for MM.");
@@ -398,7 +397,7 @@ public class CLALibBinaryCellOp {
                public Integer call() {
                        // unsafe decompress, since we count nonzeros 
afterwards.
                        for(AColGroup g : _m1.getColGroups())
-                               g.decompressToBlock(_ret, _rl, _ru, 
g.getValues(), false);
+                               g.decompressToBlock(_ret, _rl, _ru, false);
 
                        if(_m2.isInSparseFormat())
                                throw new NotImplementedException("Not 
Implemented sparse Format execution for MM.");
diff --git 
a/src/main/java/org/apache/sysds/runtime/compress/lib/CLALibCompAgg.java 
b/src/main/java/org/apache/sysds/runtime/compress/lib/CLALibCompAgg.java
index f6c734c..c0ac099 100644
--- a/src/main/java/org/apache/sysds/runtime/compress/lib/CLALibCompAgg.java
+++ b/src/main/java/org/apache/sysds/runtime/compress/lib/CLALibCompAgg.java
@@ -496,7 +496,7 @@ public class CLALibCompAgg {
                private MatrixBlock decompressToTemp() {
                        MatrixBlock tmp = getTmp();
                        for(AColGroup g : _m1.getColGroups())
-                               g.decompressToBlockUnSafe(tmp, _rl, _ru, 0, 
g.getValues());
+                               g.decompressToBlockUnSafe(tmp, _rl, _ru, 0);
                        tmp.setNonZeros(_rl + _ru);
                        return tmp;
                }
diff --git 
a/src/main/java/org/apache/sysds/runtime/compress/lib/CLALibLeftMultBy.java 
b/src/main/java/org/apache/sysds/runtime/compress/lib/CLALibLeftMultBy.java
index ba22642..1fff010 100644
--- a/src/main/java/org/apache/sysds/runtime/compress/lib/CLALibLeftMultBy.java
+++ b/src/main/java/org/apache/sysds/runtime/compress/lib/CLALibLeftMultBy.java
@@ -32,9 +32,8 @@ import org.apache.commons.logging.LogFactory;
 import org.apache.sysds.runtime.DMLRuntimeException;
 import org.apache.sysds.runtime.compress.CompressedMatrixBlock;
 import org.apache.sysds.runtime.compress.colgroup.AColGroup;
-import org.apache.sysds.runtime.compress.colgroup.AColGroup.CompressionType;
 import org.apache.sysds.runtime.compress.colgroup.ColGroupValue;
-import org.apache.sysds.runtime.compress.colgroup.pre.IPreAggregate;
+import org.apache.sysds.runtime.compress.utils.LinearAlgebraUtils;
 import org.apache.sysds.runtime.matrix.data.LibMatrixReorg;
 import org.apache.sysds.runtime.matrix.data.MatrixBlock;
 import org.apache.sysds.runtime.util.CommonThreadPool;
@@ -85,29 +84,16 @@ public class CLALibLeftMultBy {
                return ret;
        }
 
-       // public static void leftMultByTransposeSelf(CompressedMatrixBlock mb, 
MatrixBlock result, int k) {
-
-       // }
-
        public static void leftMultByTransposeSelf(List<AColGroup> groups, 
MatrixBlock result, int k, int numColumns,
                Pair<Integer, int[]> v, boolean overlapping) {
 
                result.allocateDenseBlock();
 
-               if(!overlapping && groups.get(0).getNumCols() == numColumns) {
-                       leftMultBySelfDiagonalColGroup(groups, result, 
numColumns);
-                       return;
-               }
-
                if(overlapping) {
                        LOG.warn("Inefficient TSMM with overlapping matrix 
could be implemented multi-threaded but is not yet.");
                        leftMultByCompressedTransposedMatrix(groups, groups, 
result);
-
-                       result.recomputeNonZeros();
-                       return;
                }
-
-               if(k <= 1) {
+               else if(k <= 1) {
                        for(int i = 0; i < groups.size(); i++)
                                
leftMultByCompressedTransposedMatrix(groups.get(i), groups, result, i, 
groups.size());
                }
@@ -128,35 +114,37 @@ public class CLALibLeftMultBy {
                                throw new DMLRuntimeException(e);
                        }
                }
-
+               // LOG.error(result);
+               // Move values in the lower part of the matrix to the upper part
                copyToUpperTriangle(result.getDenseBlockValues(), numColumns);
-               result.recomputeNonZeros();
+
+               // LOG.error(result);
+               // calculate the number of non zeros, and allocate all value 
locations by copying upper triangle back to bottom.
+               long nnz = LinearAlgebraUtils.copyUpperToLowerTriangle(result);
+               result.setNonZeros(nnz);
+               // Evaluate if the output should be sparsely allocated.
+               result.examSparsity(false);
+               result.setNonZeros(nnz);
+               // LOG.error(result);
        }
 
        private static void copyToUpperTriangle(final double[] c, final int 
cols) {
                for(int i = 0, offC = 0; i < cols; i++, offC += cols)
                        for(int j = i, offR = i * cols; j < cols; j++, offR += 
cols) {
-                               final double d = c[i + offR];
-                               if(d != 0)
-                                       c[offC + j] = d;
+                               final double prev = c[offC + j];
+                               if(prev == 0)
+                                       c[offC + j] = c[i + offR];
                        }
 
        }
 
-       private static void leftMultBySelfDiagonalColGroup(List<AColGroup> 
groups, MatrixBlock result, int numColumns) {
-               double[] outValues = result.getDenseBlockValues();
-               for(AColGroup g : groups)
-                       g.tsmm(outValues, numColumns);
-
-       }
-
        private static MatrixBlock 
leftMultByCompressedTransposedMatrix(List<AColGroup> colGroups,
                CompressedMatrixBlock that, MatrixBlock ret, int k, int 
numColumns, Pair<Integer, int[]> v,
                boolean overlapping) {
 
                ret.allocateDenseBlock();
                List<AColGroup> thatCGs = that.getColGroups();
-               Pair<Integer, int[]> thatV = that.getMaxNumValues();
+               // Pair<Integer, int[]> thatV = that.getMaxNumValues();
 
                if(k <= 1 || overlapping || that.isOverlapping()) {
                        if(overlapping || that.isOverlapping())
@@ -170,14 +158,14 @@ public class CLALibLeftMultBy {
                                ArrayList<Callable<Object>> tasks = new 
ArrayList<>();
                                for(int i = 0; i < thatCGs.size(); i++) {
 
-                                       if(thatCGs.get(i).getNumCols() > 1 || 
thatCGs.get(i).getCompType() == CompressionType.CONST)
-                                               tasks.add(new 
LeftMultByCompressedTransposedMatrixTask(colGroups, thatCGs.get(i), ret, 0,
-                                                       colGroups.size()));
-                                       else {
-                                               int row = 
thatCGs.get(i).getColIndices()[0];
-                                               tasks.add(new 
LeftMultByCompressedTransposedMatrixTask2(colGroups, thatCGs, ret, v, thatV, 
row,
-                                                       row + 1, overlapping, 
1));
-                                       }
+                                       // if(thatCGs.get(i).getNumCols() > 1 
|| thatCGs.get(i).getCompType() == CompressionType.CONST)
+                                       tasks.add(new 
LeftMultByCompressedTransposedMatrixTask(colGroups, thatCGs.get(i), ret, 0,
+                                               colGroups.size()));
+                                       // else {
+                                       // int row = 
thatCGs.get(i).getColIndices()[0];
+                                       // tasks.add(new 
LeftMultByCompressedTransposedMatrixTask2(colGroups, thatCGs, ret, v, thatV, 
row,
+                                       // row + 1, overlapping, 1));
+                                       // }
                                }
 
                                for(Future<Object> tret : pool.invokeAll(tasks))
@@ -211,7 +199,7 @@ public class CLALibLeftMultBy {
                @Override
                public Object call() {
                        try {
-                               IPreAggregate.setupThreadLocalMemory(1024);
+                               // IPreAggregate.setupThreadLocalMemory(1024);
                                leftMultByCompressedTransposedMatrix(_left, 
_groups, _ret, _start, _end);
 
                        }
@@ -232,8 +220,13 @@ public class CLALibLeftMultBy {
        private static void leftMultByCompressedTransposedMatrix(AColGroup lhs, 
List<AColGroup> thisCG, MatrixBlock ret,
                int colGroupStart, int colGroupEnd) {
 
-               for(; colGroupStart < colGroupEnd; colGroupStart++)
-                       thisCG.get(colGroupStart).leftMultByAColGroup(lhs, ret);
+               for(; colGroupStart < colGroupEnd; colGroupStart++) {
+                       AColGroup rhs = thisCG.get(colGroupStart);
+                       if(rhs != lhs)
+                               rhs.leftMultByAColGroup(lhs, ret);
+                       else
+                               rhs.tsmm(ret.getDenseBlockValues(), 
ret.getNumColumns());
+               }
 
        }
 
@@ -265,14 +258,14 @@ public class CLALibLeftMultBy {
                                else {
                                        for(AColGroup g : colGroups) {
                                                // if(g instanceof ColGroupDDC) 
{
-                                               //      tasks.add(new 
LeftMatrixColGroupMultTask(g, that, ret, 0, that.getNumRows(), v));
+                                               // tasks.add(new 
LeftMatrixColGroupMultTask(g, that, ret, 0, that.getNumRows(), v));
                                                // }
                                                // else {
 
-                                                       for(int blo = 0; blo < 
that.getNumRows(); blo += rowBlockSize) {
-                                                               tasks.add(new 
LeftMatrixColGroupMultTask(g, that, ret, blo,
-                                                                       
Math.min(blo + rowBlockSize, that.getNumRows()), v));
-                                                       }
+                                               for(int blo = 0; blo < 
that.getNumRows(); blo += rowBlockSize) {
+                                                       tasks.add(new 
LeftMatrixColGroupMultTask(g, that, ret, blo,
+                                                               Math.min(blo + 
rowBlockSize, that.getNumRows()), v));
+                                               }
                                                // }
                                        }
                                }
@@ -291,93 +284,94 @@ public class CLALibLeftMultBy {
                return ret;
        }
 
-       private static void 
leftMultByCompressedTransposeRowSection(List<AColGroup> thisGroups, 
List<AColGroup> thatGroups,
-               MatrixBlock result, Pair<Integer, int[]> v, Pair<Integer, 
int[]> thatV, int rl, int ru, boolean overlapping,
-               int k) {
-               if(k > 1 && !overlapping)
-                       
leftMultByCompressedTransposeRowSectionParallel(thisGroups, thatGroups, result, 
v, thatV, rl, ru, k);
-               else
-                       
leftMultByCompressedTransposeRowSectionSingleThread(thisGroups, thatGroups, 
result, v, thatV, rl, ru);
+       // private static void 
leftMultByCompressedTransposeRowSection(List<AColGroup> thisGroups, 
List<AColGroup>
+       // thatGroups,
+       // MatrixBlock result, Pair<Integer, int[]> v, Pair<Integer, int[]> 
thatV, int rl, int ru, boolean overlapping,
+       // int k) {
+       // if(k > 1 && !overlapping)
+       // leftMultByCompressedTransposeRowSectionParallel(thisGroups, 
thatGroups, result, v, thatV, rl, ru, k);
+       // else
+       // leftMultByCompressedTransposeRowSectionSingleThread(thisGroups, 
thatGroups, result, v, thatV, rl, ru);
 
-       }
+       // }
 
-       private static void 
leftMultByCompressedTransposeRowSectionParallel(List<AColGroup> thisGroups,
-               List<AColGroup> thatGroups, MatrixBlock result, Pair<Integer, 
int[]> v, Pair<Integer, int[]> thatV, int rl,
-               int ru, int k) {
+       // private static void 
leftMultByCompressedTransposeRowSectionParallel(List<AColGroup> thisGroups,
+       // List<AColGroup> thatGroups, MatrixBlock result, Pair<Integer, int[]> 
v, Pair<Integer, int[]> thatV, int rl,
+       // int ru, int k) {
 
-               // preallocated dense tmp matrix blocks
-               MatrixBlock lhs = new MatrixBlock(1, 
thisGroups.get(0).getNumRows(), false);
-               MatrixBlock tmpret = new MatrixBlock(1, result.getNumColumns(), 
false);
-               lhs.allocateDenseBlock();
-               tmpret.allocateDenseBlock();
+       // // preallocated dense tmp matrix blocks
+       // MatrixBlock lhs = new MatrixBlock(1, thisGroups.get(0).getNumRows(), 
false);
+       // MatrixBlock tmpret = new MatrixBlock(1, result.getNumColumns(), 
false);
+       // lhs.allocateDenseBlock();
+       // tmpret.allocateDenseBlock();
 
-               ExecutorService pool = CommonThreadPool.get(k);
-               ArrayList<leftMultByVectorTransposeTask> tasks = new 
ArrayList<>();
-               for(int j = rl; j < ru; j++) {
-                       AColGroup.decompressColumnToBlock(lhs, j, thatGroups);
-                       if(!lhs.isEmptyBlock(false)) {
+       // ExecutorService pool = CommonThreadPool.get(k);
+       // ArrayList<leftMultByVectorTransposeTask> tasks = new ArrayList<>();
+       // for(int j = rl; j < ru; j++) {
+       // AColGroup.decompressColumnToBlock(lhs, j, thatGroups);
+       // if(!lhs.isEmptyBlock(false)) {
 
-                               try {
-                                       int groupBatch = 
Math.max(thisGroups.size() / k, 1);
+       // try {
+       // int groupBatch = Math.max(thisGroups.size() / k, 1);
 
-                                       for(int i = 0; i * groupBatch < 
thisGroups.size(); i++) {
-                                               tasks.add(new 
leftMultByVectorTransposeTask(thisGroups, lhs, tmpret, i * groupBatch,
-                                                       
Math.min(thisGroups.size(), (i + 1) * groupBatch), v));
-                                       }
-                                       for(Future<Object> future : 
pool.invokeAll(tasks))
-                                               future.get();
-                               }
-                               catch(InterruptedException | ExecutionException 
e) {
-                                       throw new DMLRuntimeException(e);
-                               }
+       // for(int i = 0; i * groupBatch < thisGroups.size(); i++) {
+       // tasks.add(new leftMultByVectorTransposeTask(thisGroups, lhs, tmpret, 
i * groupBatch,
+       // Math.min(thisGroups.size(), (i + 1) * groupBatch), v));
+       // }
+       // for(Future<Object> future : pool.invokeAll(tasks))
+       // future.get();
+       // }
+       // catch(InterruptedException | ExecutionException e) {
+       // throw new DMLRuntimeException(e);
+       // }
 
-                               double[] tmpRetValues = 
tmpret.getDenseBlockValues();
-                               double[] resultValues = 
result.getDenseBlockValues();
-                               int offset = tmpret.getNumColumns() * j;
-                               for(int i = 0; i < tmpret.getNumColumns(); i++, 
offset++) {
-                                       resultValues[offset] += tmpRetValues[i];
-                                       tmpRetValues[i] = 0;
-                               }
-                       }
-                       lhs.reset();
-                       tasks.clear();
-               }
-               pool.shutdown();
+       // double[] tmpRetValues = tmpret.getDenseBlockValues();
+       // double[] resultValues = result.getDenseBlockValues();
+       // int offset = tmpret.getNumColumns() * j;
+       // for(int i = 0; i < tmpret.getNumColumns(); i++, offset++) {
+       // resultValues[offset] += tmpRetValues[i];
+       // tmpRetValues[i] = 0;
+       // }
+       // }
+       // lhs.reset();
+       // tasks.clear();
+       // }
+       // pool.shutdown();
 
-               // post processing
-               ColGroupValue.cleanupThreadLocalMemory();
-       }
+       // // post processing
+       // ColGroupValue.cleanupThreadLocalMemory();
+       // }
 
-       private static void 
leftMultByCompressedTransposeRowSectionSingleThread(List<AColGroup> thisGroups,
-               List<AColGroup> thatGroups, MatrixBlock result, Pair<Integer, 
int[]> v, Pair<Integer, int[]> thatV, int rl,
-               int ru) {
-               final int numRows = thisGroups.get(0).getNumRows();
-
-               // preallocated dense tmp matrix blocks
-               MatrixBlock lhs = new MatrixBlock(1, numRows, false);
-               MatrixBlock tmpret = new MatrixBlock(1, result.getNumColumns(), 
false);
-
-               lhs.allocateDenseBlock();
-               tmpret.allocateDenseBlock();
-
-               for(int j = rl; j < ru; j++) {
-                       AColGroup.decompressColumnToBlock(lhs, j, thatGroups);
-                       if(!lhs.isEmptyBlock(false)) {
-                               for(AColGroup grp : thisGroups)
-                                       grp.leftMultByMatrix(lhs, tmpret);
-
-                               double[] tmpRetValues = 
tmpret.getDenseBlockValues();
-                               double[] resultValues = 
result.getDenseBlockValues();
-                               int offset = tmpret.getNumColumns() * j;
-                               for(int i = 0; i < tmpret.getNumColumns(); i++, 
offset++) {
-                                       resultValues[offset] += tmpRetValues[i];
-                                       tmpRetValues[i] = 0;
-                               }
-                       }
-                       lhs.reset();
-               }
+       // private static void 
leftMultByCompressedTransposeRowSectionSingleThread(List<AColGroup> thisGroups,
+       // List<AColGroup> thatGroups, MatrixBlock result, Pair<Integer, int[]> 
v, Pair<Integer, int[]> thatV, int rl,
+       // int ru) {
+       // final int numRows = thisGroups.get(0).getNumRows();
+
+       // // preallocated dense tmp matrix blocks
+       // MatrixBlock lhs = new MatrixBlock(1, numRows, false);
+       // MatrixBlock tmpret = new MatrixBlock(1, result.getNumColumns(), 
false);
+
+       // lhs.allocateDenseBlock();
+       // tmpret.allocateDenseBlock();
+
+       // for(int j = rl; j < ru; j++) {
+       // AColGroup.decompressColumnToBlock(lhs, j, thatGroups);
+       // if(!lhs.isEmptyBlock(false)) {
+       // for(AColGroup grp : thisGroups)
+       // grp.leftMultByMatrix(lhs, tmpret);
+
+       // double[] tmpRetValues = tmpret.getDenseBlockValues();
+       // double[] resultValues = result.getDenseBlockValues();
+       // int offset = tmpret.getNumColumns() * j;
+       // for(int i = 0; i < tmpret.getNumColumns(); i++, offset++) {
+       // resultValues[offset] += tmpRetValues[i];
+       // tmpRetValues[i] = 0;
+       // }
+       // }
+       // lhs.reset();
+       // }
 
-       }
+       // }
 
        private static class LeftMatrixMatrixMultTask implements 
Callable<Object> {
                private final List<AColGroup> _group;
@@ -400,7 +394,7 @@ public class CLALibLeftMultBy {
                @Override
                public Object call() {
                        try {
-                               
ColGroupValue.setupThreadLocalMemory(_v.getLeft() * (_ru - _rl));
+                               
ColGroupValue.setupThreadLocalMemory(_v.getLeft());
                                for(int j = 0; j < _group.size(); j++)
                                        _group.get(j).leftMultByMatrix(_that, 
_ret, _rl, _ru);
                        }
@@ -433,7 +427,7 @@ public class CLALibLeftMultBy {
                public Object call() {
 
                        try {
-                               
ColGroupValue.setupThreadLocalMemory(_v.getLeft() * (_ru - _rl));
+                               
ColGroupValue.setupThreadLocalMemory(_v.getLeft());
                                _group.leftMultByMatrix(_that, _ret, _rl, _ru);
                        }
                        catch(Exception e) {
@@ -443,65 +437,65 @@ public class CLALibLeftMultBy {
                }
        }
 
-       private static class LeftMultByCompressedTransposedMatrixTask2 
implements Callable<Object> {
-               private final List<AColGroup> _groups;
-               private final List<AColGroup> _thatGroups;
-               private final MatrixBlock _ret;
-               private final int _rl;
-               private final int _ru;
-               private final Pair<Integer, int[]> _v;
-               private final Pair<Integer, int[]> _thatV;
-               private final boolean _overlapping;
-               private final int _extraThreads;
-
-               protected 
LeftMultByCompressedTransposedMatrixTask2(List<AColGroup> thisGroups, 
List<AColGroup> thatGroups,
-                       MatrixBlock ret, Pair<Integer, int[]> v, Pair<Integer, 
int[]> thatV, int rl, int ru, boolean overlapping,
-                       int extraThreads) {
-                       _groups = thisGroups;
-                       _thatGroups = thatGroups;
-                       _ret = ret;
-                       _rl = rl;
-                       _ru = ru;
-                       _v = v;
-                       _thatV = thatV;
-                       _overlapping = overlapping;
-                       _extraThreads = extraThreads;
-               }
-
-               @Override
-               public Object call() {
-                       
ColGroupValue.setupThreadLocalMemory(Math.max(_v.getLeft(), _thatV.getLeft()) + 
1);
-                       leftMultByCompressedTransposeRowSection(_groups, 
_thatGroups, _ret, _v, _thatV, _rl, _ru, _overlapping,
-                               _extraThreads);
-                       return null;
-               }
-       }
+       // private static class LeftMultByCompressedTransposedMatrixTask2 
implements Callable<Object> {
+       // private final List<AColGroup> _groups;
+       // private final List<AColGroup> _thatGroups;
+       // private final MatrixBlock _ret;
+       // private final int _rl;
+       // private final int _ru;
+       // private final Pair<Integer, int[]> _v;
+       // private final Pair<Integer, int[]> _thatV;
+       // private final boolean _overlapping;
+       // private final int _extraThreads;
+
+       // protected LeftMultByCompressedTransposedMatrixTask2(List<AColGroup> 
thisGroups, List<AColGroup> thatGroups,
+       // MatrixBlock ret, Pair<Integer, int[]> v, Pair<Integer, int[]> thatV, 
int rl, int ru, boolean overlapping,
+       // int extraThreads) {
+       // _groups = thisGroups;
+       // _thatGroups = thatGroups;
+       // _ret = ret;
+       // _rl = rl;
+       // _ru = ru;
+       // _v = v;
+       // _thatV = thatV;
+       // _overlapping = overlapping;
+       // _extraThreads = extraThreads;
+       // }
 
-       private static class leftMultByVectorTransposeTask implements 
Callable<Object> {
-               private final List<AColGroup> _grps;
-               private final MatrixBlock _rowVector;
-               private final MatrixBlock _result;
-               private final int _gl;
-               private final int _gu;
-               private final Pair<Integer, int[]> _v;
+       // @Override
+       // public Object call() {
+       // ColGroupValue.setupThreadLocalMemory(Math.max(_v.getLeft(), 
_thatV.getLeft()));
+       // leftMultByCompressedTransposeRowSection(_groups, _thatGroups, _ret, 
_v, _thatV, _rl, _ru, _overlapping,
+       // _extraThreads);
+       // return null;
+       // }
+       // }
 
-               protected leftMultByVectorTransposeTask(List<AColGroup> grps, 
MatrixBlock rowVector, MatrixBlock result, int gl,
-                       int gu, Pair<Integer, int[]> v) {
-                       _grps = grps;
-                       _rowVector = rowVector;
-                       _result = result;
-                       _gl = gl;
-                       _gu = gu;
-                       _v = v;
-               }
+       // private static class leftMultByVectorTransposeTask implements 
Callable<Object> {
+       // private final List<AColGroup> _grps;
+       // private final MatrixBlock _rowVector;
+       // private final MatrixBlock _result;
+       // private final int _gl;
+       // private final int _gu;
+       // private final Pair<Integer, int[]> _v;
+
+       // protected leftMultByVectorTransposeTask(List<AColGroup> grps, 
MatrixBlock rowVector, MatrixBlock result, int gl,
+       // int gu, Pair<Integer, int[]> v) {
+       // _grps = grps;
+       // _rowVector = rowVector;
+       // _result = result;
+       // _gl = gl;
+       // _gu = gu;
+       // _v = v;
+       // }
 
-               @Override
-               public Object call() {
-                       ColGroupValue.setupThreadLocalMemory(_v.getLeft() + 1);
-                       for(int i = _gl; i < _gu; i++) {
-                               _grps.get(i).leftMultByMatrix(_rowVector, 
_result);
-                       }
-                       return null;
-               }
-       }
+       // @Override
+       // public Object call() {
+       // ColGroupValue.setupThreadLocalMemory(_v.getLeft());
+       // for(int i = _gl; i < _gu; i++) {
+       // _grps.get(i).leftMultByMatrix(_rowVector, _result);
+       // }
+       // return null;
+       // }
+       // }
 }
diff --git 
a/src/main/java/org/apache/sysds/runtime/compress/lib/CLALibRelationalOp.java 
b/src/main/java/org/apache/sysds/runtime/compress/lib/CLALibRelationalOp.java
index af4e679..ce6b3d3 100644
--- 
a/src/main/java/org/apache/sysds/runtime/compress/lib/CLALibRelationalOp.java
+++ 
b/src/main/java/org/apache/sysds/runtime/compress/lib/CLALibRelationalOp.java
@@ -156,7 +156,7 @@ public class CLALibRelationalOp {
                        MatrixBlock tmp = new MatrixBlock(blkz, cols, false, 
-1).allocateBlock();
                        for(int i = 0; i * blkz < outRows; i++) {
                                for(MinMaxGroup mmg : minMax) {
-                                       mmg.g.decompressToBlock(tmp, i * blkz, 
Math.min((i + 1) * blkz, rows), 0, mmg.values);
+                                       mmg.g.decompressToBlock(tmp, i * blkz, 
Math.min((i + 1) * blkz, rows), 0);
                                }
                                for(int row = 0; row < blkz && row < rows - i * 
blkz; row++) {
                                        int off = (row + i * blkz);
@@ -260,11 +260,8 @@ public class CLALibRelationalOp {
                        }
 
                        for(MinMaxGroup mmg : _minMax) {
-                               mmg.g.decompressToBlockUnSafe(tmp,
-                                       _i * _blkz,
-                                       Math.min((_i + 1) * _blkz, 
mmg.g.getNumRows()),
-                                       0,
-                                       mmg.values);
+                               if(mmg.g.getNumberNonZeros() != 0)
+                                       mmg.g.decompressToBlockUnSafe(tmp, _i * 
_blkz, Math.min((_i + 1) * _blkz, mmg.g.getNumRows()), 0);
                        }
 
                        for(int row = 0, off = _i * _blkz; row < _blkz && row < 
_rows - _i * _blkz; row++, off++) {
diff --git 
a/src/main/java/org/apache/sysds/runtime/compress/lib/CLALibScalar.java 
b/src/main/java/org/apache/sysds/runtime/compress/lib/CLALibScalar.java
index e3ef846..d1b766e 100644
--- a/src/main/java/org/apache/sysds/runtime/compress/lib/CLALibScalar.java
+++ b/src/main/java/org/apache/sysds/runtime/compress/lib/CLALibScalar.java
@@ -100,7 +100,6 @@ public class CLALibScalar {
                }
 
                ret.recomputeNonZeros();
-
                return ret;
 
        }
diff --git 
a/src/main/java/org/apache/sysds/runtime/compress/readers/ReaderCompressedSelection.java
 
b/src/main/java/org/apache/sysds/runtime/compress/readers/ReaderCompressedSelection.java
index 3dafebe..47941ab 100644
--- 
a/src/main/java/org/apache/sysds/runtime/compress/readers/ReaderCompressedSelection.java
+++ 
b/src/main/java/org/apache/sysds/runtime/compress/readers/ReaderCompressedSelection.java
@@ -53,11 +53,7 @@ public class ReaderCompressedSelection extends 
ReaderColumnSelection {
                        // decompress into the tmpBlock.
                        currentBlock = _lastRow / decompressRowCount;
                        for(AColGroup g : compressedOverlap.getColGroups()) {
-                               g.decompressToBlockUnSafe(_tmp,
-                                       _lastRow,
-                                       Math.min(_lastRow + decompressRowCount, 
g.getNumRows()),
-                                       0,
-                                       g.getValues());
+                               g.decompressToBlockUnSafe(_tmp, _lastRow, 
Math.min(_lastRow + decompressRowCount, g.getNumRows()), 0);
                        }
                }
 
diff --git 
a/src/main/java/org/apache/sysds/runtime/compress/utils/DblArray.java 
b/src/main/java/org/apache/sysds/runtime/compress/utils/DblArray.java
index 0810e8d..6385697 100644
--- a/src/main/java/org/apache/sysds/runtime/compress/utils/DblArray.java
+++ b/src/main/java/org/apache/sysds/runtime/compress/utils/DblArray.java
@@ -27,6 +27,8 @@ import java.util.Arrays;
 public class DblArray {
        private double[] _arr;
 
+       private int hash = 0;
+
        public DblArray() {
                _arr = null;
        }
@@ -43,9 +45,16 @@ public class DblArray {
                return _arr;
        }
 
+       public void resetHash(){
+               hash = 0;
+       }
+
        @Override
        public int hashCode() {
-               return _arr == null ? 0 : Arrays.hashCode(_arr);
+               if(hash != 0)
+                       return hash;
+               hash = _arr == null ? 0 : Arrays.hashCode(_arr);
+               return hash;
        }
 
        @Override
diff --git 
a/src/main/java/org/apache/sysds/runtime/matrix/data/LibMatrixMult.java 
b/src/main/java/org/apache/sysds/runtime/matrix/data/LibMatrixMult.java
index 819cd6b..3ff1bb9 100644
--- a/src/main/java/org/apache/sysds/runtime/matrix/data/LibMatrixMult.java
+++ b/src/main/java/org/apache/sysds/runtime/matrix/data/LibMatrixMult.java
@@ -1230,7 +1230,7 @@ public class LibMatrixMult
                                if( a.isEmpty(i) ) continue; 
                                int apos = a.pos(i);
                                int alen = a.size(i);
-                               int[] aix = a.indexes(i);
+                               // int[] aix = a.indexes(i);
                                double[] avals = a.values(i);
                                double[] cvals = c.values(i);
                                int cix = c.pos(i);
diff --git 
a/src/main/java/org/apache/sysds/runtime/matrix/data/MatrixBlock.java 
b/src/main/java/org/apache/sysds/runtime/matrix/data/MatrixBlock.java
index 4031a67..fe25916 100644
--- a/src/main/java/org/apache/sysds/runtime/matrix/data/MatrixBlock.java
+++ b/src/main/java/org/apache/sysds/runtime/matrix/data/MatrixBlock.java
@@ -1078,6 +1078,16 @@ public class MatrixBlock extends MatrixValue implements 
CacheBlock, Externalizab
                return evalSparseFormatOnDisk(rlen, clen, nonZeros);
        }
        
+       /**
+        * Evaluates if this matrix block should be in sparse format in
+        * memory. Depending on the current representation, the state of the
+        * matrix block is changed to the right representation if necessary. 
+        * Note that this consumes for the time of execution memory for both 
+        * representations.
+        * 
+        * Allowing CSR format is default for this operation.
+        * 
+        */
        public void examSparsity() {
                examSparsity(true);
        }
diff --git 
a/src/test/java/org/apache/sysds/test/component/compress/CompressedMatrixTest.java
 
b/src/test/java/org/apache/sysds/test/component/compress/CompressedMatrixTest.java
index 4cad36e..de88ad6 100644
--- 
a/src/test/java/org/apache/sysds/test/component/compress/CompressedMatrixTest.java
+++ 
b/src/test/java/org/apache/sysds/test/component/compress/CompressedMatrixTest.java
@@ -30,6 +30,7 @@ import java.io.DataOutputStream;
 import org.apache.sysds.runtime.compress.CompressedMatrixBlock;
 import org.apache.sysds.runtime.compress.CompressionSettingsBuilder;
 import org.apache.sysds.runtime.compress.CompressionStatistics;
+import 
org.apache.sysds.runtime.compress.cocode.PlanningCoCoder.PartitionerType;
 import org.apache.sysds.runtime.compress.colgroup.AColGroup;
 import org.apache.sysds.runtime.matrix.data.LibMatrixCountDistinct;
 import org.apache.sysds.runtime.matrix.data.MatrixBlock;
@@ -181,7 +182,8 @@ public class CompressedMatrixTest extends 
AbstractCompressedUnaryTests {
        @Test
        public void testCompressionRatio() {
                try {
-                       if(!(cmb instanceof CompressedMatrixBlock))
+                       if(!(cmb instanceof CompressedMatrixBlock) ||
+                               compressionSettings.columnPartitioner == 
PartitionerType.COST_MATRIX_MULT)
                                return;
                        CompressionStatistics cStat = cmbStats;
                        if(cStat != null)
@@ -196,7 +198,8 @@ public class CompressedMatrixTest extends 
AbstractCompressedUnaryTests {
        @Test
        public void testCompressionEstimationVSCompression() {
                try {
-                       if(!(cmb instanceof CompressedMatrixBlock))
+                       if(!(cmb instanceof CompressedMatrixBlock) ||
+                               compressionSettings.columnPartitioner == 
PartitionerType.COST_MATRIX_MULT)
                                return;
                        CompressionStatistics cStat = cmbStats;
                        if(cStat != null) {
diff --git 
a/src/test/java/org/apache/sysds/test/component/compress/CompressedTestBase.java
 
b/src/test/java/org/apache/sysds/test/component/compress/CompressedTestBase.java
index 7b9d871..d6563d8 100644
--- 
a/src/test/java/org/apache/sysds/test/component/compress/CompressedTestBase.java
+++ 
b/src/test/java/org/apache/sysds/test/component/compress/CompressedTestBase.java
@@ -75,17 +75,18 @@ import org.junit.runners.Parameterized.Parameters;
 public abstract class CompressedTestBase extends TestBase {
        protected static final Log LOG = 
LogFactory.getLog(CompressedTestBase.class.getName());
 
-       protected static SparsityType[] usedSparsityTypes = new SparsityType[] 
{SparsityType.FULL, SparsityType.SPARSE,};
+       protected static SparsityType[] usedSparsityTypes = new SparsityType[] 
{SparsityType.FULL, SparsityType.SPARSE,
+               SparsityType.ULTRA_SPARSE};
 
        protected static ValueType[] usedValueTypes = new ValueType[] 
{ValueType.RAND_ROUND, ValueType.OLE_COMPRESSIBLE,
                ValueType.RLE_COMPRESSIBLE};
 
-       protected static ValueRange[] usedValueRanges = new ValueRange[] 
{ValueRange.SMALL, ValueRange.NEGATIVE,
-               ValueRange.BYTE};
+       protected static ValueRange[] usedValueRanges = new ValueRange[] 
{ValueRange.BOOLEAN, ValueRange.SMALL,
+               ValueRange.NEGATIVE};
 
        protected static OverLapping[] overLapping = new OverLapping[] {
                // OverLapping.COL,
-               OverLapping.PLUS, OverLapping.MATRIX, OverLapping.NONE, 
OverLapping.APPEND_EMPTY, OverLapping.APPEND_CONST,
+               OverLapping.PLUS, OverLapping.MATRIX, OverLapping.NONE, 
OverLapping.APPEND_CONST, OverLapping.APPEND_EMPTY
                // OverLapping.MATRIX_PLUS,
                // OverLapping.SQUASH,
                // OverLapping.MATRIX_MULT_NEGATIVE
@@ -115,10 +116,11 @@ public abstract class CompressedTestBase extends TestBase 
{
 
                new 
CompressionSettingsBuilder().setSamplingRatio(0.1).setSeed(compressionSeed).setTransposeInput("false")
                        .setInvestigateEstimate(true),
-               new 
CompressionSettingsBuilder().setSamplingRatio(0.1).setSeed(compressionSeed).setTransposeInput("true")
-                       
.setColumnPartitioner(PartitionerType.BIN_PACKING).setInvestigateEstimate(true),
-               new 
CompressionSettingsBuilder().setSamplingRatio(0.1).setSeed(compressionSeed).setTransposeInput("true")
-                       
.setColumnPartitioner(PartitionerType.STATIC).setInvestigateEstimate(true),
+
+               // new 
CompressionSettingsBuilder().setSamplingRatio(0.1).setSeed(compressionSeed).setTransposeInput("true")
+               // 
.setColumnPartitioner(PartitionerType.BIN_PACKING).setInvestigateEstimate(true),
+               // new 
CompressionSettingsBuilder().setSamplingRatio(0.1).setSeed(compressionSeed).setTransposeInput("true")
+               // 
.setColumnPartitioner(PartitionerType.STATIC).setInvestigateEstimate(true),
 
                new 
CompressionSettingsBuilder().setSamplingRatio(0.1).setSeed(compressionSeed).setTransposeInput("true")
                        
.setColumnPartitioner(PartitionerType.COST_MATRIX_MULT).setInvestigateEstimate(true),
@@ -155,7 +157,8 @@ public abstract class CompressedTestBase extends TestBase {
        };
 
        protected static MatrixTypology[] usedMatrixTypology = new 
MatrixTypology[] { // Selected Matrix Types
-               MatrixTypology.SMALL, MatrixTypology.FEW_COL,
+               // MatrixTypology.SMALL,
+               MatrixTypology.FEW_COL,
                // MatrixTypology.FEW_ROW,
                MatrixTypology.LARGE,
                // // MatrixTypology.SINGLE_COL,
@@ -263,7 +266,7 @@ public abstract class CompressedTestBase extends TestBase {
                                }
                                if(ov == OverLapping.PLUS) {
                                        // LOG.error(cmb.slice(0,10,0,10));
-                                       ScalarOperator sop = new 
LeftScalarOperator(Plus.getPlusFnObject(), 15);
+                                       ScalarOperator sop = new 
LeftScalarOperator(Plus.getPlusFnObject(), 5);
                                        mb = mb.scalarOperations(sop, new 
MatrixBlock());
                                        cmb = cmb.scalarOperations(sop, new 
MatrixBlock());
                                        // LOG.error(cmb.slice(0,10,0,10));
@@ -296,10 +299,16 @@ public abstract class CompressedTestBase extends TestBase 
{
                for(SparsityType st : usedSparsityTypes)
                        for(ValueType vt : usedValueTypes)
                                for(ValueRange vr : usedValueRanges)
-                                       for(CompressionSettingsBuilder cs : 
usedCompressionSettings)
-                                               for(MatrixTypology mt : 
usedMatrixTypology)
-                                                       for(OverLapping ov : 
overLapping)
-                                                               tests.add(new 
Object[] {st, vt, vr, cs, mt, ov});
+                                       if((st == SparsityType.ULTRA_SPARSE && 
vr == ValueRange.LARGE) || st != SparsityType.ULTRA_SPARSE)
+                                               for(CompressionSettingsBuilder 
cs : usedCompressionSettings)
+                                                       for(MatrixTypology mt : 
usedMatrixTypology)
+                                                               for(OverLapping 
ov : overLapping)
+                                                                       if((ov 
== OverLapping.APPEND_CONST || ov == OverLapping.APPEND_EMPTY)) {
+                                                                               
if(vr == ValueRange.BOOLEAN)
+                                                                               
        tests.add(new Object[] {st, vt, vr, cs, mt, ov});
+                                                                       }
+                                                                       else
+                                                                               
tests.add(new Object[] {st, vt, vr, cs, mt, ov});
                for(CompressionSettingsBuilder cs : usedCompressionSettings)
                        for(MatrixTypology mt : usedMatrixTypology)
                                for(OverLapping ov : overLapping) {
@@ -707,6 +716,8 @@ public abstract class CompressedTestBase extends TestBase {
                                // matrix-vector compressed
                                MatrixBlock ret2 = 
cmb.transposeSelfMatrixMultOperations(new MatrixBlock(), mType, _k);
 
+                               // LOG.error("actual : " + ret1);
+                               // LOG.error("compressed : " + ret2);
                                // compare result with input
                                compareResultMatrices(ret1, ret2, 100);
                        }
@@ -1063,7 +1074,6 @@ public abstract class CompressedTestBase extends TestBase 
{
                catch(Exception e) {
                        // e.printStackTrace();
                        throw new DMLRuntimeException(this.toString() + "\n" + 
e.getMessage(), e);
-
                }
        }
 
diff --git 
a/src/test/java/org/apache/sysds/test/component/compress/TestBase.java 
b/src/test/java/org/apache/sysds/test/component/compress/TestBase.java
index 29a2a87..abdce93 100644
--- a/src/test/java/org/apache/sysds/test/component/compress/TestBase.java
+++ b/src/test/java/org/apache/sysds/test/component/compress/TestBase.java
@@ -52,7 +52,6 @@ public class TestBase {
        protected OverLapping overlappingType;
 
        // Input
-       protected double[][] input;
        protected MatrixBlock mb;
 
        public TestBase(SparsityType sparType, ValueType valType, ValueRange 
valueRange,
@@ -66,28 +65,29 @@ public class TestBase {
                        this.max = TestConstants.getMaxRangeValue(valueRange);
                        this.min = TestConstants.getMinRangeValue(valueRange);
                        this.overlappingType = ov;
+                       double[][] input;
                        switch(valType) {
                                case CONST:
                                        this.min = this.max;
                                        // Do not Break, utilize the RAND 
afterwards.
                                case RAND:
-                                       this.input = 
TestUtils.generateTestMatrix(rows, cols, min, max, sparsity, seed);
+                                       input = 
TestUtils.generateTestMatrix(rows, cols, min, max, sparsity, seed);
                                        break;
                                case RAND_ROUND:
-                                       this.input = 
TestUtils.round(TestUtils.generateTestMatrix(rows, cols, min, max, sparsity, 
seed));
+                                       input = 
TestUtils.round(TestUtils.generateTestMatrix(rows, cols, min, max, sparsity, 
seed));
                                        break;
                                case OLE_COMPRESSIBLE:
                                        // Note the Compressible Input 
generator, generates an already Transposed input
                                        // normally, therefore last argument is 
true, to build a non transposed matrix.
-                                       this.input = 
CompressibleInputGenerator.getInputDoubleMatrix(rows, cols, CompressionType.OLE,
+                                       input = 
CompressibleInputGenerator.getInputDoubleMatrix(rows, cols, CompressionType.OLE,
                                                (max - min), max, min, 
sparsity, seed, true);
                                        break;
                                case RLE_COMPRESSIBLE:
-                                       this.input = 
CompressibleInputGenerator.getInputDoubleMatrix(rows, cols, CompressionType.RLE,
+                                       input = 
CompressibleInputGenerator.getInputDoubleMatrix(rows, cols, CompressionType.RLE,
                                                (max - min), max, min, 
sparsity, seed, true);
                                        break;
                                case ONE_HOT_ENCODED:
-                                       this.input = 
CompressibleInputGenerator.getInputOneHotMatrix(rows, cols, seed);
+                                       input = 
CompressibleInputGenerator.getInputOneHotMatrix(rows, cols, seed);
                                        break;
                                default:
                                        throw new NotImplementedException("Not 
Implemented Test Value type input generator");
@@ -97,7 +97,7 @@ public class TestBase {
                        this.valType = valType;
                        this.compressionSettings = compressionSettings.create();
 
-                       mb = DataConverter.convertToMatrixBlock(this.input);
+                       mb = DataConverter.convertToMatrixBlock(input);
 
                }
                catch(Exception e) {

Reply via email to