[SYSTEMML-810] New compressed matrix blocks and operations, tests

For details, see https://issues.apache.org/jira/browse/SYSTEMML-449.

Project: http://git-wip-us.apache.org/repos/asf/incubator-systemml/repo
Commit: 
http://git-wip-us.apache.org/repos/asf/incubator-systemml/commit/16e7b1c8
Tree: http://git-wip-us.apache.org/repos/asf/incubator-systemml/tree/16e7b1c8
Diff: http://git-wip-us.apache.org/repos/asf/incubator-systemml/diff/16e7b1c8

Branch: refs/heads/master
Commit: 16e7b1c88e45e007d1db229717311fcf70bc6b19
Parents: 71013e7
Author: Matthias Boehm <[email protected]>
Authored: Sat Jul 16 00:43:16 2016 -0700
Committer: Matthias Boehm <[email protected]>
Committed: Sat Jul 16 17:22:50 2016 -0700

----------------------------------------------------------------------
 .../runtime/compress/BitmapDecoderOLE.java      |  127 ++
 .../runtime/compress/BitmapDecoderRLE.java      |  117 ++
 .../sysml/runtime/compress/BitmapEncoder.java   |  392 +++++
 .../apache/sysml/runtime/compress/ColGroup.java |  259 ++++
 .../sysml/runtime/compress/ColGroupBitmap.java  |  498 +++++++
 .../sysml/runtime/compress/ColGroupOLE.java     |  634 +++++++++
 .../sysml/runtime/compress/ColGroupRLE.java     |  617 ++++++++
 .../runtime/compress/ColGroupUncompressed.java  |  360 +++++
 .../runtime/compress/CompressedMatrixBlock.java | 1342 ++++++++++++++++++
 .../runtime/compress/PlanningBinPacker.java     |  191 +++
 .../sysml/runtime/compress/PlanningCoCoder.java |  227 +++
 .../runtime/compress/PlanningCoCodingGroup.java |  104 ++
 .../compress/PlanningGroupMergeAction.java      |   73 +
 .../runtime/compress/ReaderColumnSelection.java |   64 +
 .../compress/ReaderColumnSelectionDense.java    |   68 +
 .../ReaderColumnSelectionDenseSample.java       |   86 ++
 .../compress/ReaderColumnSelectionSparse.java   |  115 ++
 .../runtime/compress/UncompressedBitmap.java    |  101 ++
 .../compress/estim/CompressedSizeEstimator.java |  145 ++
 .../estim/CompressedSizeEstimatorExact.java     |   53 +
 .../estim/CompressedSizeEstimatorSample.java    |  767 ++++++++++
 .../compress/estim/CompressedSizeInfo.java      |   69 +
 .../compress/estim/SizeEstimatorFactory.java    |   40 +
 .../runtime/compress/utils/ConverterUtils.java  |   99 ++
 .../sysml/runtime/compress/utils/DblArray.java  |   91 ++
 .../compress/utils/DblArrayIntListHashMap.java  |  179 +++
 .../compress/utils/DoubleIntListHashMap.java    |  181 +++
 .../runtime/compress/utils/IntArrayList.java    |  102 ++
 .../compress/utils/LinearAlgebraUtils.java      |  383 +++++
 .../runtime/functionobjects/KahanFunction.java  |    8 +
 .../runtime/functionobjects/KahanPlus.java      |    5 +
 .../runtime/functionobjects/KahanPlusSq.java    |   15 +-
 .../runtime/matrix/data/LibMatrixMult.java      |   27 +-
 .../sysml/runtime/matrix/data/MatrixBlock.java  |    2 +-
 .../compress/BasicCompressionTest.java          |  168 +++
 .../compress/BasicMatrixAppendTest.java         |  176 +++
 .../compress/BasicMatrixMultChainTest.java      |  245 ++++
 .../BasicMatrixTransposeSelfMultTest.java       |  172 +++
 .../compress/BasicMatrixVectorMultTest.java     |  180 +++
 .../BasicScalarOperationsSparseUnsafeTest.java  |  177 +++
 .../compress/BasicScalarOperationsTest.java     |  177 +++
 .../BasicTransposeSelfLeftMatrixMultTest.java   |  172 +++
 .../compress/BasicUnaryAggregateTest.java       |  544 +++++++
 .../compress/BasicVectorMatrixMultTest.java     |  180 +++
 .../functions/compress/CompressedLinregCG.java  |  151 ++
 .../compress/CompressedSerializationTest.java   |  185 +++
 .../compress/LargeCompressionTest.java          |  169 +++
 .../compress/LargeMatrixVectorMultTest.java     |  180 +++
 .../compress/LargeParMatrixVectorMultTest.java  |  182 +++
 .../compress/LargeVectorMatrixMultTest.java     |  180 +++
 .../compress/ParMatrixMultChainTest.java        |  247 ++++
 .../compress/ParMatrixVectorMultTest.java       |  182 +++
 .../ParTransposeSelfLeftMatrixMultTest.java     |  174 +++
 .../compress/ParUnaryAggregateTest.java         |  547 +++++++
 .../compress/ParVectorMatrixMultTest.java       |  181 +++
 src/test/scripts/functions/compress/LinregCG.R  |   57 +
 .../scripts/functions/compress/LinregCG.dml     |   56 +
 .../compress/SystemML-config-compress.xml       |   59 +
 .../functions/compress/ZPackageSuite.java       |   56 +
 59 files changed, 12322 insertions(+), 16 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/16e7b1c8/src/main/java/org/apache/sysml/runtime/compress/BitmapDecoderOLE.java
----------------------------------------------------------------------
diff --git 
a/src/main/java/org/apache/sysml/runtime/compress/BitmapDecoderOLE.java 
b/src/main/java/org/apache/sysml/runtime/compress/BitmapDecoderOLE.java
new file mode 100644
index 0000000..fc6e861
--- /dev/null
+++ b/src/main/java/org/apache/sysml/runtime/compress/BitmapDecoderOLE.java
@@ -0,0 +1,127 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ * 
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.sysml.runtime.compress;
+
+import java.util.Arrays;
+import java.util.Iterator;
+
+/**
+ * General-purpose iterator to decode a compressed OLE bitmap.
+ *  
+ */
+public final class BitmapDecoderOLE implements Iterator<Integer> 
+{
+       // pointer to the compressed bitmap
+       private int _bmOff;
+       private int _bmLen;
+       private char[] _bmPtr;
+
+       // The index of the current block. Block 0 covers bits 1 through 2^16
+       private int _blockIx;
+
+       // The offset where the current block starts within the bitmap.
+       private int _blockStartOffset;
+
+       // The number of offsets in the current block.
+       private int _curBlockSize;
+
+       // The offset <b>in the current block</b> the <b>next</b> element we 
will
+       // read from the bitmap, or bmPtr.length if we are done.
+       private int _nextBmOffset;
+
+       /**
+        * Point this object at the beginning of a particular bitmap. After a 
call
+        * to this method, the next call to {@link #nextOffset()} will return 
the
+        * offset of the first bit in the specified bitmap.
+        * 
+        * @param bmPtr
+        *            pointer to a compressed bitmap
+        */
+       public BitmapDecoderOLE(char[] bmPtr, int off, int len) {
+               _bmOff = off;
+               _bmLen = len;
+               _bmPtr = bmPtr;
+               _blockIx = 0;
+               _blockStartOffset = 0;
+               _curBlockSize = _bmPtr[_bmOff+_blockStartOffset];
+               if (_curBlockSize < 0) {
+                       throw new RuntimeException(String.format(
+                                       "Negative block size %d at position %d 
of %s",
+                                       _curBlockSize, _blockStartOffset, 
Arrays.toString(bmPtr)));
+               }
+               _nextBmOffset = 0;
+
+               // Advance past any zero-length blocks at the beginning of the 
array
+               while (_blockStartOffset < _bmLen
+                               && _nextBmOffset >= _curBlockSize) {
+                       advanceToNextBlock();
+               }
+       }
+
+       @Override
+       public Integer next() {
+               if( !hasNext() )
+                       throw new RuntimeException("No next offset existing.");
+               
+               // Grab the lookahead value Note the +1 in the array indexing; 
+               // the first number in a block is the block size
+               int offsetFromBlockBegin = _bmPtr[_bmOff+_blockStartOffset + 1 
+ _nextBmOffset];
+               int ret = (_blockIx * BitmapEncoder.BITMAP_BLOCK_SZ)
+                               + offsetFromBlockBegin;
+               _nextBmOffset++;
+
+               // Advance to next non-empty block if we reached the end of the 
block.
+               while (_blockStartOffset < _bmLen && _nextBmOffset >= 
_curBlockSize) {
+                       advanceToNextBlock();
+               }
+
+               return ret;
+       }
+
+       @Override
+       public boolean hasNext() {
+               return _blockStartOffset < _bmLen;
+       }
+
+       @Override
+       public void remove() {
+               throw new RuntimeException("Not implemented for 
BitmapDecoderOLE.");
+       }
+
+       /**
+        * Move forward to the next block. Does not skip empty blocks.
+        */
+       private void advanceToNextBlock() {
+               _blockStartOffset += (1 + _curBlockSize);
+               _blockIx++;
+               if (_blockStartOffset >= _bmLen) {
+                       // Read past last block
+                       return;
+               }
+
+               _curBlockSize = _bmPtr[_bmOff+_blockStartOffset];
+               if (_curBlockSize < 0) {
+                       throw new RuntimeException(String.format(
+                                       "Negative block size %d at position %d 
of %s",
+                                       _curBlockSize, _blockStartOffset, 
Arrays.toString(_bmPtr)));
+               }
+               _nextBmOffset = 0;
+       }
+}

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/16e7b1c8/src/main/java/org/apache/sysml/runtime/compress/BitmapDecoderRLE.java
----------------------------------------------------------------------
diff --git 
a/src/main/java/org/apache/sysml/runtime/compress/BitmapDecoderRLE.java 
b/src/main/java/org/apache/sysml/runtime/compress/BitmapDecoderRLE.java
new file mode 100644
index 0000000..54f24ae
--- /dev/null
+++ b/src/main/java/org/apache/sysml/runtime/compress/BitmapDecoderRLE.java
@@ -0,0 +1,117 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ * 
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.sysml.runtime.compress;
+
+import java.util.Iterator;
+
+/**
+ * General-purpose iterator to decode a compressed OLE bitmap.
+ * 
+ */
+public final class BitmapDecoderRLE implements Iterator<Integer>
+{
+       // pointer to the compressed bitmap
+       private int _bmOff;
+       private int _bmLen;
+       private char[] _bmPtr;
+
+       // The offset of the <b>next</b> element we will read from the bitmap, 
or
+       // bmPtr.length if we are done.
+       private int _nextBmOffset;
+
+       // The offset in the matrix column of the beginning of the current run
+       private int _runStartOffset;
+
+       // The length of the current run
+       private int _curRunLen;
+
+       // The number of bits that we have returned from the current run.
+       private int _runBitsReturned;
+
+       /**
+        * Point this object at the beginning of a particular bitmap. After a 
call
+        * to this method, the next call to {@link #nextOffset()} will return 
the
+        * offset of the first bit in the specified bitmap.
+        * 
+        * @param bmPtr
+        *            pointer to a compressed bitmap
+        */
+       public BitmapDecoderRLE(char[] bmPtr, int off, int len) {
+               _bmOff = off;
+               _bmLen = len;
+               _bmPtr = bmPtr;
+               _nextBmOffset = 0;
+               _runStartOffset = 0;
+               _curRunLen = 0;
+               _runBitsReturned = 0;
+
+               if (0 == _bmLen) {
+                       return; //no runs
+               }
+
+               // Advance to the beginning of the first non-empty run.
+               advanceToNextRun();
+       }
+
+       @Override
+       public Integer next() {
+               if( !hasNext() )
+                       throw new RuntimeException("No next offset existing.");
+               
+               // Grab the lookahead value
+               int ret = _runStartOffset + _runBitsReturned;
+
+               _runBitsReturned++;
+
+               // Check for end of run
+               if (_runBitsReturned == _curRunLen) {
+                       advanceToNextRun();
+               }
+
+               return ret;
+       }
+
+       @Override
+       public boolean hasNext() {
+               return _runBitsReturned < _curRunLen;
+       }
+       
+       @Override
+       public void remove() {
+               throw new RuntimeException("Not implemented for 
BitmapDecoderRLE.");
+       }
+       
+       /** Move forward to the next non-empty run. */
+       private void advanceToNextRun() {
+               // While loop needed because some runs are of length 0
+               while (_runBitsReturned == _curRunLen && _nextBmOffset < 
_bmLen) {
+
+                       _runBitsReturned = 0;
+
+                       // Read the distance to the next run
+                       char delta = _bmPtr[_bmOff + _nextBmOffset];
+
+                       // Run length is stored in the next element of the array
+                       _runStartOffset += delta + _curRunLen;
+                       _curRunLen = _bmPtr[_bmOff + _nextBmOffset + 1];
+                       _nextBmOffset += 2;
+               }
+       }
+}

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/16e7b1c8/src/main/java/org/apache/sysml/runtime/compress/BitmapEncoder.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/compress/BitmapEncoder.java 
b/src/main/java/org/apache/sysml/runtime/compress/BitmapEncoder.java
new file mode 100644
index 0000000..8a535e1
--- /dev/null
+++ b/src/main/java/org/apache/sysml/runtime/compress/BitmapEncoder.java
@@ -0,0 +1,392 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ * 
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.sysml.runtime.compress;
+
+import java.util.ArrayList;
+import java.util.Arrays;
+
+import org.apache.sysml.runtime.DMLRuntimeException;
+import org.apache.sysml.runtime.compress.utils.DblArray;
+import org.apache.sysml.runtime.compress.utils.DblArrayIntListHashMap;
+import org.apache.sysml.runtime.compress.utils.DoubleIntListHashMap;
+import org.apache.sysml.runtime.compress.utils.IntArrayList;
+import org.apache.sysml.runtime.matrix.data.MatrixBlock;
+import org.apache.sysml.runtime.matrix.data.SparseBlock;
+
+
+/** 
+ * Static functions for encoding bitmaps in various ways. 
+ * 
+ */
+public class BitmapEncoder 
+{
+       /** Size of the blocks used in a blocked bitmap representation. */
+       public static final int BITMAP_BLOCK_SZ = 65536;
+       
+       /**
+        * Generate uncompressed bitmaps for a set of columns in an uncompressed
+        * matrix block.
+        * 
+        * @param colIndices
+        *            indexes (within the block) of the columns to extract
+        * @param rawblock
+        *            an uncompressed matrix block; can be dense or sparse
+        * @return uncompressed bitmap representation of the columns
+        * @throws DMLRuntimeException 
+        */
+       public static UncompressedBitmap extractBitmap(int[] colIndices, 
MatrixBlock rawblock) 
+       {
+               //note: no sparse column selection reader because low potential
+               //single column selection
+               if( colIndices.length==1 ) {
+                       return extractBitmap(colIndices[0], rawblock, 
+                                       
!CompressedMatrixBlock.MATERIALIZE_ZEROS);
+               }
+               //multiple column selection     (general case)
+               else { 
+                       ReaderColumnSelection reader = null;
+                       if( rawblock.isInSparseFormat() && 
CompressedMatrixBlock.TRANSPOSE_INPUT )
+                               reader = new 
ReaderColumnSelectionSparse(rawblock, colIndices, 
+                                               
!CompressedMatrixBlock.MATERIALIZE_ZEROS);
+                       else
+                               reader = new 
ReaderColumnSelectionDense(rawblock, colIndices,
+                                               
!CompressedMatrixBlock.MATERIALIZE_ZEROS); 
+                       
+                       return extractBitmap(colIndices, rawblock, reader);
+               }
+       }
+
+       /**
+        * 
+        * @param colIndices
+        * @param rawblock
+        * @param sampleIndexes
+        * @return
+        */
+       public static UncompressedBitmap extractBitmapFromSample(int[] 
colIndices,
+                       MatrixBlock rawblock, int[] sampleIndexes) 
+       {
+               //note: no sparse column selection reader because low potential
+               
+               //single column selection
+               if( colIndices.length==1 ) {
+                       return extractBitmap(colIndices[0], rawblock, 
sampleIndexes,
+                                       
!CompressedMatrixBlock.MATERIALIZE_ZEROS);
+               }
+               //multiple column selection     (general case)
+               else {                  
+                       return extractBitmap(colIndices, rawblock,
+                                       new 
ReaderColumnSelectionDenseSample(rawblock, colIndices,
+                                                       sampleIndexes, 
!CompressedMatrixBlock.MATERIALIZE_ZEROS));      
+               }
+       }
+
+       /**
+        * Encodes the bitmap as a series of run lengths and offsets.
+        * <p>
+        * <b>NOTE: This method must be kept in sync with {@link 
BitmapDecoderRLE}
+        * !</b>
+        * 
+        * @param offsets
+        *            uncompressed contents of the bitmap, expressed as a list 
of
+        *            the offsets of different bits
+        * @return compressed version of said bitmap
+        */
+       public static char[] genRLEBitmap(int[] offsets) {
+               if( offsets.length == 0 )
+                       return new char[0]; //empty list
+
+               // Use an ArrayList for correctness at the expense of temp space
+               ArrayList<Character> buf = new ArrayList<Character>();
+
+               // 1 + (position of last 1 in the previous run of 1's)
+               // We add 1 because runs may be of length zero.
+               int lastRunEnd = 0;
+
+               // Offset between the end of the previous run of 1's and the 
first 1 in
+               // the current run. Initialized below.
+               int curRunOff;
+
+               // Length of the most recent run of 1's
+               int curRunLen = 0;
+
+               // Current encoding is as follows:
+               // Negative entry: abs(Entry) encodes the offset to the next 
lone 1 bit.
+               // Positive entry: Entry encodes offset to next run of 1's. The 
next
+               // entry in the bitmap holds a run length.
+
+               // Special-case the first run to simplify the loop below.
+               int firstOff = offsets[0];
+
+               // The first run may start more than a short's worth of bits in
+               while (firstOff > Character.MAX_VALUE) {
+                       buf.add(Character.MAX_VALUE);
+                       buf.add((char) 0);
+                       firstOff -= Character.MAX_VALUE;
+                       lastRunEnd += Character.MAX_VALUE;
+               }
+
+               // Create the first run with an initial size of 1
+               curRunOff = firstOff;
+               curRunLen = 1;
+
+               // Process the remaining offsets
+               for (int i = 1; i < offsets.length; i++) {
+
+                       int absOffset = offsets[i];
+
+                       // 1 + (last position in run)
+                       int curRunEnd = lastRunEnd + curRunOff + curRunLen;
+
+                       if (absOffset > curRunEnd || curRunLen >= 
Character.MAX_VALUE) {
+                               // End of a run, either because we hit a run of 
0's or because the 
+                               // number of 1's won't fit in 16 bits. Add run 
to bitmap and start a new one.
+                               buf.add((char) curRunOff);
+                               buf.add((char) curRunLen);
+
+                               lastRunEnd = curRunEnd;
+                               curRunOff = absOffset - lastRunEnd;
+
+                               while (curRunOff > Character.MAX_VALUE) {
+                                       // SPECIAL CASE: Offset to next run 
doesn't fit into 16 bits.
+                                       // Add zero-length runs until the 
offset is small enough.
+                                       buf.add(Character.MAX_VALUE);
+                                       buf.add((char) 0);
+                                       lastRunEnd += Character.MAX_VALUE;
+                                       curRunOff -= Character.MAX_VALUE;
+                               }
+                               
+                               curRunLen = 1;
+                       } else {
+                               // Middle of a run
+                               curRunLen++;
+                       }
+               }
+
+               // Close out the last run
+               if (curRunLen >= 1) {
+                       buf.add((char) curRunOff);
+                       buf.add((char) curRunLen);
+               }
+
+               // Convert wasteful ArrayList to packed array.
+               char[] ret = new char[buf.size()];
+               for (int i = 0; i < buf.size(); i++) {
+                       ret[i] = buf.get(i);
+               }
+               return ret;
+       }
+
+       /**
+        * Encodes the bitmap in blocks of offsets. Within each block, the bits 
are
+        * stored as absolute offsets from the start of the block.
+        * 
+        * @param offsets
+        *            uncompressed contents of the bitmap, expressed as a list 
of
+        *            the offsets of different bits
+        * @return compressed version of said bitmap
+        */
+       public static char[] genOffsetBitmap(int[] offsets) 
+       {
+               int lastOffset = offsets[offsets.length - 1];
+
+               // Build up the blocks
+               int numBlocks = (lastOffset / BITMAP_BLOCK_SZ) + 1;
+               // To simplify the logic, we make two passes.
+               // The first pass divides the offsets by block.
+               int[] blockLengths = new int[numBlocks];
+               Arrays.fill(blockLengths, 0);
+
+               for (int ix = 0; ix < offsets.length; ix++) {
+                       int val = offsets[ix];
+                       int blockForVal = val / BITMAP_BLOCK_SZ;
+
+                       blockLengths[blockForVal]++;
+               }
+
+               // The second pass creates the blocks.
+               int totalSize = numBlocks;
+               for (int block = 0; block < numBlocks; block++) {
+                       totalSize += blockLengths[block];
+               }
+               char[] encodedBlocks = new char[totalSize];
+
+               int inputIx = 0;
+               int blockStartIx = 0;
+               for (int block = 0; block < numBlocks; block++) {
+                       int blockSz = blockLengths[block];
+
+                       // First entry in the block is number of bits
+                       encodedBlocks[blockStartIx] = (char) blockSz;
+
+                       for (int i = 0; i < blockSz; i++) {
+                               encodedBlocks[blockStartIx + i + 1] = (char) 
+                                               (offsets[inputIx+i] % 
BITMAP_BLOCK_SZ);
+                       }
+
+                       inputIx += blockSz;
+                       blockStartIx += blockSz + 1;
+               }
+
+               return encodedBlocks;
+       }
+
+       /**
+        * 
+        * @param colIndex
+        * @param rawblock
+        * @return
+        * @throws DMLRuntimeException 
+        */
+       private static UncompressedBitmap extractBitmap(int colIndex, 
MatrixBlock rawblock, boolean skipZeros) 
+       {
+               //probe map for distinct items (for value or value groups)
+               DoubleIntListHashMap distinctVals = new DoubleIntListHashMap();
+               
+               //scan rows and probe/build distinct items
+               final int m = CompressedMatrixBlock.TRANSPOSE_INPUT ?
+                               rawblock.getNumColumns():rawblock.getNumRows();
+               
+               if( rawblock.isInSparseFormat() //SPARSE 
+                       && CompressedMatrixBlock.TRANSPOSE_INPUT )      
+               {
+                       SparseBlock a = rawblock.getSparseBlock();
+                       if( a != null && !a.isEmpty(colIndex) ) 
+                       {
+                               int apos = a.pos(colIndex);
+                               int alen = a.size(colIndex);
+                               int[] aix = a.indexes(colIndex);
+                               double[] avals = a.values(colIndex);
+                               
+                               IntArrayList lstPtr0 = new IntArrayList(); 
//for 0 values
+                               int last = -1;
+                               //iterate over non-zero entries but fill in 
zeros
+                               for( int j=apos; j<apos+alen; j++ ) 
+                               {
+                                       //fill in zero values
+                                       if( !skipZeros )
+                                               for( int k=last+1; k<aix[j]; 
k++ ) 
+                                                       lstPtr0.appendValue(k);
+                                       //handle non-zero value
+                                       IntArrayList lstPtr = 
distinctVals.get(avals[j]);       
+                                       if( lstPtr == null ) {
+                                               lstPtr = new IntArrayList();
+                                               
distinctVals.appendValue(avals[j], lstPtr);
+                                       }
+                                       lstPtr.appendValue(aix[j]);
+                                       last = aix[j];
+                               }
+                               //fill in remaining zero values
+                               if( !skipZeros ) {
+                                       for( int k=last+1; k<m; k++ )
+                                               lstPtr0.appendValue(k);
+                                       if( lstPtr0.size()>0 )
+                                               distinctVals.appendValue(0, 
lstPtr0);
+                               }
+                       }
+                       else if( !skipZeros ) { //full 0 column 
+                               IntArrayList lstPtr = new IntArrayList();
+                               for( int i=0; i<m; i++ )
+                                       lstPtr.appendValue(i);
+                               distinctVals.appendValue(0, lstPtr);
+                       }
+               }
+               else //GENERAL CASE
+               {
+                       for( int i=0; i<m; i++ ) {
+                               double val = 
CompressedMatrixBlock.TRANSPOSE_INPUT ? 
+                                               
rawblock.quickGetValue(colIndex, i):
+                                               rawblock.quickGetValue(i, 
colIndex);
+                               if( val!=0 || !skipZeros ) {            
+                                       IntArrayList lstPtr = 
distinctVals.get(val);    
+                                       if( lstPtr == null ) {
+                                               lstPtr = new IntArrayList();
+                                               distinctVals.appendValue(val, 
lstPtr);
+                                       }
+                                       lstPtr.appendValue(i);
+                               }
+                       }
+               }
+               
+               return new UncompressedBitmap(distinctVals);
+       }
+       
+       /**
+        * 
+        * @param colIndex
+        * @param rawblock
+        * @param sampleIndexes
+        * @return
+        */
+       private static UncompressedBitmap extractBitmap(int colIndex, 
MatrixBlock rawblock, int[] sampleIndexes, boolean skipZeros) 
+       {
+               //note: general case only because anyway binary search for 
small samples
+               
+               //probe map for distinct items (for value or value groups)
+               DoubleIntListHashMap distinctVals = new DoubleIntListHashMap();
+               
+               //scan rows and probe/build distinct items
+               final int m = sampleIndexes.length;
+               for( int i=0; i<m; i++ ) {
+                       int rowIndex = sampleIndexes[i]; 
+                       double val = CompressedMatrixBlock.TRANSPOSE_INPUT ? 
+                                       rawblock.quickGetValue(colIndex, 
rowIndex) : 
+                                       rawblock.quickGetValue(rowIndex, 
colIndex); 
+                       if( val!=0 || !skipZeros ) {                            
        
+                               IntArrayList lstPtr = distinctVals.get(val);    
+                               if( lstPtr == null ) {
+                                       lstPtr = new IntArrayList();
+                                       distinctVals.appendValue(val, lstPtr);
+                               }
+                               lstPtr.appendValue(i);
+                       }
+               }
+
+               return new UncompressedBitmap(distinctVals);
+       }
+       
+       /**
+        * 
+        * @param colIndices
+        * @param rawblock
+        * @param rowReader
+        * @return
+        */
+       private static UncompressedBitmap extractBitmap(int[] colIndices,
+                       MatrixBlock rawblock, ReaderColumnSelection rowReader) 
+       {
+               //probe map for distinct items (for value or value groups)
+               DblArrayIntListHashMap distinctVals = new 
DblArrayIntListHashMap();
+               
+               //scan rows and probe/build distinct items
+               DblArray cellVals = null;
+               while ((cellVals = rowReader.nextRow()) != null) {
+                       IntArrayList lstPtr = distinctVals.get(cellVals);
+                       if (lstPtr == null) {
+                               //create new objects only on demand
+                               lstPtr = new IntArrayList();
+                               distinctVals.appendValue(new 
DblArray(cellVals), lstPtr);
+                       }
+                       lstPtr.appendValue(rowReader.getCurrentRowIndex());
+               }
+               
+               return new UncompressedBitmap(distinctVals, colIndices.length);
+       }
+}

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/16e7b1c8/src/main/java/org/apache/sysml/runtime/compress/ColGroup.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/compress/ColGroup.java 
b/src/main/java/org/apache/sysml/runtime/compress/ColGroup.java
new file mode 100644
index 0000000..647a701
--- /dev/null
+++ b/src/main/java/org/apache/sysml/runtime/compress/ColGroup.java
@@ -0,0 +1,259 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ * 
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.sysml.runtime.compress;
+
+import java.io.DataInput;
+import java.io.DataOutput;
+import java.io.IOException;
+import java.io.Serializable;
+import java.util.List;
+
+import org.apache.sysml.runtime.DMLRuntimeException;
+import org.apache.sysml.runtime.matrix.data.MatrixBlock;
+import org.apache.sysml.runtime.matrix.operators.AggregateUnaryOperator;
+import org.apache.sysml.runtime.matrix.operators.ScalarOperator;
+
+/**
+ * Class that stores information about a column group within a compressed 
matrix
+ * block. There are subclasses specific to each compression type.
+ * 
+ */
+public abstract class ColGroup implements Serializable 
+{
+       private static final long serialVersionUID = 2439785418908671481L;
+
+       public enum CompressionType  {
+               UNCOMPRESSED,   //uncompressed sparse/dense 
+               RLE_BITMAP,     //RLE bitmap
+               OLE_BITMAP;  //OLE bitmap
+       }
+       
+       /**
+        * Offsets of the columns that make up the column group. Zero-based, and
+        * relative to the matrix block.
+        */
+       protected int[] _colIndexes;
+
+       /** Number of rows in the matrix, for use by child classes. */
+       protected int _numRows;
+
+       /** How the elements of the column group are compressed. */
+       private CompressionType _compType;
+
+       
+       /**
+        * Main constructor.
+        * 
+        * @param colIndices
+        *            offsets of the columns in the matrix block that make up 
the
+        *            group
+        * @param numRows
+        *            total number of rows in the parent block
+        */
+       protected ColGroup(CompressionType type, int[] colIndices, int numRows) 
{
+               _compType = type;
+               _colIndexes = colIndices;
+               _numRows = numRows;
+       }
+
+       /** Convenience constructor for converting indices to a more compact 
format. */
+       protected ColGroup(CompressionType type, List<Integer> colIndicesList, 
int numRows) {
+               _compType = type;
+               _colIndexes = new int[colIndicesList.size()];
+               int i = 0;
+               for (Integer index : colIndicesList)
+                       _colIndexes[i++] = index;
+       }
+
+       /**
+        * @return offsets of the columns in the matrix block that make up the 
group
+        */
+       public int[] getColIndices() {
+               return _colIndexes;
+       }
+
+       /**
+        * @param col
+        *            an index from 0 to the number of columns in this group - 1
+        * @return offset of the specified column in the matrix block that make 
up
+        *         the group
+        */
+       public int getColIndex(int colNum) {
+               return _colIndexes[colNum];
+       }
+
+       public int getNumRows() {
+               return _numRows;
+       }
+       
+       /**
+        * @return number of columns in this column group
+        */
+       public int getNumCols() {
+               return _colIndexes.length;
+       }
+
+       /**
+        * @return How the elements of the column group are compressed.
+        */
+       public CompressionType getCompType() {
+               return _compType;
+       }
+
+       /**
+        * 
+        * @param offset
+        */
+       public void shiftColIndices(int offset)  {
+               for( int i=0; i<_colIndexes.length; i++ )
+                       _colIndexes[i] += offset;
+       }
+       
+       /**
+        * Note: Must be overridden by child classes to account for additional 
data
+        * and metadata
+        * 
+        * @return an upper bound on the number of bytes used to store this 
ColGroup
+        *         in memory.
+        */
+       public long estimateInMemorySize() {
+               // int numRows (4B) , array reference colIndices (8B) + array 
object
+               // overhead if exists (32B) + 4B per element, CompressionType 
compType
+               // (2 booleans 2B + enum overhead 32B + reference to enum 8B)
+               long size = 54;
+               if (_colIndexes == null)
+                       return size;
+               else
+                       return size + 32 + 4 * _colIndexes.length;
+       }
+
+       /**
+        * Decompress the contents of this column group into the specified full
+        * matrix block.
+        * 
+        * @param target
+        *            a matrix block where the columns covered by this column 
group
+        *            have not yet been filled in.
+        */
+       public abstract void decompressToBlock(MatrixBlock target);
+
+       /**
+        * Decompress the contents of this column group into uncompressed packed
+        * columns
+        * 
+        * @param target
+        *            a dense matrix block. The block must have enough space to 
hold
+        *            the contents of this column group.
+        * @param colIndexTargets
+        *            array that maps column indices in the original matrix 
block to
+        *            columns of target.
+        */
+       public abstract void decompressToBlock(MatrixBlock target, int[] 
colIndexTargets);
+
+       /**
+        * 
+        * @param target  dense output vector
+        * @param colpos  column to decompress, error if larger or equal numCols
+        */
+       public abstract void decompressToBlock(MatrixBlock target, int colpos);
+
+
+       /**
+        * Serializes column group to data output.
+        * 
+        * @param out
+        * @throws IOException
+        */
+       public abstract void write(DataOutput out) 
+               throws IOException;
+       
+       /**
+        * Deserializes column group from data input.
+        * 
+        * @param in
+        * @throws IOException
+        */
+       public abstract void readFields(DataInput in) 
+               throws IOException;
+               
+       
+       /**
+        * Returns the exact serialized size of column group.
+        * This can be used for example for buffer preallocation.
+        * 
+        * @return
+        */
+       public abstract long getExactSizeOnDisk();
+       
+       /**
+        * Multiply the slice of the matrix that this column group represents 
by a
+        * vector on the right.
+        * 
+        * @param vector
+        *            vector to multiply by (tall vector)
+        * @param result
+        *            accumulator for holding the result
+        * @throws DMLRuntimeException
+        *             if the internal SystemML code that performs the
+        *             multiplication experiences an error
+        */
+       public abstract void rightMultByVector(MatrixBlock vector,
+                       MatrixBlock result, int rl, int ru) throws 
DMLRuntimeException;
+
+
+       /**
+        * Multiply the slice of the matrix that this column group represents 
by a
+        * row vector on the left (the original column vector is assumed to be
+        * transposed already i.e. its size now is 1xn).
+        * 
+        * @param vector
+        * @param result
+        * @throws DMLRuntimeException 
+        */
+       public abstract void leftMultByRowVector(MatrixBlock vector,
+                       MatrixBlock result) throws DMLRuntimeException;
+
+       /**
+        * Perform the specified scalar operation directly on the compressed 
column
+        * group, without decompressing individual cells if possible.
+        * 
+        * @param op
+        *            operation to perform
+        * @return version of this column group with the operation applied
+        */
+       public abstract ColGroup scalarOperation(ScalarOperator op)
+                       throws DMLRuntimeException;
+
+       /**
+        * 
+        * @param op
+        * @param result
+        * @throws DMLUnsupportedOperationException
+        * @throws DMLRuntimeException
+        */
+       public abstract void unaryAggregateOperations(AggregateUnaryOperator 
op, MatrixBlock result)
+               throws DMLRuntimeException;
+       
+       /**
+        * 
+        * @param rnnz
+        */
+       protected abstract void countNonZerosPerRow(int[] rnnz);
+}

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/16e7b1c8/src/main/java/org/apache/sysml/runtime/compress/ColGroupBitmap.java
----------------------------------------------------------------------
diff --git 
a/src/main/java/org/apache/sysml/runtime/compress/ColGroupBitmap.java 
b/src/main/java/org/apache/sysml/runtime/compress/ColGroupBitmap.java
new file mode 100644
index 0000000..5d60a49
--- /dev/null
+++ b/src/main/java/org/apache/sysml/runtime/compress/ColGroupBitmap.java
@@ -0,0 +1,498 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ * 
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.sysml.runtime.compress;
+
+import java.io.DataInput;
+import java.io.DataOutput;
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Iterator;
+import java.util.TreeMap;
+import java.util.Map.Entry;
+
+import org.apache.sysml.runtime.DMLRuntimeException;
+import org.apache.sysml.runtime.compress.utils.LinearAlgebraUtils;
+import org.apache.sysml.runtime.matrix.data.MatrixBlock;
+import org.apache.sysml.runtime.matrix.operators.ScalarOperator;
+
+
+/**
+ * Base class for column groups encoded with various types of bitmap encoding.
+ * 
+ * 
+ * NOTES:
+ *  * OLE: separate storage segment length and bitmaps led to a 30% improvement
+ *    but not applied because more difficult to support both data layouts at 
the
+ *    same time (distributed/local as well as w/ and w/o low-level opt)
+ */
+public abstract class ColGroupBitmap extends ColGroup 
+{
+       private static final long serialVersionUID = -1635828933479403125L;
+       
+       public static final boolean LOW_LEVEL_OPT = true;       
+       //sorting of values by physical length helps by 10-20%, especially for 
serial, while
+       //slight performance decrease for parallel incl multi-threaded, hence 
not applied for
+       //distributed operations (also because compression time + garbage 
collection increases)
+       private static final boolean SORT_VALUES_BY_LENGTH = true; 
+       protected static final boolean CREATE_SKIPLIST = true;
+       
+       protected static final int READ_CACHE_BLKSZ = 2 * 
BitmapEncoder.BITMAP_BLOCK_SZ;
+       protected static final int WRITE_CACHE_BLKSZ = 2 * 
BitmapEncoder.BITMAP_BLOCK_SZ;
+       
+       /** Distinct values associated with individual bitmaps. */
+       protected double[] _values; //linearized <numcol vals> <numcol vals>
+
+       /** Bitmaps, one per uncompressed value in {@link #values}. */
+       protected int[] _ptr; //bitmap offsets per value
+       protected char[] _data; //linearized bitmaps (variable length)
+       
+       protected int[] _skiplist;
+       
+       public ColGroupBitmap(CompressionType type) {
+               super(type, (int[]) null, -1);
+       }
+       
+       /**
+        * Main constructor. Stores the headers for the individual bitmaps.
+        * 
+        * @param colIndices
+        *            indices (within the block) of the columns included in this
+        *            column
+        * @param numRows
+        *            total number of rows in the parent block
+        * @param ubm
+        *            Uncompressed bitmap representation of the block
+        */
+       public ColGroupBitmap(CompressionType type, int[] colIndices, int 
numRows, UncompressedBitmap ubm) 
+       {
+               super(type, colIndices, numRows);
+
+               // Extract and store just the distinct values. The bitmaps 
themselves go
+               // into the subclasses.
+               final int numCols = ubm.getNumColumns();
+               final int numVals = ubm.getNumValues();
+               
+               _values = new double[numVals*numCols];
+               for (int i=0; i<numVals; i++) {
+                       //note: deep copied internally on getValues
+                       double[] tmp = ubm.getValues(i);
+                       System.arraycopy(tmp, 0, _values, i*numCols, numCols);
+               }
+       }
+
+       /**
+        * Constructor for subclass methods that need to create shallow copies
+        * 
+        * @param colIndices
+        *            raw column index information
+        * @param numRows
+        *            number of rows in the block
+        * @param values
+        *            set of distinct values for the block (associated bitmaps 
are
+        *            kept in the subclass)
+        */
+       protected ColGroupBitmap(CompressionType type, int[] colIndices, int 
numRows, double[] values) {
+               super(type, colIndices, numRows);
+               _values = values;
+       }
+       
+       protected final int len(int k) {
+               return _ptr[k+1] - _ptr[k];
+       }
+
+       /**
+        * 
+        * @param numVals
+        * @param totalLen
+        * @param lbitmaps
+        */
+       protected void createCompressedBitmaps(int numVals, int totalLen, 
char[][] lbitmaps)
+       {
+               // compact bitmaps to linearized representation
+               if( LOW_LEVEL_OPT && SORT_VALUES_BY_LENGTH
+                       && _numRows > BitmapEncoder.BITMAP_BLOCK_SZ ) 
+               {
+                       // sort value by num segments in descending order
+                       TreeMap<Integer,ArrayList<Integer>> tree = new 
TreeMap<Integer, ArrayList<Integer>>();
+                       for( int i=0; i<numVals; i++ ) {
+                               int revlen = totalLen-lbitmaps[i].length;
+                               if( !tree.containsKey(revlen) )
+                                       tree.put(revlen, new 
ArrayList<Integer>());
+                               tree.get(revlen).add(i);
+                       }
+                       
+                       // compact bitmaps to linearized representation
+                       _ptr = new int[numVals+1];
+                       _data = new char[totalLen];
+                       int pos = 0, off = 0;
+                       for( Entry<Integer,ArrayList<Integer>> e : 
tree.entrySet() ) {
+                               for( Integer tmpix : e.getValue() ) {
+                                       int len = lbitmaps[tmpix].length;
+                                       _ptr[pos] = off;
+                                       System.arraycopy(lbitmaps[tmpix], 0, 
_data, off, len);
+                                       off += len;
+                                       pos++;
+                               }
+                       }
+                       _ptr[numVals] = totalLen;
+                       
+                       // reorder values
+                       double[] lvalues = new double[_values.length];
+                       int off2 = 0; int numCols = _colIndexes.length;
+                       for( Entry<Integer,ArrayList<Integer>> e : 
tree.entrySet() ) {
+                               for( Integer tmpix : e.getValue() ) {
+                                       System.arraycopy(_values, 
tmpix*numCols, lvalues, off2, numCols);                               
+                                       off2 += numCols;
+                               }
+                       }                       
+                       _values = lvalues;
+               }
+               else
+               {
+                       // compact bitmaps to linearized representation
+                       _ptr = new int[numVals+1];
+                       _data = new char[totalLen];
+                       for( int i=0, off=0; i<numVals; i++ ) {
+                               int len = lbitmaps[i].length;
+                               _ptr[i] = off;
+                               System.arraycopy(lbitmaps[i], 0, _data, off, 
len);
+                               off += len;
+                       }
+                       _ptr[numVals] = totalLen;
+               }
+       }
+       
+       @Override
+       public long estimateInMemorySize() {
+               long size = super.estimateInMemorySize();
+               
+               // adding the size of values
+               size += 8; //array reference
+               if (_values != null) {
+                       size += 32 + _values.length * 8; //values
+               }
+               
+               // adding bitmaps size
+               size += 16; //array references
+               if (_data != null) {
+                       size += 32 + _ptr.length * 4; // offsets
+                       size += 32 + _data.length * 2;    // bitmaps
+               }
+       
+               return size;
+       }
+
+       //generic decompression for OLE/RLE, to be overwritten for performance
+       @Override
+       public void decompressToBlock(MatrixBlock target) 
+       {
+               final int numCols = getNumCols();
+               final int numVals = getNumValues();
+               int[] colIndices = getColIndices();
+               
+               // Run through the bitmaps for this column group
+               for (int i = 0; i < numVals; i++) {
+                       Iterator<Integer> decoder = getDecodeIterator(i);
+                       int valOff = i*numCols;
+
+                       while (decoder.hasNext()) {
+                               int row = decoder.next();
+                               for (int colIx = 0; colIx < numCols; colIx++) {
+                                       target.appendValue(row, 
colIndices[colIx], _values[valOff+colIx]);
+                               }
+                       }
+               }
+       }
+
+       //generic decompression for OLE/RLE, to be overwritten for performance
+       @Override
+       public void decompressToBlock(MatrixBlock target, int[] 
colIndexTargets) 
+       {
+               final int numCols = getNumCols();
+               final int numVals = getNumValues();
+               
+               // Run through the bitmaps for this column group
+               for (int i = 0; i < numVals; i++) {
+                       Iterator<Integer> decoder = getDecodeIterator(i);
+                       int valOff = i*numCols;
+
+                       while (decoder.hasNext()) {
+                               int row = decoder.next();
+                               for (int colIx = 0; colIx < numCols; colIx++) {
+                                       int origMatrixColIx = 
getColIndex(colIx);
+                                       int targetColIx = 
colIndexTargets[origMatrixColIx];
+                                       target.quickSetValue(row, targetColIx, 
_values[valOff+colIx]);
+                               }
+                       }
+               }
+       }
+       
+       //generic decompression for OLE/RLE, to be overwritten for performance
+       @Override
+       public void decompressToBlock(MatrixBlock target, int colpos) 
+       {
+               final int numCols = getNumCols();
+               final int numVals = getNumValues();
+               
+               // Run through the bitmaps for this column group
+               for (int i = 0; i < numVals; i++) {
+                       Iterator<Integer> decoder = getDecodeIterator(i);
+                       int valOff = i*numCols;
+
+                       while (decoder.hasNext()) {
+                               int row = decoder.next();
+                               target.quickSetValue(row, 0, 
_values[valOff+colpos]);
+                       }
+               }
+       }
+       
+       /**
+        * 
+        * @param bitmapIx
+        * @return
+        */
+       protected double sumValues(int bitmapIx)
+       {
+               final int numCols = getNumCols();
+               final int valOff = bitmapIx * numCols;
+               
+               double val = 0.0;
+               for( int i = 0; i < numCols; i++ ) {
+                       val += _values[valOff+i];
+               }
+               
+               return val;
+       }
+       
+       protected double sumValues(int bitmapIx, double[] b)
+       {
+               final int numCols = getNumCols();
+               final int valOff = bitmapIx * numCols;
+               
+               double val = 0;
+               for( int i = 0; i < numCols; i++ ) {
+                       val += _values[valOff+i] * b[i];
+               }
+               
+               return val;
+       }
+       
+       /**
+        * 
+        * @param b
+        * @param c
+        */
+       protected void sumAllValues(double[] b, double[] c)
+       {
+               final int numVals = getNumValues();
+               final int numCols = getNumCols();
+               
+               //vectMultiplyAdd over cols instead of dotProduct over vals 
because
+               //usually more values than columns
+               for( int i=0, off=0; i<numCols; i++, off+=numVals )
+                       LinearAlgebraUtils.vectMultiplyAdd(b[i], _values, c, 
off, 0, numVals);
+       }
+
+       /**
+        * Method for use by subclasses. Applies a scalar operation to the value
+        * metadata stored in the superclass.
+        * 
+        * @param op
+        *            scalar operation to perform
+        * @return transformed copy of value metadata for this column group
+        * @throws DMLRuntimeException
+        */
+       protected double[] applyScalarOp(ScalarOperator op)
+                       throws DMLRuntimeException 
+       {
+               //scan over linearized values
+               double[] ret = new double[_values.length];
+               for (int i = 0; i < _values.length; i++) {
+                       ret[i] = op.executeScalar(_values[i]);
+               }
+
+               return ret;
+       }
+       
+       /**
+        * 
+        * @param op
+        * @param newVal
+        * @param numCols
+        * @return
+        * @throws DMLRuntimeException
+        */
+       protected double[] applyScalarOp(ScalarOperator op, double newVal, int 
numCols)
+                       throws DMLRuntimeException 
+       {
+               //scan over linearized values
+               double[] ret = new double[_values.length + numCols];
+               for( int i = 0; i < _values.length; i++ ) {
+                       ret[i] = op.executeScalar(_values[i]);
+               }
+               
+               //add new value to the end
+               Arrays.fill(ret, _values.length, _values.length+numCols, 
newVal);
+               
+               return ret;
+       }
+
+       /**
+        * @return the number of distinct sets of values associated with the 
bitmaps
+        *         in this column group
+        */
+       public int getNumValues() {
+               return _values.length / _colIndexes.length;
+       }
+
+       /**
+        * 
+        * @return
+        */
+       public double[] getValues() {
+               return _values;
+       }
+       
+       /**
+        * 
+        * @return
+        */
+       public char[] getBitmaps() {
+               return _data;
+       }
+       
+       public int[] getBitmapOffsets() {
+               return _ptr;
+       }
+
+       /**
+        * @param bmpIx
+        *            index of a specific compressed bitmap (stored in subclass,
+        *            index same as {@link #values})
+        * @return an object for iterating over the row offsets in this bitmap. 
Only
+        *         valid until the next call to this method. May be reused 
across
+        *         calls.
+        */
+       public abstract Iterator<Integer> getDecodeIterator(int bmpIx);
+
+       /**
+        * Utility function of sparse-unsafe operations.
+        * 
+        * @param ind
+        * @return
+        * @throws DMLRuntimeException
+        */
+       protected int[] computeOffsets(boolean[] ind)
+               throws DMLRuntimeException 
+       {
+               //determine number of offsets
+               int numOffsets = 0;
+               for( int i=0; i<ind.length; i++ )
+                       numOffsets += ind[i] ? 1 : 0;
+               
+               //create offset lists
+               int[] ret = new int[numOffsets];
+               for( int i=0, pos=0; i<ind.length; i++ )
+                       if( ind[i] )
+                               ret[pos++] = i;
+               
+               return ret;
+       }
+
+       @Override
+       public void readFields(DataInput in) 
+               throws IOException 
+       {
+               _numRows = in.readInt();
+               int numCols = in.readInt();
+               int numVals = in.readInt();
+               
+               //read col indices
+               _colIndexes = new int[ numCols ];
+               for( int i=0; i<numCols; i++ )
+                       _colIndexes[i] = in.readInt();
+               
+               //read distinct values
+               _values = new double[numVals*numCols];
+               for( int i=0; i<numVals*numCols; i++ )
+                       _values[i] = in.readDouble();
+               
+               //read bitmaps
+               int totalLen = in.readInt();
+               _ptr = new int[numVals+1];
+               _data = new char[totalLen];             
+               for( int i=0, off=0; i<numVals; i++ ) {
+                       int len = in.readInt();
+                       _ptr[i] = off;
+                       for( int j=0; j<len; j++ )
+                               _data[off+j] = in.readChar();
+                       off += len;
+               }
+               _ptr[numVals] = totalLen;
+       }
+       
+       @Override
+       public void write(DataOutput out) 
+               throws IOException 
+       {
+               int numCols = getNumCols();
+               int numVals = getNumValues();
+               out.writeInt(_numRows);
+               out.writeInt(numCols);
+               out.writeInt(numVals);
+               
+               //write col indices
+               for( int i=0; i<_colIndexes.length; i++ )
+                       out.writeInt( _colIndexes[i] );
+               
+               //write distinct values
+               for( int i=0; i<_values.length; i++ )
+                       out.writeDouble(_values[i]);
+
+               //write bitmaps (lens and data, offset later recreated)
+               int totalLen = 0;
+               for( int i=0; i<numVals; i++ )
+                       totalLen += len(i);
+               out.writeInt(totalLen); 
+               for( int i=0; i<numVals; i++ ) {
+                       int len = len(i);
+                       int off = _ptr[i];
+                       out.writeInt(len);
+                       for( int j=0; j<len; j++ )
+                               out.writeChar(_data[off+j]);
+               }
+       }
+
+       @Override
+       public long getExactSizeOnDisk() {
+               long ret = 12; //header
+               //col indices
+               ret += 4 * _colIndexes.length; 
+               //distinct values (groups of values)
+               ret += 8 * _values.length;
+               //actual bitmaps
+               ret += 4; //total length
+               for( int i=0; i<getNumValues(); i++ )
+                       ret += 4 + 2 * len(i);
+               
+               return ret;
+       }
+}

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/16e7b1c8/src/main/java/org/apache/sysml/runtime/compress/ColGroupOLE.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/compress/ColGroupOLE.java 
b/src/main/java/org/apache/sysml/runtime/compress/ColGroupOLE.java
new file mode 100644
index 0000000..a2b493a
--- /dev/null
+++ b/src/main/java/org/apache/sysml/runtime/compress/ColGroupOLE.java
@@ -0,0 +1,634 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ * 
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.sysml.runtime.compress;
+
+import java.util.Arrays;
+import java.util.Iterator;
+
+import org.apache.sysml.runtime.DMLRuntimeException;
+import org.apache.sysml.runtime.compress.utils.ConverterUtils;
+import org.apache.sysml.runtime.compress.utils.LinearAlgebraUtils;
+import org.apache.sysml.runtime.functionobjects.KahanFunction;
+import org.apache.sysml.runtime.functionobjects.KahanPlus;
+import org.apache.sysml.runtime.functionobjects.KahanPlusSq;
+import org.apache.sysml.runtime.functionobjects.ReduceAll;
+import org.apache.sysml.runtime.functionobjects.ReduceCol;
+import org.apache.sysml.runtime.functionobjects.ReduceRow;
+import org.apache.sysml.runtime.instructions.cp.KahanObject;
+import org.apache.sysml.runtime.matrix.data.MatrixBlock;
+import org.apache.sysml.runtime.matrix.operators.AggregateUnaryOperator;
+import org.apache.sysml.runtime.matrix.operators.ScalarOperator;
+
+/**
+ * Class to encapsulate information about a column group that is encoded with
+ * simple lists of offsets for each set of distinct values.
+ * 
+ */
+public class ColGroupOLE extends ColGroupBitmap 
+{
+       private static final long serialVersionUID = -9157676271360528008L;
+
+       public ColGroupOLE() {
+               super(CompressionType.OLE_BITMAP);
+       }
+       
+       /**
+        * Main constructor. Constructs and stores the necessary bitmaps.
+        * 
+        * @param colIndices
+        *            indices (within the block) of the columns included in this
+        *            column
+        * @param numRows
+        *            total number of rows in the parent block
+        * @param ubm
+        *            Uncompressed bitmap representation of the block
+        */
+       public ColGroupOLE(int[] colIndices, int numRows, UncompressedBitmap 
ubm) 
+       {
+               super(CompressionType.OLE_BITMAP, colIndices, numRows, ubm);
+
+               // compress the bitmaps
+               final int numVals = ubm.getNumValues();
+               char[][] lbitmaps = new char[numVals][];
+               int totalLen = 0;
+               for( int i=0; i<numVals; i++ ) {
+                       lbitmaps[i] = 
BitmapEncoder.genOffsetBitmap(ubm.getOffsetsList(i));
+                       totalLen += lbitmaps[i].length;
+               }
+               
+               // compact bitmaps to linearized representation
+               createCompressedBitmaps(numVals, totalLen, lbitmaps);
+               
+
+               if( LOW_LEVEL_OPT && CREATE_SKIPLIST
+                               && numRows > 2*BitmapEncoder.BITMAP_BLOCK_SZ )
+               {
+                       int blksz = BitmapEncoder.BITMAP_BLOCK_SZ;
+                       _skiplist = new int[numVals];
+                       int rl = (getNumRows()/2/blksz)*blksz;
+                       for (int k = 0; k < numVals; k++) {
+                               int bitmapOff = _ptr[k];
+                               int bitmapLen = len(k);
+                               int bufIx = 0;
+                               for( int i=0; i<rl && bufIx<bitmapLen; i+=blksz 
) {
+                                       bufIx += _data[bitmapOff+bufIx] + 1;
+                               }
+                               _skiplist[k] = bufIx;
+                       }               
+               }
+       }
+
+       /**
+        * Constructor for internal use.
+        */
+       public ColGroupOLE(int[] colIndices, int numRows, double[] values, 
char[] bitmaps, int[] bitmapOffs) {
+               super(CompressionType.OLE_BITMAP, colIndices, numRows, values);
+               _data = bitmaps;
+               _ptr = bitmapOffs;
+       }
+
+       @Override
+       public Iterator<Integer> getDecodeIterator(int bmpIx) {
+               return new BitmapDecoderOLE(_data, _ptr[bmpIx], len(bmpIx));
+       }
+       
+       @Override
+       public void decompressToBlock(MatrixBlock target) 
+       {
+               if( LOW_LEVEL_OPT && getNumValues() > 1 )
+               {
+                       final int blksz = BitmapEncoder.BITMAP_BLOCK_SZ;
+                       final int numCols = getNumCols();
+                       final int numVals = getNumValues();
+                       final int n = getNumRows();
+                       
+                       //cache blocking config and position array
+                       int[] apos = new int[numVals];
+                                       
+                       //cache conscious append via horizontal scans 
+                       for( int bi=0; bi<n; bi+=blksz ) {
+                               for (int k = 0, off=0; k < numVals; k++, 
off+=numCols) {
+                                       int bitmapOff = _ptr[k];
+                                       int bitmapLen = len(k);                 
                
+                                       int bufIx = apos[k];
+                                       if( bufIx >= bitmapLen ) 
+                                               continue;
+                                       int len = _data[bitmapOff+bufIx];
+                                       int pos = bitmapOff+bufIx+1;
+                                       for( int i=pos; i<pos+len; i++ )
+                                               for( int j=0, rix = 
bi+_data[i]; j<numCols; j++ )
+                                                       if( _values[off+j]!=0 )
+                                                               
target.appendValue(rix, _colIndexes[j], _values[off+j]);
+                                       apos[k] += len + 1;
+                               }
+                       }               
+               }
+               else
+               {
+                       //call generic decompression with decoder
+                       super.decompressToBlock(target);
+               }
+       }
+
+       @Override
+       public void decompressToBlock(MatrixBlock target, int[] colixTargets) 
+       {
+               if( LOW_LEVEL_OPT && getNumValues() > 1 )
+               {
+                       final int blksz = BitmapEncoder.BITMAP_BLOCK_SZ;
+                       final int numCols = getNumCols();
+                       final int numVals = getNumValues();
+                       final int n = getNumRows();
+                       
+                       //cache blocking config and position array
+                       int[] apos = new int[numVals];                          
        
+                       int[] cix = new int[numCols];
+                       
+                       //prepare target col indexes
+                       for( int j=0; j<numCols; j++ )
+                               cix[j] = colixTargets[_colIndexes[j]];
+                       
+                       //cache conscious append via horizontal scans 
+                       for( int bi=0; bi<n; bi+=blksz ) {
+                               for (int k = 0, off=0; k < numVals; k++, 
off+=numCols) {
+                                       int bitmapOff = _ptr[k];
+                                       int bitmapLen = len(k);                 
                
+                                       int bufIx = apos[k];
+                                       if( bufIx >= bitmapLen ) 
+                                               continue;
+                                       int len = _data[bitmapOff+bufIx];
+                                       int pos = bitmapOff+bufIx+1;
+                                       for( int i=pos; i<pos+len; i++ )
+                                               for( int j=0, rix = 
bi+_data[i]; j<numCols; j++ )
+                                                       if( _values[off+j]!=0 )
+                                                               
target.appendValue(rix, cix[j], _values[off+j]);
+                                       apos[k] += len + 1;
+                               }
+                       }               
+               }
+               else
+               {
+                       //call generic decompression with decoder
+                       super.decompressToBlock(target, colixTargets);
+               }
+       }
+       
+       @Override
+       public void decompressToBlock(MatrixBlock target, int colpos) 
+       {
+               if( LOW_LEVEL_OPT && getNumValues() > 1 )
+               {
+                       final int blksz = BitmapEncoder.BITMAP_BLOCK_SZ;
+                       final int numCols = getNumCols();
+                       final int numVals = getNumValues();
+                       final int n = getNumRows();
+                       double[] c = target.getDenseBlock();
+                       
+                       //cache blocking config and position array
+                       int[] apos = new int[numVals];                          
        
+                       
+                       //cache conscious append via horizontal scans 
+                       for( int bi=0; bi<n; bi+=blksz ) {
+                               for (int k = 0, off=0; k < numVals; k++, 
off+=numCols) {
+                                       int bitmapOff = _ptr[k];
+                                       int bitmapLen = len(k);                 
                
+                                       int bufIx = apos[k];
+                                       if( bufIx >= bitmapLen ) 
+                                               continue;
+                                       int len = _data[bitmapOff+bufIx];
+                                       int pos = bitmapOff+bufIx+1;
+                                       for( int i=pos; i<pos+len; i++ ) {
+                                               c[bi+_data[i]] = 
_values[off+colpos];
+                                       }
+                                       apos[k] += len + 1;
+                               }
+                       }       
+                       
+                       target.recomputeNonZeros();
+               }
+               else
+               {
+                       //call generic decompression with decoder
+                       super.decompressToBlock(target, colpos);
+               }
+       }
+       
+       @Override
+       public ColGroup scalarOperation(ScalarOperator op)
+               throws DMLRuntimeException 
+       {
+               double val0 = op.executeScalar(0);
+               
+               //fast path: sparse-safe operations
+               // Note that bitmaps don't change and are shallow-copied
+               if( op.sparseSafe || val0==0 ) {
+                       return new ColGroupOLE(_colIndexes, _numRows, 
+                                       applyScalarOp(op), _data, _ptr);
+               }
+               
+               //slow path: sparse-unsafe operations (potentially create new 
bitmap)
+               //note: for efficiency, we currently don't drop values that 
become 0
+               boolean[] lind = computeZeroIndicatorVector();
+               int[] loff = computeOffsets(lind);
+               if( loff.length==0 ) { //empty offset list: go back to fast path
+                       return new ColGroupOLE(_colIndexes, _numRows, 
+                                       applyScalarOp(op), _data, _ptr);
+               }
+               
+               double[] rvalues = applyScalarOp(op, val0, getNumCols());       
        
+               char[] lbitmap = BitmapEncoder.genOffsetBitmap(loff);
+               char[] rbitmaps = Arrays.copyOf(_data, 
_data.length+lbitmap.length);
+               System.arraycopy(lbitmap, 0, rbitmaps, _data.length, 
lbitmap.length);
+               int[] rbitmapOffs = Arrays.copyOf(_ptr, _ptr.length+1);
+               rbitmapOffs[rbitmapOffs.length-1] = rbitmaps.length; 
+               
+               return new ColGroupOLE(_colIndexes, _numRows, 
+                               rvalues, rbitmaps, rbitmapOffs);
+       }
+
+       @Override
+       public void rightMultByVector(MatrixBlock vector, MatrixBlock result, 
int rl, int ru)
+                       throws DMLRuntimeException 
+       {
+               double[] b = ConverterUtils.getDenseVector(vector);
+               double[] c = result.getDenseBlock();
+               final int blksz = BitmapEncoder.BITMAP_BLOCK_SZ;
+               final int numCols = getNumCols();
+               final int numVals = getNumValues();
+               
+               //prepare reduced rhs w/ relevant values
+               double[] sb = new double[numCols];
+               for (int j = 0; j < numCols; j++) {
+                       sb[j] = b[_colIndexes[j]];
+               }
+               
+               if( LOW_LEVEL_OPT && numVals > 1 && _numRows > blksz )
+               {
+                       //since single segment scans already exceed typical L2 
cache sizes
+                       //and because there is some overhead associated with 
blocking, the
+                       //best configuration aligns with L3 cache size 
(x*vcores*64K*8B < L3)
+                       //x=4 leads to a good yet slightly conservative 
compromise for single-/
+                       //multi-threaded and typical number of cores and L3 
cache sizes
+                       final int blksz2 = ColGroupBitmap.WRITE_CACHE_BLKSZ;
+                       
+                       //step 1: prepare position and value arrays
+                       
+                       //current pos / values per OLs
+                       int[] apos = new int[numVals];
+                       double[] aval = new double[numVals];
+                       
+                       //skip-scan to beginning for all OLs 
+                       if( rl > 0 ) { //rl aligned with blksz          
+                               int rskip = (getNumRows()/2/blksz)*blksz;
+                               
+                               for (int k = 0; k < numVals; k++) {
+                                       int bitmapOff = _ptr[k];
+                                       int bitmapLen = len(k);
+                                       int start = (rl>=rskip)?rskip:0;
+                                       int bufIx = (rl>=rskip)?_skiplist[k]:0;
+                                       for( int i=start; i<rl && 
bufIx<bitmapLen; i+=blksz ) {
+                                               bufIx += _data[bitmapOff+bufIx] 
+ 1;
+                                       }
+                                       apos[k] = bufIx;
+                               }
+                       }
+                       
+                       //pre-aggregate values per OLs
+                       for( int k = 0; k < numVals; k++ )
+                               aval[k] = sumValues(k, sb);
+                                       
+                       //step 2: cache conscious matrix-vector via horizontal 
scans 
+                       for( int bi=rl; bi<ru; bi+=blksz2 ) 
+                       {
+                               int bimax = Math.min(bi+blksz2, ru);
+                               
+                               //horizontal segment scan, incl pos maintenance
+                               for (int k = 0; k < numVals; k++) {
+                                       int bitmapOff = _ptr[k];
+                                       int bitmapLen = len(k);
+                                       double val = aval[k];
+                                       int bufIx = apos[k];
+                                       
+                                       for( int ii=bi; ii<bimax && 
bufIx<bitmapLen; ii+=blksz ) {
+                                               //prepare length, start, and 
end pos
+                                               int len = 
_data[bitmapOff+bufIx];
+                                               int pos = bitmapOff+bufIx+1;
+                                               
+                                               //compute partial results
+                                               
//LinearAlgebraUtils.vectAdd(val, c, bitmaps, pos, ii, len);
+                                               for( int i=pos; i<pos+len; i++ )
+                                                       c[ii + _data[i]] += 
val;        
+                                               bufIx += len + 1;
+                                       }
+
+                                       apos[k] = bufIx;
+                               }
+                       }               
+               }
+               else
+               {       
+                       //iterate over all values and their bitmaps
+                       for (int k = 0; k < numVals; k++) 
+                       {
+                               //prepare value-to-add for entire value bitmap
+                               int bitmapOff = _ptr[k];
+                               int bitmapLen = len(k);
+                               double val = sumValues(k, sb);
+                               
+                               //iterate over bitmap blocks and add values
+                               if (val != 0) {
+                                       int offsetBase = 0;
+                                       int bufIx = 0;
+                                       int curBlckLen;
+                                       
+                                       //scan to beginning offset if necessary 
+                                       if( rl > 0 ){
+                                               for (; bufIx<bitmapLen & 
offsetBase<rl; bufIx += curBlckLen + 1, offsetBase += blksz) {
+                                                       curBlckLen = 
_data[bitmapOff+bufIx];
+                                               }       
+                                       }
+                                       
+                                       //compute partial results
+                                       for (; bufIx<bitmapLen & offsetBase<ru; 
bufIx += curBlckLen + 1, offsetBase += blksz) {
+                                               curBlckLen = 
_data[bitmapOff+bufIx];
+                                               for (int blckIx = 1; blckIx <= 
curBlckLen; blckIx++) {
+                                                       c[offsetBase + 
_data[bitmapOff+bufIx + blckIx]] += val;
+                                               }
+                                       }
+                               }
+                       }
+               }
+       }
+
+       @Override
+       public void leftMultByRowVector(MatrixBlock vector, MatrixBlock result)
+               throws DMLRuntimeException 
+       {
+               double[] a = ConverterUtils.getDenseVector(vector);
+               double[] c = result.getDenseBlock();
+               final int blksz = BitmapEncoder.BITMAP_BLOCK_SZ;
+               final int numCols = getNumCols();
+               final int numVals = getNumValues();
+               final int n = getNumRows();
+               
+               if( LOW_LEVEL_OPT && numVals > 1 && _numRows > blksz )
+               {
+                       //cache blocking config (see matrix-vector mult for 
explanation)
+                       final int blksz2 = ColGroupBitmap.READ_CACHE_BLKSZ;
+                       
+                       //step 1: prepare position and value arrays
+                       
+                       //current pos per OLs / output values
+                       int[] apos = new int[numVals];
+                       double[] cvals = new double[numVals];
+                       
+                       //step 2: cache conscious matrix-vector via horizontal 
scans 
+                       for( int ai=0; ai<n; ai+=blksz2 ) 
+                       {
+                               int aimax = Math.min(ai+blksz2, n);
+                               
+                               //horizontal segment scan, incl pos maintenance
+                               for (int k = 0; k < numVals; k++) {
+                                       int bitmapOff = _ptr[k];
+                                       int bitmapLen = len(k);
+                                       int bufIx = apos[k];
+                                       double vsum = 0;        
+                                       
+                                       for( int ii=ai; ii<aimax && 
bufIx<bitmapLen; ii+=blksz ) {
+                                               //prepare length, start, and 
end pos
+                                               int len = 
_data[bitmapOff+bufIx];
+                                               int pos = bitmapOff+bufIx+1;
+                                               
+                                               //iterate over bitmap blocks 
and compute partial results (a[i]*1)
+                                               vsum += 
LinearAlgebraUtils.vectSum(a, _data, ii, pos, len);
+                                               bufIx += len + 1;
+                                       }
+
+                                       apos[k] = bufIx;
+                                       cvals[k] += vsum;
+                               }
+                       }
+                       
+                       //step 3: scale partial results by values and write to 
global output
+                       for (int k = 0, valOff=0; k < numVals; k++, 
valOff+=numCols)
+                               for( int j = 0; j < numCols; j++ )
+                                       c[ _colIndexes[j] ] += cvals[k] * 
_values[valOff+j];            
+               }
+               else
+               {
+                       //iterate over all values and their bitmaps
+                       for (int k=0, valOff=0; k<numVals; k++, 
valOff+=numCols) 
+                       {
+                               int bitmapOff = _ptr[k];
+                               int bitmapLen = len(k);
+                               
+                               //iterate over bitmap blocks and add partial 
results
+                               double vsum = 0;
+                               for (int bix=0, off=0; bix < bitmapLen; 
bix+=_data[bitmapOff+bix]+1, off+=blksz)
+                                       vsum += LinearAlgebraUtils.vectSum(a, 
_data, off, bitmapOff+bix+1, _data[bitmapOff+bix]);
+                               
+                               //scale partial results by values and write 
results
+                               for( int j = 0; j < numCols; j++ )
+                                       c[ _colIndexes[j] ] += vsum * _values[ 
valOff+j ];
+                       }
+               }
+       }
+
+       @Override
+       public void unaryAggregateOperations(AggregateUnaryOperator op, 
MatrixBlock result) 
+               throws DMLRuntimeException 
+       {
+               KahanFunction kplus = (op.aggOp.increOp.fn instanceof 
KahanPlus) ?
+                               KahanPlus.getKahanPlusFnObject() : 
KahanPlusSq.getKahanPlusSqFnObject();
+               
+               if( op.indexFn instanceof ReduceAll )
+                       computeSum(result, kplus);
+               else if( op.indexFn instanceof ReduceCol )
+                       computeRowSums(result, kplus);
+               else if( op.indexFn instanceof ReduceRow )
+                       computeColSums(result, kplus);
+       }
+       
+       /**
+        * 
+        * @param result
+        */
+       private void computeSum(MatrixBlock result, KahanFunction kplus)
+       {
+               KahanObject kbuff = new KahanObject(result.quickGetValue(0, 0), 
result.quickGetValue(0, 1));
+               
+               //iterate over all values and their bitmaps
+               final int numVals = getNumValues();
+               final int numCols = getNumCols();
+               
+               for (int bitmapIx = 0; bitmapIx < numVals; bitmapIx++) 
+               {
+                       int bitmapOff = _ptr[bitmapIx];
+                       int bitmapLen = len(bitmapIx);
+                       int valOff = bitmapIx * numCols;
+                       
+                       //iterate over bitmap blocks and count partial lengths
+                       int count = 0;
+                       for (int bix=0; bix < bitmapLen; 
bix+=_data[bitmapOff+bix]+1)
+                               count += _data[bitmapOff+bix];
+                       
+                       //scale counts by all values
+                       for( int j = 0; j < numCols; j++ )
+                               kplus.execute3(kbuff, _values[ valOff+j ], 
count);
+               }
+               
+               result.quickSetValue(0, 0, kbuff._sum);
+               result.quickSetValue(0, 1, kbuff._correction);
+       }
+       
+       /**
+        * 
+        * @param result
+        */
+       private void computeRowSums(MatrixBlock result, KahanFunction kplus)
+       {
+               KahanObject kbuff = new KahanObject(0, 0);
+       
+               final int blksz = BitmapEncoder.BITMAP_BLOCK_SZ;
+               final int numVals = getNumValues();
+               
+               //iterate over all values and their bitmaps
+               for (int bitmapIx = 0; bitmapIx < numVals; bitmapIx++) 
+               {
+                       //prepare value-to-add for entire value bitmap
+                       int bitmapOff = _ptr[bitmapIx];
+                       int bitmapLen = len(bitmapIx);
+                       double val = sumValues(bitmapIx);
+                       
+                       //iterate over bitmap blocks and add values
+                       if (val != 0) {
+                               int offsetBase = 0;
+                               int bufIx = 0;
+                               int curBlckLen;
+                               for (; bufIx < bitmapLen; bufIx += curBlckLen + 
1, offsetBase += blksz) {
+                                       curBlckLen = _data[bitmapOff+bufIx];
+                                       for (int blckIx = 1; blckIx <= 
curBlckLen; blckIx++) {
+                                               int rix = offsetBase + 
_data[bitmapOff+bufIx + blckIx];
+                                               
kbuff.set(result.quickGetValue(rix, 0), result.quickGetValue(rix, 1));
+                                               kplus.execute2(kbuff, val);
+                                               result.quickSetValue(rix, 0, 
kbuff._sum);
+                                               result.quickSetValue(rix, 1, 
kbuff._correction);
+                                       }
+                               }
+                       }
+               }
+       }
+       
+       /**
+        * 
+        * @param result
+        */
+       private void computeColSums(MatrixBlock result, KahanFunction kplus)
+       {
+               KahanObject kbuff = new KahanObject(0, 0);
+               
+               //iterate over all values and their bitmaps
+               final int numVals = getNumValues();
+               final int numCols = getNumCols();
+               for (int bitmapIx = 0; bitmapIx < numVals; bitmapIx++) 
+               {
+                       int bitmapOff = _ptr[bitmapIx];
+                       int bitmapLen = len(bitmapIx);
+                       int valOff = bitmapIx * numCols;
+                       
+                       //iterate over bitmap blocks and count partial lengths
+                       int count = 0;
+                       for (int bix=0; bix < bitmapLen; 
bix+=_data[bitmapOff+bix]+1)
+                               count += _data[bitmapOff+bix];
+                       
+                       //scale counts by all values
+                       for( int j = 0; j < numCols; j++ ) {
+                               kbuff.set(result.quickGetValue(0, 
_colIndexes[j]),result.quickGetValue(1, _colIndexes[j]));
+                               kplus.execute3(kbuff, _values[ valOff+j ], 
count);
+                               result.quickSetValue(0, _colIndexes[j], 
kbuff._sum);
+                               result.quickSetValue(1, _colIndexes[j], 
kbuff._correction);
+                       }
+               }
+       }
+       
+       /**
+        * Utility function of sparse-unsafe operations.
+        * 
+        * @return
+        * @throws DMLRuntimeException
+        */
+       private boolean[] computeZeroIndicatorVector()
+               throws DMLRuntimeException 
+       {
+               boolean[] ret = new boolean[_numRows];
+               final int blksz = BitmapEncoder.BITMAP_BLOCK_SZ;
+               final int numVals = getNumValues();
+               
+               //initialize everything with zero
+               Arrays.fill(ret, true);
+               
+               //iterate over all values and their bitmaps
+               for (int bitmapIx = 0; bitmapIx < numVals; bitmapIx++) 
+               {
+                       //prepare value-to-add for entire value bitmap
+                       int bitmapOff = _ptr[bitmapIx];
+                       int bitmapLen = len(bitmapIx);
+                       
+                       //iterate over bitmap blocks and add values
+                       int offsetBase = 0;
+                       int bufIx = 0;
+                       int curBlckLen;
+                       for (; bufIx < bitmapLen; bufIx += curBlckLen + 1, 
offsetBase += blksz) {
+                               curBlckLen = _data[bitmapOff+bufIx];
+                               for (int blckIx = 1; blckIx <= curBlckLen; 
blckIx++) {
+                                       ret[offsetBase + _data[bitmapOff+bufIx 
+ blckIx]] &= false;
+                               }
+                       }
+               }
+               
+               return ret;
+       }
+       
+       @Override
+       protected void countNonZerosPerRow(int[] rnnz)
+       {
+               final int blksz = BitmapEncoder.BITMAP_BLOCK_SZ;
+               final int numVals = getNumValues();
+               final int numCols = getNumCols();
+               
+               //iterate over all values and their bitmaps
+               for (int k = 0; k < numVals; k++) 
+               {
+                       //prepare value-to-add for entire value bitmap
+                       int bitmapOff = _ptr[k];
+                       int bitmapLen = len(k);
+                       
+                       //iterate over bitmap blocks and add values
+                       int offsetBase = 0;
+                       int curBlckLen;
+                       for (int bufIx=0; bufIx<bitmapLen; bufIx+=curBlckLen+1, 
offsetBase+=blksz) {
+                               curBlckLen = _data[bitmapOff+bufIx];
+                               for (int blckIx = 1; blckIx <= curBlckLen; 
blckIx++) {
+                                       rnnz[offsetBase + _data[bitmapOff+bufIx 
+ blckIx]] += numCols;
+                               }
+                       }
+               }
+       }
+}

Reply via email to