Repository: incubator-systemml
Updated Branches:
  refs/heads/master 30dcd64b4 -> 88030df8f


[SYSTEMML-1558] Performance compressed tsmm (nnz, w/o ddc decompress)

This commit makes various minor performance improvements to tsmm
operations over compressed matrix blocks, including fused nnz
maintenance for decompressed vectors and outputs, as well as
vector-matrix operations directly over compressed DDC vectors in order
to avoid unnecessary decompression. 

We also experimented with cache-conscious schemes by blocking
decompressed and compressed columns groups but this did not show any
improvements, likely due to the coarse granularity of individual column
groups.


Project: http://git-wip-us.apache.org/repos/asf/incubator-systemml/repo
Commit: 
http://git-wip-us.apache.org/repos/asf/incubator-systemml/commit/88030df8
Tree: http://git-wip-us.apache.org/repos/asf/incubator-systemml/tree/88030df8
Diff: http://git-wip-us.apache.org/repos/asf/incubator-systemml/diff/88030df8

Branch: refs/heads/master
Commit: 88030df8f98098e02a7c4016efda00565b2ac993
Parents: 30dcd64
Author: Matthias Boehm <mboe...@gmail.com>
Authored: Mon Apr 24 23:17:54 2017 -0700
Committer: Matthias Boehm <mboe...@gmail.com>
Committed: Mon Apr 24 23:17:54 2017 -0700

----------------------------------------------------------------------
 .../sysml/runtime/compress/ColGroupDDC.java     | 13 +++-
 .../sysml/runtime/compress/ColGroupDDC1.java    | 30 ++++++----
 .../sysml/runtime/compress/ColGroupDDC2.java    | 59 ++++++++++++------
 .../sysml/runtime/compress/ColGroupOLE.java     | 29 ++++++++-
 .../sysml/runtime/compress/ColGroupRLE.java     | 39 +++++++++++-
 .../sysml/runtime/compress/ColGroupValue.java   |  5 ++
 .../runtime/compress/CompressedMatrixBlock.java | 63 +++++++++++++++-----
 .../compress/utils/LinearAlgebraUtils.java      | 21 +++----
 8 files changed, 196 insertions(+), 63 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/88030df8/src/main/java/org/apache/sysml/runtime/compress/ColGroupDDC.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/compress/ColGroupDDC.java 
b/src/main/java/org/apache/sysml/runtime/compress/ColGroupDDC.java
index 1782e2e..9a6067b 100644
--- a/src/main/java/org/apache/sysml/runtime/compress/ColGroupDDC.java
+++ b/src/main/java/org/apache/sysml/runtime/compress/ColGroupDDC.java
@@ -200,7 +200,18 @@ public abstract class ColGroupDDC extends ColGroupValue
                                c[i] = builtin.execute2(c[i], getData(i, j));
        }
        
-       
+       protected final void postScaling(double[] vals, double[] c) {
+               final int ncol = getNumCols();
+               final int numVals = getNumValues();
+               
+               for( int k=0, valOff=0; k<numVals; k++, valOff+=ncol ) {
+                       double aval = vals[k];
+                       for( int j=0; j<ncol; j++ ) {
+                               int colIx = _colIndexes[j];
+                               c[colIx] += aval * _values[valOff+j];
+                       }       
+               }
+       }
 
        /**
         * Generic get value for byte-length-agnostic access.

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/88030df8/src/main/java/org/apache/sysml/runtime/compress/ColGroupDDC1.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/compress/ColGroupDDC1.java 
b/src/main/java/org/apache/sysml/runtime/compress/ColGroupDDC1.java
index 4cd103e..8fb7d99 100644
--- a/src/main/java/org/apache/sysml/runtime/compress/ColGroupDDC1.java
+++ b/src/main/java/org/apache/sysml/runtime/compress/ColGroupDDC1.java
@@ -173,9 +173,10 @@ public class ColGroupDDC1 extends ColGroupDDC
                int nrow = getNumRows();
                int ncol = getNumCols();
                double[] c = target.getDenseBlock();
+               int nnz = 0;
                for( int i = 0; i < nrow; i++ )
-                       c[i] = _values[(_data[i]&0xFF)*ncol+colpos];
-               target.recomputeNonZeros();
+                       nnz += ((c[i] = 
_values[(_data[i]&0xFF)*ncol+colpos])!=0) ? 1 : 0;
+               target.setNonZeros(nnz);
        }
 
        @Override 
@@ -263,7 +264,6 @@ public class ColGroupDDC1 extends ColGroupDDC
                double[] a = ConverterUtils.getDenseVector(vector);
                double[] c = result.getDenseBlock();
                final int nrow = getNumRows();
-               final int ncol = getNumCols();
                final int numVals = getNumValues();
                
                //iterative over codes and pre-aggregate inputs per code 
(guaranteed <=255)
@@ -274,13 +274,23 @@ public class ColGroupDDC1 extends ColGroupDDC
                }
                
                //post-scaling of pre-aggregate with distinct values
-               for( int k=0, valOff=0; k<numVals; k++, valOff+=ncol ) {
-                       double aval = vals[k];
-                       for( int j=0; j<ncol; j++ ) {
-                               int colIx = _colIndexes[j];
-                               c[colIx] += aval * _values[valOff+j];
-                       }       
-               }
+               postScaling(vals, c);
+       }
+       
+       @Override
+       public void leftMultByRowVector(ColGroupDDC a, MatrixBlock result) 
throws DMLRuntimeException {
+               double[] c = result.getDenseBlock();
+               final int nrow = getNumRows();
+               final int numVals = getNumValues();
+               
+               //iterative over codes and pre-aggregate inputs per code 
(guaranteed <=255)
+               //temporary array also avoids false sharing in multi-threaded 
environments
+               double[] vals = allocDVector(numVals, true);
+               for( int i=0; i<nrow; i++ )
+                       vals[_data[i]&0xFF] += a.getData(i, 0);
+               
+               //post-scaling of pre-aggregate with distinct values
+               postScaling(vals, c);
        }
        
        @Override

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/88030df8/src/main/java/org/apache/sysml/runtime/compress/ColGroupDDC2.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/compress/ColGroupDDC2.java 
b/src/main/java/org/apache/sysml/runtime/compress/ColGroupDDC2.java
index f447aaf..170eb96 100644
--- a/src/main/java/org/apache/sysml/runtime/compress/ColGroupDDC2.java
+++ b/src/main/java/org/apache/sysml/runtime/compress/ColGroupDDC2.java
@@ -175,9 +175,10 @@ public class ColGroupDDC2 extends ColGroupDDC
                int nrow = getNumRows();
                int ncol = getNumCols();
                double[] c = target.getDenseBlock();
+               int nnz = 0;
                for( int i = 0; i < nrow; i++ )
-                       c[i] = _values[_data[i]*ncol+colpos];
-               target.recomputeNonZeros();
+                       nnz += ((c[i] = _values[_data[i]*ncol+colpos])!=0) ? 1 
: 0;
+               target.setNonZeros(nnz);
        }
        
        @Override 
@@ -250,27 +251,49 @@ public class ColGroupDDC2 extends ColGroupDDC
                        }
                        
                        //post-scaling of pre-aggregate with distinct values
-                       for( int k=0, valOff=0; k<numVals; k++, valOff+=ncol ) {
-                               double aval = vals[k];
-                               for( int j=0; j<ncol; j++ ) {
-                                       int colIx = _colIndexes[j];
-                                       c[colIx] += aval * _values[valOff+j];
-                               }       
-                       }
+                       postScaling(vals, c);
                }
                else //general case
-               {
-               
+               {       
                        //iterate over codes, compute all, and add to the result
                        for( int i=0; i<nrow; i++ ) {
                                double aval = a[i];
-                               if( aval != 0 ) {
-                                       int valOff = _data[i] * ncol;
-                                       for( int j=0; j<ncol; j++ ) {
-                                               int colIx = _colIndexes[j];
-                                               c[colIx] += aval * 
_values[valOff+j];
-                                       }
-                               }
+                               if( aval != 0 )
+                                       for( int j=0, valOff=_data[i]*ncol; 
j<ncol; j++ )
+                                               c[_colIndexes[j]] += aval * 
_values[valOff+j];
+                       }
+               }
+       }
+       
+       @Override
+       public void leftMultByRowVector(ColGroupDDC a, MatrixBlock result) 
+               throws DMLRuntimeException 
+       {
+               double[] c = result.getDenseBlock();
+               final int nrow = getNumRows();
+               final int ncol = getNumCols();
+               final int numVals = getNumValues();
+               
+               if( 8*numVals < getNumRows() )
+               {
+                       //iterative over codes and pre-aggregate inputs per code
+                       //temporary array also avoids false sharing in 
multi-threaded environments
+                       double[] vals = allocDVector(numVals, true);
+                       for( int i=0; i<nrow; i++ ) {
+                               vals[_data[i]] += a.getData(i, 0);
+                       }
+                       
+                       //post-scaling of pre-aggregate with distinct values
+                       postScaling(vals, c);
+               }
+               else //general case
+               {       
+                       //iterate over codes, compute all, and add to the result
+                       for( int i=0; i<nrow; i++ ) {
+                               double aval = a.getData(i, 0);
+                               if( aval != 0 )
+                                       for( int j=0, valOff=_data[i]*ncol; 
j<ncol; j++ )
+                                               c[_colIndexes[j]] += aval * 
_values[valOff+j];
                        }
                }
        }

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/88030df8/src/main/java/org/apache/sysml/runtime/compress/ColGroupOLE.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/compress/ColGroupOLE.java 
b/src/main/java/org/apache/sysml/runtime/compress/ColGroupOLE.java
index fab8917..70f759e 100644
--- a/src/main/java/org/apache/sysml/runtime/compress/ColGroupOLE.java
+++ b/src/main/java/org/apache/sysml/runtime/compress/ColGroupOLE.java
@@ -211,7 +211,9 @@ public class ColGroupOLE extends ColGroupOffset
                int[] apos = allocIVector(numVals, true);
                
                //cache conscious append via horizontal scans 
+               int nnz = 0;
                for( int bi=0; bi<n; bi+=blksz ) {
+                       Arrays.fill(c, bi, Math.min(bi+blksz, n), 0);
                        for (int k = 0, off=0; k < numVals; k++, off+=numCols) {
                                int boff = _ptr[k];
                                int blen = len(k);
@@ -222,11 +224,12 @@ public class ColGroupOLE extends ColGroupOffset
                                int pos = boff+bix+1;
                                for( int i=pos; i<pos+len; i++ ) {
                                        c[bi+_data[i]] = _values[off+colpos];
+                                       nnz++;
                                }
                                apos[k] += len + 1;
                        }
                }
-               target.recomputeNonZeros();
+               target.setNonZeros(nnz);
        }
        
        @Override 
@@ -446,6 +449,30 @@ public class ColGroupOLE extends ColGroupOffset
        }
 
        @Override
+       public void leftMultByRowVector(ColGroupDDC a, MatrixBlock result)
+               throws DMLRuntimeException 
+       {
+               //note: this method is only applicable for numrows < blocksize
+               double[] c = result.getDenseBlock();
+               final int numCols = getNumCols();
+               final int numVals = getNumValues();
+               
+               //iterate over all values and their bitmaps
+               for (int k=0, valOff=0; k<numVals; k++, valOff+=numCols) {
+                       int boff = _ptr[k];
+                       
+                       //iterate over bitmap blocks and add partial results
+                       double vsum = 0;
+                       for( int j = boff+1; j < boff+1+_data[boff]; j++ )
+                               vsum += a.getData(_data[j], 0);
+                       
+                       //scale partial results by values and write results
+                       for( int j = 0; j < numCols; j++ )
+                               c[ _colIndexes[j] ] += vsum * _values[ valOff+j 
];
+               }
+       }
+       
+       @Override
        protected final void computeSum(MatrixBlock result, KahanFunction kplus)
        {
                KahanObject kbuff = new KahanObject(result.quickGetValue(0, 0), 
result.quickGetValue(0, 1));

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/88030df8/src/main/java/org/apache/sysml/runtime/compress/ColGroupRLE.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/compress/ColGroupRLE.java 
b/src/main/java/org/apache/sysml/runtime/compress/ColGroupRLE.java
index cd658b0..6d1fb9f 100644
--- a/src/main/java/org/apache/sysml/runtime/compress/ColGroupRLE.java
+++ b/src/main/java/org/apache/sysml/runtime/compress/ColGroupRLE.java
@@ -203,8 +203,10 @@ public class ColGroupRLE extends ColGroupOffset
                int[] apos = allocIVector(numVals, true);
                
                //cache conscious append via horizontal scans 
+               int nnz = 0;
                for( int bi=0; bi<n; bi+=blksz ) {
                        int bimax = Math.min(bi+blksz, n);
+                       Arrays.fill(c, bi, bimax, 0);
                        for (int k=0, off=0; k < numVals; k++, off+=numCols) {
                                int boff = _ptr[k];
                                int blen = len(k);
@@ -215,15 +217,15 @@ public class ColGroupRLE extends ColGroupOffset
                                for( ; bix<blen & start<bimax; bix+=2) {
                                        start += _data[boff + bix];
                                        int len = _data[boff + bix+1];
-                                       for( int i=start; i<start+len; i++ )
-                                               c[i] = _values[off+colpos];
+                                       Arrays.fill(c, start, start+len, 
_values[off+colpos]);
+                                       nnz += len;
                                        start += len;
                                }
                                apos[k] = bix;  
                                astart[k] = start;
                        }
                }
-               target.recomputeNonZeros();
+               target.setNonZeros(nnz);
        }
        
        @Override 
@@ -422,6 +424,37 @@ public class ColGroupRLE extends ColGroupOffset
        }
 
        @Override
+       public void leftMultByRowVector(ColGroupDDC a, MatrixBlock result)
+                       throws DMLRuntimeException 
+       {
+               //note: this method is only applicable for numrows < blocksize
+               double[] c = result.getDenseBlock();
+               final int numCols = getNumCols();
+               final int numVals = getNumValues();
+
+               //iterate over all values and their bitmaps
+               for (int k=0, valOff=0; k<numVals; k++, valOff+=numCols) 
+               {       
+                       int boff = _ptr[k];
+                       int blen = len(k);
+                       
+                       double vsum = 0;
+                       int curRunEnd = 0;
+                       for ( int bix = 0; bix < blen; bix+=2 ) {
+                               int curRunStartOff = curRunEnd + 
_data[boff+bix];
+                               int curRunLen = _data[boff+bix+1];
+                               for( int i=curRunStartOff; 
i<curRunStartOff+curRunLen; i++ )
+                                       vsum += a.getData(i, 0);
+                               curRunEnd = curRunStartOff + curRunLen;
+                       }
+                       
+                       //scale partial results by values and write results
+                       for( int j = 0; j < numCols; j++ )
+                               c[ _colIndexes[j] ] += vsum * _values[ valOff+j 
];
+               }
+       }
+       
+       @Override
        public ColGroup scalarOperation(ScalarOperator op)
                        throws DMLRuntimeException 
        {

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/88030df8/src/main/java/org/apache/sysml/runtime/compress/ColGroupValue.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/compress/ColGroupValue.java 
b/src/main/java/org/apache/sysml/runtime/compress/ColGroupValue.java
index 8bfc019..79d963e 100644
--- a/src/main/java/org/apache/sysml/runtime/compress/ColGroupValue.java
+++ b/src/main/java/org/apache/sysml/runtime/compress/ColGroupValue.java
@@ -296,6 +296,11 @@ public abstract class ColGroupValue extends ColGroup
                for( int j=0; j<numCols; j++ )
                        result.quickSetValue(0, _colIndexes[j], vals[j]);
        }
+
+       //additional vector-matrix multiplication to avoid DDC uncompression
+       public abstract void leftMultByRowVector(ColGroupDDC vector, 
MatrixBlock result) 
+               throws DMLRuntimeException;
+
        
        /**
         * Method for use by subclasses. Applies a scalar operation to the value

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/88030df8/src/main/java/org/apache/sysml/runtime/compress/CompressedMatrixBlock.java
----------------------------------------------------------------------
diff --git 
a/src/main/java/org/apache/sysml/runtime/compress/CompressedMatrixBlock.java 
b/src/main/java/org/apache/sysml/runtime/compress/CompressedMatrixBlock.java
index 003f31a..ed2ab27 100644
--- a/src/main/java/org/apache/sysml/runtime/compress/CompressedMatrixBlock.java
+++ b/src/main/java/org/apache/sysml/runtime/compress/CompressedMatrixBlock.java
@@ -1222,8 +1222,8 @@ public class CompressedMatrixBlock extends MatrixBlock 
implements Externalizable
                        leftMultByTransposeSelf(_colGroups, out, 0, 
_colGroups.size());
                        
                        // post-processing
-                       LinearAlgebraUtils.copyUpperToLowerTriangle(out);
-                       out.recomputeNonZeros();
+                       out.setNonZeros(LinearAlgebraUtils
+                               .copyUpperToLowerTriangle(out));
                }
                
                if( LOG.isDebugEnabled() )
@@ -1280,8 +1280,8 @@ public class CompressedMatrixBlock extends MatrixBlock 
implements Externalizable
                        }
                        
                        // post-processing
-                       LinearAlgebraUtils.copyUpperToLowerTriangle(out);
-                       out.recomputeNonZeros();
+                       out.setNonZeros(LinearAlgebraUtils
+                               .copyUpperToLowerTriangle(out));
                }
                
                if( LOG.isDebugEnabled() )
@@ -1434,6 +1434,21 @@ public class CompressedMatrixBlock extends MatrixBlock 
implements Externalizable
                result.recomputeNonZeros();
        }
        
+       private static void leftMultByVectorTranspose(List<ColGroup> colGroups, 
ColGroupDDC vector, MatrixBlock result) 
+               throws DMLRuntimeException 
+       {
+               // initialize and allocate the result
+               result.reset();
+               
+               // delegate matrix-vector operation to each column group
+               for( ColGroup grp : colGroups ) {                       
+                       ((ColGroupValue)grp).leftMultByRowVector(vector, 
result);
+               }
+               
+               // post-processing
+               result.recomputeNonZeros();
+       }
+       
        /**
         * Multi-thread version of leftMultByVectorTranspose.
         * 
@@ -1492,6 +1507,7 @@ public class CompressedMatrixBlock extends MatrixBlock 
implements Externalizable
        {
                final int numRows = groups.get(0).getNumRows();
                final int numGroups = groups.size();            
+               final boolean containsUC = containsUncompressedColGroup(groups);
                
                //preallocated dense tmp matrix blocks
                MatrixBlock lhs = new MatrixBlock(1, numRows, false);
@@ -1511,20 +1527,28 @@ public class CompressedMatrixBlock extends MatrixBlock 
implements Externalizable
                        int[] ixgroup = group.getColIndices();
                        List<ColGroup> tmpList = groups.subList(i, numGroups);
                        
-                       //for all uncompressed lhs columns vectors
-                       for( int j=0; j<ixgroup.length; j++ ) {
-                               //decompress single column
-                               if( !(group instanceof ColGroupDDC) )
-                                       lhs.reset(1, numRows, false);           
                
-                               group.decompressToBlock(lhs, j);
+                       if( group instanceof ColGroupDDC //single DDC group
+                               && ixgroup.length==1 && !containsUC && 
numRows<BitmapEncoder.BITMAP_BLOCK_SZ ) 
+                       {
+                               //compute vector-matrix partial result
+                               leftMultByVectorTranspose(tmpList, 
(ColGroupDDC)group, tmpret);                                                    
     
                                
-                               if( !lhs.isEmptyBlock(false) ) {
-                                       //compute vector-matrix partial result
-                                       leftMultByVectorTranspose(tmpList, lhs, 
tmpret, false, false);                                                          
+                               //write partial results (disjoint non-zeros)
+                               
LinearAlgebraUtils.copyNonZerosToUpperTriangle(result, tmpret, ixgroup[0]);     
+                       }
+                       else {
+                               //for all uncompressed lhs columns vectors
+                               for( int j=0; j<ixgroup.length; j++ ) {
+                                       group.decompressToBlock(lhs, j);
                                        
-                                       //write partial results (disjoint 
non-zeros)
-                                       
LinearAlgebraUtils.copyNonZerosToUpperTriangle(result, tmpret, ixgroup[j]);     
-                               }
+                                       if( !lhs.isEmptyBlock(false) ) {
+                                               //compute vector-matrix partial 
result
+                                               
leftMultByVectorTranspose(tmpList, lhs, tmpret, false, false);                  
                                        
+                                               
+                                               //write partial results 
(disjoint non-zeros)
+                                               
LinearAlgebraUtils.copyNonZerosToUpperTriangle(result, tmpret, ixgroup[j]);     
+                                       }
+                               }       
                        }
                }
                
@@ -1574,6 +1598,13 @@ public class CompressedMatrixBlock extends MatrixBlock 
implements Externalizable
                
                return null;
        }
+       
+       private static boolean containsUncompressedColGroup(ArrayList<ColGroup> 
groups) {
+               for( ColGroup grp : groups )
+                       if( grp instanceof ColGroupUncompressed ) 
+                               return true;
+               return false;
+       }
 
        private static class LeftMatrixMultTask implements Callable<Object> 
        {

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/88030df8/src/main/java/org/apache/sysml/runtime/compress/utils/LinearAlgebraUtils.java
----------------------------------------------------------------------
diff --git 
a/src/main/java/org/apache/sysml/runtime/compress/utils/LinearAlgebraUtils.java 
b/src/main/java/org/apache/sysml/runtime/compress/utils/LinearAlgebraUtils.java
index 515457b..de6e233 100644
--- 
a/src/main/java/org/apache/sysml/runtime/compress/utils/LinearAlgebraUtils.java
+++ 
b/src/main/java/org/apache/sysml/runtime/compress/utils/LinearAlgebraUtils.java
@@ -128,27 +128,20 @@ public class LinearAlgebraUtils
                final int bn = len%8;
                
                //rest, not aligned to 8-blocks
-               for( int j = 0; j < bn; j++, ai++ )
-                       val += a[ ai ];
+               for( int j = ai; j < ai+bn; j++ )
+                       val += a[ j ];
                
                //unrolled 8-block (for better instruction-level parallelism)
-               for( int j = bn; j < len; j+=8, ai+=8 )
-               {
-                       val += a[ ai+0 ]
-                            + a[ ai+1 ]
-                            + a[ ai+2 ]
-                            + a[ ai+3 ]
-                            + a[ ai+4 ]
-                            + a[ ai+5 ]
-                            + a[ ai+6 ]
-                            + a[ ai+7 ];
+               for( int j = ai+bn; j < ai+len; j+=8 ) {
+                       val += a[ j+0 ] + a[ j+1 ] + a[ j+2 ] + a[ j+3 ]
+                            + a[ j+4 ] + a[ j+5 ] + a[ j+6 ] + a[ j+7 ];
                }
                
                return val;
        }
 
-       public static void copyUpperToLowerTriangle( MatrixBlock ret ) {
-               LibMatrixMult.copyUpperToLowerTriangle(ret);
+       public static long copyUpperToLowerTriangle( MatrixBlock ret ) {
+               return LibMatrixMult.copyUpperToLowerTriangle(ret);
        }
        
        public static void copyNonZerosToUpperTriangle( MatrixBlock ret, 
MatrixBlock tmp, int ix ) {

Reply via email to