Repository: incubator-systemml Updated Branches: refs/heads/master 30dcd64b4 -> 88030df8f
[SYSTEMML-1558] Performance compressed tsmm (nnz, w/o ddc decompress) This commit makes various minor performance improvements to tsmm operations over compressed matrix blocks, including fused nnz maintenance for decompressed vectors and outputs, as well as vector-matrix operations directly over compressed DDC vectors in order to avoid unnecessary decompression. We also experimented with cache-conscious schemes by blocking decompressed and compressed columns groups but this did not show any improvements, likely due to the coarse granularity of individual column groups. Project: http://git-wip-us.apache.org/repos/asf/incubator-systemml/repo Commit: http://git-wip-us.apache.org/repos/asf/incubator-systemml/commit/88030df8 Tree: http://git-wip-us.apache.org/repos/asf/incubator-systemml/tree/88030df8 Diff: http://git-wip-us.apache.org/repos/asf/incubator-systemml/diff/88030df8 Branch: refs/heads/master Commit: 88030df8f98098e02a7c4016efda00565b2ac993 Parents: 30dcd64 Author: Matthias Boehm <mboe...@gmail.com> Authored: Mon Apr 24 23:17:54 2017 -0700 Committer: Matthias Boehm <mboe...@gmail.com> Committed: Mon Apr 24 23:17:54 2017 -0700 ---------------------------------------------------------------------- .../sysml/runtime/compress/ColGroupDDC.java | 13 +++- .../sysml/runtime/compress/ColGroupDDC1.java | 30 ++++++---- .../sysml/runtime/compress/ColGroupDDC2.java | 59 ++++++++++++------ .../sysml/runtime/compress/ColGroupOLE.java | 29 ++++++++- .../sysml/runtime/compress/ColGroupRLE.java | 39 +++++++++++- .../sysml/runtime/compress/ColGroupValue.java | 5 ++ .../runtime/compress/CompressedMatrixBlock.java | 63 +++++++++++++++----- .../compress/utils/LinearAlgebraUtils.java | 21 +++---- 8 files changed, 196 insertions(+), 63 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/88030df8/src/main/java/org/apache/sysml/runtime/compress/ColGroupDDC.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/runtime/compress/ColGroupDDC.java b/src/main/java/org/apache/sysml/runtime/compress/ColGroupDDC.java index 1782e2e..9a6067b 100644 --- a/src/main/java/org/apache/sysml/runtime/compress/ColGroupDDC.java +++ b/src/main/java/org/apache/sysml/runtime/compress/ColGroupDDC.java @@ -200,7 +200,18 @@ public abstract class ColGroupDDC extends ColGroupValue c[i] = builtin.execute2(c[i], getData(i, j)); } - + protected final void postScaling(double[] vals, double[] c) { + final int ncol = getNumCols(); + final int numVals = getNumValues(); + + for( int k=0, valOff=0; k<numVals; k++, valOff+=ncol ) { + double aval = vals[k]; + for( int j=0; j<ncol; j++ ) { + int colIx = _colIndexes[j]; + c[colIx] += aval * _values[valOff+j]; + } + } + } /** * Generic get value for byte-length-agnostic access. http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/88030df8/src/main/java/org/apache/sysml/runtime/compress/ColGroupDDC1.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/runtime/compress/ColGroupDDC1.java b/src/main/java/org/apache/sysml/runtime/compress/ColGroupDDC1.java index 4cd103e..8fb7d99 100644 --- a/src/main/java/org/apache/sysml/runtime/compress/ColGroupDDC1.java +++ b/src/main/java/org/apache/sysml/runtime/compress/ColGroupDDC1.java @@ -173,9 +173,10 @@ public class ColGroupDDC1 extends ColGroupDDC int nrow = getNumRows(); int ncol = getNumCols(); double[] c = target.getDenseBlock(); + int nnz = 0; for( int i = 0; i < nrow; i++ ) - c[i] = _values[(_data[i]&0xFF)*ncol+colpos]; - target.recomputeNonZeros(); + nnz += ((c[i] = _values[(_data[i]&0xFF)*ncol+colpos])!=0) ? 1 : 0; + target.setNonZeros(nnz); } @Override @@ -263,7 +264,6 @@ public class ColGroupDDC1 extends ColGroupDDC double[] a = ConverterUtils.getDenseVector(vector); double[] c = result.getDenseBlock(); final int nrow = getNumRows(); - final int ncol = getNumCols(); final int numVals = getNumValues(); //iterative over codes and pre-aggregate inputs per code (guaranteed <=255) @@ -274,13 +274,23 @@ public class ColGroupDDC1 extends ColGroupDDC } //post-scaling of pre-aggregate with distinct values - for( int k=0, valOff=0; k<numVals; k++, valOff+=ncol ) { - double aval = vals[k]; - for( int j=0; j<ncol; j++ ) { - int colIx = _colIndexes[j]; - c[colIx] += aval * _values[valOff+j]; - } - } + postScaling(vals, c); + } + + @Override + public void leftMultByRowVector(ColGroupDDC a, MatrixBlock result) throws DMLRuntimeException { + double[] c = result.getDenseBlock(); + final int nrow = getNumRows(); + final int numVals = getNumValues(); + + //iterative over codes and pre-aggregate inputs per code (guaranteed <=255) + //temporary array also avoids false sharing in multi-threaded environments + double[] vals = allocDVector(numVals, true); + for( int i=0; i<nrow; i++ ) + vals[_data[i]&0xFF] += a.getData(i, 0); + + //post-scaling of pre-aggregate with distinct values + postScaling(vals, c); } @Override http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/88030df8/src/main/java/org/apache/sysml/runtime/compress/ColGroupDDC2.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/runtime/compress/ColGroupDDC2.java b/src/main/java/org/apache/sysml/runtime/compress/ColGroupDDC2.java index f447aaf..170eb96 100644 --- a/src/main/java/org/apache/sysml/runtime/compress/ColGroupDDC2.java +++ b/src/main/java/org/apache/sysml/runtime/compress/ColGroupDDC2.java @@ -175,9 +175,10 @@ public class ColGroupDDC2 extends ColGroupDDC int nrow = getNumRows(); int ncol = getNumCols(); double[] c = target.getDenseBlock(); + int nnz = 0; for( int i = 0; i < nrow; i++ ) - c[i] = _values[_data[i]*ncol+colpos]; - target.recomputeNonZeros(); + nnz += ((c[i] = _values[_data[i]*ncol+colpos])!=0) ? 1 : 0; + target.setNonZeros(nnz); } @Override @@ -250,27 +251,49 @@ public class ColGroupDDC2 extends ColGroupDDC } //post-scaling of pre-aggregate with distinct values - for( int k=0, valOff=0; k<numVals; k++, valOff+=ncol ) { - double aval = vals[k]; - for( int j=0; j<ncol; j++ ) { - int colIx = _colIndexes[j]; - c[colIx] += aval * _values[valOff+j]; - } - } + postScaling(vals, c); } else //general case - { - + { //iterate over codes, compute all, and add to the result for( int i=0; i<nrow; i++ ) { double aval = a[i]; - if( aval != 0 ) { - int valOff = _data[i] * ncol; - for( int j=0; j<ncol; j++ ) { - int colIx = _colIndexes[j]; - c[colIx] += aval * _values[valOff+j]; - } - } + if( aval != 0 ) + for( int j=0, valOff=_data[i]*ncol; j<ncol; j++ ) + c[_colIndexes[j]] += aval * _values[valOff+j]; + } + } + } + + @Override + public void leftMultByRowVector(ColGroupDDC a, MatrixBlock result) + throws DMLRuntimeException + { + double[] c = result.getDenseBlock(); + final int nrow = getNumRows(); + final int ncol = getNumCols(); + final int numVals = getNumValues(); + + if( 8*numVals < getNumRows() ) + { + //iterative over codes and pre-aggregate inputs per code + //temporary array also avoids false sharing in multi-threaded environments + double[] vals = allocDVector(numVals, true); + for( int i=0; i<nrow; i++ ) { + vals[_data[i]] += a.getData(i, 0); + } + + //post-scaling of pre-aggregate with distinct values + postScaling(vals, c); + } + else //general case + { + //iterate over codes, compute all, and add to the result + for( int i=0; i<nrow; i++ ) { + double aval = a.getData(i, 0); + if( aval != 0 ) + for( int j=0, valOff=_data[i]*ncol; j<ncol; j++ ) + c[_colIndexes[j]] += aval * _values[valOff+j]; } } } http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/88030df8/src/main/java/org/apache/sysml/runtime/compress/ColGroupOLE.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/runtime/compress/ColGroupOLE.java b/src/main/java/org/apache/sysml/runtime/compress/ColGroupOLE.java index fab8917..70f759e 100644 --- a/src/main/java/org/apache/sysml/runtime/compress/ColGroupOLE.java +++ b/src/main/java/org/apache/sysml/runtime/compress/ColGroupOLE.java @@ -211,7 +211,9 @@ public class ColGroupOLE extends ColGroupOffset int[] apos = allocIVector(numVals, true); //cache conscious append via horizontal scans + int nnz = 0; for( int bi=0; bi<n; bi+=blksz ) { + Arrays.fill(c, bi, Math.min(bi+blksz, n), 0); for (int k = 0, off=0; k < numVals; k++, off+=numCols) { int boff = _ptr[k]; int blen = len(k); @@ -222,11 +224,12 @@ public class ColGroupOLE extends ColGroupOffset int pos = boff+bix+1; for( int i=pos; i<pos+len; i++ ) { c[bi+_data[i]] = _values[off+colpos]; + nnz++; } apos[k] += len + 1; } } - target.recomputeNonZeros(); + target.setNonZeros(nnz); } @Override @@ -446,6 +449,30 @@ public class ColGroupOLE extends ColGroupOffset } @Override + public void leftMultByRowVector(ColGroupDDC a, MatrixBlock result) + throws DMLRuntimeException + { + //note: this method is only applicable for numrows < blocksize + double[] c = result.getDenseBlock(); + final int numCols = getNumCols(); + final int numVals = getNumValues(); + + //iterate over all values and their bitmaps + for (int k=0, valOff=0; k<numVals; k++, valOff+=numCols) { + int boff = _ptr[k]; + + //iterate over bitmap blocks and add partial results + double vsum = 0; + for( int j = boff+1; j < boff+1+_data[boff]; j++ ) + vsum += a.getData(_data[j], 0); + + //scale partial results by values and write results + for( int j = 0; j < numCols; j++ ) + c[ _colIndexes[j] ] += vsum * _values[ valOff+j ]; + } + } + + @Override protected final void computeSum(MatrixBlock result, KahanFunction kplus) { KahanObject kbuff = new KahanObject(result.quickGetValue(0, 0), result.quickGetValue(0, 1)); http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/88030df8/src/main/java/org/apache/sysml/runtime/compress/ColGroupRLE.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/runtime/compress/ColGroupRLE.java b/src/main/java/org/apache/sysml/runtime/compress/ColGroupRLE.java index cd658b0..6d1fb9f 100644 --- a/src/main/java/org/apache/sysml/runtime/compress/ColGroupRLE.java +++ b/src/main/java/org/apache/sysml/runtime/compress/ColGroupRLE.java @@ -203,8 +203,10 @@ public class ColGroupRLE extends ColGroupOffset int[] apos = allocIVector(numVals, true); //cache conscious append via horizontal scans + int nnz = 0; for( int bi=0; bi<n; bi+=blksz ) { int bimax = Math.min(bi+blksz, n); + Arrays.fill(c, bi, bimax, 0); for (int k=0, off=0; k < numVals; k++, off+=numCols) { int boff = _ptr[k]; int blen = len(k); @@ -215,15 +217,15 @@ public class ColGroupRLE extends ColGroupOffset for( ; bix<blen & start<bimax; bix+=2) { start += _data[boff + bix]; int len = _data[boff + bix+1]; - for( int i=start; i<start+len; i++ ) - c[i] = _values[off+colpos]; + Arrays.fill(c, start, start+len, _values[off+colpos]); + nnz += len; start += len; } apos[k] = bix; astart[k] = start; } } - target.recomputeNonZeros(); + target.setNonZeros(nnz); } @Override @@ -422,6 +424,37 @@ public class ColGroupRLE extends ColGroupOffset } @Override + public void leftMultByRowVector(ColGroupDDC a, MatrixBlock result) + throws DMLRuntimeException + { + //note: this method is only applicable for numrows < blocksize + double[] c = result.getDenseBlock(); + final int numCols = getNumCols(); + final int numVals = getNumValues(); + + //iterate over all values and their bitmaps + for (int k=0, valOff=0; k<numVals; k++, valOff+=numCols) + { + int boff = _ptr[k]; + int blen = len(k); + + double vsum = 0; + int curRunEnd = 0; + for ( int bix = 0; bix < blen; bix+=2 ) { + int curRunStartOff = curRunEnd + _data[boff+bix]; + int curRunLen = _data[boff+bix+1]; + for( int i=curRunStartOff; i<curRunStartOff+curRunLen; i++ ) + vsum += a.getData(i, 0); + curRunEnd = curRunStartOff + curRunLen; + } + + //scale partial results by values and write results + for( int j = 0; j < numCols; j++ ) + c[ _colIndexes[j] ] += vsum * _values[ valOff+j ]; + } + } + + @Override public ColGroup scalarOperation(ScalarOperator op) throws DMLRuntimeException { http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/88030df8/src/main/java/org/apache/sysml/runtime/compress/ColGroupValue.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/runtime/compress/ColGroupValue.java b/src/main/java/org/apache/sysml/runtime/compress/ColGroupValue.java index 8bfc019..79d963e 100644 --- a/src/main/java/org/apache/sysml/runtime/compress/ColGroupValue.java +++ b/src/main/java/org/apache/sysml/runtime/compress/ColGroupValue.java @@ -296,6 +296,11 @@ public abstract class ColGroupValue extends ColGroup for( int j=0; j<numCols; j++ ) result.quickSetValue(0, _colIndexes[j], vals[j]); } + + //additional vector-matrix multiplication to avoid DDC uncompression + public abstract void leftMultByRowVector(ColGroupDDC vector, MatrixBlock result) + throws DMLRuntimeException; + /** * Method for use by subclasses. Applies a scalar operation to the value http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/88030df8/src/main/java/org/apache/sysml/runtime/compress/CompressedMatrixBlock.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/runtime/compress/CompressedMatrixBlock.java b/src/main/java/org/apache/sysml/runtime/compress/CompressedMatrixBlock.java index 003f31a..ed2ab27 100644 --- a/src/main/java/org/apache/sysml/runtime/compress/CompressedMatrixBlock.java +++ b/src/main/java/org/apache/sysml/runtime/compress/CompressedMatrixBlock.java @@ -1222,8 +1222,8 @@ public class CompressedMatrixBlock extends MatrixBlock implements Externalizable leftMultByTransposeSelf(_colGroups, out, 0, _colGroups.size()); // post-processing - LinearAlgebraUtils.copyUpperToLowerTriangle(out); - out.recomputeNonZeros(); + out.setNonZeros(LinearAlgebraUtils + .copyUpperToLowerTriangle(out)); } if( LOG.isDebugEnabled() ) @@ -1280,8 +1280,8 @@ public class CompressedMatrixBlock extends MatrixBlock implements Externalizable } // post-processing - LinearAlgebraUtils.copyUpperToLowerTriangle(out); - out.recomputeNonZeros(); + out.setNonZeros(LinearAlgebraUtils + .copyUpperToLowerTriangle(out)); } if( LOG.isDebugEnabled() ) @@ -1434,6 +1434,21 @@ public class CompressedMatrixBlock extends MatrixBlock implements Externalizable result.recomputeNonZeros(); } + private static void leftMultByVectorTranspose(List<ColGroup> colGroups, ColGroupDDC vector, MatrixBlock result) + throws DMLRuntimeException + { + // initialize and allocate the result + result.reset(); + + // delegate matrix-vector operation to each column group + for( ColGroup grp : colGroups ) { + ((ColGroupValue)grp).leftMultByRowVector(vector, result); + } + + // post-processing + result.recomputeNonZeros(); + } + /** * Multi-thread version of leftMultByVectorTranspose. * @@ -1492,6 +1507,7 @@ public class CompressedMatrixBlock extends MatrixBlock implements Externalizable { final int numRows = groups.get(0).getNumRows(); final int numGroups = groups.size(); + final boolean containsUC = containsUncompressedColGroup(groups); //preallocated dense tmp matrix blocks MatrixBlock lhs = new MatrixBlock(1, numRows, false); @@ -1511,20 +1527,28 @@ public class CompressedMatrixBlock extends MatrixBlock implements Externalizable int[] ixgroup = group.getColIndices(); List<ColGroup> tmpList = groups.subList(i, numGroups); - //for all uncompressed lhs columns vectors - for( int j=0; j<ixgroup.length; j++ ) { - //decompress single column - if( !(group instanceof ColGroupDDC) ) - lhs.reset(1, numRows, false); - group.decompressToBlock(lhs, j); + if( group instanceof ColGroupDDC //single DDC group + && ixgroup.length==1 && !containsUC && numRows<BitmapEncoder.BITMAP_BLOCK_SZ ) + { + //compute vector-matrix partial result + leftMultByVectorTranspose(tmpList, (ColGroupDDC)group, tmpret); - if( !lhs.isEmptyBlock(false) ) { - //compute vector-matrix partial result - leftMultByVectorTranspose(tmpList, lhs, tmpret, false, false); + //write partial results (disjoint non-zeros) + LinearAlgebraUtils.copyNonZerosToUpperTriangle(result, tmpret, ixgroup[0]); + } + else { + //for all uncompressed lhs columns vectors + for( int j=0; j<ixgroup.length; j++ ) { + group.decompressToBlock(lhs, j); - //write partial results (disjoint non-zeros) - LinearAlgebraUtils.copyNonZerosToUpperTriangle(result, tmpret, ixgroup[j]); - } + if( !lhs.isEmptyBlock(false) ) { + //compute vector-matrix partial result + leftMultByVectorTranspose(tmpList, lhs, tmpret, false, false); + + //write partial results (disjoint non-zeros) + LinearAlgebraUtils.copyNonZerosToUpperTriangle(result, tmpret, ixgroup[j]); + } + } } } @@ -1574,6 +1598,13 @@ public class CompressedMatrixBlock extends MatrixBlock implements Externalizable return null; } + + private static boolean containsUncompressedColGroup(ArrayList<ColGroup> groups) { + for( ColGroup grp : groups ) + if( grp instanceof ColGroupUncompressed ) + return true; + return false; + } private static class LeftMatrixMultTask implements Callable<Object> { http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/88030df8/src/main/java/org/apache/sysml/runtime/compress/utils/LinearAlgebraUtils.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/runtime/compress/utils/LinearAlgebraUtils.java b/src/main/java/org/apache/sysml/runtime/compress/utils/LinearAlgebraUtils.java index 515457b..de6e233 100644 --- a/src/main/java/org/apache/sysml/runtime/compress/utils/LinearAlgebraUtils.java +++ b/src/main/java/org/apache/sysml/runtime/compress/utils/LinearAlgebraUtils.java @@ -128,27 +128,20 @@ public class LinearAlgebraUtils final int bn = len%8; //rest, not aligned to 8-blocks - for( int j = 0; j < bn; j++, ai++ ) - val += a[ ai ]; + for( int j = ai; j < ai+bn; j++ ) + val += a[ j ]; //unrolled 8-block (for better instruction-level parallelism) - for( int j = bn; j < len; j+=8, ai+=8 ) - { - val += a[ ai+0 ] - + a[ ai+1 ] - + a[ ai+2 ] - + a[ ai+3 ] - + a[ ai+4 ] - + a[ ai+5 ] - + a[ ai+6 ] - + a[ ai+7 ]; + for( int j = ai+bn; j < ai+len; j+=8 ) { + val += a[ j+0 ] + a[ j+1 ] + a[ j+2 ] + a[ j+3 ] + + a[ j+4 ] + a[ j+5 ] + a[ j+6 ] + a[ j+7 ]; } return val; } - public static void copyUpperToLowerTriangle( MatrixBlock ret ) { - LibMatrixMult.copyUpperToLowerTriangle(ret); + public static long copyUpperToLowerTriangle( MatrixBlock ret ) { + return LibMatrixMult.copyUpperToLowerTriangle(ret); } public static void copyNonZerosToUpperTriangle( MatrixBlock ret, MatrixBlock tmp, int ix ) {