[SYSTEMML-1959] Extended shallow-serialize for sparse matrices Originally, all dense and sparse matrices where serializes into byte arrays on write into the buffer pool. Later, we introduced the concept of shallow-serialize where we simply hold a strong reference to all matrices whose in-memory size is equal or close to its size in serialized form, which greatly reduced the bufferpool overhead.
For tall and skinny sparse matrices in MCSR format (our default), this is not the case, which causes serialization overhead and potentially unnecessary evictions if the serialized representation is larger than 2GB (max integer size of byte arrays). This patch overcomes these issues, by deciding upon a potential conversion from MCSR to CSR, which allows for shallow serialize. Since this only happens in cases where we would have serialized or evicted the MCSR block, this conversion is safe with regard to potentially unnecessary overhead. Project: http://git-wip-us.apache.org/repos/asf/systemml/repo Commit: http://git-wip-us.apache.org/repos/asf/systemml/commit/db725dec Tree: http://git-wip-us.apache.org/repos/asf/systemml/tree/db725dec Diff: http://git-wip-us.apache.org/repos/asf/systemml/diff/db725dec Branch: refs/heads/master Commit: db725dec447be39b4e2f16ec66d669ad48217874 Parents: d641c22 Author: Matthias Boehm <[email protected]> Authored: Fri Oct 13 22:44:09 2017 -0700 Committer: Matthias Boehm <[email protected]> Committed: Sat Oct 14 02:30:37 2017 -0700 ---------------------------------------------------------------------- .../controlprogram/caching/ByteBuffer.java | 17 +++--- .../controlprogram/caching/CacheBlock.java | 20 +++++++ .../sysml/runtime/matrix/data/FrameBlock.java | 11 +++- .../sysml/runtime/matrix/data/MatrixBlock.java | 56 ++++++++++++++------ 4 files changed, 81 insertions(+), 23 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/systemml/blob/db725dec/src/main/java/org/apache/sysml/runtime/controlprogram/caching/ByteBuffer.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/runtime/controlprogram/caching/ByteBuffer.java b/src/main/java/org/apache/sysml/runtime/controlprogram/caching/ByteBuffer.java index 807fdc4..a87e4b4 100644 --- a/src/main/java/org/apache/sysml/runtime/controlprogram/caching/ByteBuffer.java +++ b/src/main/java/org/apache/sysml/runtime/controlprogram/caching/ByteBuffer.java @@ -52,7 +52,7 @@ public class ByteBuffer public void serializeBlock( CacheBlock cb ) throws IOException { - _shallow = cb.isShallowSerialize(); + _shallow = cb.isShallowSerialize(true); _matrix = (cb instanceof MatrixBlock); try @@ -69,6 +69,10 @@ public class ByteBuffer } else //SPARSE/DENSE -> DENSE { + //convert to shallow serialize block if necessary + if( !cb.isShallowSerialize() ) + cb.toShallowSerializeBlock(); + //shallow serialize _cdata = cb; } @@ -160,14 +164,15 @@ public class ByteBuffer */ public static boolean isValidCapacity( long size, CacheBlock cb ) { - if( !cb.isShallowSerialize() ) { //SPARSE matrix blocks + if( !cb.isShallowSerialize(true) ) { //SPARSE matrix blocks // since cache blocks are serialized into a byte representation // the buffer buffer can hold at most 2GB in size - return ( size <= Integer.MAX_VALUE ); + return ( size <= Integer.MAX_VALUE ); } - else {//DENSE matrix / frame blocks - // since for dense matrix blocks we use a shallow serialize (strong reference), - // the byte buffer can hold any size (currently upper bounded by 16GB) + else {//DENSE/SPARSE matrix / frame blocks + // for dense and under special conditions also sparse matrix blocks + // we use a shallow serialize (strong reference), there is no additional + // capacity constraint for serializing these blocks into byte arrays return true; } } http://git-wip-us.apache.org/repos/asf/systemml/blob/db725dec/src/main/java/org/apache/sysml/runtime/controlprogram/caching/CacheBlock.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/runtime/controlprogram/caching/CacheBlock.java b/src/main/java/org/apache/sysml/runtime/controlprogram/caching/CacheBlock.java index eb34b09..bd03ead 100644 --- a/src/main/java/org/apache/sysml/runtime/controlprogram/caching/CacheBlock.java +++ b/src/main/java/org/apache/sysml/runtime/controlprogram/caching/CacheBlock.java @@ -59,6 +59,26 @@ public interface CacheBlock extends Writable public boolean isShallowSerialize(); /** + * Indicates if the cache block is subject to shallow serialized, + * which is generally true if in-memory size and serialized size + * are almost identical allowing to avoid unnecessary deep serialize. + * + * @param inclConvert if true report blocks as shallow serialize that are + * currently not amenable but can be brought into an amenable form + * via {@link #toShallowSerializeBlock() toShallowSerializeBlock}. + * + * @return true if shallow serialized + */ + public boolean isShallowSerialize(boolean inclConvert); + + /** + * Converts a cache block that is not shallow serializable into + * a form that is shallow serializable. This methods has no affect + * if the given cache block is not amenable. + */ + public void toShallowSerializeBlock(); + + /** * Free unnecessarily allocated empty block. */ public void compactEmptyBlock(); http://git-wip-us.apache.org/repos/asf/systemml/blob/db725dec/src/main/java/org/apache/sysml/runtime/matrix/data/FrameBlock.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/runtime/matrix/data/FrameBlock.java b/src/main/java/org/apache/sysml/runtime/matrix/data/FrameBlock.java index a79f6c2..a56fd6a 100644 --- a/src/main/java/org/apache/sysml/runtime/matrix/data/FrameBlock.java +++ b/src/main/java/org/apache/sysml/runtime/matrix/data/FrameBlock.java @@ -753,15 +753,24 @@ public class FrameBlock implements Writable, CacheBlock, Externalizable @Override public boolean isShallowSerialize() { + return isShallowSerialize(false); + } + + @Override + public boolean isShallowSerialize(boolean inclConvert) { //shallow serialize if non-string schema because a frame block //is always dense but strings have large array overhead per cell boolean ret = true; for( int j=0; j<_schema.length && ret; j++ ) ret &= (_schema[j] != ValueType.STRING); - return ret; } + @Override + public void toShallowSerializeBlock() { + //do nothing (not applicable). + } + @Override public void compactEmptyBlock() { //do nothing http://git-wip-us.apache.org/repos/asf/systemml/blob/db725dec/src/main/java/org/apache/sysml/runtime/matrix/data/MatrixBlock.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/runtime/matrix/data/MatrixBlock.java b/src/main/java/org/apache/sysml/runtime/matrix/data/MatrixBlock.java index 8117c04..e563c60 100644 --- a/src/main/java/org/apache/sysml/runtime/matrix/data/MatrixBlock.java +++ b/src/main/java/org/apache/sysml/runtime/matrix/data/MatrixBlock.java @@ -105,6 +105,8 @@ public class MatrixBlock extends MatrixValue implements CacheBlock, Externalizab public static final SparseBlock.Type DEFAULT_INPLACE_SPARSEBLOCK = SparseBlock.Type.CSR; //allowed overhead for shallow serialize in terms of in-memory-size/x <= serialized-size public static final double MAX_SHALLOW_SERIALIZE_OVERHEAD = 1.3; + //flag if MCSR blocks that do not qualify for shallow serialize should be converted to CSR + public static final boolean CONVERT_MCSR_TO_CSR_ON_DEEP_SERIALIZE = true; //basic header (int rlen, int clen, byte type) public static final int HEADER_SIZE = 9; @@ -947,6 +949,10 @@ public class MatrixBlock extends MatrixValue implements CacheBlock, Externalizab isPM &= sblock.isEmpty(i) || sblock.size(i) == 1; return isPM; } + + private boolean isUltraSparseSerialize(boolean sparseDst) { + return nonZeros<rlen && sparseDst; + } /** * Evaluates if this matrix block should be in sparse format in @@ -1004,7 +1010,7 @@ public class MatrixBlock extends MatrixValue implements CacheBlock, Externalizab //ensure exact size estimates for write if( nonZeros <= 0 ) { recomputeNonZeros(); - } + } //decide on in-memory representation return evalSparseFormatOnDisk(lrlen, lclen, nonZeros); @@ -1674,8 +1680,8 @@ public class MatrixBlock extends MatrixValue implements CacheBlock, Externalizab if( !a.isEmpty(i) ) { boolean update = a.set(i, cl, 0); if( updateNNZ ) - nonZeros -= update ? 1 : 0; - } + nonZeros -= update ? 1 : 0; + } } else { @@ -1685,7 +1691,7 @@ public class MatrixBlock extends MatrixValue implements CacheBlock, Externalizab a.deleteIndexRange(i, cl, cu+1); if( updateNNZ ) nonZeros += (a.size(i)-lnnz); - } + } } } @@ -2028,7 +2034,7 @@ public class MatrixBlock extends MatrixValue implements CacheBlock, Externalizab for(long i=0; i<nonZeros; i++) { int r = in.readInt(); int c = in.readInt(); - double val = in.readDouble(); + double val = in.readDouble(); denseBlock[r*clen+c] = val; } } @@ -2037,7 +2043,7 @@ public class MatrixBlock extends MatrixValue implements CacheBlock, Externalizab //col: read iv-pairs for(long i=0; i<nonZeros; i++) { int r = in.readInt(); - double val = in.readDouble(); + double val = in.readDouble(); denseBlock[r] = val; } } @@ -2060,7 +2066,7 @@ public class MatrixBlock extends MatrixValue implements CacheBlock, Externalizab //write sparse to * if( sparseBlock==null || nonZeros==0 ) writeEmptyBlock(out); - else if( nonZeros<rlen && sparseDst ) + else if( isUltraSparseSerialize(sparseDst) ) writeSparseToUltraSparse(out); else if( sparseDst ) writeSparseBlock(out); @@ -2072,7 +2078,7 @@ public class MatrixBlock extends MatrixValue implements CacheBlock, Externalizab //write dense to * if( denseBlock==null || nonZeros==0 ) writeEmptyBlock(out); - else if( nonZeros<rlen && sparseDst ) + else if( isUltraSparseSerialize(sparseDst) ) writeDenseToUltraSparse(out); else if( sparseDst ) writeDenseToSparse(out); @@ -2127,8 +2133,8 @@ public class MatrixBlock extends MatrixValue implements CacheBlock, Externalizab for(int j=pos; j<pos+nr; j++) { out.writeInt(cols[j]); out.writeDouble(values[j]); - } - } + } + } } for(;r<rlen; r++) out.writeInt(0); @@ -2206,14 +2212,14 @@ public class MatrixBlock extends MatrixValue implements CacheBlock, Externalizab for( ; j<aix[apos+j2]; j++ ) out.writeDouble( 0 ); out.writeDouble( avals[apos+j2] ); - } + } //remaining 0 values in row for( int j=aix[apos+alen-1]+1; j<clen; j++) out.writeDouble( 0 ); } else //empty row for( int j=0; j<clen; j++ ) - out.writeDouble( 0 ); + out.writeDouble( 0 ); } } } @@ -2361,7 +2367,7 @@ public class MatrixBlock extends MatrixValue implements CacheBlock, Externalizab } else { //default serialize (general case) - write(os); + write(os); } } @@ -2621,12 +2627,30 @@ public class MatrixBlock extends MatrixValue implements CacheBlock, Externalizab @Override public boolean isShallowSerialize() { + return isShallowSerialize(false); + } + + @Override + public boolean isShallowSerialize(boolean inclConvert) { //shallow serialize if dense, dense in serialized form or already in CSR - return !sparse || !evalSparseFormatOnDisk() + boolean sparseDst = evalSparseFormatOnDisk(); + return !sparse || !sparseDst || (sparse && sparseBlock instanceof SparseBlockCSR) || (sparse && sparseBlock instanceof SparseBlockMCSR - && getInMemorySize()/MAX_SHALLOW_SERIALIZE_OVERHEAD - <= getExactSerializedSize()); + && getInMemorySize() / MAX_SHALLOW_SERIALIZE_OVERHEAD + <= getExactSerializedSize()) + || (sparse && sparseBlock instanceof SparseBlockMCSR + && nonZeros < Integer.MAX_VALUE //CSR constraint + && inclConvert && CONVERT_MCSR_TO_CSR_ON_DEEP_SERIALIZE + && !isUltraSparseSerialize(sparseDst)); + } + + @Override + public void toShallowSerializeBlock() { + if( isShallowSerialize() || !isShallowSerialize(true) ) + return; + sparseBlock = SparseBlockFactory.copySparseBlock( + SparseBlock.Type.CSR, sparseBlock, false); } @Override
