Repository: incubator-systemml Updated Branches: refs/heads/master 51c4291ab -> 172bfcacc
[SYSTEMML-573] Fix meta data handling of textcell frame readers/writers Including fixes for meta data handling on frame block merge, slice, and deep copy. Project: http://git-wip-us.apache.org/repos/asf/incubator-systemml/repo Commit: http://git-wip-us.apache.org/repos/asf/incubator-systemml/commit/12f2da9f Tree: http://git-wip-us.apache.org/repos/asf/incubator-systemml/tree/12f2da9f Diff: http://git-wip-us.apache.org/repos/asf/incubator-systemml/diff/12f2da9f Branch: refs/heads/master Commit: 12f2da9f007ea97db566bc12ed95e13c590bc8c2 Parents: 51c4291 Author: Matthias Boehm <[email protected]> Authored: Thu Jul 7 01:04:18 2016 -0700 Committer: Matthias Boehm <[email protected]> Committed: Fri Jul 8 10:59:13 2016 -0700 ---------------------------------------------------------------------- .../functions/ConvertFrameBlockToIJVLines.java | 9 ++++++ .../sysml/runtime/io/FrameReaderTextCell.java | 17 +++++++--- .../sysml/runtime/io/FrameWriterTextCell.java | 11 +++++++ .../sysml/runtime/matrix/data/FrameBlock.java | 28 ++++++++++------ .../matrix/mapred/FrameReblockBuffer.java | 13 +++++--- .../functions/frame/FrameMetaReadWriteTest.java | 34 +++++++++----------- 6 files changed, 76 insertions(+), 36 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/12f2da9f/src/main/java/org/apache/sysml/runtime/instructions/spark/functions/ConvertFrameBlockToIJVLines.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/runtime/instructions/spark/functions/ConvertFrameBlockToIJVLines.java b/src/main/java/org/apache/sysml/runtime/instructions/spark/functions/ConvertFrameBlockToIJVLines.java index 3c50668..792e0b6 100644 --- a/src/main/java/org/apache/sysml/runtime/instructions/spark/functions/ConvertFrameBlockToIJVLines.java +++ b/src/main/java/org/apache/sysml/runtime/instructions/spark/functions/ConvertFrameBlockToIJVLines.java @@ -40,6 +40,15 @@ public class ConvertFrameBlockToIJVLines implements FlatMapFunction<Tuple2<Long, ArrayList<String> cells = new ArrayList<String>(); + //write frame meta data + if( rowoffset == 1 ) { + for( int j=0; j<block.getNumColumns(); j++ ) + if( !block.isColumnMetadataDefault(j) ) { + cells.add("-1 " + (j+1) + " " + block.getColumnMetadata(j).getNumDistinct()); + cells.add("-2 " + (j+1) + " " + block.getColumnMetadata(j).getMvValue()); + } + } + //convert frame block to list of ijv cell triples StringBuilder sb = new StringBuilder(); Iterator<String[]> iter = block.getStringRowIterator(); http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/12f2da9f/src/main/java/org/apache/sysml/runtime/io/FrameReaderTextCell.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/runtime/io/FrameReaderTextCell.java b/src/main/java/org/apache/sysml/runtime/io/FrameReaderTextCell.java index 8345b47..e7efc2b 100644 --- a/src/main/java/org/apache/sysml/runtime/io/FrameReaderTextCell.java +++ b/src/main/java/org/apache/sysml/runtime/io/FrameReaderTextCell.java @@ -182,7 +182,12 @@ public class FrameReaderTextCell extends FrameReader st.reset( value.toString() ); //reinit tokenizer row = st.nextInt()-1; col = st.nextInt()-1; - dest.set(row, col, UtilFunctions.stringToObject(schema.get(col), st.nextToken())); + if( row == -3 ) + dest.getColumnMetadata(col).setMvValue(st.nextToken()); + else if( row == -2 ) + dest.getColumnMetadata(col).setNumDistinct(st.nextLong()); + else + dest.set(row, col, UtilFunctions.stringToObject(schema.get(col), st.nextToken())); } } catch(Exception ex) @@ -253,8 +258,13 @@ public class FrameReaderTextCell extends FrameReader while( (value=br.readLine())!=null ) { st.reset( value ); //reinit tokenizer row = st.nextInt()-1; - col = st.nextInt()-1; - dest.set(row, col, UtilFunctions.stringToObject(schema.get(col), st.nextToken())); + col = st.nextInt()-1; + if( row == -3 ) + dest.getColumnMetadata(col).setMvValue(st.nextToken()); + else if (row == -2) + dest.getColumnMetadata(col).setNumDistinct(st.nextLong()); + else + dest.set(row, col, UtilFunctions.stringToObject(schema.get(col), st.nextToken())); } } catch(Exception ex) @@ -272,5 +282,4 @@ public class FrameReaderTextCell extends FrameReader IOUtilFunctions.closeSilently(br); } } - } http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/12f2da9f/src/main/java/org/apache/sysml/runtime/io/FrameWriterTextCell.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/runtime/io/FrameWriterTextCell.java b/src/main/java/org/apache/sysml/runtime/io/FrameWriterTextCell.java index 38348ad..d257b69 100644 --- a/src/main/java/org/apache/sysml/runtime/io/FrameWriterTextCell.java +++ b/src/main/java/org/apache/sysml/runtime/io/FrameWriterTextCell.java @@ -110,6 +110,17 @@ public class FrameWriterTextCell extends FrameWriter //for obj reuse and preventing repeated buffer re-allocations StringBuilder sb = new StringBuilder(); + //write frame meta data + if( rl == 0 ) { + for( int j=0; j<cols; j++ ) + if( !src.isColumnMetadataDefault(j) ) { + sb.append("-1 " + (j+1) + " " + src.getColumnMetadata(j).getNumDistinct() + "\n"); + sb.append("-2 " + (j+1) + " " + src.getColumnMetadata(j).getMvValue() + "\n"); + br.write( sb.toString() ); + sb.setLength(0); + } + } + //write frame row range to output Iterator<String[]> iter = src.getStringRowIterator(rl, ru); for( int i=rl; iter.hasNext(); i++ ) { //for all rows http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/12f2da9f/src/main/java/org/apache/sysml/runtime/matrix/data/FrameBlock.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/runtime/matrix/data/FrameBlock.java b/src/main/java/org/apache/sysml/runtime/matrix/data/FrameBlock.java index d5787ca..2088e85 100644 --- a/src/main/java/org/apache/sysml/runtime/matrix/data/FrameBlock.java +++ b/src/main/java/org/apache/sysml/runtime/matrix/data/FrameBlock.java @@ -83,8 +83,9 @@ public class FrameBlock implements Writable, CacheBlock, Externalizable } public FrameBlock(FrameBlock that) { - this(that.getSchema()); + this(that.getSchema(), that.getColumnNames()); copy(that); + setColumnMetadata(that.getColumnMetadata()); } public FrameBlock(int ncols, ValueType vt) { @@ -206,7 +207,7 @@ public class FrameBlock implements Writable, CacheBlock, Externalizable * @param colmeta */ public void setColumnMetadata(List<ColumnMetadata> colmeta) { - _colmeta = colmeta; + _colmeta = new ArrayList<FrameBlock.ColumnMetadata>(colmeta); } /** @@ -330,7 +331,8 @@ public class FrameBlock implements Writable, CacheBlock, Externalizable getColumnNames().clear(); if( _colmeta != null ) { for( int i=0; i<_colmeta.size(); i++ ) - _colmeta.get(i).reset(); + if( !isColumnMetadataDefault(i) ) + _colmeta.set(i, new ColumnMetadata(0)); } if(_coldata != null) { for( int i=0; i < _coldata.size(); i++ ) @@ -888,6 +890,13 @@ public class FrameBlock implements Writable, CacheBlock, Externalizable if ( getNumRows() != that.getNumRows() || getNumColumns() != that.getNumColumns() ) throw new DMLRuntimeException("Dimension mismatch on merge disjoint (target="+getNumRows()+"x"+getNumColumns()+", source="+that.getNumRows()+"x"+that.getNumColumns()+")"); + //meta data copy if necessary + for( int j=0; j<getNumColumns(); j++ ) + if( !that.isColumnMetadataDefault(j) ) { + _colmeta.get(j).setNumDistinct(that._colmeta.get(j).getNumDistinct()); + _colmeta.get(j).setMvValue(that._colmeta.get(j).getMvValue()); + } + //core frame block merge through cell copy for( int i=0; i<that.getNumRows(); i++ ) { for( int j=0; j<getNumColumns(); j++ ) { @@ -896,7 +905,6 @@ public class FrameBlock implements Writable, CacheBlock, Externalizable set(i, j,obj); } } - } /** @@ -1252,16 +1260,16 @@ public class FrameBlock implements Writable, CacheBlock, Externalizable private long _ndistinct = 0; private String _mvValue = null; - public ColumnMetadata(long ndistinct, String mvval) { + public ColumnMetadata(long ndistinct) { _ndistinct = ndistinct; - _mvValue = mvval; } - public ColumnMetadata(long ndistinct) { + public ColumnMetadata(long ndistinct, String mvval) { _ndistinct = ndistinct; + _mvValue = mvval; } - public void reset() { - _ndistinct = 0; - _mvValue = null; + public ColumnMetadata(ColumnMetadata that) { + _ndistinct = that._ndistinct; + _mvValue = that._mvValue; } public long getNumDistinct() { http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/12f2da9f/src/main/java/org/apache/sysml/runtime/matrix/mapred/FrameReblockBuffer.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/runtime/matrix/mapred/FrameReblockBuffer.java b/src/main/java/org/apache/sysml/runtime/matrix/mapred/FrameReblockBuffer.java index 2762a0d..9fd9e36 100644 --- a/src/main/java/org/apache/sysml/runtime/matrix/mapred/FrameReblockBuffer.java +++ b/src/main/java/org/apache/sysml/runtime/matrix/mapred/FrameReblockBuffer.java @@ -218,7 +218,8 @@ public class FrameReblockBuffer long cbi = -1, cbj = -1; //current block indexes for( int i=0; i<_count; i++ ) { - long bi = UtilFunctions.computeBlockIndex(_buff[i].getRow(), _brlen); + //compute block indexes (w/ robustness for meta data handling) + long bi = Math.max(UtilFunctions.computeBlockIndex(_buff[i].getRow(), _brlen), 1); long bj = UtilFunctions.computeBlockIndex(_buff[i].getCol(), _bclen); //output block and switch to next index pair @@ -229,13 +230,17 @@ public class FrameReblockBuffer cbj = bj; tmpIx = (bi-1)*_brlen+1; tmpBlock = new FrameBlock(_schema); - tmpBlock.ensureAllocatedColumns(Math.min(_brlen, (int)(_rlen-(bi-1)*_brlen))); - + tmpBlock.ensureAllocatedColumns(Math.min(_brlen, (int)(_rlen-(bi-1)*_brlen))); } int ci = UtilFunctions.computeCellInBlock(_buff[i].getRow(), _brlen); int cj = UtilFunctions.computeCellInBlock(_buff[i].getCol(), _bclen); - tmpBlock.set(ci, cj, _buff[i].getObjVal()); + if( ci == -3 ) + tmpBlock.getColumnMetadata(cj).setMvValue(_buff[i].getObjVal().toString()); + else if( ci == -2 ) + tmpBlock.getColumnMetadata(cj).setNumDistinct(Long.parseLong(_buff[i].getObjVal().toString())); + else + tmpBlock.set(ci, cj, _buff[i].getObjVal()); } //output last block http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/12f2da9f/src/test/java/org/apache/sysml/test/integration/functions/frame/FrameMetaReadWriteTest.java ---------------------------------------------------------------------- diff --git a/src/test/java/org/apache/sysml/test/integration/functions/frame/FrameMetaReadWriteTest.java b/src/test/java/org/apache/sysml/test/integration/functions/frame/FrameMetaReadWriteTest.java index c71eac6..3803015 100644 --- a/src/test/java/org/apache/sysml/test/integration/functions/frame/FrameMetaReadWriteTest.java +++ b/src/test/java/org/apache/sysml/test/integration/functions/frame/FrameMetaReadWriteTest.java @@ -55,33 +55,31 @@ public class FrameMetaReadWriteTest extends AutomatedTestBase runFrameReadWriteTest(OutputInfo.BinaryBlockOutputInfo, ExecType.CP); } + @Test + public void testFrameBinarySpark() { + runFrameReadWriteTest(OutputInfo.BinaryBlockOutputInfo, ExecType.SPARK); + } + + @Test + public void testFrameTextcellCP() { + runFrameReadWriteTest(OutputInfo.TextCellOutputInfo, ExecType.CP); + } + + @Test + public void testFrameTextcellSpark() { + runFrameReadWriteTest(OutputInfo.TextCellOutputInfo, ExecType.SPARK); + } + // TODO: add meta data support for text formats (requires consolidation with file-based transform first) // @Test // public void testFrameCsvCP() { // runFrameReadWriteTest(OutputInfo.CSVOutputInfo, ExecType.CP); // } -// -// @Test -// public void testFrameTextcellCP() { -// runFrameReadWriteTest(OutputInfo.TextCellOutputInfo, ExecType.CP); -// } - - @Test - public void testFrameBinarySpark() { - runFrameReadWriteTest(OutputInfo.BinaryBlockOutputInfo, ExecType.SPARK); - } - -// TODO: add meta data support for text formats (requires consolidation with file-based transform first) +// // @Test // public void testFrameCsvSpark() { // runFrameReadWriteTest(OutputInfo.CSVOutputInfo, ExecType.SPARK); // } -// -// @Test -// public void testFrameTextcellSpark() { -// runFrameReadWriteTest(OutputInfo.TextCellOutputInfo, ExecType.SPARK); -// } - /** *
