[MINOR] Performance frame transformencode (selective row iterators) This patch adds selective row iterators to frame blocks, which allows the transform recode encoder to iterate over rows of selected columns which avoids unnecessary string conversions for unused columns.
Project: http://git-wip-us.apache.org/repos/asf/systemml/repo Commit: http://git-wip-us.apache.org/repos/asf/systemml/commit/4a6165b7 Tree: http://git-wip-us.apache.org/repos/asf/systemml/tree/4a6165b7 Diff: http://git-wip-us.apache.org/repos/asf/systemml/diff/4a6165b7 Branch: refs/heads/master Commit: 4a6165b796590a6388a9c182612761219731d77f Parents: f485ab2 Author: Matthias Boehm <[email protected]> Authored: Fri Jul 7 17:14:39 2017 -0700 Committer: Matthias Boehm <[email protected]> Committed: Sat Jul 8 22:32:03 2017 -0700 ---------------------------------------------------------------------- .../sysml/runtime/matrix/data/FrameBlock.java | 82 +++++++++++++++++--- .../transform/decode/DecoderFactory.java | 2 +- .../transform/encode/EncoderFactory.java | 2 +- .../runtime/transform/encode/EncoderRecode.java | 4 +- .../sysml/runtime/util/UtilFunctions.java | 46 +++++++---- 5 files changed, 107 insertions(+), 29 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/systemml/blob/4a6165b7/src/main/java/org/apache/sysml/runtime/matrix/data/FrameBlock.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/runtime/matrix/data/FrameBlock.java b/src/main/java/org/apache/sysml/runtime/matrix/data/FrameBlock.java index 99a6f3f..512b85c 100644 --- a/src/main/java/org/apache/sysml/runtime/matrix/data/FrameBlock.java +++ b/src/main/java/org/apache/sysml/runtime/matrix/data/FrameBlock.java @@ -505,6 +505,17 @@ public class FrameBlock implements Writable, CacheBlock, Externalizable } /** + * Get a row iterator over the frame where all selected fields are + * encoded as strings independent of their value types. + * + * @param cols column selection, 1-based + * @return string array iterator + */ + public Iterator<String[]> getStringRowIterator(int[] cols) { + return new StringRowIterator(0, _numRows, cols); + } + + /** * Get a row iterator over the frame where all fields are encoded * as strings independent of their value types. * @@ -517,6 +528,19 @@ public class FrameBlock implements Writable, CacheBlock, Externalizable } /** + * Get a row iterator over the frame where all selected fields are + * encoded as strings independent of their value types. + * + * @param rl lower row index + * @param ru upper row index + * @param cols column selection, 1-based + * @return string array iterator + */ + public Iterator<String[]> getStringRowIterator(int rl, int ru, int[] cols) { + return new StringRowIterator(rl, ru, cols); + } + + /** * Get a row iterator over the frame where all fields are encoded * as boxed objects according to their value types. * @@ -527,6 +551,17 @@ public class FrameBlock implements Writable, CacheBlock, Externalizable } /** + * Get a row iterator over the frame where all selected fields are + * encoded as boxed objects according to their value types. + * + * @param cols column selection, 1-based + * @return object array iterator + */ + public Iterator<Object[]> getObjectRowIterator(int[] cols) { + return new ObjectRowIterator(0, _numRows, cols); + } + + /** * Get a row iterator over the frame where all fields are encoded * as boxed objects according to their value types. * @@ -537,6 +572,19 @@ public class FrameBlock implements Writable, CacheBlock, Externalizable public Iterator<Object[]> getObjectRowIterator(int rl, int ru) { return new ObjectRowIterator(rl, ru); } + + /** + * Get a row iterator over the frame where all selected fields are + * encoded as boxed objects according to their value types. + * + * @param rl lower row index + * @param ru upper row index + * @param cols column selection, 1-based + * @return object array iterator + */ + public Iterator<Object[]> getObjectRowIterator(int rl, int ru, int[] cols) { + return new ObjectRowIterator(rl, ru, cols); + } /////// // serialization / deserialization (implementation of writable and externalizable) @@ -1111,14 +1159,20 @@ public class FrameBlock implements Writable, CacheBlock, Externalizable // row iterators (over strings and boxed objects) private abstract class RowIterator<T> implements Iterator<T[]> { - protected T[] _curRow = null; + protected final int[] _cols; + protected final T[] _curRow; + protected final int _maxPos; protected int _curPos = -1; - protected int _maxPos = -1; protected RowIterator(int rl, int ru) { - _curPos = rl; + this(rl, ru, UtilFunctions.getSeqArray(1, getNumColumns(), 1)); + } + + protected RowIterator(int rl, int ru, int[] cols) { + _curRow = createRow(cols.length); + _cols = cols; _maxPos = ru; - _curRow = createRow(getNumColumns()); + _curPos = rl; } @Override @@ -1139,6 +1193,10 @@ public class FrameBlock implements Writable, CacheBlock, Externalizable super(rl, ru); } + public StringRowIterator(int rl, int ru, int[] cols) { + super(rl, ru, cols); + } + @Override protected String[] createRow(int size) { return new String[size]; @@ -1146,11 +1204,11 @@ public class FrameBlock implements Writable, CacheBlock, Externalizable @Override public String[] next( ) { - for( int j=0; j<getNumColumns(); j++ ) { - Object tmp = get(_curPos, j); + for( int j=0; j<_cols.length; j++ ) { + Object tmp = get(_curPos, _cols[j]-1); _curRow[j] = (tmp!=null) ? tmp.toString() : null; } - _curPos++; + _curPos++; return _curRow; } } @@ -1160,6 +1218,10 @@ public class FrameBlock implements Writable, CacheBlock, Externalizable super(rl, ru); } + public ObjectRowIterator(int rl, int ru, int[] cols) { + super(rl, ru, cols); + } + @Override protected Object[] createRow(int size) { return new Object[size]; @@ -1167,9 +1229,9 @@ public class FrameBlock implements Writable, CacheBlock, Externalizable @Override public Object[] next( ) { - for( int j=0; j<getNumColumns(); j++ ) - _curRow[j] = get(_curPos, j); - _curPos++; + for( int j=0; j<_cols.length; j++ ) + _curRow[j] = get(_curPos, _cols[j]-1); + _curPos++; return _curRow; } } http://git-wip-us.apache.org/repos/asf/systemml/blob/4a6165b7/src/main/java/org/apache/sysml/runtime/transform/decode/DecoderFactory.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/runtime/transform/decode/DecoderFactory.java b/src/main/java/org/apache/sysml/runtime/transform/decode/DecoderFactory.java index 425466a..c02609a 100644 --- a/src/main/java/org/apache/sysml/runtime/transform/decode/DecoderFactory.java +++ b/src/main/java/org/apache/sysml/runtime/transform/decode/DecoderFactory.java @@ -56,7 +56,7 @@ public class DecoderFactory TfMetaUtils.parseJsonIDList(jSpec, colnames, TfUtils.TXMETHOD_DUMMYCODE))); rcIDs = new ArrayList<Integer>(CollectionUtils.union(rcIDs, dcIDs)); List<Integer> ptIDs = new ArrayList<Integer>(CollectionUtils - .subtract(UtilFunctions.getSequenceList(1, meta.getNumColumns(), 1), rcIDs)); + .subtract(UtilFunctions.getSeqList(1, meta.getNumColumns(), 1), rcIDs)); //create default schema if unspecified (with double columns for pass-through) if( schema == null ) { http://git-wip-us.apache.org/repos/asf/systemml/blob/4a6165b7/src/main/java/org/apache/sysml/runtime/transform/encode/EncoderFactory.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/runtime/transform/encode/EncoderFactory.java b/src/main/java/org/apache/sysml/runtime/transform/encode/EncoderFactory.java index 13b2810..5e0a178 100644 --- a/src/main/java/org/apache/sysml/runtime/transform/encode/EncoderFactory.java +++ b/src/main/java/org/apache/sysml/runtime/transform/encode/EncoderFactory.java @@ -65,7 +65,7 @@ public class EncoderFactory rcIDs = new ArrayList<Integer>(CollectionUtils.union(rcIDs, dcIDs)); List<Integer> binIDs = TfMetaUtils.parseBinningColIDs(jSpec, colnames); List<Integer> ptIDs = new ArrayList<Integer>(CollectionUtils.subtract( - CollectionUtils.subtract(UtilFunctions.getSequenceList(1, clen, 1), rcIDs), binIDs)); + CollectionUtils.subtract(UtilFunctions.getSeqList(1, clen, 1), rcIDs), binIDs)); List<Integer> oIDs = Arrays.asList(ArrayUtils.toObject( TfMetaUtils.parseJsonIDList(jSpec, colnames, TfUtils.TXMETHOD_OMIT))); List<Integer> mvIDs = Arrays.asList(ArrayUtils.toObject( http://git-wip-us.apache.org/repos/asf/systemml/blob/4a6165b7/src/main/java/org/apache/sysml/runtime/transform/encode/EncoderRecode.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/runtime/transform/encode/EncoderRecode.java b/src/main/java/org/apache/sysml/runtime/transform/encode/EncoderRecode.java index bb8592c..dc75a74 100644 --- a/src/main/java/org/apache/sysml/runtime/transform/encode/EncoderRecode.java +++ b/src/main/java/org/apache/sysml/runtime/transform/encode/EncoderRecode.java @@ -112,7 +112,7 @@ public class EncoderRecode extends Encoder if( !isApplicable() ) return; - Iterator<String[]> iter = in.getStringRowIterator(); + Iterator<String[]> iter = in.getStringRowIterator(_colList); while( iter.hasNext() ) { String[] row = iter.next(); for( int j=0; j<_colList.length; j++ ) { @@ -122,7 +122,7 @@ public class EncoderRecode extends Encoder _rcdMaps.put(colID, new HashMap<String,Long>()); //probe and build column map HashMap<String,Long> map = _rcdMaps.get(colID); - String key = row[colID-1]; + String key = row[j]; if( key!=null && !key.isEmpty() && !map.containsKey(key) ) map.put(key, Long.valueOf(map.size()+1)); } http://git-wip-us.apache.org/repos/asf/systemml/blob/4a6165b7/src/main/java/org/apache/sysml/runtime/util/UtilFunctions.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/runtime/util/UtilFunctions.java b/src/main/java/org/apache/sysml/runtime/util/UtilFunctions.java index f76d37b..8c4cacd 100644 --- a/src/main/java/org/apache/sysml/runtime/util/UtilFunctions.java +++ b/src/main/java/org/apache/sysml/runtime/util/UtilFunctions.java @@ -314,6 +314,37 @@ public class UtilFunctions return 1L + (long) Math.floor(to/incr - from/incr); } + /** + * Obtain sequence list + * + * @param low lower bound (inclusive) + * @param up upper bound (inclusive) + * @param incr increment + * @return list of integers + */ + public static List<Integer> getSeqList(int low, int up, int incr) { + ArrayList<Integer> ret = new ArrayList<Integer>(); + for( int i=low; i<=up; i+=incr ) + ret.add(i); + return ret; + } + + /** + * Obtain sequence array + * + * @param low lower bound (inclusive) + * @param up upper bound (inclusive) + * @param incr increment + * @return array of integers + */ + public static int[] getSeqArray(int low, int up, int incr) { + int len = (int) getSeqLength(low, up, incr); + int[] ret = new int[len]; + for( int i=0, val=low; i<len; i++, val+=incr ) + ret[i] = val; + return ret; + } + public static int roundToNext(int val, int factor) { //round up to next non-zero multiple of factor int pval = Math.max(val, factor); @@ -506,21 +537,6 @@ public class UtilFunctions else return String.format("%d", arg); } - - /** - * Obtain sequence list - * - * @param low lower bound (inclusive) - * @param up upper bound (inclusive) - * @param incr increment - * @return list of integers - */ - public static List<Integer> getSequenceList(int low, int up, int incr) { - ArrayList<Integer> ret = new ArrayList<Integer>(); - for( int i=low; i<=up; i+=incr ) - ret.add(i); - return ret; - } public static double getDouble(Object obj) { return (obj instanceof Double) ? (Double)obj :
