[SYSTEMML-1791] Performance frame block indexing and transformapply This patch makes the following performance improvements to various frame operations in order to remove unnecessary overheads:
(1) Shallow column copy on full column indexing. (2) Bidirectional reuse of recode maps across original meta data frame blocks and shallow column copies (e.g., after column indexing). (3) Avoid unnecessary long-string-double conversions on transformapply (the recently removed file-based transform required string lookups - we now avoid this long-string conversion which is unnecessary for the related frame operations). Furthermore, this patch also makes a couple of cleanups methods which become obsolete after the removal of the old file-based transform. Project: http://git-wip-us.apache.org/repos/asf/systemml/repo Commit: http://git-wip-us.apache.org/repos/asf/systemml/commit/4a24b9a7 Tree: http://git-wip-us.apache.org/repos/asf/systemml/tree/4a24b9a7 Diff: http://git-wip-us.apache.org/repos/asf/systemml/diff/4a24b9a7 Branch: refs/heads/master Commit: 4a24b9a78424dc85fe774e6d2dd5689fea9cd5b1 Parents: 7cd978d Author: Matthias Boehm <[email protected]> Authored: Wed Jul 19 23:16:44 2017 -0700 Committer: Matthias Boehm <[email protected]> Committed: Thu Jul 20 01:24:24 2017 -0700 ---------------------------------------------------------------------- .../sysml/runtime/matrix/data/FrameBlock.java | 45 ++++++++++---------- .../sysml/runtime/transform/encode/Encoder.java | 10 ----- .../runtime/transform/encode/EncoderBin.java | 31 -------------- .../transform/encode/EncoderComposite.java | 7 --- .../transform/encode/EncoderDummycode.java | 38 ----------------- .../transform/encode/EncoderMVImpute.java | 30 ------------- .../runtime/transform/encode/EncoderOmit.java | 5 --- .../transform/encode/EncoderPassThrough.java | 5 --- .../runtime/transform/encode/EncoderRecode.java | 45 +++----------------- 9 files changed, 29 insertions(+), 187 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/systemml/blob/4a24b9a7/src/main/java/org/apache/sysml/runtime/matrix/data/FrameBlock.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/runtime/matrix/data/FrameBlock.java b/src/main/java/org/apache/sysml/runtime/matrix/data/FrameBlock.java index bfe236e..5e6404b 100644 --- a/src/main/java/org/apache/sysml/runtime/matrix/data/FrameBlock.java +++ b/src/main/java/org/apache/sysml/runtime/matrix/data/FrameBlock.java @@ -67,13 +67,8 @@ public class FrameBlock implements Writable, CacheBlock, Externalizable /** The data frame data as an ordered list of columns */ private Array[] _coldata = null; - /** Cache for recode maps from frame meta data, indexed by column 0-based */ - private Map<Integer, SoftReference<HashMap<String,Long>>> _rcdMapCache = null; - public FrameBlock() { _numRows = 0; - if( REUSE_RECODE_MAPS ) - _rcdMapCache = new HashMap<Integer, SoftReference<HashMap<String,Long>>>(); } /** @@ -120,8 +115,6 @@ public class FrameBlock implements Writable, CacheBlock, Externalizable _colmeta[j] = new ColumnMetadata(0); for( int i=0; i<data.length; i++ ) appendRow(data[i]); - if( REUSE_RECODE_MAPS ) - _rcdMapCache = new HashMap<Integer, SoftReference<HashMap<String,Long>>>(); } /** @@ -872,16 +865,25 @@ public class FrameBlock implements Writable, CacheBlock, Externalizable ret._colnames[j-cl] = getColumnName(j); } ret._numRows = ru-rl+1; - - //copy output data - if(ret._coldata == null ) { + if(ret._coldata == null ) ret._coldata = new Array[numCols]; + + //fast-path: shallow copy column indexing + if( ret._numRows == _numRows ) { + //this shallow copy does not only avoid an array copy, but + //also allows for bi-directional reuses of recodemaps for( int j=cl; j<=cu; j++ ) - ret._coldata[j-cl] = _coldata[j].slice(rl,ru); + ret._coldata[j-cl] = _coldata[j]; + } + //copy output data + else { + for( int j=cl; j<=cu; j++ ) { + if( ret._coldata[j-cl] == null ) + ret._coldata[j-cl] = _coldata[j].slice(rl,ru); + else + ret._coldata[j-cl].set(0, ru-rl, _coldata[j], rl); + } } - else - for( int j=cl; j<=cu; j++ ) - ret._coldata[j-cl].set(0, ru-rl, _coldata[j], rl); return ret; } @@ -1023,7 +1025,7 @@ public class FrameBlock implements Writable, CacheBlock, Externalizable public HashMap<String,Long> getRecodeMap(int col) { //probe cache for existing map if( REUSE_RECODE_MAPS ) { - SoftReference<HashMap<String,Long>> tmp = _rcdMapCache.get(col); + SoftReference<HashMap<String,Long>> tmp = _coldata[col]._rcdMapCache; HashMap<String,Long> map = (tmp!=null) ? tmp.get() : null; if( map != null ) return map; } @@ -1034,10 +1036,8 @@ public class FrameBlock implements Writable, CacheBlock, Externalizable for( int i=0; i<getNumRows(); i++ ) { Object val = ldata.get(i); if( val != null ) { -// String[] tmp = IOUtilFunctions.splitCSV( -// val.toString(), Lop.DATATYPE_PREFIX); - - // Instead of using splitCSV which is forcing string with RFC-4180 format, using Lop.DATATYPE_PREFIX separator to split token and code + // Instead of using splitCSV which is forcing string with RFC-4180 format, + // using Lop.DATATYPE_PREFIX separator to split token and code String[] tmp = new String[2]; int pos = val.toString().lastIndexOf(Lop.DATATYPE_PREFIX); tmp[0] = val.toString().substring(0, pos); @@ -1047,9 +1047,8 @@ public class FrameBlock implements Writable, CacheBlock, Externalizable } //put created map into cache - if( REUSE_RECODE_MAPS ) { - _rcdMapCache.put(col, new SoftReference<HashMap<String,Long>>(map)); - } + if( REUSE_RECODE_MAPS ) + _coldata[col]._rcdMapCache = new SoftReference<>(map); return map; } @@ -1245,6 +1244,8 @@ public class FrameBlock implements Writable, CacheBlock, Externalizable * in order to avoid unnecessary dependencies. */ private abstract static class Array<T> implements Writable { + protected SoftReference<HashMap<String,Long>> _rcdMapCache = null; + protected int _size = 0; protected int newSize() { return (int) Math.max(_size*2, 4); http://git-wip-us.apache.org/repos/asf/systemml/blob/4a24b9a7/src/main/java/org/apache/sysml/runtime/transform/encode/Encoder.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/runtime/transform/encode/Encoder.java b/src/main/java/org/apache/sysml/runtime/transform/encode/Encoder.java index 304dcdb..e4af8a6 100644 --- a/src/main/java/org/apache/sysml/runtime/transform/encode/Encoder.java +++ b/src/main/java/org/apache/sysml/runtime/transform/encode/Encoder.java @@ -117,16 +117,6 @@ public abstract class Encoder implements Serializable * @return output matrix block */ public abstract MatrixBlock apply(FrameBlock in, MatrixBlock out); - - /** - * Encode input data according to existing transform meta - * data (transform apply). - * TODO remove once file-based transform removed - * - * @param in input data as string array - * @return encoded data as string array - */ - public abstract String[] apply(String[] in); /** * Construct a frame block out of the transform meta data. http://git-wip-us.apache.org/repos/asf/systemml/blob/4a24b9a7/src/main/java/org/apache/sysml/runtime/transform/encode/EncoderBin.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/runtime/transform/encode/EncoderBin.java b/src/main/java/org/apache/sysml/runtime/transform/encode/EncoderBin.java index fbe6994..e70a392 100644 --- a/src/main/java/org/apache/sysml/runtime/transform/encode/EncoderBin.java +++ b/src/main/java/org/apache/sysml/runtime/transform/encode/EncoderBin.java @@ -44,7 +44,6 @@ public class EncoderBin extends Encoder private int[] _numBins = null; private double[] _min=null, _max=null; // min and max among non-missing values - private double[] _binWidths = null; // width of a bin for each attribute //frame transform-apply attributes private double[][] _binMins = null; @@ -83,8 +82,6 @@ public class EncoderBin extends Encoder Arrays.fill(_min, Double.MAX_VALUE); _max = new double[_colList.length]; Arrays.fill(_max, -Double.MAX_VALUE); - - _binWidths = new double[_colList.length]; } } @@ -121,34 +118,6 @@ public class EncoderBin extends Encoder // nothing to do } - /** - * Method to apply transformations. - */ - @Override - public String[] apply(String[] words) { - if( !isApplicable() ) - return words; - - for(int i=0; i < _colList.length; i++) { - int colID = _colList[i]; - try { - double val = UtilFunctions.parseToDouble(words[colID-1]); - int binid = 1; - double tmp = _min[i] + _binWidths[i]; - while(val > tmp && binid < _numBins[i]) { - tmp += _binWidths[i]; - binid++; - } - words[colID-1] = Integer.toString(binid); - } - catch(NumberFormatException e) { - throw new RuntimeException("Encountered \"" + words[colID-1] + "\" in column ID \"" + colID + "\", when expecting a numeric value. Consider adding \"" + words[colID-1] + "\" to na.strings, along with an appropriate imputation method."); - } - } - - return words; - } - @Override public MatrixBlock apply(FrameBlock in, MatrixBlock out) { for(int j=0; j<_colList.length; j++) { http://git-wip-us.apache.org/repos/asf/systemml/blob/4a24b9a7/src/main/java/org/apache/sysml/runtime/transform/encode/EncoderComposite.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/runtime/transform/encode/EncoderComposite.java b/src/main/java/org/apache/sysml/runtime/transform/encode/EncoderComposite.java index deff887..ffff1df 100644 --- a/src/main/java/org/apache/sysml/runtime/transform/encode/EncoderComposite.java +++ b/src/main/java/org/apache/sysml/runtime/transform/encode/EncoderComposite.java @@ -80,13 +80,6 @@ public class EncoderComposite extends Encoder for( Encoder encoder : _encoders ) encoder.build(in); } - - @Override - public String[] apply(String[] in) { - for( Encoder encoder : _encoders ) - encoder.apply(in); - return in; - } @Override public MatrixBlock apply(FrameBlock in, MatrixBlock out) { http://git-wip-us.apache.org/repos/asf/systemml/blob/4a24b9a7/src/main/java/org/apache/sysml/runtime/transform/encode/EncoderDummycode.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/runtime/transform/encode/EncoderDummycode.java b/src/main/java/org/apache/sysml/runtime/transform/encode/EncoderDummycode.java index 743381a..9a2f059 100644 --- a/src/main/java/org/apache/sysml/runtime/transform/encode/EncoderDummycode.java +++ b/src/main/java/org/apache/sysml/runtime/transform/encode/EncoderDummycode.java @@ -58,44 +58,6 @@ public class EncoderDummycode extends Encoder //do nothing } - /** - * Method to apply transformations. - * - * @param words array of strings - * @return array of transformed strings - */ - @Override - public String[] apply(String[] words) - { - if( !isApplicable() ) - return words; - - String[] nwords = new String[(int)_dummycodedLength]; - int rcdVal = 0; - - for(int colID=1, idx=0, ncolID=1; colID <= words.length; colID++) { - if(idx < _colList.length && colID==_colList[idx]) { - // dummycoded columns - try { - rcdVal = UtilFunctions.parseToInt(UtilFunctions.unquote(words[colID-1])); - nwords[ ncolID-1+rcdVal-1 ] = "1"; - ncolID += _domainSizes[idx]; - idx++; - } - catch (Exception e) { - throw new RuntimeException("Error in dummycoding: colID="+colID + ", rcdVal=" + rcdVal+", word="+words[colID-1] - + ", domainSize=" + _domainSizes[idx] + ", dummyCodedLength=" + _dummycodedLength); - } - } - else { - nwords[ncolID-1] = words[colID-1]; - ncolID++; - } - } - - return nwords; - } - @Override public MatrixBlock apply(FrameBlock in, MatrixBlock out) { http://git-wip-us.apache.org/repos/asf/systemml/blob/4a24b9a7/src/main/java/org/apache/sysml/runtime/transform/encode/EncoderMVImpute.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/runtime/transform/encode/EncoderMVImpute.java b/src/main/java/org/apache/sysml/runtime/transform/encode/EncoderMVImpute.java index 55a0bde..ae9b809 100644 --- a/src/main/java/org/apache/sysml/runtime/transform/encode/EncoderMVImpute.java +++ b/src/main/java/org/apache/sysml/runtime/transform/encode/EncoderMVImpute.java @@ -336,36 +336,6 @@ public class EncoderMVImpute extends Encoder throw new RuntimeException(ex); } } - - @Override - public String[] apply(String[] words) - { - if( isApplicable() ) - for(int i=0; i < _colList.length; i++) { - int colID = _colList[i]; - String w = UtilFunctions.unquote(words[colID-1]); - if(TfUtils.isNA(_NAstrings, w)) - w = words[colID-1] = _replacementList[i]; - - if ( _isMVScaled.get(i) ) - if ( _mvscMethodList[i] == MVMethod.GLOBAL_MEAN ) - words[colID-1] = Double.toString( UtilFunctions.parseToDouble(w) - _meanList[i]._sum ); - else - words[colID-1] = Double.toString( (UtilFunctions.parseToDouble(w) - _meanList[i]._sum) / _varList[i].mean._sum ); - } - - if(_scnomvList != null) - for(int i=0; i < _scnomvList.length; i++) - { - int colID = _scnomvList[i]; - if ( _scnomvMethodList[i] == MVMethod.GLOBAL_MEAN ) - words[colID-1] = Double.toString( UtilFunctions.parseToDouble(words[colID-1]) - _scnomvMeanList[i]._sum ); - else - words[colID-1] = Double.toString( (UtilFunctions.parseToDouble(words[colID-1]) - _scnomvMeanList[i]._sum) / _scnomvVarList[i].mean._sum ); - } - - return words; - } @Override public MatrixBlock apply(FrameBlock in, MatrixBlock out) { http://git-wip-us.apache.org/repos/asf/systemml/blob/4a24b9a7/src/main/java/org/apache/sysml/runtime/transform/encode/EncoderOmit.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/runtime/transform/encode/EncoderOmit.java b/src/main/java/org/apache/sysml/runtime/transform/encode/EncoderOmit.java index af09cee..0f74590 100644 --- a/src/main/java/org/apache/sysml/runtime/transform/encode/EncoderOmit.java +++ b/src/main/java/org/apache/sysml/runtime/transform/encode/EncoderOmit.java @@ -71,11 +71,6 @@ public class EncoderOmit extends Encoder } @Override - public String[] apply(String[] words) { - return null; - } - - @Override public MatrixBlock apply(FrameBlock in, MatrixBlock out) { //determine output size http://git-wip-us.apache.org/repos/asf/systemml/blob/4a24b9a7/src/main/java/org/apache/sysml/runtime/transform/encode/EncoderPassThrough.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/runtime/transform/encode/EncoderPassThrough.java b/src/main/java/org/apache/sysml/runtime/transform/encode/EncoderPassThrough.java index d84ea0d..ee22ac1 100644 --- a/src/main/java/org/apache/sysml/runtime/transform/encode/EncoderPassThrough.java +++ b/src/main/java/org/apache/sysml/runtime/transform/encode/EncoderPassThrough.java @@ -48,11 +48,6 @@ public class EncoderPassThrough extends Encoder public void build(FrameBlock in) { //do nothing } - - @Override - public String[] apply(String[] in) { - return in; - } @Override public MatrixBlock apply(FrameBlock in, MatrixBlock out) { http://git-wip-us.apache.org/repos/asf/systemml/blob/4a24b9a7/src/main/java/org/apache/sysml/runtime/transform/encode/EncoderRecode.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/runtime/transform/encode/EncoderRecode.java b/src/main/java/org/apache/sysml/runtime/transform/encode/EncoderRecode.java index 855d565..526d31e 100644 --- a/src/main/java/org/apache/sysml/runtime/transform/encode/EncoderRecode.java +++ b/src/main/java/org/apache/sysml/runtime/transform/encode/EncoderRecode.java @@ -29,7 +29,6 @@ import org.apache.sysml.runtime.matrix.data.FrameBlock; import org.apache.sysml.runtime.matrix.data.MatrixBlock; import org.apache.sysml.runtime.transform.TfUtils; import org.apache.sysml.runtime.transform.meta.TfMetaUtils; -import org.apache.sysml.runtime.util.UtilFunctions; import org.apache.wink.json4j.JSONException; import org.apache.wink.json4j.JSONObject; @@ -39,7 +38,6 @@ public class EncoderRecode extends Encoder //recode maps and custom map for partial recode maps private HashMap<Integer, HashMap<String, Long>> _rcdMaps = new HashMap<Integer, HashMap<String, Long>>(); - private HashMap<Integer, HashMap<String,String>> _finalMaps = null; private HashMap<Integer, HashSet<Object>> _rcdMapsPart = null; public EncoderRecode(JSONObject parsedSpec, String[] colnames, int clen) @@ -60,17 +58,9 @@ public class EncoderRecode extends Encoder return _rcdMapsPart; } - public HashMap<Integer, HashMap<String,String>> getRecodeMaps() { - return _finalMaps; - } - - private String lookupRCDMap(int colID, String key) { - if( _finalMaps!=null ) - return _finalMaps.get(colID).get(key); - else { //used for cp - Long tmp = _rcdMaps.get(colID).get(key); - return (tmp!=null) ? Long.toString(tmp) : null; - } + private long lookupRCDMap(int colID, String key) { + Long tmp = _rcdMaps.get(colID).get(key); + return (tmp!=null) ? tmp : -1; } @Override @@ -132,28 +122,6 @@ public class EncoderRecode extends Encoder } } - /** - * Method to apply transformations. - */ - @Override - public String[] apply(String[] words) - { - if( !isApplicable() ) - return words; - - //apply recode maps on relevant columns of given row - for(int i=0; i < _colList.length; i++) { - //prepare input and get code - int colID = _colList[i]; - String key = UtilFunctions.unquote(words[colID-1].trim()); - String val = lookupRCDMap(colID, key); - // replace unseen keys with NaN - words[colID-1] = (val!=null) ? val : "NaN"; - } - - return words; - } - @Override public MatrixBlock apply(FrameBlock in, MatrixBlock out) { //apply recode maps column wise @@ -162,9 +130,9 @@ public class EncoderRecode extends Encoder for( int i=0; i<in.getNumRows(); i++ ) { Object okey = in.get(i, colID-1); String key = (okey!=null) ? okey.toString() : null; - String val = lookupRCDMap(colID, key); - out.quickSetValue(i, colID-1, (val!=null) ? - Double.parseDouble(val) : Double.NaN); + long code = lookupRCDMap(colID, key); + out.quickSetValue(i, colID-1, + (code >= 0) ? code : Double.NaN); } } @@ -228,4 +196,3 @@ public class EncoderRecode extends Encoder return token + Lop.DATATYPE_PREFIX + code.toString(); } } - \ No newline at end of file
