[SYSTEMML-1791] Performance frame block indexing and transformapply

This patch makes the following performance improvements to various frame
operations in order to remove unnecessary overheads:

(1) Shallow column copy on full column indexing.

(2) Bidirectional reuse of recode maps across original meta data frame
blocks and shallow column copies (e.g., after column indexing).

(3) Avoid unnecessary long-string-double conversions on transformapply
(the recently removed file-based transform required string lookups - we
now avoid this long-string conversion which is unnecessary for the
related frame operations).

Furthermore, this patch also makes a couple of cleanups methods which
become obsolete after the removal of the old file-based transform.


Project: http://git-wip-us.apache.org/repos/asf/systemml/repo
Commit: http://git-wip-us.apache.org/repos/asf/systemml/commit/4a24b9a7
Tree: http://git-wip-us.apache.org/repos/asf/systemml/tree/4a24b9a7
Diff: http://git-wip-us.apache.org/repos/asf/systemml/diff/4a24b9a7

Branch: refs/heads/master
Commit: 4a24b9a78424dc85fe774e6d2dd5689fea9cd5b1
Parents: 7cd978d
Author: Matthias Boehm <[email protected]>
Authored: Wed Jul 19 23:16:44 2017 -0700
Committer: Matthias Boehm <[email protected]>
Committed: Thu Jul 20 01:24:24 2017 -0700

----------------------------------------------------------------------
 .../sysml/runtime/matrix/data/FrameBlock.java   | 45 ++++++++++----------
 .../sysml/runtime/transform/encode/Encoder.java | 10 -----
 .../runtime/transform/encode/EncoderBin.java    | 31 --------------
 .../transform/encode/EncoderComposite.java      |  7 ---
 .../transform/encode/EncoderDummycode.java      | 38 -----------------
 .../transform/encode/EncoderMVImpute.java       | 30 -------------
 .../runtime/transform/encode/EncoderOmit.java   |  5 ---
 .../transform/encode/EncoderPassThrough.java    |  5 ---
 .../runtime/transform/encode/EncoderRecode.java | 45 +++-----------------
 9 files changed, 29 insertions(+), 187 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/systemml/blob/4a24b9a7/src/main/java/org/apache/sysml/runtime/matrix/data/FrameBlock.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/matrix/data/FrameBlock.java 
b/src/main/java/org/apache/sysml/runtime/matrix/data/FrameBlock.java
index bfe236e..5e6404b 100644
--- a/src/main/java/org/apache/sysml/runtime/matrix/data/FrameBlock.java
+++ b/src/main/java/org/apache/sysml/runtime/matrix/data/FrameBlock.java
@@ -67,13 +67,8 @@ public class FrameBlock implements Writable, CacheBlock, 
Externalizable
        /** The data frame data as an ordered list of columns */
        private Array[] _coldata = null;
        
-       /** Cache for recode maps from frame meta data, indexed by column 
0-based */
-       private Map<Integer, SoftReference<HashMap<String,Long>>> _rcdMapCache 
= null;
-       
        public FrameBlock() {
                _numRows = 0;
-               if( REUSE_RECODE_MAPS )
-                       _rcdMapCache = new HashMap<Integer, 
SoftReference<HashMap<String,Long>>>();
        }
        
        /**
@@ -120,8 +115,6 @@ public class FrameBlock implements Writable, CacheBlock, 
Externalizable
                        _colmeta[j] = new ColumnMetadata(0);
                for( int i=0; i<data.length; i++ )
                        appendRow(data[i]);
-               if( REUSE_RECODE_MAPS )
-                       _rcdMapCache = new HashMap<Integer, 
SoftReference<HashMap<String,Long>>>();
        }
        
        /**
@@ -872,16 +865,25 @@ public class FrameBlock implements Writable, CacheBlock, 
Externalizable
                                ret._colnames[j-cl] = getColumnName(j);
                }       
                ret._numRows = ru-rl+1;
-
-               //copy output data
-               if(ret._coldata == null ) { 
+               if(ret._coldata == null )
                        ret._coldata = new Array[numCols];
+               
+               //fast-path: shallow copy column indexing 
+               if( ret._numRows == _numRows ) {
+                       //this shallow copy does not only avoid an array copy, 
but
+                       //also allows for bi-directional reuses of recodemaps 
                        for( int j=cl; j<=cu; j++ )
-                               ret._coldata[j-cl] = _coldata[j].slice(rl,ru);
+                               ret._coldata[j-cl] = _coldata[j];
+               }
+               //copy output data
+               else {
+                       for( int j=cl; j<=cu; j++ ) {
+                               if( ret._coldata[j-cl] == null )
+                                       ret._coldata[j-cl] = 
_coldata[j].slice(rl,ru);
+                               else
+                                       ret._coldata[j-cl].set(0, ru-rl, 
_coldata[j], rl);
+                       }
                }
-               else
-                       for( int j=cl; j<=cu; j++ )
-                               ret._coldata[j-cl].set(0, ru-rl, _coldata[j], 
rl);      
                
                return ret;
        }
@@ -1023,7 +1025,7 @@ public class FrameBlock implements Writable, CacheBlock, 
Externalizable
        public HashMap<String,Long> getRecodeMap(int col) {
                //probe cache for existing map
                if( REUSE_RECODE_MAPS ) {
-                       SoftReference<HashMap<String,Long>> tmp = 
_rcdMapCache.get(col);
+                       SoftReference<HashMap<String,Long>> tmp = 
_coldata[col]._rcdMapCache;
                        HashMap<String,Long> map = (tmp!=null) ? tmp.get() : 
null;
                        if( map != null ) return map;
                }
@@ -1034,10 +1036,8 @@ public class FrameBlock implements Writable, CacheBlock, 
Externalizable
                for( int i=0; i<getNumRows(); i++ ) {
                        Object val = ldata.get(i);
                        if( val != null ) {
-//                             String[] tmp = IOUtilFunctions.splitCSV(
-//                                             val.toString(), 
Lop.DATATYPE_PREFIX);
-
-                               // Instead of using splitCSV which is forcing 
string with RFC-4180 format, using Lop.DATATYPE_PREFIX separator to split token 
and code 
+                               // Instead of using splitCSV which is forcing 
string with RFC-4180 format, 
+                               // using Lop.DATATYPE_PREFIX separator to split 
token and code 
                                String[] tmp =  new String[2];
                                int pos = 
val.toString().lastIndexOf(Lop.DATATYPE_PREFIX);
                                tmp[0] = val.toString().substring(0, pos);
@@ -1047,9 +1047,8 @@ public class FrameBlock implements Writable, CacheBlock, 
Externalizable
                }
                
                //put created map into cache
-               if( REUSE_RECODE_MAPS ) {
-                       _rcdMapCache.put(col, new 
SoftReference<HashMap<String,Long>>(map));
-               }
+               if( REUSE_RECODE_MAPS )
+                       _coldata[col]._rcdMapCache = new SoftReference<>(map);
                
                return map;
        }
@@ -1245,6 +1244,8 @@ public class FrameBlock implements Writable, CacheBlock, 
Externalizable
         * in order to avoid unnecessary dependencies.
         */
        private abstract static class Array<T> implements Writable {
+               protected SoftReference<HashMap<String,Long>> _rcdMapCache = 
null;
+               
                protected int _size = 0;
                protected int newSize() {
                        return (int) Math.max(_size*2, 4); 

http://git-wip-us.apache.org/repos/asf/systemml/blob/4a24b9a7/src/main/java/org/apache/sysml/runtime/transform/encode/Encoder.java
----------------------------------------------------------------------
diff --git 
a/src/main/java/org/apache/sysml/runtime/transform/encode/Encoder.java 
b/src/main/java/org/apache/sysml/runtime/transform/encode/Encoder.java
index 304dcdb..e4af8a6 100644
--- a/src/main/java/org/apache/sysml/runtime/transform/encode/Encoder.java
+++ b/src/main/java/org/apache/sysml/runtime/transform/encode/Encoder.java
@@ -117,16 +117,6 @@ public abstract class Encoder implements Serializable
         * @return output matrix block
         */
        public abstract MatrixBlock apply(FrameBlock in, MatrixBlock out);
-       
-       /**
-        * Encode input data according to existing transform meta
-        * data (transform apply).
-        * TODO remove once file-based transform removed
-        * 
-        * @param in input data as string array
-        * @return encoded data as string array
-        */
-       public abstract String[] apply(String[] in);
 
        /**
         * Construct a frame block out of the transform meta data.

http://git-wip-us.apache.org/repos/asf/systemml/blob/4a24b9a7/src/main/java/org/apache/sysml/runtime/transform/encode/EncoderBin.java
----------------------------------------------------------------------
diff --git 
a/src/main/java/org/apache/sysml/runtime/transform/encode/EncoderBin.java 
b/src/main/java/org/apache/sysml/runtime/transform/encode/EncoderBin.java
index fbe6994..e70a392 100644
--- a/src/main/java/org/apache/sysml/runtime/transform/encode/EncoderBin.java
+++ b/src/main/java/org/apache/sysml/runtime/transform/encode/EncoderBin.java
@@ -44,7 +44,6 @@ public class EncoderBin extends Encoder
 
        private int[] _numBins = null;
        private double[] _min=null, _max=null;  // min and max among 
non-missing values
-       private double[] _binWidths = null;             // width of a bin for 
each attribute
        
        //frame transform-apply attributes
        private double[][] _binMins = null;
@@ -83,8 +82,6 @@ public class EncoderBin extends Encoder
                        Arrays.fill(_min, Double.MAX_VALUE);
                        _max = new double[_colList.length];
                        Arrays.fill(_max, -Double.MAX_VALUE);
-                       
-                       _binWidths = new double[_colList.length];
                }
        }
 
@@ -121,34 +118,6 @@ public class EncoderBin extends Encoder
                // nothing to do
        }
        
-       /**
-        * Method to apply transformations.
-        */
-       @Override
-       public String[] apply(String[] words) {
-               if( !isApplicable() )
-                       return words;
-       
-               for(int i=0; i < _colList.length; i++) {
-                       int colID = _colList[i];
-                       try {
-                               double val = 
UtilFunctions.parseToDouble(words[colID-1]);
-                               int binid = 1;
-                               double tmp = _min[i] + _binWidths[i];
-                               while(val > tmp && binid < _numBins[i]) {
-                                       tmp += _binWidths[i];
-                                       binid++;
-                               }
-                               words[colID-1] = Integer.toString(binid);
-                       } 
-                       catch(NumberFormatException e) {
-                               throw new RuntimeException("Encountered \"" + 
words[colID-1] + "\" in column ID \"" + colID + "\", when expecting a numeric 
value. Consider adding \"" + words[colID-1] + "\" to na.strings, along with an 
appropriate imputation method.");
-                       }
-               }
-               
-               return words;
-       }
-
        @Override
        public MatrixBlock apply(FrameBlock in, MatrixBlock out) {
                for(int j=0; j<_colList.length; j++) {

http://git-wip-us.apache.org/repos/asf/systemml/blob/4a24b9a7/src/main/java/org/apache/sysml/runtime/transform/encode/EncoderComposite.java
----------------------------------------------------------------------
diff --git 
a/src/main/java/org/apache/sysml/runtime/transform/encode/EncoderComposite.java 
b/src/main/java/org/apache/sysml/runtime/transform/encode/EncoderComposite.java
index deff887..ffff1df 100644
--- 
a/src/main/java/org/apache/sysml/runtime/transform/encode/EncoderComposite.java
+++ 
b/src/main/java/org/apache/sysml/runtime/transform/encode/EncoderComposite.java
@@ -80,13 +80,6 @@ public class EncoderComposite extends Encoder
                for( Encoder encoder : _encoders )
                        encoder.build(in);
        }
-
-       @Override
-       public String[] apply(String[] in) {
-               for( Encoder encoder : _encoders )
-                       encoder.apply(in);
-               return in;
-       }
        
        @Override 
        public MatrixBlock apply(FrameBlock in, MatrixBlock out) {

http://git-wip-us.apache.org/repos/asf/systemml/blob/4a24b9a7/src/main/java/org/apache/sysml/runtime/transform/encode/EncoderDummycode.java
----------------------------------------------------------------------
diff --git 
a/src/main/java/org/apache/sysml/runtime/transform/encode/EncoderDummycode.java 
b/src/main/java/org/apache/sysml/runtime/transform/encode/EncoderDummycode.java
index 743381a..9a2f059 100644
--- 
a/src/main/java/org/apache/sysml/runtime/transform/encode/EncoderDummycode.java
+++ 
b/src/main/java/org/apache/sysml/runtime/transform/encode/EncoderDummycode.java
@@ -58,44 +58,6 @@ public class EncoderDummycode extends Encoder
                //do nothing
        }
        
-       /**
-        * Method to apply transformations.
-        * 
-        * @param words array of strings
-        * @return array of transformed strings
-        */
-       @Override
-       public String[] apply(String[] words) 
-       {
-               if( !isApplicable() )
-                       return words;
-               
-               String[] nwords = new String[(int)_dummycodedLength];
-               int rcdVal = 0;
-               
-               for(int colID=1, idx=0, ncolID=1; colID <= words.length; 
colID++) {
-                       if(idx < _colList.length && colID==_colList[idx]) {
-                               // dummycoded columns
-                               try {
-                                       rcdVal = 
UtilFunctions.parseToInt(UtilFunctions.unquote(words[colID-1]));
-                                       nwords[ ncolID-1+rcdVal-1 ] = "1";
-                                       ncolID += _domainSizes[idx];
-                                       idx++;
-                               } 
-                               catch (Exception e) {
-                                       throw new RuntimeException("Error in 
dummycoding: colID="+colID + ", rcdVal=" + rcdVal+", word="+words[colID-1] 
-                                                       + ", domainSize=" + 
_domainSizes[idx] + ", dummyCodedLength=" + _dummycodedLength);
-                               }
-                       }
-                       else {
-                               nwords[ncolID-1] = words[colID-1];
-                               ncolID++;
-                       }
-               }
-               
-               return nwords;
-       }
-       
        @Override
        public MatrixBlock apply(FrameBlock in, MatrixBlock out) 
        {

http://git-wip-us.apache.org/repos/asf/systemml/blob/4a24b9a7/src/main/java/org/apache/sysml/runtime/transform/encode/EncoderMVImpute.java
----------------------------------------------------------------------
diff --git 
a/src/main/java/org/apache/sysml/runtime/transform/encode/EncoderMVImpute.java 
b/src/main/java/org/apache/sysml/runtime/transform/encode/EncoderMVImpute.java
index 55a0bde..ae9b809 100644
--- 
a/src/main/java/org/apache/sysml/runtime/transform/encode/EncoderMVImpute.java
+++ 
b/src/main/java/org/apache/sysml/runtime/transform/encode/EncoderMVImpute.java
@@ -336,36 +336,6 @@ public class EncoderMVImpute extends Encoder
                        throw new RuntimeException(ex);
                }
        }
-
-       @Override
-       public String[] apply(String[] words) 
-       {       
-               if( isApplicable() )
-                       for(int i=0; i < _colList.length; i++) {
-                               int colID = _colList[i];
-                               String w = 
UtilFunctions.unquote(words[colID-1]);
-                               if(TfUtils.isNA(_NAstrings, w))
-                                       w = words[colID-1] = 
_replacementList[i];
-                               
-                               if ( _isMVScaled.get(i) )
-                                       if ( _mvscMethodList[i] == 
MVMethod.GLOBAL_MEAN )
-                                               words[colID-1] = 
Double.toString( UtilFunctions.parseToDouble(w) - _meanList[i]._sum );
-                                       else
-                                               words[colID-1] = 
Double.toString( (UtilFunctions.parseToDouble(w) - _meanList[i]._sum) / 
_varList[i].mean._sum );
-                       }
-               
-               if(_scnomvList != null)
-               for(int i=0; i < _scnomvList.length; i++)
-               {
-                       int colID = _scnomvList[i];
-                       if ( _scnomvMethodList[i] == MVMethod.GLOBAL_MEAN )
-                               words[colID-1] = Double.toString( 
UtilFunctions.parseToDouble(words[colID-1]) - _scnomvMeanList[i]._sum );
-                       else
-                               words[colID-1] = Double.toString( 
(UtilFunctions.parseToDouble(words[colID-1]) - _scnomvMeanList[i]._sum) / 
_scnomvVarList[i].mean._sum );
-               }
-                       
-               return words;
-       }
        
        @Override
        public MatrixBlock apply(FrameBlock in, MatrixBlock out) {

http://git-wip-us.apache.org/repos/asf/systemml/blob/4a24b9a7/src/main/java/org/apache/sysml/runtime/transform/encode/EncoderOmit.java
----------------------------------------------------------------------
diff --git 
a/src/main/java/org/apache/sysml/runtime/transform/encode/EncoderOmit.java 
b/src/main/java/org/apache/sysml/runtime/transform/encode/EncoderOmit.java
index af09cee..0f74590 100644
--- a/src/main/java/org/apache/sysml/runtime/transform/encode/EncoderOmit.java
+++ b/src/main/java/org/apache/sysml/runtime/transform/encode/EncoderOmit.java
@@ -71,11 +71,6 @@ public class EncoderOmit extends Encoder
        }
        
        @Override
-       public String[] apply(String[] words) {
-               return null;
-       }
-       
-       @Override
        public MatrixBlock apply(FrameBlock in, MatrixBlock out) 
        {
                //determine output size

http://git-wip-us.apache.org/repos/asf/systemml/blob/4a24b9a7/src/main/java/org/apache/sysml/runtime/transform/encode/EncoderPassThrough.java
----------------------------------------------------------------------
diff --git 
a/src/main/java/org/apache/sysml/runtime/transform/encode/EncoderPassThrough.java
 
b/src/main/java/org/apache/sysml/runtime/transform/encode/EncoderPassThrough.java
index d84ea0d..ee22ac1 100644
--- 
a/src/main/java/org/apache/sysml/runtime/transform/encode/EncoderPassThrough.java
+++ 
b/src/main/java/org/apache/sysml/runtime/transform/encode/EncoderPassThrough.java
@@ -48,11 +48,6 @@ public class EncoderPassThrough extends Encoder
        public void build(FrameBlock in) {
                //do nothing
        }
-
-       @Override
-       public String[] apply(String[] in) {
-               return in;
-       }
        
        @Override 
        public MatrixBlock apply(FrameBlock in, MatrixBlock out) {

http://git-wip-us.apache.org/repos/asf/systemml/blob/4a24b9a7/src/main/java/org/apache/sysml/runtime/transform/encode/EncoderRecode.java
----------------------------------------------------------------------
diff --git 
a/src/main/java/org/apache/sysml/runtime/transform/encode/EncoderRecode.java 
b/src/main/java/org/apache/sysml/runtime/transform/encode/EncoderRecode.java
index 855d565..526d31e 100644
--- a/src/main/java/org/apache/sysml/runtime/transform/encode/EncoderRecode.java
+++ b/src/main/java/org/apache/sysml/runtime/transform/encode/EncoderRecode.java
@@ -29,7 +29,6 @@ import org.apache.sysml.runtime.matrix.data.FrameBlock;
 import org.apache.sysml.runtime.matrix.data.MatrixBlock;
 import org.apache.sysml.runtime.transform.TfUtils;
 import org.apache.sysml.runtime.transform.meta.TfMetaUtils;
-import org.apache.sysml.runtime.util.UtilFunctions;
 import org.apache.wink.json4j.JSONException;
 import org.apache.wink.json4j.JSONObject;
 
@@ -39,7 +38,6 @@ public class EncoderRecode extends Encoder
 
        //recode maps and custom map for partial recode maps 
        private HashMap<Integer, HashMap<String, Long>> _rcdMaps  = new 
HashMap<Integer, HashMap<String, Long>>();
-       private HashMap<Integer, HashMap<String,String>> _finalMaps = null;
        private HashMap<Integer, HashSet<Object>> _rcdMapsPart = null;
        
        public EncoderRecode(JSONObject parsedSpec, String[] colnames, int clen)
@@ -60,17 +58,9 @@ public class EncoderRecode extends Encoder
                return _rcdMapsPart; 
        }
        
-       public HashMap<Integer, HashMap<String,String>> getRecodeMaps() {
-               return _finalMaps;
-       }
-       
-       private String lookupRCDMap(int colID, String key) {
-               if( _finalMaps!=null )
-                       return _finalMaps.get(colID).get(key);
-               else { //used for cp
-                       Long tmp = _rcdMaps.get(colID).get(key);
-                       return (tmp!=null) ? Long.toString(tmp) : null;
-               }
+       private long lookupRCDMap(int colID, String key) {
+               Long tmp = _rcdMaps.get(colID).get(key);
+               return (tmp!=null) ? tmp : -1;
        }
        
        @Override
@@ -132,28 +122,6 @@ public class EncoderRecode extends Encoder
                }
        }
        
-       /**
-        * Method to apply transformations.
-        */
-       @Override
-       public String[] apply(String[] words) 
-       {
-               if( !isApplicable() )
-                       return words;
-               
-               //apply recode maps on relevant columns of given row
-               for(int i=0; i < _colList.length; i++) {
-                       //prepare input and get code
-                       int colID = _colList[i];
-                       String key = 
UtilFunctions.unquote(words[colID-1].trim());
-                       String val = lookupRCDMap(colID, key);                  
-                       // replace unseen keys with NaN 
-                       words[colID-1] = (val!=null) ? val : "NaN";
-               }
-                       
-               return words;
-       }
-       
        @Override
        public MatrixBlock apply(FrameBlock in, MatrixBlock out) {
                //apply recode maps column wise
@@ -162,9 +130,9 @@ public class EncoderRecode extends Encoder
                        for( int i=0; i<in.getNumRows(); i++ ) {
                                Object okey = in.get(i, colID-1);
                                String key = (okey!=null) ? okey.toString() : 
null;
-                               String val = lookupRCDMap(colID, key);          
        
-                               out.quickSetValue(i, colID-1, (val!=null) ? 
-                                               Double.parseDouble(val) : 
Double.NaN);
+                               long code = lookupRCDMap(colID, key);           
        
+                               out.quickSetValue(i, colID-1,
+                                       (code >= 0) ? code : Double.NaN);
                        }
                }
                
@@ -228,4 +196,3 @@ public class EncoderRecode extends Encoder
                return token + Lop.DATATYPE_PREFIX + code.toString();
        }
 }
- 
\ No newline at end of file

Reply via email to