Repository: incubator-systemml
Updated Branches:
  refs/heads/master e16fe1df5 -> 67395b832


[SYSTEMML-583] Extended jmlc read util transform meta data (resources)

This patch extends the JMLC API by (1) additional read utilities to read
transform meta data from resource streams (e.g., for scoring scenarios
where the transform meta data is bundled into jars), and (2) appropriate
API documentation of all meta data read utilities.

Project: http://git-wip-us.apache.org/repos/asf/incubator-systemml/repo
Commit: 
http://git-wip-us.apache.org/repos/asf/incubator-systemml/commit/67395b83
Tree: http://git-wip-us.apache.org/repos/asf/incubator-systemml/tree/67395b83
Diff: http://git-wip-us.apache.org/repos/asf/incubator-systemml/diff/67395b83

Branch: refs/heads/master
Commit: 67395b832b2b7d78cb1cf1b058b19500bc4915c7
Parents: e16fe1d
Author: Matthias Boehm <[email protected]>
Authored: Tue Apr 5 23:08:35 2016 -0700
Committer: Matthias Boehm <[email protected]>
Committed: Wed Apr 6 10:19:13 2016 -0700

----------------------------------------------------------------------
 .../org/apache/sysml/api/jmlc/Connection.java   | 98 +++++++++++++++++---
 .../sysml/runtime/io/IOUtilFunctions.java       | 33 ++++++-
 .../functions/jmlc/FrameReadMetaTest.java       |  2 +-
 3 files changed, 116 insertions(+), 17 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/67395b83/src/main/java/org/apache/sysml/api/jmlc/Connection.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/api/jmlc/Connection.java 
b/src/main/java/org/apache/sysml/api/jmlc/Connection.java
index 00477cf..e2d76da 100644
--- a/src/main/java/org/apache/sysml/api/jmlc/Connection.java
+++ b/src/main/java/org/apache/sysml/api/jmlc/Connection.java
@@ -349,7 +349,7 @@ public class Connection
        public double[][] convertToDoubleMatrix(String input, int rows, int 
cols) 
                throws IOException
        {
-               InputStream is = new 
ByteArrayInputStream(input.getBytes("UTF-8"));
+               InputStream is = IOUtilFunctions.toInputStream(input);
                return convertToDoubleMatrix(is, rows, cols);
        }
        
@@ -386,25 +386,30 @@ public class Connection
        }
        
        /**
+        * Reads transform meta data from an HDFS file path and converts it 
into an in-memory
+        * FrameBlock object. The column names in the meta data file 
'column.names' is processed
+        * with default separator ','. 
         * 
-        * @param spec
-        * @param metapath
+        * @param spec      transform specification as json string
+        * @param metapath  hdfs file path to meta data directory
         * @return
         * @throws IOException
         */
-       public FrameBlock readTransformMetaData(String spec, String metapath) 
throws IOException {
-               return readTransformMetaData(spec, metapath, TfUtils.TXMTD_SEP);
+       public FrameBlock readTransformMetaDataFromFile(String spec, String 
metapath) throws IOException {
+               return readTransformMetaDataFromFile(spec, metapath, 
TfUtils.TXMTD_SEP);
        }
        
        /**
-        * NOTE: This is a temporary api - use readTransformMetaData(String 
spec, String metapath) if possible. 
+        * Reads transform meta data from an HDFS file path and converts it 
into an in-memory
+        * FrameBlock object.  
         * 
-        * @param spec
-        * @param metapath
+        * @param spec      transform specification as json string
+        * @param metapath  hdfs file path to meta data directory
+        * @param colDelim  separator for processing column names in the meta 
data file 'column.names'
         * @return
-        * @throws IOException 
+        * @throws IOException
         */
-       public FrameBlock readTransformMetaData(String spec, String metapath, 
String colDelim) 
+       public FrameBlock readTransformMetaDataFromFile(String spec, String 
metapath, String colDelim) 
                throws IOException 
        {
                //read column types (for sanity check column names)
@@ -436,17 +441,80 @@ public class Connection
                }
                
                //create frame block from in-memory strings
-               return readTransformMetaData(spec, rows, colnames, meta);
+               return convertToTransformMetaDataFrame(spec, rows, colnames, 
meta);
        }
        
        /**
+        * Reads transform meta data from the class path and converts it into 
an in-memory
+        * FrameBlock object. The column names in the meta data file 
'column.names' is processed
+        * with default separator ','. 
         * 
-        * @param spec
-        * @param meta
+        * @param spec      transform specification as json string
+        * @param metapath  resource path to meta data directory
         * @return
-        * @throws IOException 
+        * @throws IOException
+        */
+       public FrameBlock readTransformMetaDataFromPath(String spec, String 
metapath) throws IOException {
+               return readTransformMetaDataFromPath(spec, metapath, 
TfUtils.TXMTD_SEP);
+       }
+       
+       /**
+        * Reads transform meta data from the class path and converts it into 
an in-memory
+        * FrameBlock object.  
+        * 
+        * @param spec      transform specification as json string
+        * @param metapath  resource path to meta data directory
+        * @param colDelim  separator for processing column names in the meta 
data file 'column.names'
+        * @return
+        * @throws IOException
+        */
+       public FrameBlock readTransformMetaDataFromPath(String spec, String 
metapath, String colDelim) 
+               throws IOException 
+       {
+               //read column types (for sanity check column names)
+               String coltypesStr = 
IOUtilFunctions.toString(Connection.class.getResourceAsStream(metapath+"/"+TfUtils.TXMTD_COLTYPES));
+               List<String> coltypes = 
Arrays.asList(IOUtilFunctions.split(coltypesStr.trim(), TfUtils.TXMTD_SEP));
+               
+               //read column names
+               String colnamesStr = 
IOUtilFunctions.toString(Connection.class.getResourceAsStream(metapath+"/"+TfUtils.TXMTD_COLNAMES));
+               List<String> colnames = 
Arrays.asList(IOUtilFunctions.split(colnamesStr.trim(), colDelim));
+               if( coltypes.size() != colnames.size() ) {
+                       LOG.warn("Number of columns names: "+colnames.size()+" 
(expected: "+coltypes.size()+").");
+                       LOG.warn("--Sample column names: 
"+(!colnames.isEmpty()?colnames.get(0):"null"));
+               }
+               
+               //read meta data (currently only recode supported, without 
parsing spec)
+               HashMap<String,String> meta = new HashMap<String,String>();
+               int rows = 0;
+               for( int j=0; j<colnames.size(); j++ ) {
+                       String colName = colnames.get(j);
+                       String name = metapath+"/"+"Recode"+"/"+colName;
+                       String map = 
IOUtilFunctions.toString(Connection.class.getResourceAsStream(name+TfUtils.TXMTD_RCD_MAP_SUFFIX));
+                       if( map != null ) {
+                               meta.put(colName, map);
+                               String ndistinct = 
IOUtilFunctions.toString(Connection.class.getResourceAsStream(name+TfUtils.TXMTD_RCD_DISTINCT_SUFFIX));
+                               rows = Math.max(rows, 
Integer.parseInt(ndistinct));
+                       }
+                       else if( coltypes.get(j).equals("2") ) {
+                               LOG.warn("Recode map for column '"+colName+"' 
does not exist.");
+                       }
+               }
+               
+               //create frame block from in-memory strings
+               return convertToTransformMetaDataFrame(spec, rows, colnames, 
meta);
+       }
+       
+       /**
+        * Converts transform meta data into an in-memory FrameBlock object.
+        * 
+        * @param spec      transform specification as json string
+        * @param rows      maximum number of distinct items (number of rows in 
frame block)
+        * @param colnames  column names, ordered by position
+        * @param meta      map of (column name, recode map)-pairs, with recode 
maps in their original csv representation
+        * @return
+        * @throws IOException
         */
-       public FrameBlock readTransformMetaData(String spec, int rows, 
List<String> colnames, HashMap<String,String> meta) 
+       public FrameBlock convertToTransformMetaDataFrame(String spec, int 
rows, List<String> colnames, HashMap<String,String> meta) 
                throws IOException 
        {
                //create frame block w/ pure string schema

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/67395b83/src/main/java/org/apache/sysml/runtime/io/IOUtilFunctions.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/io/IOUtilFunctions.java 
b/src/main/java/org/apache/sysml/runtime/io/IOUtilFunctions.java
index 2daa257..3b618d0 100644
--- a/src/main/java/org/apache/sysml/runtime/io/IOUtilFunctions.java
+++ b/src/main/java/org/apache/sysml/runtime/io/IOUtilFunctions.java
@@ -21,6 +21,8 @@ package org.apache.sysml.runtime.io;
 
 import java.io.BufferedReader;
 import java.io.BufferedWriter;
+import java.io.ByteArrayInputStream;
+import java.io.ByteArrayOutputStream;
 import java.io.IOException;
 import java.io.InputStream;
 import java.io.OutputStream;
@@ -30,7 +32,7 @@ import org.apache.commons.logging.Log;
 import org.apache.commons.logging.LogFactory;
 import org.apache.hadoop.io.SequenceFile;
 import org.apache.hadoop.mapred.RecordReader;
-
+import org.apache.sysml.runtime.util.LocalFileUtils;
 import org.apache.sysml.runtime.util.UtilFunctions;
 
 public class IOUtilFunctions 
@@ -202,4 +204,33 @@ public class IOUtilFunctions
                //all tokens required for empty cells and in order to keep cell 
alignment
                return StringUtils.splitByWholeSeparatorPreserveAllTokens(str, 
delim);
        }
+       
+       /**
+        * 
+        * @param input
+        * @return
+        * @throws IOException
+        */
+       public static InputStream toInputStream(String input) throws 
IOException {
+               if( input == null ) 
+                       return null;
+               return new ByteArrayInputStream(input.getBytes("UTF-8"));
+       }
+       
+       /**
+        * 
+        * @param input
+        * @return
+        * @throws IOException
+        */
+       public static String toString(InputStream input) throws IOException {
+               if( input == null )
+                       return null;
+               ByteArrayOutputStream bos = new ByteArrayOutputStream();
+               byte[] buff = new byte[LocalFileUtils.BUFFER_SIZE];
+               for( int len=0; (len=input.read(buff))!=-1; )
+                       bos.write(buff, 0, len);
+               input.close();          
+               return bos.toString("UTF-8");
+       }
 }

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/67395b83/src/test/java/org/apache/sysml/test/integration/functions/jmlc/FrameReadMetaTest.java
----------------------------------------------------------------------
diff --git 
a/src/test/java/org/apache/sysml/test/integration/functions/jmlc/FrameReadMetaTest.java
 
b/src/test/java/org/apache/sysml/test/integration/functions/jmlc/FrameReadMetaTest.java
index 35a2e6e..bb5cbd9 100644
--- 
a/src/test/java/org/apache/sysml/test/integration/functions/jmlc/FrameReadMetaTest.java
+++ 
b/src/test/java/org/apache/sysml/test/integration/functions/jmlc/FrameReadMetaTest.java
@@ -82,7 +82,7 @@ public class FrameReadMetaTest extends AutomatedTestBase
                
                //read meta data frame
                String spec = MapReduceTool.readStringFromHDFSFile(SCRIPT_DIR + 
TEST_DIR+"tfmtd_example/spec.json");
-               FrameBlock M = conn.readTransformMetaData(spec, SCRIPT_DIR + 
TEST_DIR+"tfmtd_example/");
+               FrameBlock M = conn.readTransformMetaDataFromFile(spec, 
SCRIPT_DIR + TEST_DIR+"tfmtd_example/");
                
                //generate data based on recode maps
                HashMap<String,Long>[] RC = getRecodeMaps(M);

Reply via email to