Repository: incubator-systemml Updated Branches: refs/heads/master e16fe1df5 -> 67395b832
[SYSTEMML-583] Extended jmlc read util transform meta data (resources) This patch extends the JMLC API by (1) additional read utilities to read transform meta data from resource streams (e.g., for scoring scenarios where the transform meta data is bundled into jars), and (2) appropriate API documentation of all meta data read utilities. Project: http://git-wip-us.apache.org/repos/asf/incubator-systemml/repo Commit: http://git-wip-us.apache.org/repos/asf/incubator-systemml/commit/67395b83 Tree: http://git-wip-us.apache.org/repos/asf/incubator-systemml/tree/67395b83 Diff: http://git-wip-us.apache.org/repos/asf/incubator-systemml/diff/67395b83 Branch: refs/heads/master Commit: 67395b832b2b7d78cb1cf1b058b19500bc4915c7 Parents: e16fe1d Author: Matthias Boehm <[email protected]> Authored: Tue Apr 5 23:08:35 2016 -0700 Committer: Matthias Boehm <[email protected]> Committed: Wed Apr 6 10:19:13 2016 -0700 ---------------------------------------------------------------------- .../org/apache/sysml/api/jmlc/Connection.java | 98 +++++++++++++++++--- .../sysml/runtime/io/IOUtilFunctions.java | 33 ++++++- .../functions/jmlc/FrameReadMetaTest.java | 2 +- 3 files changed, 116 insertions(+), 17 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/67395b83/src/main/java/org/apache/sysml/api/jmlc/Connection.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/api/jmlc/Connection.java b/src/main/java/org/apache/sysml/api/jmlc/Connection.java index 00477cf..e2d76da 100644 --- a/src/main/java/org/apache/sysml/api/jmlc/Connection.java +++ b/src/main/java/org/apache/sysml/api/jmlc/Connection.java @@ -349,7 +349,7 @@ public class Connection public double[][] convertToDoubleMatrix(String input, int rows, int cols) throws IOException { - InputStream is = new ByteArrayInputStream(input.getBytes("UTF-8")); + InputStream is = IOUtilFunctions.toInputStream(input); return convertToDoubleMatrix(is, rows, cols); } @@ -386,25 +386,30 @@ public class Connection } /** + * Reads transform meta data from an HDFS file path and converts it into an in-memory + * FrameBlock object. The column names in the meta data file 'column.names' is processed + * with default separator ','. * - * @param spec - * @param metapath + * @param spec transform specification as json string + * @param metapath hdfs file path to meta data directory * @return * @throws IOException */ - public FrameBlock readTransformMetaData(String spec, String metapath) throws IOException { - return readTransformMetaData(spec, metapath, TfUtils.TXMTD_SEP); + public FrameBlock readTransformMetaDataFromFile(String spec, String metapath) throws IOException { + return readTransformMetaDataFromFile(spec, metapath, TfUtils.TXMTD_SEP); } /** - * NOTE: This is a temporary api - use readTransformMetaData(String spec, String metapath) if possible. + * Reads transform meta data from an HDFS file path and converts it into an in-memory + * FrameBlock object. * - * @param spec - * @param metapath + * @param spec transform specification as json string + * @param metapath hdfs file path to meta data directory + * @param colDelim separator for processing column names in the meta data file 'column.names' * @return - * @throws IOException + * @throws IOException */ - public FrameBlock readTransformMetaData(String spec, String metapath, String colDelim) + public FrameBlock readTransformMetaDataFromFile(String spec, String metapath, String colDelim) throws IOException { //read column types (for sanity check column names) @@ -436,17 +441,80 @@ public class Connection } //create frame block from in-memory strings - return readTransformMetaData(spec, rows, colnames, meta); + return convertToTransformMetaDataFrame(spec, rows, colnames, meta); } /** + * Reads transform meta data from the class path and converts it into an in-memory + * FrameBlock object. The column names in the meta data file 'column.names' is processed + * with default separator ','. * - * @param spec - * @param meta + * @param spec transform specification as json string + * @param metapath resource path to meta data directory * @return - * @throws IOException + * @throws IOException + */ + public FrameBlock readTransformMetaDataFromPath(String spec, String metapath) throws IOException { + return readTransformMetaDataFromPath(spec, metapath, TfUtils.TXMTD_SEP); + } + + /** + * Reads transform meta data from the class path and converts it into an in-memory + * FrameBlock object. + * + * @param spec transform specification as json string + * @param metapath resource path to meta data directory + * @param colDelim separator for processing column names in the meta data file 'column.names' + * @return + * @throws IOException + */ + public FrameBlock readTransformMetaDataFromPath(String spec, String metapath, String colDelim) + throws IOException + { + //read column types (for sanity check column names) + String coltypesStr = IOUtilFunctions.toString(Connection.class.getResourceAsStream(metapath+"/"+TfUtils.TXMTD_COLTYPES)); + List<String> coltypes = Arrays.asList(IOUtilFunctions.split(coltypesStr.trim(), TfUtils.TXMTD_SEP)); + + //read column names + String colnamesStr = IOUtilFunctions.toString(Connection.class.getResourceAsStream(metapath+"/"+TfUtils.TXMTD_COLNAMES)); + List<String> colnames = Arrays.asList(IOUtilFunctions.split(colnamesStr.trim(), colDelim)); + if( coltypes.size() != colnames.size() ) { + LOG.warn("Number of columns names: "+colnames.size()+" (expected: "+coltypes.size()+")."); + LOG.warn("--Sample column names: "+(!colnames.isEmpty()?colnames.get(0):"null")); + } + + //read meta data (currently only recode supported, without parsing spec) + HashMap<String,String> meta = new HashMap<String,String>(); + int rows = 0; + for( int j=0; j<colnames.size(); j++ ) { + String colName = colnames.get(j); + String name = metapath+"/"+"Recode"+"/"+colName; + String map = IOUtilFunctions.toString(Connection.class.getResourceAsStream(name+TfUtils.TXMTD_RCD_MAP_SUFFIX)); + if( map != null ) { + meta.put(colName, map); + String ndistinct = IOUtilFunctions.toString(Connection.class.getResourceAsStream(name+TfUtils.TXMTD_RCD_DISTINCT_SUFFIX)); + rows = Math.max(rows, Integer.parseInt(ndistinct)); + } + else if( coltypes.get(j).equals("2") ) { + LOG.warn("Recode map for column '"+colName+"' does not exist."); + } + } + + //create frame block from in-memory strings + return convertToTransformMetaDataFrame(spec, rows, colnames, meta); + } + + /** + * Converts transform meta data into an in-memory FrameBlock object. + * + * @param spec transform specification as json string + * @param rows maximum number of distinct items (number of rows in frame block) + * @param colnames column names, ordered by position + * @param meta map of (column name, recode map)-pairs, with recode maps in their original csv representation + * @return + * @throws IOException */ - public FrameBlock readTransformMetaData(String spec, int rows, List<String> colnames, HashMap<String,String> meta) + public FrameBlock convertToTransformMetaDataFrame(String spec, int rows, List<String> colnames, HashMap<String,String> meta) throws IOException { //create frame block w/ pure string schema http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/67395b83/src/main/java/org/apache/sysml/runtime/io/IOUtilFunctions.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/runtime/io/IOUtilFunctions.java b/src/main/java/org/apache/sysml/runtime/io/IOUtilFunctions.java index 2daa257..3b618d0 100644 --- a/src/main/java/org/apache/sysml/runtime/io/IOUtilFunctions.java +++ b/src/main/java/org/apache/sysml/runtime/io/IOUtilFunctions.java @@ -21,6 +21,8 @@ package org.apache.sysml.runtime.io; import java.io.BufferedReader; import java.io.BufferedWriter; +import java.io.ByteArrayInputStream; +import java.io.ByteArrayOutputStream; import java.io.IOException; import java.io.InputStream; import java.io.OutputStream; @@ -30,7 +32,7 @@ import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.hadoop.io.SequenceFile; import org.apache.hadoop.mapred.RecordReader; - +import org.apache.sysml.runtime.util.LocalFileUtils; import org.apache.sysml.runtime.util.UtilFunctions; public class IOUtilFunctions @@ -202,4 +204,33 @@ public class IOUtilFunctions //all tokens required for empty cells and in order to keep cell alignment return StringUtils.splitByWholeSeparatorPreserveAllTokens(str, delim); } + + /** + * + * @param input + * @return + * @throws IOException + */ + public static InputStream toInputStream(String input) throws IOException { + if( input == null ) + return null; + return new ByteArrayInputStream(input.getBytes("UTF-8")); + } + + /** + * + * @param input + * @return + * @throws IOException + */ + public static String toString(InputStream input) throws IOException { + if( input == null ) + return null; + ByteArrayOutputStream bos = new ByteArrayOutputStream(); + byte[] buff = new byte[LocalFileUtils.BUFFER_SIZE]; + for( int len=0; (len=input.read(buff))!=-1; ) + bos.write(buff, 0, len); + input.close(); + return bos.toString("UTF-8"); + } } http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/67395b83/src/test/java/org/apache/sysml/test/integration/functions/jmlc/FrameReadMetaTest.java ---------------------------------------------------------------------- diff --git a/src/test/java/org/apache/sysml/test/integration/functions/jmlc/FrameReadMetaTest.java b/src/test/java/org/apache/sysml/test/integration/functions/jmlc/FrameReadMetaTest.java index 35a2e6e..bb5cbd9 100644 --- a/src/test/java/org/apache/sysml/test/integration/functions/jmlc/FrameReadMetaTest.java +++ b/src/test/java/org/apache/sysml/test/integration/functions/jmlc/FrameReadMetaTest.java @@ -82,7 +82,7 @@ public class FrameReadMetaTest extends AutomatedTestBase //read meta data frame String spec = MapReduceTool.readStringFromHDFSFile(SCRIPT_DIR + TEST_DIR+"tfmtd_example/spec.json"); - FrameBlock M = conn.readTransformMetaData(spec, SCRIPT_DIR + TEST_DIR+"tfmtd_example/"); + FrameBlock M = conn.readTransformMetaDataFromFile(spec, SCRIPT_DIR + TEST_DIR+"tfmtd_example/"); //generate data based on recode maps HashMap<String,Long>[] RC = getRecodeMaps(M);
