[SYSTEMML-556] Extended jmlc api for reading frames (file/path), tests Project: http://git-wip-us.apache.org/repos/asf/incubator-systemml/repo Commit: http://git-wip-us.apache.org/repos/asf/incubator-systemml/commit/14e898ac Tree: http://git-wip-us.apache.org/repos/asf/incubator-systemml/tree/14e898ac Diff: http://git-wip-us.apache.org/repos/asf/incubator-systemml/diff/14e898ac
Branch: refs/heads/master Commit: 14e898acc4ac1dde839cc19342f0f3bd4fcab904 Parents: 2c19736 Author: Matthias Boehm <[email protected]> Authored: Fri Apr 22 18:05:31 2016 -0700 Committer: Matthias Boehm <[email protected]> Committed: Fri Apr 22 21:51:09 2016 -0700 ---------------------------------------------------------------------- .../org/apache/sysml/api/jmlc/Connection.java | 150 +++++++++++++++++++ .../sysml/runtime/io/FrameReaderTextCell.java | 16 +- .../functions/jmlc/FrameReadMetaTest.java | 25 +++- .../jmlc/tfmtd_frame_example/tfmtd_frame | 26 ++++ .../jmlc/tfmtd_frame_example/tfmtd_frame.mtd | 10 ++ 5 files changed, 220 insertions(+), 7 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/14e898ac/src/main/java/org/apache/sysml/api/jmlc/Connection.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/api/jmlc/Connection.java b/src/main/java/org/apache/sysml/api/jmlc/Connection.java index b384ca5..6a8dc45 100644 --- a/src/main/java/org/apache/sysml/api/jmlc/Connection.java +++ b/src/main/java/org/apache/sysml/api/jmlc/Connection.java @@ -55,6 +55,9 @@ import org.apache.sysml.parser.Expression.ValueType; import org.apache.sysml.runtime.DMLRuntimeException; import org.apache.sysml.runtime.controlprogram.Program; import org.apache.sysml.runtime.controlprogram.caching.CacheableData; +import org.apache.sysml.runtime.io.FrameReader; +import org.apache.sysml.runtime.io.FrameReaderFactory; +import org.apache.sysml.runtime.io.FrameReaderTextCell; import org.apache.sysml.runtime.io.IOUtilFunctions; import org.apache.sysml.runtime.io.MatrixReader; import org.apache.sysml.runtime.io.MatrixReaderFactory; @@ -259,6 +262,10 @@ public class Connection return sb.toString(); } + //////////////////////////////////////////// + // Read matrices + //////////////////////////////////////////// + /** * Reads an input matrix in arbitrary format from HDFS into a dense double array. * NOTE: this call currently only supports default configurations for CSV. @@ -407,6 +414,149 @@ public class Connection return ret; } + //////////////////////////////////////////// + // Read frames + //////////////////////////////////////////// + + /** + * Reads an input frame in arbitrary format from HDFS into a dense string array. + * NOTE: this call currently only supports default configurations for CSV. + * + * @param fname the filename of the input frame + * @return frame as a two-dimensional string array + * @throws IOException + */ + public String[][] readStringFrame(String fname) + throws IOException + { + try { + //read json meta data + String fnamemtd = DataExpression.getMTDFileName(fname); + JSONObject jmtd = new DataExpression().readMetadataFile(fnamemtd, false); + + //parse json meta data + long rows = jmtd.getLong(DataExpression.READROWPARAM); + long cols = jmtd.getLong(DataExpression.READCOLPARAM); + String format = jmtd.getString(DataExpression.FORMAT_TYPE); + InputInfo iinfo = InputInfo.stringExternalToInputInfo(format); + + //read frame file + return readStringFrame(fname, iinfo, rows, cols); + } + catch(Exception ex) { + throw new IOException(ex); + } + } + + /** + * Reads an input frame in arbitrary format from HDFS into a dense string array. + * NOTE: this call currently only supports default configurations for CSV. + * + * @param fname the filename of the input frame + * @param iinfo InputInfo object + * @param rows number of rows in the frame + * @param cols number of columns in the frame + * @return frame as a two-dimensional string array + * @throws IOException + */ + public String[][] readStringFrame(String fname, InputInfo iinfo, long rows, long cols) + throws IOException + { + try { + FrameReader reader = FrameReaderFactory.createFrameReader(iinfo); + FrameBlock mb = reader.readFrameFromHDFS(fname, rows, cols); + return DataConverter.convertToStringFrame(mb); + } + catch(Exception ex) { + throw new IOException(ex); + } + } + + /** + * Converts an input string representation of a frame in textcell format + * into a dense string array. The meta data string is the SystemML generated + * .mtd file including the number of rows and columns. + * + * @param input string frame in textcell format + * @param meta string representing SystemML frame metadata in JSON format + * @return frame as a two-dimensional string array + * @throws IOException + */ + public String[][] convertToStringFrame(String input, String meta) + throws IOException + { + try { + //parse json meta data + JSONObject jmtd = new JSONObject(meta); + int rows = jmtd.getInt(DataExpression.READROWPARAM); + int cols = jmtd.getInt(DataExpression.READCOLPARAM); + String format = jmtd.getString(DataExpression.FORMAT_TYPE); + + //sanity check input format + if(!(DataExpression.FORMAT_TYPE_VALUE_TEXT.equals(format) + ||DataExpression.FORMAT_TYPE_VALUE_MATRIXMARKET.equals(format))) { + throw new IOException("Invalid input format (expected: text or mm): "+format); + } + + //parse the input frame + return convertToStringFrame(input, rows, cols); + } + catch(Exception ex) { + throw new IOException(ex); + } + } + + /** + * Converts an input string representation of a frame in textcell format + * into a dense string array. The number of rows and columns need to be + * specified because textcell only represents non-zero values and hence + * does not define the dimensions in the general case. + * + * @param input string frame in textcell format + * @param rows number of rows in the frame + * @param cols number of columns in the frame + * @return frame as a two-dimensional string array + * @throws IOException + */ + public String[][] convertToStringFrame(String input, int rows, int cols) + throws IOException + { + InputStream is = IOUtilFunctions.toInputStream(input); + return convertToStringFrame(is, rows, cols); + } + + /** + * Converts an input stream of a string frame in textcell format + * into a dense string array. The number of rows and columns need to be + * specified because textcell only represents non-zero values and hence + * does not define the dimensions in the general case. + * + * @param input InputStream to a string frame in textcell format + * @param rows number of rows in the frame + * @param cols number of columns in the frame + * @return frame as a two-dimensional string array + * @throws IOException + */ + public String[][] convertToStringFrame(InputStream input, int rows, int cols) + throws IOException + { + String[][] ret = null; + + try { + //read input matrix + FrameReaderTextCell reader = (FrameReaderTextCell)FrameReaderFactory.createFrameReader(InputInfo.TextCellInputInfo); + FrameBlock mb = reader.readFrameFromInputStream(input, rows, cols); + + //convert to double array + ret = DataConverter.convertToStringFrame( mb ); + } + catch(DMLRuntimeException rex) { + throw new IOException( rex ); + } + + return ret; + } + //////////////////////////////////////////// // Read transform meta data http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/14e898ac/src/main/java/org/apache/sysml/runtime/io/FrameReaderTextCell.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/runtime/io/FrameReaderTextCell.java b/src/main/java/org/apache/sysml/runtime/io/FrameReaderTextCell.java index 91b7892..2f42a64 100644 --- a/src/main/java/org/apache/sysml/runtime/io/FrameReaderTextCell.java +++ b/src/main/java/org/apache/sysml/runtime/io/FrameReaderTextCell.java @@ -84,6 +84,20 @@ public class FrameReaderTextCell extends FrameReader /** * * @param is + * @param rlen + * @param clen + * @return + * @throws IOException + * @throws DMLRuntimeException + */ + public FrameBlock readFrameFromInputStream(InputStream is, long rlen, long clen) + throws IOException, DMLRuntimeException { + return readFrameFromInputStream(is, getDefSchema(clen), getDefColNames(clen), rlen, clen); + } + + /** + * + * @param is * @param schema * @param names * @param rlen @@ -93,7 +107,7 @@ public class FrameReaderTextCell extends FrameReader * @throws IOException */ public FrameBlock readFrameFromInputStream(InputStream is, List<ValueType> schema, List<String> names, long rlen, long clen) - throws IOException, DMLRuntimeException + throws IOException, DMLRuntimeException { //allocate output frame block FrameBlock ret = createOutputFrameBlock(schema, names, rlen); http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/14e898ac/src/test/java/org/apache/sysml/test/integration/functions/jmlc/FrameReadMetaTest.java ---------------------------------------------------------------------- diff --git a/src/test/java/org/apache/sysml/test/integration/functions/jmlc/FrameReadMetaTest.java b/src/test/java/org/apache/sysml/test/integration/functions/jmlc/FrameReadMetaTest.java index 05beeed..1abe68a 100644 --- a/src/test/java/org/apache/sysml/test/integration/functions/jmlc/FrameReadMetaTest.java +++ b/src/test/java/org/apache/sysml/test/integration/functions/jmlc/FrameReadMetaTest.java @@ -30,6 +30,7 @@ import org.apache.sysml.api.jmlc.PreparedScript; import org.apache.sysml.api.jmlc.ResultVariables; import org.apache.sysml.lops.Lop; import org.apache.sysml.runtime.matrix.data.FrameBlock; +import org.apache.sysml.runtime.util.DataConverter; import org.apache.sysml.runtime.util.MapReduceTool; import org.apache.sysml.test.integration.AutomatedTestBase; import org.apache.sysml.test.integration.TestConfiguration; @@ -54,22 +55,32 @@ public class FrameReadMetaTest extends AutomatedTestBase @Test public void testJMLCTransformDenseSpec() throws IOException { - runJMLCReadMetaTest(TEST_NAME1, false, true); + runJMLCReadMetaTest(TEST_NAME1, false, false, true); } @Test public void testJMLCTransformDenseReuseSpec() throws IOException { - runJMLCReadMetaTest(TEST_NAME1, true, true); + runJMLCReadMetaTest(TEST_NAME1, true, false, true); } @Test public void testJMLCTransformDense() throws IOException { - runJMLCReadMetaTest(TEST_NAME1, false, false); + runJMLCReadMetaTest(TEST_NAME1, false, false, false); } @Test public void testJMLCTransformDenseReuse() throws IOException { - runJMLCReadMetaTest(TEST_NAME1, true, false); + runJMLCReadMetaTest(TEST_NAME1, true, false, false); + } + + @Test + public void testJMLCTransformDenseReadFrame() throws IOException { + runJMLCReadMetaTest(TEST_NAME1, false, true, false); + } + + @Test + public void testJMLCTransformDenseReuseReadFrame() throws IOException { + runJMLCReadMetaTest(TEST_NAME1, true, true, false); } /** @@ -79,7 +90,7 @@ public class FrameReadMetaTest extends AutomatedTestBase * @param instType * @throws IOException */ - private void runJMLCReadMetaTest( String testname, boolean modelReuse, boolean useSpec ) + private void runJMLCReadMetaTest( String testname, boolean modelReuse, boolean readFrame, boolean useSpec ) throws IOException { String TEST_NAME = testname; @@ -92,7 +103,9 @@ public class FrameReadMetaTest extends AutomatedTestBase //read meta data frame String spec = MapReduceTool.readStringFromHDFSFile(SCRIPT_DIR + TEST_DIR+"tfmtd_example/spec.json"); - FrameBlock M = conn.readTransformMetaDataFromFile(useSpec ? spec : null, SCRIPT_DIR + TEST_DIR+"tfmtd_example/"); + FrameBlock M = readFrame ? + DataConverter.convertToFrameBlock(conn.readStringFrame(SCRIPT_DIR + TEST_DIR+"tfmtd_frame_example/tfmtd_frame")) : + conn.readTransformMetaDataFromFile(useSpec ? spec : null, SCRIPT_DIR + TEST_DIR+"tfmtd_example/"); //generate data based on recode maps HashMap<String,Long>[] RC = getRecodeMaps(M); http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/14e898ac/src/test/scripts/functions/jmlc/tfmtd_frame_example/tfmtd_frame ---------------------------------------------------------------------- diff --git a/src/test/scripts/functions/jmlc/tfmtd_frame_example/tfmtd_frame b/src/test/scripts/functions/jmlc/tfmtd_frame_example/tfmtd_frame new file mode 100644 index 0000000..60a3c18 --- /dev/null +++ b/src/test/scripts/functions/jmlc/tfmtd_frame_example/tfmtd_frame @@ -0,0 +1,26 @@ +1 1 91312·1 +1 2 east·1 +1 4 1·1 +1 5 1·1 +1 6 1·1 +1 7 FALSE·1 +2 1 94555·2 +2 2 north·2 +2 4 2·2 +2 5 1.5·2 +2 6 2·2 +2 7 TRUE·2 +3 1 95141·3 +3 2 south·3 +3 4 3·3 +3 5 2·3 +3 6 3·3 +4 1 96334·4 +4 2 west·4 +4 4 4·4 +4 5 2.5·4 +5 1 98755·5 +5 4 5·5 +5 5 3·5 +6 4 6·6 +7 4 7·7 http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/14e898ac/src/test/scripts/functions/jmlc/tfmtd_frame_example/tfmtd_frame.mtd ---------------------------------------------------------------------- diff --git a/src/test/scripts/functions/jmlc/tfmtd_frame_example/tfmtd_frame.mtd b/src/test/scripts/functions/jmlc/tfmtd_frame_example/tfmtd_frame.mtd new file mode 100644 index 0000000..bc8b601 --- /dev/null +++ b/src/test/scripts/functions/jmlc/tfmtd_frame_example/tfmtd_frame.mtd @@ -0,0 +1,10 @@ +{ + "data_type": "frame", + "value_type": "string", + "rows": 7, + "cols": 9, + "format": "text", + "description": { + "author": "SystemML" + } +} \ No newline at end of file
