Repository: incubator-systemml Updated Branches: refs/heads/master 50541d32e -> e6db69b87
[SYSTEMML-474] MLContext matrix from URL Initial support reading IJV and CSV matrices into MLContext via URLs. Closes #210. Project: http://git-wip-us.apache.org/repos/asf/incubator-systemml/repo Commit: http://git-wip-us.apache.org/repos/asf/incubator-systemml/commit/e6db69b8 Tree: http://git-wip-us.apache.org/repos/asf/incubator-systemml/tree/e6db69b8 Diff: http://git-wip-us.apache.org/repos/asf/incubator-systemml/diff/e6db69b8 Branch: refs/heads/master Commit: e6db69b873a3ac8be022f5ba7273d37124e34497 Parents: 50541d3 Author: Deron Eriksson <[email protected]> Authored: Tue Aug 16 21:38:27 2016 -0700 Committer: Deron Eriksson <[email protected]> Committed: Tue Aug 16 21:38:27 2016 -0700 ---------------------------------------------------------------------- docs/_config.yml | 2 +- docs/spark-mlcontext-programming-guide.md | 101 +++++++++++++++++++ pom.xml | 1 + .../api/mlcontext/MLContextConversionUtil.java | 79 +++++++++++++-- .../sysml/api/mlcontext/MLContextUtil.java | 35 ++----- .../integration/mlcontext/MLContextTest.java | 52 +++++++++- .../org/apache/sysml/api/mlcontext/1234.ijv | 4 + .../org/apache/sysml/api/mlcontext/1234.ijv.mtd | 11 ++ 8 files changed, 248 insertions(+), 37 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/e6db69b8/docs/_config.yml ---------------------------------------------------------------------- diff --git a/docs/_config.yml b/docs/_config.yml index 1a09658..2ef66a0 100644 --- a/docs/_config.yml +++ b/docs/_config.yml @@ -11,7 +11,7 @@ include: - _modules # These allow the documentation to be updated with newer releases -SYSTEMML_VERSION: 0.11.0 +SYSTEMML_VERSION: 0.10.x # if 'analytics_on' is true, analytics section will be rendered on the HTML pages analytics_on: true http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/e6db69b8/docs/spark-mlcontext-programming-guide.md ---------------------------------------------------------------------- diff --git a/docs/spark-mlcontext-programming-guide.md b/docs/spark-mlcontext-programming-guide.md index 2f77347..71db1a4 100644 --- a/docs/spark-mlcontext-programming-guide.md +++ b/docs/spark-mlcontext-programming-guide.md @@ -662,6 +662,107 @@ None </div> +Alternatively, we could supply a `java.net.URL` to the Script `in` method. Note that if the URL matrix data is in IJV +format, metadata needs to be supplied for the matrix. + +<div class="codetabs"> + +<div data-lang="Scala" markdown="1"> +{% highlight scala %} +val habermanUrl = "http://archive.ics.uci.edu/ml/machine-learning-databases/haberman/haberman.data" +val typesRDD = sc.parallelize(Array("1.0,1.0,1.0,2.0")) +val scriptUrl = "https://raw.githubusercontent.com/apache/incubator-systemml/master/scripts/algorithms/Univar-Stats.dml" +val uni = dmlFromUrl(scriptUrl).in("A", new java.net.URL(habermanUrl)).in("K", typesRDD).in("$CONSOLE_OUTPUT", true) +ml.execute(uni) +{% endhighlight %} +</div> + +<div data-lang="Spark Shell" markdown="1"> +{% highlight scala %} +scala> val habermanUrl = "http://archive.ics.uci.edu/ml/machine-learning-databases/haberman/haberman.data" +habermanUrl: String = http://archive.ics.uci.edu/ml/machine-learning-databases/haberman/haberman.data + +scala> val typesRDD = sc.parallelize(Array("1.0,1.0,1.0,2.0")) +typesRDD: org.apache.spark.rdd.RDD[String] = ParallelCollectionRDD[50] at parallelize at <console>:33 + +scala> val scriptUrl = "https://raw.githubusercontent.com/apache/incubator-systemml/master/scripts/algorithms/Univar-Stats.dml" +scriptUrl: String = https://raw.githubusercontent.com/apache/incubator-systemml/master/scripts/algorithms/Univar-Stats.dml + +scala> val uni = dmlFromUrl(scriptUrl).in("A", new java.net.URL(habermanUrl)).in("K", typesRDD).in("$CONSOLE_OUTPUT", true) +uni: org.apache.sysml.api.mlcontext.Script = +Inputs: + [1] (URL) A: http://archive.ics.uci.edu/ml/machine-learning-databases/haberman/haberman.data + [2] (RDD) K: ParallelCollectionRDD[50] at parallelize at <console>:33 + [3] (Boolean) $CONSOLE_OUTPUT: true + +Outputs: +None + + +scala> ml.execute(uni) +... +------------------------------------------------- + (01) Minimum | 30.0 + (02) Maximum | 83.0 + (03) Range | 53.0 + (04) Mean | 52.45751633986928 + (05) Variance | 116.71458266366658 + (06) Std deviation | 10.803452349303281 + (07) Std err of mean | 0.6175922641866753 + (08) Coeff of variation | 0.20594669940735139 + (09) Skewness | 0.1450718616532357 + (10) Kurtosis | -0.6150152487211726 + (11) Std err of skewness | 0.13934809593495995 + (12) Std err of kurtosis | 0.277810485320835 + (13) Median | 52.0 + (14) Interquartile mean | 52.16013071895425 +Feature [1]: Scale +------------------------------------------------- + (01) Minimum | 58.0 + (02) Maximum | 69.0 + (03) Range | 11.0 + (04) Mean | 62.85294117647059 + (05) Variance | 10.558630665380907 + (06) Std deviation | 3.2494046632238507 + (07) Std err of mean | 0.18575610076612029 + (08) Coeff of variation | 0.051698529971741194 + (09) Skewness | 0.07798443581479181 + (10) Kurtosis | -1.1324380182967442 + (11) Std err of skewness | 0.13934809593495995 + (12) Std err of kurtosis | 0.277810485320835 + (13) Median | 63.0 + (14) Interquartile mean | 62.80392156862745 +Feature [2]: Scale +------------------------------------------------- + (01) Minimum | 0.0 + (02) Maximum | 52.0 + (03) Range | 52.0 + (04) Mean | 4.026143790849673 + (05) Variance | 51.691117539912135 + (06) Std deviation | 7.189653506248555 + (07) Std err of mean | 0.41100513466216837 + (08) Coeff of variation | 1.7857418611299172 + (09) Skewness | 2.954633471088322 + (10) Kurtosis | 11.425776549251449 + (11) Std err of skewness | 0.13934809593495995 + (12) Std err of kurtosis | 0.277810485320835 + (13) Median | 1.0 + (14) Interquartile mean | 1.2483660130718954 +Feature [3]: Scale +------------------------------------------------- +Feature [4]: Categorical (Nominal) + (15) Num of categories | 2 + (16) Mode | 1 + (17) Num of modes | 1 +res5: org.apache.sysml.api.mlcontext.MLResults = +None + +{% endhighlight %} +</div> + +</div> + + ### Input Variables vs Input Parameters If we examine the http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/e6db69b8/pom.xml ---------------------------------------------------------------------- diff --git a/pom.xml b/pom.xml index b271cf7..a4c66a1 100644 --- a/pom.xml +++ b/pom.xml @@ -577,6 +577,7 @@ <exclude>docs</exclude> <exclude>**/docs/**</exclude> <exclude>**/*.csv</exclude> + <exclude>**/*.ijv</exclude> <exclude>**/*.json</exclude> <exclude>**/*.mtx</exclude> <exclude>**/*.mtd</exclude> http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/e6db69b8/src/main/java/org/apache/sysml/api/mlcontext/MLContextConversionUtil.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/api/mlcontext/MLContextConversionUtil.java b/src/main/java/org/apache/sysml/api/mlcontext/MLContextConversionUtil.java index 161ad17..33a5a3c 100644 --- a/src/main/java/org/apache/sysml/api/mlcontext/MLContextConversionUtil.java +++ b/src/main/java/org/apache/sysml/api/mlcontext/MLContextConversionUtil.java @@ -19,10 +19,13 @@ package org.apache.sysml.api.mlcontext; +import java.io.InputStream; +import java.net.URL; import java.util.ArrayList; import java.util.Iterator; import java.util.List; +import org.apache.commons.io.IOUtils; import org.apache.hadoop.io.LongWritable; import org.apache.hadoop.io.Text; import org.apache.spark.Accumulator; @@ -118,6 +121,71 @@ public class MLContextConversionUtil { } /** + * Convert a matrix at a URL to a {@code MatrixObject}. + * + * @param variableName + * name of the variable associated with the matrix + * @param url + * the URL to a matrix (in CSV or IJV format) + * @param matrixMetadata + * the matrix metadata + * @return the matrix at a URL converted to a {@code MatrixObject} + */ + public static MatrixObject urlToMatrixObject(String variableName, URL url, MatrixMetadata matrixMetadata) { + try { + InputStream is = url.openStream(); + List<String> lines = IOUtils.readLines(is); + MLContext activeMLContext = (MLContext) MLContextProxy.getActiveMLContext(); + SparkContext sparkContext = activeMLContext.getSparkContext(); + @SuppressWarnings("resource") + JavaSparkContext javaSparkContext = new JavaSparkContext(sparkContext); + JavaRDD<String> javaRDD = javaSparkContext.parallelize(lines); + if ((matrixMetadata == null) || (matrixMetadata.getMatrixFormat() == MatrixFormat.CSV)) { + MatrixObject matrixObject = javaRDDStringCSVToMatrixObject(variableName, javaRDD, matrixMetadata); + return matrixObject; + } else if (matrixMetadata.getMatrixFormat() == MatrixFormat.IJV) { + MatrixObject matrixObject = javaRDDStringIJVToMatrixObject(variableName, javaRDD, matrixMetadata); + return matrixObject; + } + return null; + } catch (Exception e) { + throw new MLContextException("Exception converting URL to MatrixObject", e); + } + } + + /** + * Convert a {@code MatrixBlock} to a {@code MatrixObject}. + * + * @param variableName + * name of the variable associated with the matrix + * @param matrixBlock + * matrix as a MatrixBlock + * @param matrixMetadata + * the matrix metadata + * @return the {@code MatrixBlock} converted to a {@code MatrixObject} + */ + public static MatrixObject matrixBlockToMatrixObject(String variableName, MatrixBlock matrixBlock, + MatrixMetadata matrixMetadata) { + try { + MatrixCharacteristics matrixCharacteristics; + if (matrixMetadata != null) { + matrixCharacteristics = matrixMetadata.asMatrixCharacteristics(); + } else { + matrixCharacteristics = new MatrixCharacteristics(); + } + MatrixFormatMetaData mtd = new MatrixFormatMetaData(matrixCharacteristics, + OutputInfo.BinaryBlockOutputInfo, InputInfo.BinaryBlockInputInfo); + MatrixObject matrixObject = new MatrixObject(ValueType.DOUBLE, MLContextUtil.scratchSpace() + "/" + + variableName, mtd); + matrixObject.acquireModify(matrixBlock); + matrixObject.release(); + return matrixObject; + } catch (CacheException e) { + throw new MLContextException("Exception converting MatrixBlock to MatrixObject", e); + } + } + + /** * Convert a {@code JavaPairRDD<MatrixIndexes, MatrixBlock>} to a * {@code MatrixObject}. * @@ -687,16 +755,13 @@ public class MLContextConversionUtil { SparkContext sc = activeMLContext.getSparkContext(); SQLContext sqlContext = new SQLContext(sc); DataFrame df = null; - if(isVectorDF) { + if (isVectorDF) { df = RDDConverterUtilsExt.binaryBlockToVectorDataFrame(binaryBlockMatrix, matrixCharacteristics, sqlContext); + } else { + df = RDDConverterUtilsExt.binaryBlockToDataFrame(binaryBlockMatrix, matrixCharacteristics, sqlContext); } - else { - df = RDDConverterUtilsExt.binaryBlockToDataFrame(binaryBlockMatrix, matrixCharacteristics, - sqlContext); - } - - + return df; } catch (DMLRuntimeException e) { throw new MLContextException("DMLRuntimeException while converting matrix object to DataFrame", e); http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/e6db69b8/src/main/java/org/apache/sysml/api/mlcontext/MLContextUtil.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/api/mlcontext/MLContextUtil.java b/src/main/java/org/apache/sysml/api/mlcontext/MLContextUtil.java index fc942e9..ea7857e 100644 --- a/src/main/java/org/apache/sysml/api/mlcontext/MLContextUtil.java +++ b/src/main/java/org/apache/sysml/api/mlcontext/MLContextUtil.java @@ -20,6 +20,7 @@ package org.apache.sysml.api.mlcontext; import java.io.FileNotFoundException; +import java.net.URL; import java.text.DateFormat; import java.text.SimpleDateFormat; import java.util.Date; @@ -44,9 +45,7 @@ import org.apache.sysml.conf.CompilerConfig.ConfigType; import org.apache.sysml.conf.ConfigurationManager; import org.apache.sysml.conf.DMLConfig; import org.apache.sysml.parser.ParseException; -import org.apache.sysml.parser.Expression.ValueType; import org.apache.sysml.runtime.controlprogram.LocalVariableMap; -import org.apache.sysml.runtime.controlprogram.caching.CacheException; import org.apache.sysml.runtime.controlprogram.caching.FrameObject; import org.apache.sysml.runtime.controlprogram.caching.MatrixObject; import org.apache.sysml.runtime.instructions.cp.BooleanObject; @@ -54,12 +53,8 @@ import org.apache.sysml.runtime.instructions.cp.Data; import org.apache.sysml.runtime.instructions.cp.DoubleObject; import org.apache.sysml.runtime.instructions.cp.IntObject; import org.apache.sysml.runtime.instructions.cp.StringObject; -import org.apache.sysml.runtime.matrix.MatrixCharacteristics; -import org.apache.sysml.runtime.matrix.MatrixFormatMetaData; -import org.apache.sysml.runtime.matrix.data.InputInfo; import org.apache.sysml.runtime.matrix.data.MatrixBlock; import org.apache.sysml.runtime.matrix.data.MatrixIndexes; -import org.apache.sysml.runtime.matrix.data.OutputInfo; /** * Utility class containing methods for working with the MLContext API. @@ -78,7 +73,7 @@ public final class MLContextUtil { */ @SuppressWarnings("rawtypes") public static final Class[] COMPLEX_DATA_TYPES = { JavaRDD.class, RDD.class, DataFrame.class, - BinaryBlockMatrix.class, Matrix.class, (new double[][] {}).getClass(), MatrixBlock.class }; + BinaryBlockMatrix.class, Matrix.class, (new double[][] {}).getClass(), MatrixBlock.class, URL.class }; /** * All data types supported by the MLContext API @@ -343,7 +338,7 @@ public final class MLContextUtil { /** * Is the object one of the supported complex data types? (JavaRDD, RDD, - * DataFrame, BinaryBlockMatrix, Matrix, double[][]) + * DataFrame, BinaryBlockMatrix, Matrix, double[][], MatrixBlock, URL) * * @param object * the object type to be examined @@ -457,23 +452,11 @@ public final class MLContextUtil { return matrixObject; } else if (value instanceof MatrixBlock) { - MatrixCharacteristics matrixCharacteristics; - if (matrixMetadata != null) { - matrixCharacteristics = matrixMetadata.asMatrixCharacteristics(); - } else { - matrixCharacteristics = new MatrixCharacteristics(); - } - MatrixFormatMetaData mtd = new MatrixFormatMetaData(matrixCharacteristics, OutputInfo.BinaryBlockOutputInfo, InputInfo.BinaryBlockInputInfo); - MatrixObject matrixObject = new MatrixObject(ValueType.DOUBLE, MLContextUtil.scratchSpace() + "/" + name, mtd); - try { - matrixObject.acquireModify((MatrixBlock)value); - matrixObject.release(); - } catch (CacheException e) { - throw new MLContextException(e); - } + MatrixBlock matrixBlock = (MatrixBlock) value; + MatrixObject matrixObject = MLContextConversionUtil.matrixBlockToMatrixObject(name, matrixBlock, + matrixMetadata); return matrixObject; - } - else if (value instanceof DataFrame) { + } else if (value instanceof DataFrame) { DataFrame dataFrame = (DataFrame) value; MatrixObject matrixObject = MLContextConversionUtil .dataFrameToMatrixObject(name, dataFrame, matrixMetadata); @@ -496,6 +479,10 @@ public final class MLContextUtil { MatrixObject matrixObject = MLContextConversionUtil.doubleMatrixToMatrixObject(name, doubleMatrix, matrixMetadata); return matrixObject; + } else if (value instanceof URL) { + URL url = (URL) value; + MatrixObject matrixObject = MLContextConversionUtil.urlToMatrixObject(name, url, matrixMetadata); + return matrixObject; } else if (value instanceof Integer) { Integer i = (Integer) value; IntObject iObj = new IntObject(i); http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/e6db69b8/src/test/java/org/apache/sysml/test/integration/mlcontext/MLContextTest.java ---------------------------------------------------------------------- diff --git a/src/test/java/org/apache/sysml/test/integration/mlcontext/MLContextTest.java b/src/test/java/org/apache/sysml/test/integration/mlcontext/MLContextTest.java index 4afd275..e6e1046 100644 --- a/src/test/java/org/apache/sysml/test/integration/mlcontext/MLContextTest.java +++ b/src/test/java/org/apache/sysml/test/integration/mlcontext/MLContextTest.java @@ -93,7 +93,7 @@ public class MLContextTest extends AutomatedTestBase { sc = new JavaSparkContext(conf); ml = new MLContext(sc); } - + @Override public void setUp() { addTestConfiguration(TEST_DIR, TEST_NAME); @@ -1641,6 +1641,48 @@ public class MLContextTest extends AutomatedTestBase { ml.execute(script); } + @Test + public void testCSVMatrixFromURLSumDML() throws MalformedURLException { + System.out.println("MLContextTest - CSV matrix from URL sum DML"); + String csv = "https://raw.githubusercontent.com/apache/incubator-systemml/master/src/test/scripts/org/apache/sysml/api/mlcontext/1234.csv"; + URL url = new URL(csv); + Script script = dml("print('sum: ' + sum(M));").in("M", url); + setExpectedStdOut("sum: 10.0"); + ml.execute(script); + } + + @Test + public void testCSVMatrixFromURLSumPYDML() throws MalformedURLException { + System.out.println("MLContextTest - CSV matrix from URL sum PYDML"); + String csv = "https://raw.githubusercontent.com/apache/incubator-systemml/master/src/test/scripts/org/apache/sysml/api/mlcontext/1234.csv"; + URL url = new URL(csv); + Script script = pydml("print('sum: ' + sum(M))").in("M", url); + setExpectedStdOut("sum: 10.0"); + ml.execute(script); + } + + @Test + public void testIJVMatrixFromURLSumDML() throws MalformedURLException { + System.out.println("MLContextTest - IJV matrix from URL sum DML"); + String ijv = "https://raw.githubusercontent.com/apache/incubator-systemml/master/src/test/scripts/org/apache/sysml/api/mlcontext/1234.ijv"; + URL url = new URL(ijv); + MatrixMetadata mm = new MatrixMetadata(MatrixFormat.IJV, 2, 2); + Script script = dml("print('sum: ' + sum(M));").in("M", url, mm); + setExpectedStdOut("sum: 10.0"); + ml.execute(script); + } + + @Test + public void testIJVMatrixFromURLSumPYDML() throws MalformedURLException { + System.out.println("MLContextTest - IJV matrix from URL sum PYDML"); + String ijv = "https://raw.githubusercontent.com/apache/incubator-systemml/master/src/test/scripts/org/apache/sysml/api/mlcontext/1234.ijv"; + URL url = new URL(ijv); + MatrixMetadata mm = new MatrixMetadata(MatrixFormat.IJV, 2, 2); + Script script = pydml("print('sum: ' + sum(M))").in("M", url, mm); + setExpectedStdOut("sum: 10.0"); + ml.execute(script); + } + // NOTE: Uncomment these tests once they work // @SuppressWarnings({ "rawtypes", "unchecked" }) @@ -1714,13 +1756,13 @@ public class MLContextTest extends AutomatedTestBase { @AfterClass public static void tearDownClass() { - //stop spark context to allow single jvm tests (otherwise the - //next test that tries to create a SparkContext would fail) + // stop spark context to allow single jvm tests (otherwise the + // next test that tries to create a SparkContext would fail) sc.stop(); sc = null; conf = null; - - //clear status mlcontext and spark exec context + + // clear status mlcontext and spark exec context ml.close(); ml = null; } http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/e6db69b8/src/test/scripts/org/apache/sysml/api/mlcontext/1234.ijv ---------------------------------------------------------------------- diff --git a/src/test/scripts/org/apache/sysml/api/mlcontext/1234.ijv b/src/test/scripts/org/apache/sysml/api/mlcontext/1234.ijv new file mode 100644 index 0000000..ddf4c44 --- /dev/null +++ b/src/test/scripts/org/apache/sysml/api/mlcontext/1234.ijv @@ -0,0 +1,4 @@ +1 1 1.0 +1 2 2.0 +2 1 3.0 +2 2 4.0 http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/e6db69b8/src/test/scripts/org/apache/sysml/api/mlcontext/1234.ijv.mtd ---------------------------------------------------------------------- diff --git a/src/test/scripts/org/apache/sysml/api/mlcontext/1234.ijv.mtd b/src/test/scripts/org/apache/sysml/api/mlcontext/1234.ijv.mtd new file mode 100644 index 0000000..d459138 --- /dev/null +++ b/src/test/scripts/org/apache/sysml/api/mlcontext/1234.ijv.mtd @@ -0,0 +1,11 @@ +{ + "data_type": "matrix", + "value_type": "double", + "rows": 2, + "cols": 2, + "nnz": 4, + "format": "text", + "description": { + "author": "SystemML" + } +} \ No newline at end of file
