Repository: incubator-systemml Updated Branches: refs/heads/master d5eea2e85 -> 10e701de3
[SYSTEMML-889] Remove MLContext Java input output methods This removes the `Script.input` and `Script.output` methods on the JVM side, and hacks the Python side so that it is able to call the `Script.in` method on the JVM side. This means that a Scala/Java user would use the clean `in(...)` and `out(...)` syntax, while a Python user would use the `input(...)` and `output(...)` syntax. Closes #229. Project: http://git-wip-us.apache.org/repos/asf/incubator-systemml/repo Commit: http://git-wip-us.apache.org/repos/asf/incubator-systemml/commit/10e701de Tree: http://git-wip-us.apache.org/repos/asf/incubator-systemml/tree/10e701de Diff: http://git-wip-us.apache.org/repos/asf/incubator-systemml/diff/10e701de Branch: refs/heads/master Commit: 10e701de3fc145f83303afd3d609d7cb720a25a0 Parents: d5eea2e Author: Mike Dusenberry <[email protected]> Authored: Thu Sep 1 10:13:03 2016 -0700 Committer: Mike Dusenberry <[email protected]> Committed: Thu Sep 1 10:13:03 2016 -0700 ---------------------------------------------------------------------- docs/beginners-guide-python.md | 2 +- .../org/apache/sysml/api/mlcontext/Script.java | 112 ------------------- src/main/python/SystemML/defmatrix.py | 102 ++++++++--------- src/main/python/SystemML/mlcontext.py | 38 ++++--- src/main/python/tests/test_mlcontext.py | 18 +-- 5 files changed, 83 insertions(+), 189 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/10e701de/docs/beginners-guide-python.md ---------------------------------------------------------------------- diff --git a/docs/beginners-guide-python.md b/docs/beginners-guide-python.md index b565656..3b4aeed 100644 --- a/docs/beginners-guide-python.md +++ b/docs/beginners-guide-python.md @@ -328,6 +328,6 @@ X_df = sqlCtx.createDataFrame(pd.DataFrame(X_digits[:.9 * n_samples])) y_df = sqlCtx.createDataFrame(pd.DataFrame(y_digits[:.9 * n_samples])) ml = sml.MLContext(sc) script = os.path.join(os.environ['SYSTEMML_HOME'], 'scripts', 'algorithms', 'MultiLogReg.dml') -script = sml.dml(script).input(X=X_df, Y_vec=y_df).input(**{"$X": ' ', "$Y": ' ', "$B": ' '}).out("B_out") +script = sml.dml(script).input(X=X_df, Y_vec=y_df).output("B_out") beta = ml.execute(script).getNumPyArray('B_out') ``` http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/10e701de/src/main/java/org/apache/sysml/api/mlcontext/Script.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/api/mlcontext/Script.java b/src/main/java/org/apache/sysml/api/mlcontext/Script.java index a3ce430..bfa947c 100644 --- a/src/main/java/org/apache/sysml/api/mlcontext/Script.java +++ b/src/main/java/org/apache/sysml/api/mlcontext/Script.java @@ -247,17 +247,6 @@ public class Script { } /** - * Pass a map of inputs to the script. - * - * @param inputs - * map of inputs (parameters ($) and variables). - * @return {@code this} Script object to allow chaining of methods - */ - public Script input(Map<String, Object> inputs) { - return in(inputs); - } - - /** * Pass a Scala Map of inputs to the script. * <p> * Note that the {@code Map} value type is not explicitly specified on this @@ -282,26 +271,6 @@ public class Script { } /** - * Pass a Scala Map of inputs to the script. - * <p> - * Note that the {@code Map} value type is not explicitly specified on this - * method because {@code [String, Any]} can't be recognized on the Java side - * since {@code Any} doesn't have an equivalent in the Java class hierarchy - * ({@code scala.Any} is a superclass of {@code scala.AnyRef}, which is - * equivalent to {@code java.lang.Object}). Therefore, specifying - * {@code scala.collection.Map<String, Object>} as an input parameter to - * this Java method is not encompassing enough and would require types such - * as a {@code scala.Double} to be cast using {@code asInstanceOf[AnyRef]}. - * - * @param inputs - * Scala Map of inputs (parameters ($) and variables). - * @return {@code this} Script object to allow chaining of methods - */ - public Script input(scala.collection.Map<String, ?> inputs) { - return in(inputs); - } - - /** * Pass a Scala Seq of inputs to the script. The inputs are either two-value * or three-value tuples, where the first value is the variable name, the * second value is the variable value, and the third optional value is the @@ -330,20 +299,6 @@ public class Script { } /** - * Pass a Scala Seq of inputs to the script. The inputs are either two-value - * or three-value tuples, where the first value is the variable name, the - * second value is the variable value, and the third optional value is the - * metadata. - * - * @param inputs - * Scala Seq of inputs (parameters ($) and variables). - * @return {@code this} Script object to allow chaining of methods - */ - public Script input(scala.collection.Seq<Object> inputs) { - return in(inputs); - } - - /** * Obtain an unmodifiable map of all input parameters ($). * * @return input parameters ($) @@ -366,19 +321,6 @@ public class Script { } /** - * Register an input (parameter ($) or variable). - * - * @param name - * name of the input - * @param value - * value of the input - * @return {@code this} Script object to allow chaining of methods - */ - public Script input(String name, Object value) { - return in(name, value); - } - - /** * Register an input (parameter ($) or variable) with optional matrix * metadata. * @@ -403,22 +345,6 @@ public class Script { * name of the input * @param value * value of the input - * @param matrixFormat - * optional matrix format - * @return {@code this} Script object to allow chaining of methods - */ - public Script input(String name, Object value, MatrixFormat matrixFormat) { - return in(name, value, matrixFormat); - } - - /** - * Register an input (parameter ($) or variable) with optional matrix - * metadata. - * - * @param name - * name of the input - * @param value - * value of the input * @param matrixMetadata * optional matrix metadata * @return {@code this} Script object to allow chaining of methods @@ -473,22 +399,6 @@ public class Script { } /** - * Register an input (parameter ($) or variable) with optional matrix - * metadata. - * - * @param name - * name of the input - * @param value - * value of the input - * @param matrixMetadata - * optional matrix metadata - * @return {@code this} Script object to allow chaining of methods - */ - public Script input(String name, Object value, MatrixMetadata matrixMetadata) { - return in(name, value, matrixMetadata); - } - - /** * Register an output variable. * * @param outputName @@ -501,17 +411,6 @@ public class Script { } /** - * Register an output variable. - * - * @param outputName - * name of the output variable - * @return {@code this} Script object to allow chaining of methods - */ - public Script output(String outputName) { - return out(outputName); - } - - /** * Register output variables. * * @param outputNames @@ -524,17 +423,6 @@ public class Script { } /** - * Register output variables. - * - * @param outputNames - * names of the output variables - * @return {@code this} Script object to allow chaining of methods - */ - public Script output(String... outputNames) { - return output(outputNames); - } - - /** * Clear the inputs, outputs, and symbol table. */ public void clearIOS() { http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/10e701de/src/main/python/SystemML/defmatrix.py ---------------------------------------------------------------------- diff --git a/src/main/python/SystemML/defmatrix.py b/src/main/python/SystemML/defmatrix.py index 37e813c..18f6314 100644 --- a/src/main/python/SystemML/defmatrix.py +++ b/src/main/python/SystemML/defmatrix.py @@ -30,7 +30,7 @@ from pyspark.sql import DataFrame, SQLContext def setSparkContext(sc): """ Before using the matrix, the user needs to invoke this function. - + Parameters ---------- sc: SparkContext @@ -38,7 +38,7 @@ def setSparkContext(sc): """ matrix.ml = MLContext(sc) matrix.sc = sc - + def checkIfMLContextIsSet(): if matrix.ml is None: raise Exception('Expected setSparkContext(sc) to be called.') @@ -50,10 +50,10 @@ class DMLOp(object): def __init__(self, inputs, dml=None): self.inputs = inputs self.dml = dml - + def _visit(self, execute=True): matrix.dml = matrix.dml + self.dml - + def reset(): """ @@ -62,7 +62,7 @@ def reset(): for m in matrix.visited: m.visited = False matrix.visited = [] - + def binaryOp(lhs, rhs, opStr): """ Common function called by all the binary operators in matrix class @@ -103,7 +103,7 @@ def binaryMatrixFunction(X, Y, fnName): def solve(A, b): """ Computes the least squares solution for system of linear equations A %*% x = b - + Examples -------- >>> import numpy as np @@ -123,7 +123,7 @@ def solve(A, b): >>> b = X.transpose().dot(y) >>> beta = sml.solve(A, b).toNumPyArray() >>> y_predicted = X_test.dot(beta) - >>> print('Residual sum of squares: %.2f' % np.mean((y_predicted - y_test) ** 2)) + >>> print('Residual sum of squares: %.2f' % np.mean((y_predicted - y_test) ** 2)) Residual sum of squares: 25282.12 """ return binaryMatrixFunction(A, b, 'solve') @@ -158,20 +158,20 @@ def eval(outputs, outputDF=False, execute=True): m.data = results.getDataFrame(m.ID) else: m.data = results.getNumPyArray(m.ID) - + class matrix(object): """ matrix class is a python wrapper that implements basic matrix operator. Note: an evaluated matrix contains a data field computed by eval method as DataFrame or NumPy array. - + Examples -------- >>> import SystemML as sml >>> import numpy as np >>> sml.setSparkContext(sc) - + Welcome to Apache SystemML! - + >>> m1 = sml.matrix(np.ones((3,3)) + 2) >>> m2 = sml.matrix(np.ones((3,3)) + 3) >>> m2 = m1 * (m2 + m1) @@ -184,7 +184,7 @@ class matrix(object): mVar4 = mVar1 * mVar3 mVar5 = 1.0 - mVar4 save(mVar5, " ") - + <SystemML.defmatrix.matrix object> >>> m2.eval() >>> m2 @@ -195,7 +195,7 @@ class matrix(object): mVar4 = load(" ", format="csv") mVar5 = 1.0 - mVar4 save(mVar5, " ") - + <SystemML.defmatrix.matrix object> >>> m4.sum(axis=1).toNumPyArray() array([[-60.], @@ -204,31 +204,31 @@ class matrix(object): """ # Global variable that is used to keep track of intermediate matrix variables in the DML script systemmlVarID = 0 - + # Since joining of string is expensive operation, we collect the set of strings into list and then join # them before execution: See matrix.script.scriptString = ''.join(matrix.dml) in eval() method dml = [] - + # Represents MLContext's script object script = None - + # Represents MLContext object ml = None - + # Contains list of nodes visited in Abstract Syntax Tree. This helps to avoid computation of matrix objects # that have been previously evaluated. visited = [] - + def __init__(self, data, op=None): """ Constructs a lazy matrix - + Parameters ---------- data: NumPy ndarray, Pandas DataFrame, scipy sparse matrix or PySpark DataFrame. (data cannot be None for external users, 'data=None' is used internally for lazy evaluation). """ checkIfMLContextIsSet() - self.visited = False + self.visited = False matrix.systemmlVarID += 1 self.output = False self.ID = 'mVar' + str(matrix.systemmlVarID) @@ -242,21 +242,21 @@ class matrix(object): self.op = op else: raise TypeError('Unsupported input type') - + def eval(self, outputDF=False): """ This is a convenience function that calls the global eval method """ eval([self], outputDF=False) - + def toPandas(self): """ This is a convenience function that calls the global eval method and then converts the matrix object into Pandas DataFrame. """ if self.data is None: self.eval() - return convertToPandasDF(self.data) - + return convertToPandasDF(self.data) + def toNumPyArray(self): """ This is a convenience function that calls the global eval method and then converts the matrix object into NumPy array. @@ -267,7 +267,7 @@ class matrix(object): self.data = self.data.toPandas().as_matrix() # Always keep default format as NumPy array if possible return self.data - + def toDataFrame(self): """ This is a convenience function that calls the global eval method and then converts the matrix object into DataFrame. @@ -279,13 +279,13 @@ class matrix(object): MLResults.sqlContext = SQLContext(matrix.sc) self.data = sqlContext.createDataFrame(self.toPandas()) return self.data - + def _visit(self, execute=True): """ This function is called for two scenarios: 1. For printing the PyDML script which has not yet been evaluated (execute=False). See '__repr__' method. - 2. Called as part of 'eval' method (execute=True). In this scenario, it builds the PyDML script by visiting itself - and its child nodes. Also, it does appropriate registration as input or output that is required by MLContext. + 2. Called as part of 'eval' method (execute=True). In this scenario, it builds the PyDML script by visiting itself + and its child nodes. Also, it does appropriate registration as input or output that is required by MLContext. """ if self.visited: return self @@ -308,13 +308,13 @@ class matrix(object): if self.data is None and self.output: matrix.dml = matrix.dml + ['save(', self.ID, ', \" \")\n'] if execute: - matrix.script.out(self.ID) + matrix.script.output(self.ID) return self - + def __repr__(self): """ This function helps to debug matrix class and also examine the generated PyDML script - """ + """ if self.data is None: print('# This matrix (' + self.ID + ') is backed by below given PyDML script (which is not yet evaluated). To fetch the data of this matrix, invoke toNumPyArray() or toDataFrame() or toPandas() methods.\n' + eval([self], execute=False)) elif isinstance(self.data, DataFrame): @@ -322,49 +322,49 @@ class matrix(object): else: print('# This matrix (' + self.ID + ') is backed by NumPy array. To fetch the NumPy array, invoke toNumPyArray() method.') return '<SystemML.defmatrix.matrix object>' - + def __add__(self, other): return binaryOp(self, other, ' + ') - + def __sub__(self, other): return binaryOp(self, other, ' - ') - + def __mul__(self, other): return binaryOp(self, other, ' * ') - + def __floordiv__(self, other): return binaryOp(self, other, ' // ') - + def __div__(self, other): return binaryOp(self, other, ' / ') - + def __mod__(self, other): return binaryOp(self, other, ' % ') - + def __pow__(self, other): return binaryOp(self, other, ' ** ') def __radd__(self, other): return binaryOp(other, self, ' + ') - + def __rsub__(self, other): return binaryOp(other, self, ' - ') - + def __rmul__(self, other): return binaryOp(other, self, ' * ') - + def __rfloordiv__(self, other): return binaryOp(other, self, ' // ') - + def __rdiv__(self, other): return binaryOp(other, self, ' / ') - + def __rmod__(self, other): return binaryOp(other, self, ' % ') - + def __rpow__(self, other): return binaryOp(other, self, ' ** ') - + def sum(self, axis=None): return self._aggFn('sum', axis) @@ -382,7 +382,7 @@ class matrix(object): def argmax(self, axis=None): return self._aggFn('argmax', axis) - + def cumsum(self, axis=None): return self._aggFn('cumsum', axis) @@ -391,20 +391,20 @@ class matrix(object): def trace(self, axis=None): return self._aggFn('trace', axis) - + def _aggFn(self, fnName, axis): """ Common function that is called for functions that have axis as parameter. - """ + """ dmlOp = DMLOp([self]) out = matrix(None, op=dmlOp) if axis is None: dmlOp.dml = [out.ID, ' = ', fnName, '(', self.ID, ')\n'] else: dmlOp.dml = [out.ID, ' = ', fnName, '(', self.ID, ', axis=', str(axis) ,')\n'] - return out + return out def dot(self, other): return binaryMatrixFunction(self, other, 'dot') - -__all__ = [ 'setSparkContext', 'matrix', 'eval', 'solve'] \ No newline at end of file + +__all__ = [ 'setSparkContext', 'matrix', 'eval', 'solve'] http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/10e701de/src/main/python/SystemML/mlcontext.py ---------------------------------------------------------------------- diff --git a/src/main/python/SystemML/mlcontext.py b/src/main/python/SystemML/mlcontext.py index 7ed277a..1b90e70 100644 --- a/src/main/python/SystemML/mlcontext.py +++ b/src/main/python/SystemML/mlcontext.py @@ -22,10 +22,11 @@ import os try: + import py4j.java_gateway from py4j.java_gateway import JavaObject except ImportError: raise ImportError('Unable to import JavaObject from py4j.java_gateway. Hint: Make sure you are running with pyspark') - + from pyspark import SparkContext import pyspark.mllib.common from pyspark.sql import DataFrame, SQLContext @@ -34,12 +35,12 @@ from .converters import * def dml(scriptString): """ Create a dml script object based on a string. - + Parameters ---------- scriptString: string Can be a path to a dml script or a dml script itself. - + Returns ------- script: Script instance @@ -53,12 +54,12 @@ def dml(scriptString): def pydml(scriptString): """ Create a pydml script object based on a string. - + Parameters ---------- scriptString: string Can be a path to a pydml script or a pydml script itself. - + Returns ------- script: Script instance @@ -92,12 +93,12 @@ def _py2java(sc, obj): class Matrix(object): """ Wrapper around a Java Matrix object. - + Parameters ---------- javaMatrix: JavaObject A Java Matrix object as returned by calling `ml.execute().get()`. - + sc: SparkContext SparkContext """ @@ -111,7 +112,7 @@ class Matrix(object): def toDF(self): """ Convert the Matrix to a PySpark SQL DataFrame. - + Returns ------- df: PySpark SQL DataFrame @@ -128,12 +129,12 @@ class Matrix(object): class MLResults(object): """ Wrapper around a Java ML Results object. - + Parameters ---------- results: JavaObject A Java MLResults object as returned by calling `ml.execute()`. - + sc: SparkContext SparkContext """ @@ -160,7 +161,7 @@ class MLResults(object): if len(outs) == 1: return outs[0] return outs - + def getDataFrame(self, *outputs): """ Parameters @@ -172,7 +173,7 @@ class MLResults(object): if len(outs) == 1: return outs[0] return outs - + def get(self, *outputs): """ Parameters @@ -194,7 +195,7 @@ class Script(object): ---------- scriptString: string Can be either a file path to a DML script or a DML script itself. - + scriptType: string Script language, either "dml" for DML (R-like) or "pydml" for PyDML (Python-like). """ @@ -223,7 +224,7 @@ class Script(object): self._input[name] = value return self - def out(self, *names): + def output(self, *names): """ Parameters ---------- @@ -287,10 +288,15 @@ class MLContext(object): script_java = self._sc._jvm.org.apache.sysml.api.mlcontext.ScriptFactory.pydml(scriptString) for key, val in script._input.items(): - script_java.input(key, _py2java(self._sc, val)) + # `in` is a reserved word ("keyword") in Python, so `script_java.in(...)` is not + # allowed. Therefore, we use the following code in which we retrieve a function + # representing `script_java.in`, and then call it with the arguments. This is in + # lieu of adding a new `input` method on the JVM side, as that would complicate use + # from Scala/Java. + py4j.java_gateway.get_method(script_java, "in")(key, _py2java(self._sc, val)) for val in script._output: script_java.out(val) return MLResults(self._ml.execute(script_java), self._sc) -__all__ = ['MLResults', 'MLContext', 'Script', 'dml', 'pydml'] \ No newline at end of file +__all__ = ['MLResults', 'MLContext', 'Script', 'dml', 'pydml'] http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/10e701de/src/main/python/tests/test_mlcontext.py ---------------------------------------------------------------------- diff --git a/src/main/python/tests/test_mlcontext.py b/src/main/python/tests/test_mlcontext.py index ec5a196..182a4d8 100644 --- a/src/main/python/tests/test_mlcontext.py +++ b/src/main/python/tests/test_mlcontext.py @@ -31,7 +31,7 @@ ml = MLContext(sc) class TestAPI(unittest.TestCase): def test_output_string(self): - script = dml("x1 = 'Hello World'").out("x1") + script = dml("x1 = 'Hello World'").output("x1") self.assertEqual(ml.execute(script).get("x1"), "Hello World") def test_output_list(self): @@ -40,7 +40,7 @@ class TestAPI(unittest.TestCase): x2 = x1 + 1 x3 = x1 + 2 """ - script = dml(script).out("x1", "x2", "x3") + script = dml(script).output("x1", "x2", "x3") self.assertEqual(ml.execute(script).get("x1", "x2"), [0.2, 1.2]) self.assertEqual(ml.execute(script).get("x1", "x3"), [0.2, 2.2]) @@ -50,7 +50,7 @@ class TestAPI(unittest.TestCase): m2 = m1 * 2 """ rdd1 = sc.parallelize(["1.0,2.0", "3.0,4.0"]) - script = dml(sums).input(m1=rdd1).out("s1", "m2") + script = dml(sums).input(m1=rdd1).output("s1", "m2") s1, m2 = ml.execute(script).get("s1", "m2") self.assertEqual((s1, repr(m2)), (10.0, "Matrix")) @@ -60,7 +60,7 @@ class TestAPI(unittest.TestCase): m2 = m1 * 2 """ rdd1 = sc.parallelize(["1.0,2.0", "3.0,4.0"]) - script = dml(sums).input(m1=rdd1).out("m2") + script = dml(sums).input(m1=rdd1).output("m2") m2 = ml.execute(script).get("m2") self.assertEqual(repr(m2.toDF()), "DataFrame[ID: double, C1: double, C2: double]") @@ -69,14 +69,14 @@ class TestAPI(unittest.TestCase): x2 = x1 + 1 x3 = x1 + 2 """ - script = dml(script).input("x1", 5).out("x2", "x3") + script = dml(script).input("x1", 5).output("x2", "x3") self.assertEqual(ml.execute(script).get("x2", "x3"), [6, 7]) def test_input(self): script = """ x3 = x1 + x2 """ - script = dml(script).input(x1=5, x2=3).out("x3") + script = dml(script).input(x1=5, x2=3).output("x3") self.assertEqual(ml.execute(script).get("x3"), 8) def test_rdd(self): @@ -87,13 +87,13 @@ class TestAPI(unittest.TestCase): """ rdd1 = sc.parallelize(["1.0,2.0", "3.0,4.0"]) rdd2 = sc.parallelize(["5.0,6.0", "7.0,8.0"]) - script = dml(sums).input(m1=rdd1).input(m2=rdd2).out("s1", "s2", "s3") + script = dml(sums).input(m1=rdd1).input(m2=rdd2).output("s1", "s2", "s3") self.assertEqual( ml.execute(script).get("s1", "s2", "s3"), [10.0, 26.0, "whatever"]) def test_pydml(self): script = "A = full('1 2 3 4 5 6 7 8 9', rows=3, cols=3)\nx = toString(A)" - script = pydml(script).out("x") + script = pydml(script).output("x") self.assertEqual( ml.execute(script).get("x"), '1.000 2.000 3.000\n4.000 5.000 6.000\n7.000 8.000 9.000\n' @@ -101,4 +101,4 @@ class TestAPI(unittest.TestCase): if __name__ == "__main__": - unittest.main() \ No newline at end of file + unittest.main()
