Repository: incubator-systemml Updated Branches: refs/heads/master 2f9769e20 -> d5eea2e85
[SYSTEMML-451] Updated the doc string for defmatrix class Project: http://git-wip-us.apache.org/repos/asf/incubator-systemml/repo Commit: http://git-wip-us.apache.org/repos/asf/incubator-systemml/commit/d5eea2e8 Tree: http://git-wip-us.apache.org/repos/asf/incubator-systemml/tree/d5eea2e8 Diff: http://git-wip-us.apache.org/repos/asf/incubator-systemml/diff/d5eea2e8 Branch: refs/heads/master Commit: d5eea2e85ade5178e832b8b5a7f27e7b1fb25a67 Parents: 2f9769e Author: Niketan Pansare <npan...@us.ibm.com> Authored: Wed Aug 31 16:33:46 2016 -0700 Committer: Niketan Pansare <npan...@us.ibm.com> Committed: Wed Aug 31 16:34:33 2016 -0700 ---------------------------------------------------------------------- src/main/python/SystemML/defmatrix.py | 121 ++++++++++++++++++++++++++++- 1 file changed, 118 insertions(+), 3 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/d5eea2e8/src/main/python/SystemML/defmatrix.py ---------------------------------------------------------------------- diff --git a/src/main/python/SystemML/defmatrix.py b/src/main/python/SystemML/defmatrix.py index 7e2c453..37e813c 100644 --- a/src/main/python/SystemML/defmatrix.py +++ b/src/main/python/SystemML/defmatrix.py @@ -44,6 +44,9 @@ def checkIfMLContextIsSet(): raise Exception('Expected setSparkContext(sc) to be called.') class DMLOp(object): + """ + Represents an intermediate node of Abstract syntax tree created to generate the PyDML script + """ def __init__(self, inputs, dml=None): self.inputs = inputs self.dml = dml @@ -53,11 +56,17 @@ class DMLOp(object): def reset(): + """ + Resets the visited status of matrix and the operators in the generated AST. + """ for m in matrix.visited: m.visited = False matrix.visited = [] def binaryOp(lhs, rhs, opStr): + """ + Common function called by all the binary operators in matrix class + """ inputs = [] if isinstance(lhs, matrix): lhsStr = lhs.ID @@ -79,6 +88,10 @@ def binaryOp(lhs, rhs, opStr): return out def binaryMatrixFunction(X, Y, fnName): + """ + Common function called by supported PyDML built-in function that has two arguments both of which are matrices. + TODO: This needs to be generalized to support arbitrary arguments of differen types. + """ if not isinstance(X, matrix) or not isinstance(Y, matrix): raise TypeError('Incorrect input type. Expected matrix type') inputs = [X, Y] @@ -88,9 +101,33 @@ def binaryMatrixFunction(X, Y, fnName): return out def solve(A, b): + """ + Computes the least squares solution for system of linear equations A %*% x = b + + Examples + -------- + >>> import numpy as np + >>> from sklearn import datasets + >>> import SystemML as sml + >>> from pyspark.sql import SQLContext + >>> diabetes = datasets.load_diabetes() + >>> diabetes_X = diabetes.data[:, np.newaxis, 2] + >>> X_train = diabetes_X[:-20] + >>> X_test = diabetes_X[-20:] + >>> y_train = diabetes.target[:-20] + >>> y_test = diabetes.target[-20:] + >>> sml.setSparkContext(sc) + >>> X = sml.matrix(X_train) + >>> y = sml.matrix(y_train) + >>> A = X.transpose().dot(X) + >>> b = X.transpose().dot(y) + >>> beta = sml.solve(A, b).toNumPyArray() + >>> y_predicted = X_test.dot(beta) + >>> print('Residual sum of squares: %.2f' % np.mean((y_predicted - y_test) ** 2)) + Residual sum of squares: 25282.12 + """ return binaryMatrixFunction(A, b, 'solve') - def eval(outputs, outputDF=False, execute=True): """ Executes the unevaluated DML script and computes the matrices specified by outputs. @@ -115,19 +152,73 @@ def eval(outputs, outputDF=False, execute=True): return ''.join(matrix.dml) matrix.script.scriptString = ''.join(matrix.dml) results = matrix.ml.execute(matrix.script) + # Note: an evaluated matrix contains a data field computed by eval method as DataFrame or NumPy array. for m in outputs: if outputDF: m.data = results.getDataFrame(m.ID) else: m.data = results.getNumPyArray(m.ID) -# Instead of inheriting from np.matrix class matrix(object): + """ + matrix class is a python wrapper that implements basic matrix operator. + Note: an evaluated matrix contains a data field computed by eval method as DataFrame or NumPy array. + + Examples + -------- + >>> import SystemML as sml + >>> import numpy as np + >>> sml.setSparkContext(sc) + + Welcome to Apache SystemML! + + >>> m1 = sml.matrix(np.ones((3,3)) + 2) + >>> m2 = sml.matrix(np.ones((3,3)) + 3) + >>> m2 = m1 * (m2 + m1) + >>> m4 = 1.0 - m2 + >>> m4 + # This matrix (mVar5) is backed by below given PyDML script (which is not yet evaluated). To fetch the data of this matrix, invoke toNumPyArray() or toDataFrame() or toPandas() methods. + mVar1 = load(" ", format="csv") + mVar2 = load(" ", format="csv") + mVar3 = mVar2 + mVar1 + mVar4 = mVar1 * mVar3 + mVar5 = 1.0 - mVar4 + save(mVar5, " ") + + <SystemML.defmatrix.matrix object> + >>> m2.eval() + >>> m2 + # This matrix (mVar4) is backed by NumPy array. To fetch the NumPy array, invoke toNumPyArray() method. + <SystemML.defmatrix.matrix object> + >>> m4 + # This matrix (mVar5) is backed by below given PyDML script (which is not yet evaluated). To fetch the data of this matrix, invoke toNumPyArray() or toDataFrame() or toPandas() methods. + mVar4 = load(" ", format="csv") + mVar5 = 1.0 - mVar4 + save(mVar5, " ") + + <SystemML.defmatrix.matrix object> + >>> m4.sum(axis=1).toNumPyArray() + array([[-60.], + [-60.], + [-60.]]) + """ + # Global variable that is used to keep track of intermediate matrix variables in the DML script systemmlVarID = 0 + + # Since joining of string is expensive operation, we collect the set of strings into list and then join + # them before execution: See matrix.script.scriptString = ''.join(matrix.dml) in eval() method dml = [] + + # Represents MLContext's script object script = None + + # Represents MLContext object ml = None + + # Contains list of nodes visited in Abstract Syntax Tree. This helps to avoid computation of matrix objects + # that have been previously evaluated. visited = [] + def __init__(self, data, op=None): """ Constructs a lazy matrix @@ -153,14 +244,23 @@ class matrix(object): raise TypeError('Unsupported input type') def eval(self, outputDF=False): + """ + This is a convenience function that calls the global eval method + """ eval([self], outputDF=False) - + def toPandas(self): + """ + This is a convenience function that calls the global eval method and then converts the matrix object into Pandas DataFrame. + """ if self.data is None: self.eval() return convertToPandasDF(self.data) def toNumPyArray(self): + """ + This is a convenience function that calls the global eval method and then converts the matrix object into NumPy array. + """ if self.data is None: self.eval() if isinstance(self.data, DataFrame): @@ -169,6 +269,9 @@ class matrix(object): return self.data def toDataFrame(self): + """ + This is a convenience function that calls the global eval method and then converts the matrix object into DataFrame. + """ if self.data is None: self.eval(outputDF=True) if not isinstance(self.data, DataFrame): @@ -178,6 +281,12 @@ class matrix(object): return self.data def _visit(self, execute=True): + """ + This function is called for two scenarios: + 1. For printing the PyDML script which has not yet been evaluated (execute=False). See '__repr__' method. + 2. Called as part of 'eval' method (execute=True). In this scenario, it builds the PyDML script by visiting itself + and its child nodes. Also, it does appropriate registration as input or output that is required by MLContext. + """ if self.visited: return self self.visited = True @@ -203,6 +312,9 @@ class matrix(object): return self def __repr__(self): + """ + This function helps to debug matrix class and also examine the generated PyDML script + """ if self.data is None: print('# This matrix (' + self.ID + ') is backed by below given PyDML script (which is not yet evaluated). To fetch the data of this matrix, invoke toNumPyArray() or toDataFrame() or toPandas() methods.\n' + eval([self], execute=False)) elif isinstance(self.data, DataFrame): @@ -281,6 +393,9 @@ class matrix(object): return self._aggFn('trace', axis) def _aggFn(self, fnName, axis): + """ + Common function that is called for functions that have axis as parameter. + """ dmlOp = DMLOp([self]) out = matrix(None, op=dmlOp) if axis is None: