[SYSTEMML-878] Update the Python package from SystemML to systemml - Updated Python package name from SystemML to systemml - Moved uploadToPyPI.sh script to dev/release - Updated the documentation
Closes #231. Project: http://git-wip-us.apache.org/repos/asf/incubator-systemml/repo Commit: http://git-wip-us.apache.org/repos/asf/incubator-systemml/commit/542de374 Tree: http://git-wip-us.apache.org/repos/asf/incubator-systemml/tree/542de374 Diff: http://git-wip-us.apache.org/repos/asf/incubator-systemml/diff/542de374 Branch: refs/heads/master Commit: 542de374e19e25e3581de45af06d37a2f4cdaad0 Parents: 7610a21 Author: Niketan Pansare <[email protected]> Authored: Thu Sep 1 16:55:57 2016 -0700 Committer: Niketan Pansare <[email protected]> Committed: Thu Sep 1 16:55:57 2016 -0700 ---------------------------------------------------------------------- dev/release/uploadToPyPI.sh | 34 ++ docs/algorithms-classification.md | 18 +- docs/algorithms-regression.md | 8 +- docs/beginners-guide-python.md | 49 ++- src/main/python/MANIFEST.in | 18 +- src/main/python/SystemML/__init__.py | 29 -- src/main/python/SystemML/converters.py | 100 ----- src/main/python/SystemML/defmatrix.py | 410 -------------------- src/main/python/SystemML/mlcontext.py | 302 -------------- src/main/python/SystemML/mllearn/__init__.py | 25 -- src/main/python/SystemML/mllearn/estimators.py | 302 -------------- src/main/python/setup.py | 4 +- src/main/python/systemml/__init__.py | 29 ++ src/main/python/systemml/converters.py | 100 +++++ src/main/python/systemml/defmatrix.py | 410 ++++++++++++++++++++ src/main/python/systemml/mlcontext.py | 302 ++++++++++++++ src/main/python/systemml/mllearn/__init__.py | 25 ++ src/main/python/systemml/mllearn/estimators.py | 302 ++++++++++++++ src/main/python/tests/test_mlcontext.py | 2 +- src/main/python/tests/test_mllearn.py | 2 +- src/main/python/uploadToPyPI.sh | 34 -- 21 files changed, 1265 insertions(+), 1240 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/542de374/dev/release/uploadToPyPI.sh ---------------------------------------------------------------------- diff --git a/dev/release/uploadToPyPI.sh b/dev/release/uploadToPyPI.sh new file mode 100644 index 0000000..44a6b4f --- /dev/null +++ b/dev/release/uploadToPyPI.sh @@ -0,0 +1,34 @@ +#!/bin/bash +#------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +#------------------------------------------------------------- + +cd ../.. +mvn clean package -P distribution +tar -xzf target/systemml-*-SNAPSHOT.tar.gz -C src/main/python/systemml + +cd src/main/python/systemml +mv systemml-*-incubating-SNAPSHOT systemml-java + +cd .. +echo "Preparing to upload to PyPI ...." +python setup.py register sdist upload -r https://pypi.python.org/pypi + +rm -r systemml/systemml-java \ No newline at end of file http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/542de374/docs/algorithms-classification.md ---------------------------------------------------------------------- diff --git a/docs/algorithms-classification.md b/docs/algorithms-classification.md index 340267c..8d19d04 100644 --- a/docs/algorithms-classification.md +++ b/docs/algorithms-classification.md @@ -129,7 +129,7 @@ Eqs. (1) and (2). <div class="codetabs"> <div data-lang="Python" markdown="1"> {% highlight python %} -from SystemML.mllearn import LogisticRegression +from systemml.mllearn import LogisticRegression # C = 1/reg logistic = LogisticRegression(sqlCtx, fit_intercept=True, max_iter=100, max_inner_iter=0, tol=0.000001, C=1.0) # X_train, y_train and X_test can be NumPy matrices or Pandas DataFrame or SciPy Sparse Matrix @@ -237,7 +237,7 @@ SystemML Language Reference for details. {% highlight python %} # Scikit-learn way from sklearn import datasets, neighbors -from SystemML.mllearn import LogisticRegression +from systemml.mllearn import LogisticRegression from pyspark.sql import SQLContext sqlCtx = SQLContext(sc) digits = datasets.load_digits() @@ -253,7 +253,7 @@ print('LogisticRegression score: %f' % logistic.fit(X_train, y_train).score(X_te # MLPipeline way from pyspark.ml import Pipeline -from SystemML.mllearn import LogisticRegression +from systemml.mllearn import LogisticRegression from pyspark.ml.feature import HashingTF, Tokenizer from pyspark.sql import SQLContext sqlCtx = SQLContext(sc) @@ -498,7 +498,7 @@ support vector machine (`y` with domain size `2`). <div class="codetabs"> <div data-lang="Python" markdown="1"> {% highlight python %} -from SystemML.mllearn import SVM +from systemml.mllearn import SVM # C = 1/reg svm = SVM(sqlCtx, fit_intercept=True, max_iter=100, tol=0.000001, C=1.0, is_multi_class=False) # X_train, y_train and X_test can be NumPy matrices or Pandas DataFrame or SciPy Sparse Matrix @@ -766,7 +766,7 @@ class labels. <div class="codetabs"> <div data-lang="Python" markdown="1"> {% highlight python %} -from SystemML.mllearn import SVM +from systemml.mllearn import SVM # C = 1/reg svm = SVM(sqlCtx, fit_intercept=True, max_iter=100, tol=0.000001, C=1.0, is_multi_class=True) # X_train, y_train and X_test can be NumPy matrices or Pandas DataFrame or SciPy Sparse Matrix @@ -916,7 +916,7 @@ SystemML Language Reference for details. {% highlight python %} # Scikit-learn way from sklearn import datasets, neighbors -from SystemML.mllearn import SVM +from systemml.mllearn import SVM from pyspark.sql import SQLContext sqlCtx = SQLContext(sc) digits = datasets.load_digits() @@ -932,7 +932,7 @@ print('LogisticRegression score: %f' % svm.fit(X_train, y_train).score(X_test, y # MLPipeline way from pyspark.ml import Pipeline -from SystemML.mllearn import SVM +from systemml.mllearn import SVM from pyspark.ml.feature import HashingTF, Tokenizer from pyspark.sql import SQLContext sqlCtx = SQLContext(sc) @@ -1122,7 +1122,7 @@ applicable when all features are counts of categorical values. <div class="codetabs"> <div data-lang="Python" markdown="1"> {% highlight python %} -from SystemML.mllearn import NaiveBayes +from systemml.mllearn import NaiveBayes nb = NaiveBayes(sqlCtx, laplace=1.0) # X_train, y_train and X_test can be NumPy matrices or Pandas DataFrame or SciPy Sparse Matrix y_test = nb.fit(X_train, y_train) @@ -1257,7 +1257,7 @@ SystemML Language Reference for details. {% highlight python %} from sklearn.datasets import fetch_20newsgroups from sklearn.feature_extraction.text import TfidfVectorizer -from SystemML.mllearn import NaiveBayes +from systemml.mllearn import NaiveBayes from sklearn import metrics from pyspark.sql import SQLContext sqlCtx = SQLContext(sc) http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/542de374/docs/algorithms-regression.md ---------------------------------------------------------------------- diff --git a/docs/algorithms-regression.md b/docs/algorithms-regression.md index 6585b00..992862e 100644 --- a/docs/algorithms-regression.md +++ b/docs/algorithms-regression.md @@ -82,7 +82,7 @@ efficient when the number of features $m$ is relatively small <div class="codetabs"> <div data-lang="Python" markdown="1"> {% highlight python %} -from SystemML.mllearn import LinearRegression +from systemml.mllearn import LinearRegression # C = 1/reg lr = LinearRegression(sqlCtx, fit_intercept=True, C=1.0, solver='direct-solve') # X_train, y_train and X_test can be NumPy matrices or Pandas DataFrame or SciPy Sparse Matrix @@ -124,7 +124,7 @@ y_test = lr.fit(df_train) <div class="codetabs"> <div data-lang="Python" markdown="1"> {% highlight python %} -from SystemML.mllearn import LinearRegression +from systemml.mllearn import LinearRegression # C = 1/reg lr = LinearRegression(sqlCtx, fit_intercept=True, max_iter=100, tol=0.000001, C=1.0, solver='newton-cg') # X_train, y_train and X_test can be NumPy matrices or Pandas DataFrames or SciPy Sparse matrices @@ -222,7 +222,7 @@ SystemML Language Reference for details. {% highlight python %} import numpy as np from sklearn import datasets -from SystemML.mllearn import LinearRegression +from systemml.mllearn import LinearRegression from pyspark.sql import SQLContext # Load the diabetes dataset diabetes = datasets.load_diabetes() @@ -277,7 +277,7 @@ print("Residual sum of squares: %.2f" % np.mean((regr.predict(diabetes_X_test) - {% highlight python %} import numpy as np from sklearn import datasets -from SystemML.mllearn import LinearRegression +from systemml.mllearn import LinearRegression from pyspark.sql import SQLContext # Load the diabetes dataset diabetes = datasets.load_diabetes() http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/542de374/docs/beginners-guide-python.md ---------------------------------------------------------------------- diff --git a/docs/beginners-guide-python.md b/docs/beginners-guide-python.md index 3b4aeed..f040212 100644 --- a/docs/beginners-guide-python.md +++ b/docs/beginners-guide-python.md @@ -72,7 +72,7 @@ brew install apache-spark #### Step 1: Install SystemML Python package ```bash -pip install SystemML +pip install systemml ``` #### Step 2: Download SystemML Java binaries @@ -81,14 +81,14 @@ SystemML Python package downloads the corresponding Java binaries (along with al into the installed location. To find the location of the downloaded Java binaries, use the following command: ```bash -python -c 'import imp; import os; print os.path.join(imp.find_module("SystemML")[1], "SystemML-java")' +python -c 'import imp; import os; print os.path.join(imp.find_module("systemml")[1], "systemml-java")' ``` #### Step 3: (Optional but recommended) Set SYSTEMML_HOME environment variable <div class="codetabs"> <div data-lang="OSX" markdown="1"> ```bash -SYSTEMML_HOME=`python -c 'import imp; import os; print os.path.join(imp.find_module("SystemML")[1], "SystemML-java")'` +SYSTEMML_HOME=`python -c 'import imp; import os; print os.path.join(imp.find_module("systemml")[1], "systemml-java")'` # If you are using zsh or ksh or csh, append it to ~/.zshrc or ~/.profile or ~/.login respectively. echo '' >> ~/.bashrc echo 'export SYSTEMML_HOME='$SYSTEMML_HOME >> ~/.bashrc @@ -96,7 +96,7 @@ echo 'export SYSTEMML_HOME='$SYSTEMML_HOME >> ~/.bashrc </div> <div data-lang="Linux" markdown="1"> ```bash -SYSTEMML_HOME=`python -c 'import imp; import os; print os.path.join(imp.find_module("SystemML")[1], "SystemML-java")'` +SYSTEMML_HOME=`python -c 'import imp; import os; print os.path.join(imp.find_module("systemml")[1], "systemml-java")'` # If you are using zsh or ksh or csh, append it to ~/.zshrc or ~/.profile or ~/.login respectively. echo '' >> ~/.bashrc echo 'export SYSTEMML_HOME='$SYSTEMML_HOME >> ~/.bashrc @@ -128,7 +128,7 @@ pyspark --master local[*] --driver-class-path $SYSTEMML_HOME"/SystemML.jar" To get started with SystemML, let's try few elementary matrix multiplication operations: ```python -import SystemML as sml +import systemml as sml import numpy as np sml.setSparkContext(sc) m1 = sml.matrix(np.ones((3,3)) + 2) @@ -152,7 +152,7 @@ model: $ \beta = solve(X^T X, X^T y) $. For simplicity, we will use direct-solve ```python import numpy as np from sklearn import datasets -import SystemML as sml +import systemml as sml from pyspark.sql import SQLContext # Load the diabetes dataset diabetes = datasets.load_diabetes() @@ -196,7 +196,7 @@ algorithm. ```python import numpy as np from sklearn import datasets -from SystemML.mllearn import LinearRegression +from systemml.mllearn import LinearRegression from pyspark.sql import SQLContext # Load the diabetes dataset diabetes = datasets.load_diabetes() @@ -230,7 +230,7 @@ algorithm on digits datasets. ```python # Scikit-learn way from sklearn import datasets, neighbors -from SystemML.mllearn import LogisticRegression +from systemml.mllearn import LogisticRegression from pyspark.sql import SQLContext sqlCtx = SQLContext(sc) digits = datasets.load_digits() @@ -245,15 +245,21 @@ logistic = LogisticRegression(sqlCtx) print('LogisticRegression score: %f' % logistic.fit(X_train, y_train).score(X_test, y_test)) ``` +Output: + +```bash +LogisticRegression score: 0.922222 +``` + ### Passing PySpark DataFrame To train the above algorithm on larger dataset, we can load the dataset into DataFrame and pass it to the `fit` method: ```python from sklearn import datasets, neighbors -from SystemML.mllearn import LogisticRegression +from systemml.mllearn import LogisticRegression from pyspark.sql import SQLContext -import SystemML as sml +import systemml as sml sqlCtx = SQLContext(sc) digits = datasets.load_digits() X_digits = digits.data @@ -267,6 +273,12 @@ logistic = LogisticRegression(sqlCtx) print('LogisticRegression score: %f' % logistic.fit(df_train).score(X_test, y_test)) ``` +Output: + +```bash +LogisticRegression score: 0.922222 +``` + ### MLPipeline interface In the below example, we demonstrate how the same `LogisticRegression` class can allow SystemML to fit seamlessly into @@ -275,7 +287,7 @@ large data pipelines. ```python # MLPipeline way from pyspark.ml import Pipeline -from SystemML.mllearn import LogisticRegression +from systemml.mllearn import LogisticRegression from pyspark.ml.feature import HashingTF, Tokenizer from pyspark.sql import SQLContext sqlCtx = SQLContext(sc) @@ -307,6 +319,19 @@ prediction = model.transform(test) prediction.show() ``` +Output: + +```bash ++--+---------------+--------------------+--------------------+--------------------+---+----------+ +|id| text| words| features| probability| ID|prediction| ++--+---------------+--------------------+--------------------+--------------------+---+----------+ +|12| spark i j k|ArrayBuffer(spark...|(20,[5,6,7],[2.0,...|[0.99999999999975...|1.0| 1.0| +|13| l m n|ArrayBuffer(l, m, n)|(20,[8,9,10],[1.0...|[1.37552128844736...|2.0| 2.0| +|14|mapreduce spark|ArrayBuffer(mapre...|(20,[5,10],[1.0,1...|[0.99860290938153...|3.0| 1.0| +|15| apache hadoop|ArrayBuffer(apach...|(20,[9,14],[1.0,1...|[5.41688748236143...|4.0| 2.0| ++--+---------------+--------------------+--------------------+--------------------+---+----------+ +``` + ## Invoking DML/PyDML scripts using MLContext The below example demonstrates how to invoke the algorithm [scripts/algorithms/MultiLogReg.dml](https://github.com/apache/incubator-systemml/blob/master/scripts/algorithms/MultiLogReg.dml) @@ -315,7 +340,7 @@ using Python [MLContext API](https://apache.github.io/incubator-systemml/spark-m ```python from sklearn import datasets, neighbors from pyspark.sql import DataFrame, SQLContext -import SystemML as sml +import systemml as sml import pandas as pd import os sqlCtx = SQLContext(sc) http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/542de374/src/main/python/MANIFEST.in ---------------------------------------------------------------------- diff --git a/src/main/python/MANIFEST.in b/src/main/python/MANIFEST.in index a185263..a0835d2 100644 --- a/src/main/python/MANIFEST.in +++ b/src/main/python/MANIFEST.in @@ -18,12 +18,12 @@ # under the License. # #------------------------------------------------------------- -include SystemML/SystemML-java/LICENSE -include SystemML/SystemML-java/SystemML-config.xml -include SystemML/SystemML-java/NOTICE -include SystemML/SystemML-java/SystemML.jar -include SystemML/SystemML-java/DISCLAIMER -include SystemML/SystemML-java/scripts/sparkDML.sh -recursive-include SystemML/SystemML-java/scripts/algorithms * -recursive-include SystemML/SystemML-java/scripts/datagen * -recursive-include SystemML/SystemML-java/scripts/utils * \ No newline at end of file +include systemml/systemml-java/LICENSE +include systemml/systemml-java/SystemML-config.xml +include systemml/systemml-java/NOTICE +include systemml/systemml-java/SystemML.jar +include systemml/systemml-java/DISCLAIMER +include systemml/systemml-java/scripts/sparkDML.sh +recursive-include systemml/systemml-java/scripts/algorithms * +recursive-include systemml/systemml-java/scripts/datagen * +recursive-include systemml/systemml-java/scripts/utils * \ No newline at end of file http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/542de374/src/main/python/SystemML/__init__.py ---------------------------------------------------------------------- diff --git a/src/main/python/SystemML/__init__.py b/src/main/python/SystemML/__init__.py deleted file mode 100644 index 02a940b..0000000 --- a/src/main/python/SystemML/__init__.py +++ /dev/null @@ -1,29 +0,0 @@ -#!/usr/bin/python -#------------------------------------------------------------- -# -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. -# -#------------------------------------------------------------- - -from .mlcontext import * -from .defmatrix import * -from .converters import * - -__all__ = mlcontext.__all__ -__all__ += defmatrix.__all__ -__all__ += converters.__all__ \ No newline at end of file http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/542de374/src/main/python/SystemML/converters.py ---------------------------------------------------------------------- diff --git a/src/main/python/SystemML/converters.py b/src/main/python/SystemML/converters.py deleted file mode 100644 index 9588bec..0000000 --- a/src/main/python/SystemML/converters.py +++ /dev/null @@ -1,100 +0,0 @@ -#!/usr/bin/python -#------------------------------------------------------------- -# -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. -# -#------------------------------------------------------------- - -from pyspark.context import SparkContext -from pyspark.sql import DataFrame, SQLContext -from pyspark.rdd import RDD -import numpy as np -import pandas as pd -import sklearn as sk - -from scipy.sparse import spmatrix -from scipy.sparse import coo_matrix - -SUPPORTED_TYPES = (np.ndarray, pd.DataFrame, spmatrix) - -def getNumCols(numPyArr): - if numPyArr.ndim == 1: - return 1 - else: - return numPyArr.shape[1] - -def convertToLabeledDF(sqlCtx, X, y=None): - from pyspark.ml.feature import VectorAssembler - if y is not None: - pd1 = pd.DataFrame(X) - pd2 = pd.DataFrame(y, columns=['label']) - pdf = pd.concat([pd1, pd2], axis=1) - inputColumns = ['C' + str(i) for i in pd1.columns] - outputColumns = inputColumns + ['label'] - else: - pdf = pd.DataFrame(X) - inputColumns = ['C' + str(i) for i in pdf.columns] - outputColumns = inputColumns - assembler = VectorAssembler(inputCols=inputColumns, outputCol='features') - out = assembler.transform(sqlCtx.createDataFrame(pdf, outputColumns)) - if y is not None: - return out.select('features', 'label') - else: - return out.select('features') - - -def convertToMatrixBlock(sc, src): - if isinstance(src, spmatrix): - src = coo_matrix(src, dtype=np.float64) - numRows = src.shape[0] - numCols = src.shape[1] - data = src.data - row = src.row.astype(np.int32) - col = src.col.astype(np.int32) - nnz = len(src.col) - buf1 = bytearray(data.tostring()) - buf2 = bytearray(row.tostring()) - buf3 = bytearray(col.tostring()) - return sc._jvm.org.apache.sysml.runtime.instructions.spark.utils.RDDConverterUtilsExt.convertSciPyCOOToMB(buf1, buf2, buf3, numRows, numCols, nnz) - elif isinstance(sc, SparkContext): - src = np.asarray(src) - numCols = getNumCols(src) - numRows = src.shape[0] - arr = src.ravel().astype(np.float64) - buf = bytearray(arr.tostring()) - return sc._jvm.org.apache.sysml.runtime.instructions.spark.utils.RDDConverterUtilsExt.convertPy4JArrayToMB(buf, numRows, numCols) - else: - raise TypeError('sc needs to be of type SparkContext') # TODO: We can generalize this by creating py4j gateway ourselves - - -def convertToNumpyArr(sc, mb): - if isinstance(sc, SparkContext): - numRows = mb.getNumRows() - numCols = mb.getNumColumns() - buf = sc._jvm.org.apache.sysml.runtime.instructions.spark.utils.RDDConverterUtilsExt.convertMBtoPy4JDenseArr(mb) - return np.frombuffer(buf, count=numRows*numCols, dtype=np.float64).reshape((numRows, numCols)) - else: - raise TypeError('sc needs to be of type SparkContext') # TODO: We can generalize this by creating py4j gateway ourselves - - -def convertToPandasDF(X): - if not isinstance(X, pd.DataFrame): - return pd.DataFrame(X, columns=['C' + str(i) for i in range(getNumCols(X))]) - return X - -__all__ = [ 'getNumCols', 'convertToMatrixBlock', 'convertToNumpyArr', 'convertToPandasDF', 'SUPPORTED_TYPES' , 'convertToLabeledDF'] http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/542de374/src/main/python/SystemML/defmatrix.py ---------------------------------------------------------------------- diff --git a/src/main/python/SystemML/defmatrix.py b/src/main/python/SystemML/defmatrix.py deleted file mode 100644 index 18f6314..0000000 --- a/src/main/python/SystemML/defmatrix.py +++ /dev/null @@ -1,410 +0,0 @@ -#!/usr/bin/python -#------------------------------------------------------------- -# -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. -# -#------------------------------------------------------------- - -import numpy as np - -from . import pydml, MLContext -from .converters import * -from pyspark import SparkContext, RDD -from pyspark.sql import DataFrame, SQLContext - -def setSparkContext(sc): - """ - Before using the matrix, the user needs to invoke this function. - - Parameters - ---------- - sc: SparkContext - SparkContext - """ - matrix.ml = MLContext(sc) - matrix.sc = sc - -def checkIfMLContextIsSet(): - if matrix.ml is None: - raise Exception('Expected setSparkContext(sc) to be called.') - -class DMLOp(object): - """ - Represents an intermediate node of Abstract syntax tree created to generate the PyDML script - """ - def __init__(self, inputs, dml=None): - self.inputs = inputs - self.dml = dml - - def _visit(self, execute=True): - matrix.dml = matrix.dml + self.dml - - -def reset(): - """ - Resets the visited status of matrix and the operators in the generated AST. - """ - for m in matrix.visited: - m.visited = False - matrix.visited = [] - -def binaryOp(lhs, rhs, opStr): - """ - Common function called by all the binary operators in matrix class - """ - inputs = [] - if isinstance(lhs, matrix): - lhsStr = lhs.ID - inputs = [lhs] - elif isinstance(lhs, float) or isinstance(lhs, int): - lhsStr = str(lhs) - else: - raise TypeError('Incorrect type') - if isinstance(rhs, matrix): - rhsStr = rhs.ID - inputs = inputs + [rhs] - elif isinstance(rhs, float) or isinstance(rhs, int): - rhsStr = str(rhs) - else: - raise TypeError('Incorrect type') - dmlOp = DMLOp(inputs) - out = matrix(None, op=dmlOp) - dmlOp.dml = [out.ID, ' = ', lhsStr, opStr, rhsStr, '\n'] - return out - -def binaryMatrixFunction(X, Y, fnName): - """ - Common function called by supported PyDML built-in function that has two arguments both of which are matrices. - TODO: This needs to be generalized to support arbitrary arguments of differen types. - """ - if not isinstance(X, matrix) or not isinstance(Y, matrix): - raise TypeError('Incorrect input type. Expected matrix type') - inputs = [X, Y] - dmlOp = DMLOp(inputs) - out = matrix(None, op=dmlOp) - dmlOp.dml = [out.ID, ' = ', fnName,'(', X.ID, ', ', Y.ID, ')\n'] - return out - -def solve(A, b): - """ - Computes the least squares solution for system of linear equations A %*% x = b - - Examples - -------- - >>> import numpy as np - >>> from sklearn import datasets - >>> import SystemML as sml - >>> from pyspark.sql import SQLContext - >>> diabetes = datasets.load_diabetes() - >>> diabetes_X = diabetes.data[:, np.newaxis, 2] - >>> X_train = diabetes_X[:-20] - >>> X_test = diabetes_X[-20:] - >>> y_train = diabetes.target[:-20] - >>> y_test = diabetes.target[-20:] - >>> sml.setSparkContext(sc) - >>> X = sml.matrix(X_train) - >>> y = sml.matrix(y_train) - >>> A = X.transpose().dot(X) - >>> b = X.transpose().dot(y) - >>> beta = sml.solve(A, b).toNumPyArray() - >>> y_predicted = X_test.dot(beta) - >>> print('Residual sum of squares: %.2f' % np.mean((y_predicted - y_test) ** 2)) - Residual sum of squares: 25282.12 - """ - return binaryMatrixFunction(A, b, 'solve') - -def eval(outputs, outputDF=False, execute=True): - """ - Executes the unevaluated DML script and computes the matrices specified by outputs. - - Parameters - ---------- - outputs: list of matrices - outputDF: back the data of matrix as PySpark DataFrame - """ - checkIfMLContextIsSet() - reset() - matrix.dml = [] - matrix.script = pydml('') - if isinstance(outputs, matrix): - outputs = [ outputs ] - elif not isinstance(outputs, list): - raise TypeError('Incorrect input type') - for m in outputs: - m.output = True - m._visit(execute=execute) - if not execute: - return ''.join(matrix.dml) - matrix.script.scriptString = ''.join(matrix.dml) - results = matrix.ml.execute(matrix.script) - # Note: an evaluated matrix contains a data field computed by eval method as DataFrame or NumPy array. - for m in outputs: - if outputDF: - m.data = results.getDataFrame(m.ID) - else: - m.data = results.getNumPyArray(m.ID) - -class matrix(object): - """ - matrix class is a python wrapper that implements basic matrix operator. - Note: an evaluated matrix contains a data field computed by eval method as DataFrame or NumPy array. - - Examples - -------- - >>> import SystemML as sml - >>> import numpy as np - >>> sml.setSparkContext(sc) - - Welcome to Apache SystemML! - - >>> m1 = sml.matrix(np.ones((3,3)) + 2) - >>> m2 = sml.matrix(np.ones((3,3)) + 3) - >>> m2 = m1 * (m2 + m1) - >>> m4 = 1.0 - m2 - >>> m4 - # This matrix (mVar5) is backed by below given PyDML script (which is not yet evaluated). To fetch the data of this matrix, invoke toNumPyArray() or toDataFrame() or toPandas() methods. - mVar1 = load(" ", format="csv") - mVar2 = load(" ", format="csv") - mVar3 = mVar2 + mVar1 - mVar4 = mVar1 * mVar3 - mVar5 = 1.0 - mVar4 - save(mVar5, " ") - - <SystemML.defmatrix.matrix object> - >>> m2.eval() - >>> m2 - # This matrix (mVar4) is backed by NumPy array. To fetch the NumPy array, invoke toNumPyArray() method. - <SystemML.defmatrix.matrix object> - >>> m4 - # This matrix (mVar5) is backed by below given PyDML script (which is not yet evaluated). To fetch the data of this matrix, invoke toNumPyArray() or toDataFrame() or toPandas() methods. - mVar4 = load(" ", format="csv") - mVar5 = 1.0 - mVar4 - save(mVar5, " ") - - <SystemML.defmatrix.matrix object> - >>> m4.sum(axis=1).toNumPyArray() - array([[-60.], - [-60.], - [-60.]]) - """ - # Global variable that is used to keep track of intermediate matrix variables in the DML script - systemmlVarID = 0 - - # Since joining of string is expensive operation, we collect the set of strings into list and then join - # them before execution: See matrix.script.scriptString = ''.join(matrix.dml) in eval() method - dml = [] - - # Represents MLContext's script object - script = None - - # Represents MLContext object - ml = None - - # Contains list of nodes visited in Abstract Syntax Tree. This helps to avoid computation of matrix objects - # that have been previously evaluated. - visited = [] - - def __init__(self, data, op=None): - """ - Constructs a lazy matrix - - Parameters - ---------- - data: NumPy ndarray, Pandas DataFrame, scipy sparse matrix or PySpark DataFrame. (data cannot be None for external users, 'data=None' is used internally for lazy evaluation). - """ - checkIfMLContextIsSet() - self.visited = False - matrix.systemmlVarID += 1 - self.output = False - self.ID = 'mVar' + str(matrix.systemmlVarID) - if isinstance(data, SUPPORTED_TYPES): - self.data = data - elif hasattr(data, '_jdf'): - self.data = data - elif data is None and op is not None: - self.data = None - # op refers to the node of Abstract Syntax Tree created internally for lazy evaluation - self.op = op - else: - raise TypeError('Unsupported input type') - - def eval(self, outputDF=False): - """ - This is a convenience function that calls the global eval method - """ - eval([self], outputDF=False) - - def toPandas(self): - """ - This is a convenience function that calls the global eval method and then converts the matrix object into Pandas DataFrame. - """ - if self.data is None: - self.eval() - return convertToPandasDF(self.data) - - def toNumPyArray(self): - """ - This is a convenience function that calls the global eval method and then converts the matrix object into NumPy array. - """ - if self.data is None: - self.eval() - if isinstance(self.data, DataFrame): - self.data = self.data.toPandas().as_matrix() - # Always keep default format as NumPy array if possible - return self.data - - def toDataFrame(self): - """ - This is a convenience function that calls the global eval method and then converts the matrix object into DataFrame. - """ - if self.data is None: - self.eval(outputDF=True) - if not isinstance(self.data, DataFrame): - if MLResults.sqlContext is None: - MLResults.sqlContext = SQLContext(matrix.sc) - self.data = sqlContext.createDataFrame(self.toPandas()) - return self.data - - def _visit(self, execute=True): - """ - This function is called for two scenarios: - 1. For printing the PyDML script which has not yet been evaluated (execute=False). See '__repr__' method. - 2. Called as part of 'eval' method (execute=True). In this scenario, it builds the PyDML script by visiting itself - and its child nodes. Also, it does appropriate registration as input or output that is required by MLContext. - """ - if self.visited: - return self - self.visited = True - # for cleanup - matrix.visited = matrix.visited + [ self ] - if self.data is not None: - matrix.dml = matrix.dml + [ self.ID, ' = load(\" \", format=\"csv\")\n'] - if isinstance(self.data, DataFrame) and execute: - matrix.script.input(self.ID, self.data) - elif execute: - matrix.script.input(self.ID, convertToMatrixBlock(matrix.sc, self.data)) - return self - elif self.op is not None: - for m in self.op.inputs: - m._visit(execute=execute) - self.op._visit(execute=execute) - else: - raise Exception('Expected either op or data to be set') - if self.data is None and self.output: - matrix.dml = matrix.dml + ['save(', self.ID, ', \" \")\n'] - if execute: - matrix.script.output(self.ID) - return self - - def __repr__(self): - """ - This function helps to debug matrix class and also examine the generated PyDML script - """ - if self.data is None: - print('# This matrix (' + self.ID + ') is backed by below given PyDML script (which is not yet evaluated). To fetch the data of this matrix, invoke toNumPyArray() or toDataFrame() or toPandas() methods.\n' + eval([self], execute=False)) - elif isinstance(self.data, DataFrame): - print('# This matrix (' + self.ID + ') is backed by PySpark DataFrame. To fetch the DataFrame, invoke toDataFrame() method.') - else: - print('# This matrix (' + self.ID + ') is backed by NumPy array. To fetch the NumPy array, invoke toNumPyArray() method.') - return '<SystemML.defmatrix.matrix object>' - - def __add__(self, other): - return binaryOp(self, other, ' + ') - - def __sub__(self, other): - return binaryOp(self, other, ' - ') - - def __mul__(self, other): - return binaryOp(self, other, ' * ') - - def __floordiv__(self, other): - return binaryOp(self, other, ' // ') - - def __div__(self, other): - return binaryOp(self, other, ' / ') - - def __mod__(self, other): - return binaryOp(self, other, ' % ') - - def __pow__(self, other): - return binaryOp(self, other, ' ** ') - - def __radd__(self, other): - return binaryOp(other, self, ' + ') - - def __rsub__(self, other): - return binaryOp(other, self, ' - ') - - def __rmul__(self, other): - return binaryOp(other, self, ' * ') - - def __rfloordiv__(self, other): - return binaryOp(other, self, ' // ') - - def __rdiv__(self, other): - return binaryOp(other, self, ' / ') - - def __rmod__(self, other): - return binaryOp(other, self, ' % ') - - def __rpow__(self, other): - return binaryOp(other, self, ' ** ') - - def sum(self, axis=None): - return self._aggFn('sum', axis) - - def mean(self, axis=None): - return self._aggFn('mean', axis) - - def max(self, axis=None): - return self._aggFn('max', axis) - - def min(self, axis=None): - return self._aggFn('min', axis) - - def argmin(self, axis=None): - return self._aggFn('argmin', axis) - - def argmax(self, axis=None): - return self._aggFn('argmax', axis) - - def cumsum(self, axis=None): - return self._aggFn('cumsum', axis) - - def transpose(self, axis=None): - return self._aggFn('transpose', axis) - - def trace(self, axis=None): - return self._aggFn('trace', axis) - - def _aggFn(self, fnName, axis): - """ - Common function that is called for functions that have axis as parameter. - """ - dmlOp = DMLOp([self]) - out = matrix(None, op=dmlOp) - if axis is None: - dmlOp.dml = [out.ID, ' = ', fnName, '(', self.ID, ')\n'] - else: - dmlOp.dml = [out.ID, ' = ', fnName, '(', self.ID, ', axis=', str(axis) ,')\n'] - return out - - def dot(self, other): - return binaryMatrixFunction(self, other, 'dot') - -__all__ = [ 'setSparkContext', 'matrix', 'eval', 'solve'] http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/542de374/src/main/python/SystemML/mlcontext.py ---------------------------------------------------------------------- diff --git a/src/main/python/SystemML/mlcontext.py b/src/main/python/SystemML/mlcontext.py deleted file mode 100644 index 1b90e70..0000000 --- a/src/main/python/SystemML/mlcontext.py +++ /dev/null @@ -1,302 +0,0 @@ -#!/usr/bin/python -#------------------------------------------------------------- -# -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. -# -#------------------------------------------------------------- -import os - -try: - import py4j.java_gateway - from py4j.java_gateway import JavaObject -except ImportError: - raise ImportError('Unable to import JavaObject from py4j.java_gateway. Hint: Make sure you are running with pyspark') - -from pyspark import SparkContext -import pyspark.mllib.common -from pyspark.sql import DataFrame, SQLContext -from .converters import * - -def dml(scriptString): - """ - Create a dml script object based on a string. - - Parameters - ---------- - scriptString: string - Can be a path to a dml script or a dml script itself. - - Returns - ------- - script: Script instance - Instance of a script object. - """ - if not isinstance(scriptString, str): - raise ValueError("scriptString should be a string, got %s" % type(scriptString)) - return Script(scriptString, scriptType="dml") - - -def pydml(scriptString): - """ - Create a pydml script object based on a string. - - Parameters - ---------- - scriptString: string - Can be a path to a pydml script or a pydml script itself. - - Returns - ------- - script: Script instance - Instance of a script object. - """ - if not isinstance(scriptString, str): - raise ValueError("scriptString should be a string, got %s" % type(scriptString)) - return Script(scriptString, scriptType="pydml") - - -def _java2py(sc, obj): - """ Convert Java object to Python. """ - # TODO: Port this private PySpark function. - obj = pyspark.mllib.common._java2py(sc, obj) - if isinstance(obj, JavaObject): - class_name = obj.getClass().getSimpleName() - if class_name == 'Matrix': - obj = Matrix(obj, sc) - return obj - - -def _py2java(sc, obj): - """ Convert Python object to Java. """ - if isinstance(obj, Matrix): - obj = obj._java_matrix - # TODO: Port this private PySpark function. - obj = pyspark.mllib.common._py2java(sc, obj) - return obj - - -class Matrix(object): - """ - Wrapper around a Java Matrix object. - - Parameters - ---------- - javaMatrix: JavaObject - A Java Matrix object as returned by calling `ml.execute().get()`. - - sc: SparkContext - SparkContext - """ - def __init__(self, javaMatrix, sc): - self._java_matrix = javaMatrix - self.sc = sc - - def __repr__(self): - return "Matrix" - - def toDF(self): - """ - Convert the Matrix to a PySpark SQL DataFrame. - - Returns - ------- - df: PySpark SQL DataFrame - A PySpark SQL DataFrame representing the matrix, with - one "ID" column containing the row index (since Spark - DataFrames are unordered), followed by columns of doubles - for each column in the matrix. - """ - jdf = self._java_matrix.asDataFrame() - df = _java2py(self.sc, jdf) - return df - - -class MLResults(object): - """ - Wrapper around a Java ML Results object. - - Parameters - ---------- - results: JavaObject - A Java MLResults object as returned by calling `ml.execute()`. - - sc: SparkContext - SparkContext - """ - def __init__(self, results, sc): - self._java_results = results - self.sc = sc - try: - if MLResults.sqlContext is None: - MLResults.sqlContext = SQLContext(sc) - except AttributeError: - MLResults.sqlContext = SQLContext(sc) - - def __repr__(self): - return "MLResults" - - def getNumPyArray(self, *outputs): - """ - Parameters - ---------- - outputs: string, list of strings - Output variables as defined inside the DML script. - """ - outs = [convertToNumpyArr(self.sc, self._java_results.getMatrix(out).asBinaryBlockMatrix().getMatrixBlock()) for out in outputs] - if len(outs) == 1: - return outs[0] - return outs - - def getDataFrame(self, *outputs): - """ - Parameters - ---------- - outputs: string, list of strings - Output variables as defined inside the DML script. - """ - outs = [DataFrame(self._java_results.getDataFrame(out), MLResults.sqlContext) for out in outputs] - if len(outs) == 1: - return outs[0] - return outs - - def get(self, *outputs): - """ - Parameters - ---------- - outputs: string, list of strings - Output variables as defined inside the DML script. - """ - outs = [_java2py(self.sc, self._java_results.get(out)) for out in outputs] - if len(outs) == 1: - return outs[0] - return outs - - -class Script(object): - """ - Instance of a DML/PyDML Script. - - Parameters - ---------- - scriptString: string - Can be either a file path to a DML script or a DML script itself. - - scriptType: string - Script language, either "dml" for DML (R-like) or "pydml" for PyDML (Python-like). - """ - def __init__(self, scriptString, scriptType="dml"): - self.scriptString = scriptString - self.scriptType = scriptType - self._input = {} - self._output = [] - - def input(self, *args, **kwargs): - """ - Parameters - ---------- - args: name, value tuple - where name is a string, and currently supported value formats - are double, string, dataframe, rdd, and list of such object. - - kwargs: dict of name, value pairs - To know what formats are supported for name and value, look above. - """ - if args and len(args) != 2: - raise ValueError("Expected name, value pair.") - elif args: - self._input[args[0]] = args[1] - for name, value in kwargs.items(): - self._input[name] = value - return self - - def output(self, *names): - """ - Parameters - ---------- - names: string, list of strings - Output variables as defined inside the DML script. - """ - self._output.extend(names) - return self - - -class MLContext(object): - """ - Wrapper around the new SystemML MLContext. - - Parameters - ---------- - sc: SparkContext - SparkContext - """ - def __init__(self, sc): - if not isinstance(sc, SparkContext): - raise ValueError("Expected sc to be a SparkContext, got " % sc) - self._sc = sc - self._ml = sc._jvm.org.apache.sysml.api.mlcontext.MLContext(sc._jsc) - - def __repr__(self): - return "MLContext" - - def execute(self, script): - """ - Execute a DML / PyDML script. - - Parameters - ---------- - script: Script instance - Script instance defined with the appropriate input and output variables. - - Returns - ------- - ml_results: MLResults - MLResults instance. - """ - if not isinstance(script, Script): - raise ValueError("Expected script to be an instance of Script") - scriptString = script.scriptString - if script.scriptType == "dml": - if scriptString.endswith(".dml"): - if os.path.exists(scriptString): - script_java = self._sc._jvm.org.apache.sysml.api.mlcontext.ScriptFactory.dmlFromFile(scriptString) - else: - raise ValueError("path: %s does not exist" % scriptString) - else: - script_java = self._sc._jvm.org.apache.sysml.api.mlcontext.ScriptFactory.dml(scriptString) - elif script.scriptType == "pydml": - if scriptString.endswith(".pydml"): - if os.path.exists(scriptString): - script_java = self._sc._jvm.org.apache.sysml.api.mlcontext.ScriptFactory.pydmlFromFile(scriptString) - else: - raise ValueError("path: %s does not exist" % scriptString) - else: - script_java = self._sc._jvm.org.apache.sysml.api.mlcontext.ScriptFactory.pydml(scriptString) - - for key, val in script._input.items(): - # `in` is a reserved word ("keyword") in Python, so `script_java.in(...)` is not - # allowed. Therefore, we use the following code in which we retrieve a function - # representing `script_java.in`, and then call it with the arguments. This is in - # lieu of adding a new `input` method on the JVM side, as that would complicate use - # from Scala/Java. - py4j.java_gateway.get_method(script_java, "in")(key, _py2java(self._sc, val)) - for val in script._output: - script_java.out(val) - return MLResults(self._ml.execute(script_java), self._sc) - - -__all__ = ['MLResults', 'MLContext', 'Script', 'dml', 'pydml'] http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/542de374/src/main/python/SystemML/mllearn/__init__.py ---------------------------------------------------------------------- diff --git a/src/main/python/SystemML/mllearn/__init__.py b/src/main/python/SystemML/mllearn/__init__.py deleted file mode 100644 index 69cab58..0000000 --- a/src/main/python/SystemML/mllearn/__init__.py +++ /dev/null @@ -1,25 +0,0 @@ -#!/usr/bin/python -#------------------------------------------------------------- -# -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. -# -#------------------------------------------------------------- - -from .estimators import * - -__all__ = estimators.__all__ \ No newline at end of file http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/542de374/src/main/python/SystemML/mllearn/estimators.py ---------------------------------------------------------------------- diff --git a/src/main/python/SystemML/mllearn/estimators.py b/src/main/python/SystemML/mllearn/estimators.py deleted file mode 100644 index 5d33d64..0000000 --- a/src/main/python/SystemML/mllearn/estimators.py +++ /dev/null @@ -1,302 +0,0 @@ -#!/usr/bin/python -#------------------------------------------------------------- -# -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. -# -#------------------------------------------------------------- - -from pyspark.context import SparkContext -from pyspark.sql import DataFrame, SQLContext -from pyspark.rdd import RDD -import numpy as np -import pandas as pd -import sklearn as sk -from pyspark.ml.feature import VectorAssembler -from pyspark.mllib.linalg import Vectors -from pyspark.ml import Estimator, Model - -from ..converters import * - -def assemble(sqlCtx, pdf, inputCols, outputCol): - tmpDF = sqlCtx.createDataFrame(pdf, list(pdf.columns)) - assembler = VectorAssembler(inputCols=list(inputCols), outputCol=outputCol) - return assembler.transform(tmpDF) - -class BaseSystemMLEstimator(Estimator): - featuresCol = 'features' - labelCol = 'label' - - def setFeaturesCol(self, colName): - """ - Sets the default column name for features of PySpark DataFrame. - - Parameters - ---------- - colName: column name for features (default: 'features') - """ - self.featuresCol = colName - - def setLabelCol(self, colName): - """ - Sets the default column name for features of PySpark DataFrame. - - Parameters - ---------- - colName: column name for features (default: 'label') - """ - self.labelCol = colName - - # Returns a model after calling fit(df) on Estimator object on JVM - def _fit(self, X): - """ - Invokes the fit method on Estimator object on JVM if X is PySpark DataFrame - - Parameters - ---------- - X: PySpark DataFrame that contain the columns featuresCol (default: 'features') and labelCol (default: 'label') - """ - if hasattr(X, '_jdf') and self.featuresCol in X.columns and self.labelCol in X.columns: - self.model = self.estimator.fit(X._jdf) - return self - else: - raise Exception('Incorrect usage: Expected dataframe as input with features/label as columns') - - def fit(self, X, y=None, params=None): - """ - Invokes the fit method on Estimator object on JVM if X and y are on of the supported data types - - Parameters - ---------- - X: NumPy ndarray, Pandas DataFrame, scipy sparse matrix - y: NumPy ndarray, Pandas DataFrame, scipy sparse matrix - """ - if y is None: - return self._fit(X) - elif y is not None and isinstance(X, SUPPORTED_TYPES) and isinstance(y, SUPPORTED_TYPES): - if self.transferUsingDF: - pdfX = convertToPandasDF(X) - pdfY = convertToPandasDF(y) - if getNumCols(pdfY) != 1: - raise Exception('y should be a column vector') - if pdfX.shape[0] != pdfY.shape[0]: - raise Exception('Number of rows of X and y should match') - colNames = pdfX.columns - pdfX[self.labelCol] = pdfY[pdfY.columns[0]] - df = assemble(self.sqlCtx, pdfX, colNames, self.featuresCol).select(self.featuresCol, self.labelCol) - self.model = self.estimator.fit(df._jdf) - else: - numColsy = getNumCols(y) - if numColsy != 1: - raise Exception('Expected y to be a column vector') - self.model = self.estimator.fit(convertToMatrixBlock(self.sc, X), convertToMatrixBlock(self.sc, y)) - if self.setOutputRawPredictionsToFalse: - self.model.setOutputRawPredictions(False) - return self - else: - raise Exception('Unsupported input type') - - def transform(self, X): - return self.predict(X) - - # Returns either a DataFrame or MatrixBlock after calling transform(X:MatrixBlock, y:MatrixBlock) on Model object on JVM - def predict(self, X): - """ - Invokes the transform method on Estimator object on JVM if X and y are on of the supported data types - - Parameters - ---------- - X: NumPy ndarray, Pandas DataFrame, scipy sparse matrix or PySpark DataFrame - """ - if isinstance(X, SUPPORTED_TYPES): - if self.transferUsingDF: - pdfX = convertToPandasDF(X) - df = assemble(self.sqlCtx, pdfX, pdfX.columns, self.featuresCol).select(self.featuresCol) - retjDF = self.model.transform(df._jdf) - retDF = DataFrame(retjDF, self.sqlCtx) - retPDF = retDF.sort('ID').select('prediction').toPandas() - if isinstance(X, np.ndarray): - return retPDF.as_matrix().flatten() - else: - return retPDF - else: - retNumPy = convertToNumpyArr(self.sc, self.model.transform(convertToMatrixBlock(self.sc, X))) - if isinstance(X, np.ndarray): - return retNumPy - else: - return retNumPy # TODO: Convert to Pandas - elif hasattr(X, '_jdf'): - if self.featuresCol in X.columns: - # No need to assemble as input DF is likely coming via MLPipeline - df = X - else: - assembler = VectorAssembler(inputCols=X.columns, outputCol=self.featuresCol) - df = assembler.transform(X) - retjDF = self.model.transform(df._jdf) - retDF = DataFrame(retjDF, self.sqlCtx) - # Return DF - return retDF.sort('ID') - else: - raise Exception('Unsupported input type') - -class BaseSystemMLClassifier(BaseSystemMLEstimator): - - def score(self, X, y): - """ - Scores the predicted value with ground truth 'y' - - Parameters - ---------- - X: NumPy ndarray, Pandas DataFrame, scipy sparse matrix - y: NumPy ndarray, Pandas DataFrame, scipy sparse matrix - """ - return sk.metrics.accuracy_score(y, self.predict(X)) - -class BaseSystemMLRegressor(BaseSystemMLEstimator): - - def score(self, X, y): - """ - Scores the predicted value with ground truth 'y' - - Parameters - ---------- - X: NumPy ndarray, Pandas DataFrame, scipy sparse matrix - y: NumPy ndarray, Pandas DataFrame, scipy sparse matrix - """ - return sk.metrics.r2_score(y, self.predict(X), multioutput='variance_weighted') - - -class LogisticRegression(BaseSystemMLClassifier): - def __init__(self, sqlCtx, penalty='l2', fit_intercept=True, max_iter=100, max_inner_iter=0, tol=0.000001, C=1.0, solver='newton-cg', transferUsingDF=False): - """ - Performs both binomial and multinomial logistic regression. - - Parameters - ---------- - sqlCtx: PySpark SQLContext - penalty: Only 'l2' supported - fit_intercept: Specifies whether to add intercept or not (default: True) - max_iter: Maximum number of outer (Fisher scoring) iterations (default: 100) - max_inner_iter: Maximum number of inner (conjugate gradient) iterations, or 0 if no maximum limit provided (default: 0) - tol: Tolerance used in the convergence criterion (default: 0.000001) - C: 1/regularization parameter (default: 1.0) - solver: Only 'newton-cg' solver supported - """ - self.sqlCtx = sqlCtx - self.sc = sqlCtx._sc - self.uid = "logReg" - self.estimator = self.sc._jvm.org.apache.sysml.api.ml.LogisticRegression(self.uid, self.sc._jsc.sc()) - self.estimator.setMaxOuterIter(max_iter) - self.estimator.setMaxInnerIter(max_inner_iter) - if C <= 0: - raise Exception('C has to be positive') - reg = 1.0 / C - self.estimator.setRegParam(reg) - self.estimator.setTol(tol) - self.estimator.setIcpt(int(fit_intercept)) - self.transferUsingDF = transferUsingDF - self.setOutputRawPredictionsToFalse = True - if penalty != 'l2': - raise Exception('Only l2 penalty is supported') - if solver != 'newton-cg': - raise Exception('Only newton-cg solver supported') - -class LinearRegression(BaseSystemMLRegressor): - - def __init__(self, sqlCtx, fit_intercept=True, max_iter=100, tol=0.000001, C=1.0, solver='newton-cg', transferUsingDF=False): - """ - Performs linear regression to model the relationship between one numerical response variable and one or more explanatory (feature) variables.. - - Parameters - ---------- - sqlCtx: PySpark SQLContext - fit_intercept: Specifies whether to add intercept or not (default: True) - max_iter: Maximum number of conjugate gradient iterations, or 0 if no maximum limit provided (default: 100) - tol: Tolerance used in the convergence criterion (default: 0.000001) - C: 1/regularization parameter (default: 1.0) - solver: Supports either 'newton-cg' or 'direct-solve' (default: 'newton-cg'). - Depending on the size and the sparsity of the feature matrix, one or the other solver may be more efficient. - 'direct-solve' solver is more efficient when the number of features is relatively small (m < 1000) and - input matrix X is either tall or fairly dense; otherwise 'newton-cg' solver is more efficient. - """ - self.sqlCtx = sqlCtx - self.sc = sqlCtx._sc - self.uid = "lr" - if solver == 'newton-cg' or solver == 'direct-solve': - self.estimator = self.sc._jvm.org.apache.sysml.api.ml.LinearRegression(self.uid, self.sc._jsc.sc(), solver) - else: - raise Exception('Only newton-cg solver supported') - self.estimator.setMaxIter(max_iter) - if C <= 0: - raise Exception('C has to be positive') - reg = 1.0 / C - self.estimator.setRegParam(reg) - self.estimator.setTol(tol) - self.estimator.setIcpt(int(fit_intercept)) - self.transferUsingDF = transferUsingDF - self.setOutputRawPredictionsToFalse = False - - -class SVM(BaseSystemMLClassifier): - - def __init__(self, sqlCtx, fit_intercept=True, max_iter=100, tol=0.000001, C=1.0, is_multi_class=False, transferUsingDF=False): - """ - Performs both binary-class and multiclass SVM (Support Vector Machines). - - Parameters - ---------- - sqlCtx: PySpark SQLContext - fit_intercept: Specifies whether to add intercept or not (default: True) - max_iter: Maximum number iterations (default: 100) - tol: Tolerance used in the convergence criterion (default: 0.000001) - C: 1/regularization parameter (default: 1.0) - is_multi_class: Specifies whether to use binary-class SVM or multi-class SVM algorithm (default: False) - """ - self.sqlCtx = sqlCtx - self.sc = sqlCtx._sc - self.uid = "svm" - self.estimator = self.sc._jvm.org.apache.sysml.api.ml.SVM(self.uid, self.sc._jsc.sc(), is_multi_class) - self.estimator.setMaxIter(max_iter) - if C <= 0: - raise Exception('C has to be positive') - reg = 1.0 / C - self.estimator.setRegParam(reg) - self.estimator.setTol(tol) - self.estimator.setIcpt(int(fit_intercept)) - self.transferUsingDF = transferUsingDF - self.setOutputRawPredictionsToFalse = False - -class NaiveBayes(BaseSystemMLClassifier): - - def __init__(self, sqlCtx, laplace=1.0, transferUsingDF=False): - """ - Performs both binary-class and multiclass SVM (Support Vector Machines). - - Parameters - ---------- - sqlCtx: PySpark SQLContext - laplace: Laplace smoothing specified by the user to avoid creation of 0 probabilities (default: 1.0) - """ - self.sqlCtx = sqlCtx - self.sc = sqlCtx._sc - self.uid = "nb" - self.estimator = self.sc._jvm.org.apache.sysml.api.ml.NaiveBayes(self.uid, self.sc._jsc.sc()) - self.estimator.setLaplace(laplace) - self.transferUsingDF = transferUsingDF - self.setOutputRawPredictionsToFalse = False - -__all__ = ['LogisticRegression', 'LinearRegression', 'SVM', 'NaiveBayes'] http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/542de374/src/main/python/setup.py ---------------------------------------------------------------------- diff --git a/src/main/python/setup.py b/src/main/python/setup.py index 0bcebab..cc8f373 100644 --- a/src/main/python/setup.py +++ b/src/main/python/setup.py @@ -34,7 +34,7 @@ REQUIRED_PACKAGES = [ ] PACKAGE_DATA = [] -for path, subdirs, files in os.walk('SystemML/SystemML-java'): +for path, subdirs, files in os.walk('systemml/systemml-java'): for name in files: PACKAGE_DATA = PACKAGE_DATA + [ os.path.join(path, name).replace('./', '') ] @@ -61,7 +61,7 @@ setup( install_requires=REQUIRED_PACKAGES, include_package_data=True, package_data={ - 'SystemML-java': PACKAGE_DATA + 'systemml-java': PACKAGE_DATA }, classifiers=[ 'Intended Audience :: Developers', http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/542de374/src/main/python/systemml/__init__.py ---------------------------------------------------------------------- diff --git a/src/main/python/systemml/__init__.py b/src/main/python/systemml/__init__.py new file mode 100644 index 0000000..02a940b --- /dev/null +++ b/src/main/python/systemml/__init__.py @@ -0,0 +1,29 @@ +#!/usr/bin/python +#------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +#------------------------------------------------------------- + +from .mlcontext import * +from .defmatrix import * +from .converters import * + +__all__ = mlcontext.__all__ +__all__ += defmatrix.__all__ +__all__ += converters.__all__ \ No newline at end of file http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/542de374/src/main/python/systemml/converters.py ---------------------------------------------------------------------- diff --git a/src/main/python/systemml/converters.py b/src/main/python/systemml/converters.py new file mode 100644 index 0000000..9588bec --- /dev/null +++ b/src/main/python/systemml/converters.py @@ -0,0 +1,100 @@ +#!/usr/bin/python +#------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +#------------------------------------------------------------- + +from pyspark.context import SparkContext +from pyspark.sql import DataFrame, SQLContext +from pyspark.rdd import RDD +import numpy as np +import pandas as pd +import sklearn as sk + +from scipy.sparse import spmatrix +from scipy.sparse import coo_matrix + +SUPPORTED_TYPES = (np.ndarray, pd.DataFrame, spmatrix) + +def getNumCols(numPyArr): + if numPyArr.ndim == 1: + return 1 + else: + return numPyArr.shape[1] + +def convertToLabeledDF(sqlCtx, X, y=None): + from pyspark.ml.feature import VectorAssembler + if y is not None: + pd1 = pd.DataFrame(X) + pd2 = pd.DataFrame(y, columns=['label']) + pdf = pd.concat([pd1, pd2], axis=1) + inputColumns = ['C' + str(i) for i in pd1.columns] + outputColumns = inputColumns + ['label'] + else: + pdf = pd.DataFrame(X) + inputColumns = ['C' + str(i) for i in pdf.columns] + outputColumns = inputColumns + assembler = VectorAssembler(inputCols=inputColumns, outputCol='features') + out = assembler.transform(sqlCtx.createDataFrame(pdf, outputColumns)) + if y is not None: + return out.select('features', 'label') + else: + return out.select('features') + + +def convertToMatrixBlock(sc, src): + if isinstance(src, spmatrix): + src = coo_matrix(src, dtype=np.float64) + numRows = src.shape[0] + numCols = src.shape[1] + data = src.data + row = src.row.astype(np.int32) + col = src.col.astype(np.int32) + nnz = len(src.col) + buf1 = bytearray(data.tostring()) + buf2 = bytearray(row.tostring()) + buf3 = bytearray(col.tostring()) + return sc._jvm.org.apache.sysml.runtime.instructions.spark.utils.RDDConverterUtilsExt.convertSciPyCOOToMB(buf1, buf2, buf3, numRows, numCols, nnz) + elif isinstance(sc, SparkContext): + src = np.asarray(src) + numCols = getNumCols(src) + numRows = src.shape[0] + arr = src.ravel().astype(np.float64) + buf = bytearray(arr.tostring()) + return sc._jvm.org.apache.sysml.runtime.instructions.spark.utils.RDDConverterUtilsExt.convertPy4JArrayToMB(buf, numRows, numCols) + else: + raise TypeError('sc needs to be of type SparkContext') # TODO: We can generalize this by creating py4j gateway ourselves + + +def convertToNumpyArr(sc, mb): + if isinstance(sc, SparkContext): + numRows = mb.getNumRows() + numCols = mb.getNumColumns() + buf = sc._jvm.org.apache.sysml.runtime.instructions.spark.utils.RDDConverterUtilsExt.convertMBtoPy4JDenseArr(mb) + return np.frombuffer(buf, count=numRows*numCols, dtype=np.float64).reshape((numRows, numCols)) + else: + raise TypeError('sc needs to be of type SparkContext') # TODO: We can generalize this by creating py4j gateway ourselves + + +def convertToPandasDF(X): + if not isinstance(X, pd.DataFrame): + return pd.DataFrame(X, columns=['C' + str(i) for i in range(getNumCols(X))]) + return X + +__all__ = [ 'getNumCols', 'convertToMatrixBlock', 'convertToNumpyArr', 'convertToPandasDF', 'SUPPORTED_TYPES' , 'convertToLabeledDF'] http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/542de374/src/main/python/systemml/defmatrix.py ---------------------------------------------------------------------- diff --git a/src/main/python/systemml/defmatrix.py b/src/main/python/systemml/defmatrix.py new file mode 100644 index 0000000..18f6314 --- /dev/null +++ b/src/main/python/systemml/defmatrix.py @@ -0,0 +1,410 @@ +#!/usr/bin/python +#------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +#------------------------------------------------------------- + +import numpy as np + +from . import pydml, MLContext +from .converters import * +from pyspark import SparkContext, RDD +from pyspark.sql import DataFrame, SQLContext + +def setSparkContext(sc): + """ + Before using the matrix, the user needs to invoke this function. + + Parameters + ---------- + sc: SparkContext + SparkContext + """ + matrix.ml = MLContext(sc) + matrix.sc = sc + +def checkIfMLContextIsSet(): + if matrix.ml is None: + raise Exception('Expected setSparkContext(sc) to be called.') + +class DMLOp(object): + """ + Represents an intermediate node of Abstract syntax tree created to generate the PyDML script + """ + def __init__(self, inputs, dml=None): + self.inputs = inputs + self.dml = dml + + def _visit(self, execute=True): + matrix.dml = matrix.dml + self.dml + + +def reset(): + """ + Resets the visited status of matrix and the operators in the generated AST. + """ + for m in matrix.visited: + m.visited = False + matrix.visited = [] + +def binaryOp(lhs, rhs, opStr): + """ + Common function called by all the binary operators in matrix class + """ + inputs = [] + if isinstance(lhs, matrix): + lhsStr = lhs.ID + inputs = [lhs] + elif isinstance(lhs, float) or isinstance(lhs, int): + lhsStr = str(lhs) + else: + raise TypeError('Incorrect type') + if isinstance(rhs, matrix): + rhsStr = rhs.ID + inputs = inputs + [rhs] + elif isinstance(rhs, float) or isinstance(rhs, int): + rhsStr = str(rhs) + else: + raise TypeError('Incorrect type') + dmlOp = DMLOp(inputs) + out = matrix(None, op=dmlOp) + dmlOp.dml = [out.ID, ' = ', lhsStr, opStr, rhsStr, '\n'] + return out + +def binaryMatrixFunction(X, Y, fnName): + """ + Common function called by supported PyDML built-in function that has two arguments both of which are matrices. + TODO: This needs to be generalized to support arbitrary arguments of differen types. + """ + if not isinstance(X, matrix) or not isinstance(Y, matrix): + raise TypeError('Incorrect input type. Expected matrix type') + inputs = [X, Y] + dmlOp = DMLOp(inputs) + out = matrix(None, op=dmlOp) + dmlOp.dml = [out.ID, ' = ', fnName,'(', X.ID, ', ', Y.ID, ')\n'] + return out + +def solve(A, b): + """ + Computes the least squares solution for system of linear equations A %*% x = b + + Examples + -------- + >>> import numpy as np + >>> from sklearn import datasets + >>> import SystemML as sml + >>> from pyspark.sql import SQLContext + >>> diabetes = datasets.load_diabetes() + >>> diabetes_X = diabetes.data[:, np.newaxis, 2] + >>> X_train = diabetes_X[:-20] + >>> X_test = diabetes_X[-20:] + >>> y_train = diabetes.target[:-20] + >>> y_test = diabetes.target[-20:] + >>> sml.setSparkContext(sc) + >>> X = sml.matrix(X_train) + >>> y = sml.matrix(y_train) + >>> A = X.transpose().dot(X) + >>> b = X.transpose().dot(y) + >>> beta = sml.solve(A, b).toNumPyArray() + >>> y_predicted = X_test.dot(beta) + >>> print('Residual sum of squares: %.2f' % np.mean((y_predicted - y_test) ** 2)) + Residual sum of squares: 25282.12 + """ + return binaryMatrixFunction(A, b, 'solve') + +def eval(outputs, outputDF=False, execute=True): + """ + Executes the unevaluated DML script and computes the matrices specified by outputs. + + Parameters + ---------- + outputs: list of matrices + outputDF: back the data of matrix as PySpark DataFrame + """ + checkIfMLContextIsSet() + reset() + matrix.dml = [] + matrix.script = pydml('') + if isinstance(outputs, matrix): + outputs = [ outputs ] + elif not isinstance(outputs, list): + raise TypeError('Incorrect input type') + for m in outputs: + m.output = True + m._visit(execute=execute) + if not execute: + return ''.join(matrix.dml) + matrix.script.scriptString = ''.join(matrix.dml) + results = matrix.ml.execute(matrix.script) + # Note: an evaluated matrix contains a data field computed by eval method as DataFrame or NumPy array. + for m in outputs: + if outputDF: + m.data = results.getDataFrame(m.ID) + else: + m.data = results.getNumPyArray(m.ID) + +class matrix(object): + """ + matrix class is a python wrapper that implements basic matrix operator. + Note: an evaluated matrix contains a data field computed by eval method as DataFrame or NumPy array. + + Examples + -------- + >>> import SystemML as sml + >>> import numpy as np + >>> sml.setSparkContext(sc) + + Welcome to Apache SystemML! + + >>> m1 = sml.matrix(np.ones((3,3)) + 2) + >>> m2 = sml.matrix(np.ones((3,3)) + 3) + >>> m2 = m1 * (m2 + m1) + >>> m4 = 1.0 - m2 + >>> m4 + # This matrix (mVar5) is backed by below given PyDML script (which is not yet evaluated). To fetch the data of this matrix, invoke toNumPyArray() or toDataFrame() or toPandas() methods. + mVar1 = load(" ", format="csv") + mVar2 = load(" ", format="csv") + mVar3 = mVar2 + mVar1 + mVar4 = mVar1 * mVar3 + mVar5 = 1.0 - mVar4 + save(mVar5, " ") + + <SystemML.defmatrix.matrix object> + >>> m2.eval() + >>> m2 + # This matrix (mVar4) is backed by NumPy array. To fetch the NumPy array, invoke toNumPyArray() method. + <SystemML.defmatrix.matrix object> + >>> m4 + # This matrix (mVar5) is backed by below given PyDML script (which is not yet evaluated). To fetch the data of this matrix, invoke toNumPyArray() or toDataFrame() or toPandas() methods. + mVar4 = load(" ", format="csv") + mVar5 = 1.0 - mVar4 + save(mVar5, " ") + + <SystemML.defmatrix.matrix object> + >>> m4.sum(axis=1).toNumPyArray() + array([[-60.], + [-60.], + [-60.]]) + """ + # Global variable that is used to keep track of intermediate matrix variables in the DML script + systemmlVarID = 0 + + # Since joining of string is expensive operation, we collect the set of strings into list and then join + # them before execution: See matrix.script.scriptString = ''.join(matrix.dml) in eval() method + dml = [] + + # Represents MLContext's script object + script = None + + # Represents MLContext object + ml = None + + # Contains list of nodes visited in Abstract Syntax Tree. This helps to avoid computation of matrix objects + # that have been previously evaluated. + visited = [] + + def __init__(self, data, op=None): + """ + Constructs a lazy matrix + + Parameters + ---------- + data: NumPy ndarray, Pandas DataFrame, scipy sparse matrix or PySpark DataFrame. (data cannot be None for external users, 'data=None' is used internally for lazy evaluation). + """ + checkIfMLContextIsSet() + self.visited = False + matrix.systemmlVarID += 1 + self.output = False + self.ID = 'mVar' + str(matrix.systemmlVarID) + if isinstance(data, SUPPORTED_TYPES): + self.data = data + elif hasattr(data, '_jdf'): + self.data = data + elif data is None and op is not None: + self.data = None + # op refers to the node of Abstract Syntax Tree created internally for lazy evaluation + self.op = op + else: + raise TypeError('Unsupported input type') + + def eval(self, outputDF=False): + """ + This is a convenience function that calls the global eval method + """ + eval([self], outputDF=False) + + def toPandas(self): + """ + This is a convenience function that calls the global eval method and then converts the matrix object into Pandas DataFrame. + """ + if self.data is None: + self.eval() + return convertToPandasDF(self.data) + + def toNumPyArray(self): + """ + This is a convenience function that calls the global eval method and then converts the matrix object into NumPy array. + """ + if self.data is None: + self.eval() + if isinstance(self.data, DataFrame): + self.data = self.data.toPandas().as_matrix() + # Always keep default format as NumPy array if possible + return self.data + + def toDataFrame(self): + """ + This is a convenience function that calls the global eval method and then converts the matrix object into DataFrame. + """ + if self.data is None: + self.eval(outputDF=True) + if not isinstance(self.data, DataFrame): + if MLResults.sqlContext is None: + MLResults.sqlContext = SQLContext(matrix.sc) + self.data = sqlContext.createDataFrame(self.toPandas()) + return self.data + + def _visit(self, execute=True): + """ + This function is called for two scenarios: + 1. For printing the PyDML script which has not yet been evaluated (execute=False). See '__repr__' method. + 2. Called as part of 'eval' method (execute=True). In this scenario, it builds the PyDML script by visiting itself + and its child nodes. Also, it does appropriate registration as input or output that is required by MLContext. + """ + if self.visited: + return self + self.visited = True + # for cleanup + matrix.visited = matrix.visited + [ self ] + if self.data is not None: + matrix.dml = matrix.dml + [ self.ID, ' = load(\" \", format=\"csv\")\n'] + if isinstance(self.data, DataFrame) and execute: + matrix.script.input(self.ID, self.data) + elif execute: + matrix.script.input(self.ID, convertToMatrixBlock(matrix.sc, self.data)) + return self + elif self.op is not None: + for m in self.op.inputs: + m._visit(execute=execute) + self.op._visit(execute=execute) + else: + raise Exception('Expected either op or data to be set') + if self.data is None and self.output: + matrix.dml = matrix.dml + ['save(', self.ID, ', \" \")\n'] + if execute: + matrix.script.output(self.ID) + return self + + def __repr__(self): + """ + This function helps to debug matrix class and also examine the generated PyDML script + """ + if self.data is None: + print('# This matrix (' + self.ID + ') is backed by below given PyDML script (which is not yet evaluated). To fetch the data of this matrix, invoke toNumPyArray() or toDataFrame() or toPandas() methods.\n' + eval([self], execute=False)) + elif isinstance(self.data, DataFrame): + print('# This matrix (' + self.ID + ') is backed by PySpark DataFrame. To fetch the DataFrame, invoke toDataFrame() method.') + else: + print('# This matrix (' + self.ID + ') is backed by NumPy array. To fetch the NumPy array, invoke toNumPyArray() method.') + return '<SystemML.defmatrix.matrix object>' + + def __add__(self, other): + return binaryOp(self, other, ' + ') + + def __sub__(self, other): + return binaryOp(self, other, ' - ') + + def __mul__(self, other): + return binaryOp(self, other, ' * ') + + def __floordiv__(self, other): + return binaryOp(self, other, ' // ') + + def __div__(self, other): + return binaryOp(self, other, ' / ') + + def __mod__(self, other): + return binaryOp(self, other, ' % ') + + def __pow__(self, other): + return binaryOp(self, other, ' ** ') + + def __radd__(self, other): + return binaryOp(other, self, ' + ') + + def __rsub__(self, other): + return binaryOp(other, self, ' - ') + + def __rmul__(self, other): + return binaryOp(other, self, ' * ') + + def __rfloordiv__(self, other): + return binaryOp(other, self, ' // ') + + def __rdiv__(self, other): + return binaryOp(other, self, ' / ') + + def __rmod__(self, other): + return binaryOp(other, self, ' % ') + + def __rpow__(self, other): + return binaryOp(other, self, ' ** ') + + def sum(self, axis=None): + return self._aggFn('sum', axis) + + def mean(self, axis=None): + return self._aggFn('mean', axis) + + def max(self, axis=None): + return self._aggFn('max', axis) + + def min(self, axis=None): + return self._aggFn('min', axis) + + def argmin(self, axis=None): + return self._aggFn('argmin', axis) + + def argmax(self, axis=None): + return self._aggFn('argmax', axis) + + def cumsum(self, axis=None): + return self._aggFn('cumsum', axis) + + def transpose(self, axis=None): + return self._aggFn('transpose', axis) + + def trace(self, axis=None): + return self._aggFn('trace', axis) + + def _aggFn(self, fnName, axis): + """ + Common function that is called for functions that have axis as parameter. + """ + dmlOp = DMLOp([self]) + out = matrix(None, op=dmlOp) + if axis is None: + dmlOp.dml = [out.ID, ' = ', fnName, '(', self.ID, ')\n'] + else: + dmlOp.dml = [out.ID, ' = ', fnName, '(', self.ID, ', axis=', str(axis) ,')\n'] + return out + + def dot(self, other): + return binaryMatrixFunction(self, other, 'dot') + +__all__ = [ 'setSparkContext', 'matrix', 'eval', 'solve']
