Repository: incubator-systemml Updated Branches: refs/heads/master 1f6d55e9e -> 701b9e319
[SYSTEMML-895] Cleanup Python Package Various cleanups of the Python code to be more Pythonic, adhering to PEP 8. Project: http://git-wip-us.apache.org/repos/asf/incubator-systemml/repo Commit: http://git-wip-us.apache.org/repos/asf/incubator-systemml/commit/701b9e31 Tree: http://git-wip-us.apache.org/repos/asf/incubator-systemml/tree/701b9e31 Diff: http://git-wip-us.apache.org/repos/asf/incubator-systemml/diff/701b9e31 Branch: refs/heads/master Commit: 701b9e319daa8140faff5193e2f4d0401f55db0d Parents: 1f6d55e Author: Mike Dusenberry <[email protected]> Authored: Fri Sep 2 13:59:00 2016 -0700 Committer: Mike Dusenberry <[email protected]> Committed: Fri Sep 2 13:59:00 2016 -0700 ---------------------------------------------------------------------- src/main/python/setup.py | 23 ++++--- src/main/python/systemml/__init__.py | 7 +-- src/main/python/systemml/converters.py | 25 +++----- src/main/python/systemml/defmatrix.py | 19 +++--- src/main/python/systemml/mlcontext.py | 4 +- src/main/python/systemml/mllearn/__init__.py | 7 +-- src/main/python/systemml/mllearn/estimators.py | 69 ++++++++++----------- 7 files changed, 75 insertions(+), 79 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/701b9e31/src/main/python/setup.py ---------------------------------------------------------------------- diff --git a/src/main/python/setup.py b/src/main/python/setup.py index cc8f373..5cb498f 100644 --- a/src/main/python/setup.py +++ b/src/main/python/setup.py @@ -1,4 +1,3 @@ -#!/usr/bin/python #------------------------------------------------------------- # # Licensed to the Apache Software Foundation (ASF) under one @@ -8,9 +7,9 @@ # to you under the Apache License, Version 2.0 (the # "License"); you may not use this file except in compliance # with the License. You may obtain a copy of the License at -# +# # http://www.apache.org/licenses/LICENSE-2.0 -# +# # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY @@ -20,8 +19,8 @@ # #------------------------------------------------------------- -from setuptools import setup, find_packages import os +from setuptools import find_packages, setup import time VERSION = '0.11.0.dev1' @@ -37,22 +36,22 @@ PACKAGE_DATA = [] for path, subdirs, files in os.walk('systemml/systemml-java'): for name in files: PACKAGE_DATA = PACKAGE_DATA + [ os.path.join(path, name).replace('./', '') ] - + setup( - name='SystemML', + name='systemml', version=VERSION, description='Apache SystemML is a distributed and declarative machine learning platform.', long_description=''' - + Apache SystemML is an effort undergoing incubation at the Apache Software Foundation (ASF), sponsored by the Apache Incubator PMC. While incubation status is not necessarily a reflection of the completeness or stability of the code, it does indicate that the project has yet to be fully endorsed by the ASF. - - Apache SystemML provides declarative large-scale machine learning (ML) that aims at - flexible specification of ML algorithms and automatic generation of hybrid runtime + + Apache SystemML provides declarative large-scale machine learning (ML) that aims at + flexible specification of ML algorithms and automatic generation of hybrid runtime plans ranging from single-node, in-memory computations, to distributed computations on Apache Hadoop and Apache Spark. - + Note: This is not a released version and was built with SNAPSHOT available on the date''' + RELEASED_DATE, url='http://systemml.apache.org/', author='Apache SystemML', @@ -74,4 +73,4 @@ setup( 'Topic :: Software Development :: Libraries', ], license='Apache 2.0', - ) \ No newline at end of file + ) http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/701b9e31/src/main/python/systemml/__init__.py ---------------------------------------------------------------------- diff --git a/src/main/python/systemml/__init__.py b/src/main/python/systemml/__init__.py index 02a940b..04d521b 100644 --- a/src/main/python/systemml/__init__.py +++ b/src/main/python/systemml/__init__.py @@ -1,4 +1,3 @@ -#!/usr/bin/python #------------------------------------------------------------- # # Licensed to the Apache Software Foundation (ASF) under one @@ -8,9 +7,9 @@ # to you under the Apache License, Version 2.0 (the # "License"); you may not use this file except in compliance # with the License. You may obtain a copy of the License at -# +# # http://www.apache.org/licenses/LICENSE-2.0 -# +# # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY @@ -26,4 +25,4 @@ from .converters import * __all__ = mlcontext.__all__ __all__ += defmatrix.__all__ -__all__ += converters.__all__ \ No newline at end of file +__all__ += converters.__all__ http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/701b9e31/src/main/python/systemml/converters.py ---------------------------------------------------------------------- diff --git a/src/main/python/systemml/converters.py b/src/main/python/systemml/converters.py index 9588bec..243a507 100644 --- a/src/main/python/systemml/converters.py +++ b/src/main/python/systemml/converters.py @@ -1,4 +1,3 @@ -#!/usr/bin/python #------------------------------------------------------------- # # Licensed to the Apache Software Foundation (ASF) under one @@ -8,9 +7,9 @@ # to you under the Apache License, Version 2.0 (the # "License"); you may not use this file except in compliance # with the License. You may obtain a copy of the License at -# +# # http://www.apache.org/licenses/LICENSE-2.0 -# +# # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY @@ -20,15 +19,12 @@ # #------------------------------------------------------------- -from pyspark.context import SparkContext -from pyspark.sql import DataFrame, SQLContext -from pyspark.rdd import RDD +__all__ = [ 'getNumCols', 'convertToMatrixBlock', 'convertToNumpyArr', 'convertToPandasDF', 'SUPPORTED_TYPES' , 'convertToLabeledDF'] + import numpy as np import pandas as pd -import sklearn as sk - -from scipy.sparse import spmatrix -from scipy.sparse import coo_matrix +from pyspark.context import SparkContext +from scipy.sparse import coo_matrix, spmatrix SUPPORTED_TYPES = (np.ndarray, pd.DataFrame, spmatrix) @@ -37,7 +33,8 @@ def getNumCols(numPyArr): return 1 else: return numPyArr.shape[1] - + + def convertToLabeledDF(sqlCtx, X, y=None): from pyspark.ml.feature import VectorAssembler if y is not None: @@ -56,7 +53,7 @@ def convertToLabeledDF(sqlCtx, X, y=None): return out.select('features', 'label') else: return out.select('features') - + def convertToMatrixBlock(sc, src): if isinstance(src, spmatrix): @@ -80,7 +77,7 @@ def convertToMatrixBlock(sc, src): return sc._jvm.org.apache.sysml.runtime.instructions.spark.utils.RDDConverterUtilsExt.convertPy4JArrayToMB(buf, numRows, numCols) else: raise TypeError('sc needs to be of type SparkContext') # TODO: We can generalize this by creating py4j gateway ourselves - + def convertToNumpyArr(sc, mb): if isinstance(sc, SparkContext): @@ -96,5 +93,3 @@ def convertToPandasDF(X): if not isinstance(X, pd.DataFrame): return pd.DataFrame(X, columns=['C' + str(i) for i in range(getNumCols(X))]) return X - -__all__ = [ 'getNumCols', 'convertToMatrixBlock', 'convertToNumpyArr', 'convertToPandasDF', 'SUPPORTED_TYPES' , 'convertToLabeledDF'] http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/701b9e31/src/main/python/systemml/defmatrix.py ---------------------------------------------------------------------- diff --git a/src/main/python/systemml/defmatrix.py b/src/main/python/systemml/defmatrix.py index 18f6314..2994092 100644 --- a/src/main/python/systemml/defmatrix.py +++ b/src/main/python/systemml/defmatrix.py @@ -1,4 +1,3 @@ -#!/usr/bin/python #------------------------------------------------------------- # # Licensed to the Apache Software Foundation (ASF) under one @@ -20,13 +19,14 @@ # #------------------------------------------------------------- -import numpy as np +__all__ = [ 'setSparkContext', 'matrix', 'eval', 'solve'] -from . import pydml, MLContext -from .converters import * -from pyspark import SparkContext, RDD +from pyspark import SparkContext from pyspark.sql import DataFrame, SQLContext +from . import MLContext, pydml +from .converters import * + def setSparkContext(sc): """ Before using the matrix, the user needs to invoke this function. @@ -39,10 +39,12 @@ def setSparkContext(sc): matrix.ml = MLContext(sc) matrix.sc = sc + def checkIfMLContextIsSet(): if matrix.ml is None: raise Exception('Expected setSparkContext(sc) to be called.') + class DMLOp(object): """ Represents an intermediate node of Abstract syntax tree created to generate the PyDML script @@ -63,6 +65,7 @@ def reset(): m.visited = False matrix.visited = [] + def binaryOp(lhs, rhs, opStr): """ Common function called by all the binary operators in matrix class @@ -87,6 +90,7 @@ def binaryOp(lhs, rhs, opStr): dmlOp.dml = [out.ID, ' = ', lhsStr, opStr, rhsStr, '\n'] return out + def binaryMatrixFunction(X, Y, fnName): """ Common function called by supported PyDML built-in function that has two arguments both of which are matrices. @@ -100,6 +104,7 @@ def binaryMatrixFunction(X, Y, fnName): dmlOp.dml = [out.ID, ' = ', fnName,'(', X.ID, ', ', Y.ID, ')\n'] return out + def solve(A, b): """ Computes the least squares solution for system of linear equations A %*% x = b @@ -128,6 +133,7 @@ def solve(A, b): """ return binaryMatrixFunction(A, b, 'solve') + def eval(outputs, outputDF=False, execute=True): """ Executes the unevaluated DML script and computes the matrices specified by outputs. @@ -159,6 +165,7 @@ def eval(outputs, outputDF=False, execute=True): else: m.data = results.getNumPyArray(m.ID) + class matrix(object): """ matrix class is a python wrapper that implements basic matrix operator. @@ -406,5 +413,3 @@ class matrix(object): def dot(self, other): return binaryMatrixFunction(self, other, 'dot') - -__all__ = [ 'setSparkContext', 'matrix', 'eval', 'solve'] http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/701b9e31/src/main/python/systemml/mlcontext.py ---------------------------------------------------------------------- diff --git a/src/main/python/systemml/mlcontext.py b/src/main/python/systemml/mlcontext.py index 1b90e70..c578a8e 100644 --- a/src/main/python/systemml/mlcontext.py +++ b/src/main/python/systemml/mlcontext.py @@ -1,4 +1,3 @@ -#!/usr/bin/python #------------------------------------------------------------- # # Licensed to the Apache Software Foundation (ASF) under one @@ -19,6 +18,7 @@ # under the License. # #------------------------------------------------------------- + import os try: @@ -26,10 +26,10 @@ try: from py4j.java_gateway import JavaObject except ImportError: raise ImportError('Unable to import JavaObject from py4j.java_gateway. Hint: Make sure you are running with pyspark') - from pyspark import SparkContext import pyspark.mllib.common from pyspark.sql import DataFrame, SQLContext + from .converters import * def dml(scriptString): http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/701b9e31/src/main/python/systemml/mllearn/__init__.py ---------------------------------------------------------------------- diff --git a/src/main/python/systemml/mllearn/__init__.py b/src/main/python/systemml/mllearn/__init__.py index 69cab58..8132405 100644 --- a/src/main/python/systemml/mllearn/__init__.py +++ b/src/main/python/systemml/mllearn/__init__.py @@ -1,4 +1,3 @@ -#!/usr/bin/python #------------------------------------------------------------- # # Licensed to the Apache Software Foundation (ASF) under one @@ -8,9 +7,9 @@ # to you under the Apache License, Version 2.0 (the # "License"); you may not use this file except in compliance # with the License. You may obtain a copy of the License at -# +# # http://www.apache.org/licenses/LICENSE-2.0 -# +# # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY @@ -22,4 +21,4 @@ from .estimators import * -__all__ = estimators.__all__ \ No newline at end of file +__all__ = estimators.__all__ http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/701b9e31/src/main/python/systemml/mllearn/estimators.py ---------------------------------------------------------------------- diff --git a/src/main/python/systemml/mllearn/estimators.py b/src/main/python/systemml/mllearn/estimators.py index 5d33d64..97ab6bb 100644 --- a/src/main/python/systemml/mllearn/estimators.py +++ b/src/main/python/systemml/mllearn/estimators.py @@ -1,4 +1,3 @@ -#!/usr/bin/python #------------------------------------------------------------- # # Licensed to the Apache Software Foundation (ASF) under one @@ -8,9 +7,9 @@ # to you under the Apache License, Version 2.0 (the # "License"); you may not use this file except in compliance # with the License. You may obtain a copy of the License at -# +# # http://www.apache.org/licenses/LICENSE-2.0 -# +# # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY @@ -20,15 +19,13 @@ # #------------------------------------------------------------- -from pyspark.context import SparkContext -from pyspark.sql import DataFrame, SQLContext -from pyspark.rdd import RDD +__all__ = ['LinearRegression', 'LogisticRegression', 'SVM', 'NaiveBayes'] + import numpy as np -import pandas as pd -import sklearn as sk +from pyspark.ml import Estimator from pyspark.ml.feature import VectorAssembler -from pyspark.mllib.linalg import Vectors -from pyspark.ml import Estimator, Model +from pyspark.sql import DataFrame +import sklearn as sk from ..converters import * @@ -40,32 +37,32 @@ def assemble(sqlCtx, pdf, inputCols, outputCol): class BaseSystemMLEstimator(Estimator): featuresCol = 'features' labelCol = 'label' - + def setFeaturesCol(self, colName): """ Sets the default column name for features of PySpark DataFrame. - + Parameters ---------- colName: column name for features (default: 'features') """ self.featuresCol = colName - + def setLabelCol(self, colName): """ Sets the default column name for features of PySpark DataFrame. - + Parameters ---------- colName: column name for features (default: 'label') """ self.labelCol = colName - - # Returns a model after calling fit(df) on Estimator object on JVM + + # Returns a model after calling fit(df) on Estimator object on JVM def _fit(self, X): """ Invokes the fit method on Estimator object on JVM if X is PySpark DataFrame - + Parameters ---------- X: PySpark DataFrame that contain the columns featuresCol (default: 'features') and labelCol (default: 'label') @@ -75,11 +72,11 @@ class BaseSystemMLEstimator(Estimator): return self else: raise Exception('Incorrect usage: Expected dataframe as input with features/label as columns') - + def fit(self, X, y=None, params=None): """ Invokes the fit method on Estimator object on JVM if X and y are on of the supported data types - + Parameters ---------- X: NumPy ndarray, Pandas DataFrame, scipy sparse matrix @@ -109,15 +106,15 @@ class BaseSystemMLEstimator(Estimator): return self else: raise Exception('Unsupported input type') - + def transform(self, X): return self.predict(X) - - # Returns either a DataFrame or MatrixBlock after calling transform(X:MatrixBlock, y:MatrixBlock) on Model object on JVM + + # Returns either a DataFrame or MatrixBlock after calling transform(X:MatrixBlock, y:MatrixBlock) on Model object on JVM def predict(self, X): """ Invokes the transform method on Estimator object on JVM if X and y are on of the supported data types - + Parameters ---------- X: NumPy ndarray, Pandas DataFrame, scipy sparse matrix or PySpark DataFrame @@ -152,26 +149,28 @@ class BaseSystemMLEstimator(Estimator): return retDF.sort('ID') else: raise Exception('Unsupported input type') - + + class BaseSystemMLClassifier(BaseSystemMLEstimator): def score(self, X, y): """ Scores the predicted value with ground truth 'y' - + Parameters ---------- X: NumPy ndarray, Pandas DataFrame, scipy sparse matrix y: NumPy ndarray, Pandas DataFrame, scipy sparse matrix """ - return sk.metrics.accuracy_score(y, self.predict(X)) + return sk.metrics.accuracy_score(y, self.predict(X)) + class BaseSystemMLRegressor(BaseSystemMLEstimator): def score(self, X, y): """ Scores the predicted value with ground truth 'y' - + Parameters ---------- X: NumPy ndarray, Pandas DataFrame, scipy sparse matrix @@ -184,7 +183,7 @@ class LogisticRegression(BaseSystemMLClassifier): def __init__(self, sqlCtx, penalty='l2', fit_intercept=True, max_iter=100, max_inner_iter=0, tol=0.000001, C=1.0, solver='newton-cg', transferUsingDF=False): """ Performs both binomial and multinomial logistic regression. - + Parameters ---------- sqlCtx: PySpark SQLContext @@ -215,12 +214,13 @@ class LogisticRegression(BaseSystemMLClassifier): if solver != 'newton-cg': raise Exception('Only newton-cg solver supported') + class LinearRegression(BaseSystemMLRegressor): def __init__(self, sqlCtx, fit_intercept=True, max_iter=100, tol=0.000001, C=1.0, solver='newton-cg', transferUsingDF=False): """ Performs linear regression to model the relationship between one numerical response variable and one or more explanatory (feature) variables.. - + Parameters ---------- sqlCtx: PySpark SQLContext @@ -228,7 +228,7 @@ class LinearRegression(BaseSystemMLRegressor): max_iter: Maximum number of conjugate gradient iterations, or 0 if no maximum limit provided (default: 100) tol: Tolerance used in the convergence criterion (default: 0.000001) C: 1/regularization parameter (default: 1.0) - solver: Supports either 'newton-cg' or 'direct-solve' (default: 'newton-cg'). + solver: Supports either 'newton-cg' or 'direct-solve' (default: 'newton-cg'). Depending on the size and the sparsity of the feature matrix, one or the other solver may be more efficient. 'direct-solve' solver is more efficient when the number of features is relatively small (m < 1000) and input matrix X is either tall or fairly dense; otherwise 'newton-cg' solver is more efficient. @@ -256,7 +256,7 @@ class SVM(BaseSystemMLClassifier): def __init__(self, sqlCtx, fit_intercept=True, max_iter=100, tol=0.000001, C=1.0, is_multi_class=False, transferUsingDF=False): """ Performs both binary-class and multiclass SVM (Support Vector Machines). - + Parameters ---------- sqlCtx: PySpark SQLContext @@ -278,14 +278,15 @@ class SVM(BaseSystemMLClassifier): self.estimator.setTol(tol) self.estimator.setIcpt(int(fit_intercept)) self.transferUsingDF = transferUsingDF - self.setOutputRawPredictionsToFalse = False + self.setOutputRawPredictionsToFalse = False + class NaiveBayes(BaseSystemMLClassifier): def __init__(self, sqlCtx, laplace=1.0, transferUsingDF=False): """ Performs both binary-class and multiclass SVM (Support Vector Machines). - + Parameters ---------- sqlCtx: PySpark SQLContext @@ -298,5 +299,3 @@ class NaiveBayes(BaseSystemMLClassifier): self.estimator.setLaplace(laplace) self.transferUsingDF = transferUsingDF self.setOutputRawPredictionsToFalse = False - -__all__ = ['LogisticRegression', 'LinearRegression', 'SVM', 'NaiveBayes']
