This is an automated email from the ASF dual-hosted git repository. janardhan pushed a commit to branch main in repository https://gitbox.apache.org/repos/asf/systemds.git
commit 97633c74d5fcf4d884aef433192ada6c977d465d Author: Erwin Tumbul <erwin.tum...@student.tugraz.at> AuthorDate: Fri Dec 3 16:17:41 2021 +0530 [SYSTEMDS-3235] Scikit-learn to SystemDS dml converter * Basic sklearn pipelines to dml script mapping * List of supported algorithms DBSCANMapper, l2svm, Gaussian, GLM, MultiLogReg * A distinction between supervised and unsupervised algorithms added. * Simple testing to verify the transformation, and add error handling * Includes basic design documentation and decisions. Co-authored-by: Erwin Tumbul <erwin.tum...@student.tugraz.at> Co-authored-by: Alexander Thien <alexander.th...@student.tugraz.at> Co-authored-by: Mathias Kahr <m.k...@student.tugraz.at> --- scripts/staging/sklearn/.gitignore | 7 ++ scripts/staging/sklearn/SklearnToDMLMapper.py | 139 +++++++++++++++++++++ scripts/staging/sklearn/SklearnToDMLMapper.rst | 72 +++++++++++ scripts/staging/sklearn/mapped_functions.rst | 46 +++++++ scripts/staging/sklearn/mappers/__init__.py | 25 ++++ scripts/staging/sklearn/mappers/mapper.py | 63 ++++++++++ scripts/staging/sklearn/mappers/supervised.py | 88 +++++++++++++ scripts/staging/sklearn/mappers/transformations.py | 81 ++++++++++++ scripts/staging/sklearn/mappers/unsupervised.py | 83 ++++++++++++ scripts/staging/sklearn/poc/design.rst | 65 ++++++++++ scripts/staging/sklearn/poc/poc.py | 103 +++++++++++++++ scripts/staging/sklearn/run_tests.py | 99 +++++++++++++++ scripts/staging/sklearn/tests/input_X.csv | 10 ++ scripts/staging/sklearn/tests/input_X.csv.mtd | 1 + scripts/staging/sklearn/tests/input_Y.csv | 10 ++ scripts/staging/sklearn/tests/input_Y.csv.mtd | 12 ++ scripts/staging/sklearn/tests/util.py | 101 +++++++++++++++ 17 files changed, 1005 insertions(+) diff --git a/scripts/staging/sklearn/.gitignore b/scripts/staging/sklearn/.gitignore new file mode 100644 index 0000000..2508296 --- /dev/null +++ b/scripts/staging/sklearn/.gitignore @@ -0,0 +1,7 @@ +*.pkl +*.dml +*.csv +*.csv.mtd + +!/tests/input_X.* +!/tests/input_Y.* \ No newline at end of file diff --git a/scripts/staging/sklearn/SklearnToDMLMapper.py b/scripts/staging/sklearn/SklearnToDMLMapper.py new file mode 100755 index 0000000..703153d --- /dev/null +++ b/scripts/staging/sklearn/SklearnToDMLMapper.py @@ -0,0 +1,139 @@ +# ------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +# ------------------------------------------------------------- + +import pickle +import sys +import inspect +import mappers +import argparse + +class SklearnToDMLMapper: + """ SklearnToDMLMapper is a simple tool for transforming scikit-learn pipelines into DML scripts. + This tool may be used over a simple command line interface, where a scikit-learn pipeline provided over + a pickle file. Alternatively, SklearnToDMLMapper can be used in a script as a Python module. + + Args: + pipeline (sklearn.pipeline.Pipeline): sklearn pipeline + input_name (str, optional): Name for the input variable (prefix). Defaults to 'input'. + Depending on the pipeline two files are necessary. + Example: input_name="input". Maps to files input_X.csv and input_Y.csv + for a pipeline ending in a supervised algorithm. + """ + def __init__(self, pipeline, input_name='input'): + """Create an SklearnToDMLMapper.""" + self.steps = pipeline.steps + self.functions = self.__get_functions() + self.dml_script = None + self.input_name = input_name + + def __get_functions(self): + clsmembers = inspect.getmembers(sys.modules['mappers'], inspect.isclass) + functions = {} + for cls in clsmembers: + instance = cls[1]() + if instance.sklearn_name is not None: + functions[instance.sklearn_name] = cls[1] + + return functions + + def __get_input(self): + # Get last function (an algorithm) + func = self.functions[self.steps[-1][0]]() + if func is None: + raise RuntimeError(f'{self.steps[-1][0]} is not supported.') + + if func.is_supervised: + return f'X = read(${self.input_name}_X)\nY = read(${self.input_name}_Y)' + else: + return f'X = read(${self.input_name}_X)' + + def __get_output(self): + func = self.functions[self.steps[-1][0]]() + if func is None: + raise RuntimeError(f'{self.steps[-1][0]} is not supported.') + return '\n'.join([f'write({output}, "{output}.csv")' for output in func.mapped_output]) + + def transform(self): + """Transforms a sklearn pipeline in a .dml script. + + Returns: + str: The transformed .dml script. + """ + sources = [] + calls = [] + + for name, step in self.steps: + if name not in self.functions: + continue + + mapper = self.functions[name](step.get_params()) + calls.append(mapper.get_call()) + sources.append(mapper.get_source()) + + self.dml_script = "{}\n\n{}\n\n{}\n\n{}".format('\n'.join(sources), + self.__get_input(), + '\n'.join(calls), + self.__get_output()) + return self.dml_script + + def save(self, path): + """Saves the transformed .dml script. + + Args: + path (str): Location where the DML script is to be saved. + + Raises: + RuntimeError: Save can only be called if a transformation was executed beforehand. + """ + if self.dml_script is None: + raise RuntimeError('Transformation was not applied yet.') + + with open(path, 'w') as f: + f.write(self.dml_script) + +if __name__ == '__main__': + parser = argparse.ArgumentParser(description='Tool that parses a sklearn pipeline and produces a dml script') + parser.add_argument('Path', + metavar='path', + type=str, + help='Location of the sklearn pipeline saved as pickle file') + parser.add_argument('-i', + metavar='input_name', + type=str, + default='X', + help='Name for the input variable (prefix). Depending on the pipeline two files are necessary. Example: input_name="input". Maps to files input_X.csv and input_Y.csv for a pipeline ending in a supervised algorithm.') + parser.add_argument('-o', + metavar='output', + type=str, + default='./pipeline.dml', + help='Path for the dml output script') + + args = parser.parse_args() + + try: + with open(args['path'], 'rb') as f: + pipeline = pickle.load(f) + + mapper = SklearnToDMLMapper(pipeline, args['input_name']) + mapper.transform() + mapper.save(args['output']) + except Exception as e: + print(f'Failed to transform pipeline.\nError:\n{e}') \ No newline at end of file diff --git a/scripts/staging/sklearn/SklearnToDMLMapper.rst b/scripts/staging/sklearn/SklearnToDMLMapper.rst new file mode 100644 index 0000000..d80251b --- /dev/null +++ b/scripts/staging/sklearn/SklearnToDMLMapper.rst @@ -0,0 +1,72 @@ +# ------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +# ------------------------------------------------------------- + +SklearnToDMLMapper +================== + +SklearnToDMLMapper is a simple tool for transforming scikit-learn pipelines into DML scripts. +This tool may be used over a simple command line interface, where a scikit-learn pipeline provided over a `pickle <https://docs.python.org/3/library/pickle.html>`_ file. Alternatively, SklearnToDMLMapper can be used in a script as a Python module. + + +Prerequisites +------------- + +If a pickle file is provided, no dependecies are necessary except for python 3.6+. +Otherwise, scikit-learn needs to be `installed <https://scikit-learn.org/stable/install.html>`_. + +Usage +----- + +For usage over the CLI, as example call may look as follows: + + python SklearnToDMLMapper.py -i input -o output_path pipe.pkl + +* input: name (prefix) of the input file(s) (see below) +* output_path: transformed pipeline as .dml script +* pipe.pkl: binary file (pickle) of a sklear pipeline + +Used as a Python module a script may look as follows:: + + from sklearn.pipeline import make_pipeline + # Other imports from sklearn + from SklearnToDMLMapper import SklearnToDMLMapper + + pipeline = make_pipeline(...) + + mapper = SklearnToDMLMapper(pipeline, 'input') + mapper.transform() + mapper.save('mapped_pipeline') + +or, alternatively using a pickle file:: + + from SklearnToDMLMapper import SklearnToDMLMapper + + with open('pipeline.pkl', 'rb') as f: + pipeline = pickle.load(f) + + mapper = SklearnToDMLMapper(pipeline, 'input') + mapper.transform() + mapper.save('mapped_pipeline') + +API description +--------------- + +.. autoclass:: SklearnToDMLMapper \ No newline at end of file diff --git a/scripts/staging/sklearn/mapped_functions.rst b/scripts/staging/sklearn/mapped_functions.rst new file mode 100644 index 0000000..4db4ada --- /dev/null +++ b/scripts/staging/sklearn/mapped_functions.rst @@ -0,0 +1,46 @@ +# ------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +# ------------------------------------------------------------- + +Mapped functions +================ + +Classification +-------------- + +Supervised +"""""""""" +* glm.dml <=> `TweedieRegressor <https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.TweedieRegressor.html#sklearn.linear_model.TweedieRegressor>`_ +* l2svm.dml <=> `LinearSVC <https://scikit-learn.org/stable/modules/generated/sklearn.svm.LinearSVC.html#sklearn.svm.LinearSVC>`_ +* multiLogReg.dml <=> `LogisticRegression <https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html>`_ + +Unsupervised +"""""""""""" +* dbscan.dml <=> `DBSCAN <https://scikit-learn.org/stable/modules/generated/sklearn.cluster.DBSCAN.html#sklearn.cluster.DBSCAN>`_ +* kmeans.dml <=> `KMeans <https://scikit-learn.org/stable/modules/generated/sklearn.cluster.KMeans.html#sklearn.cluster.KMeans>`_ +* gmm.dml <=> `GaussianMixture <https://scikit-learn.org/stable/modules/generated/sklearn.mixture.GaussianMixture.html#sklearn.mixture.GaussianMixture>`_ + +Transformations +--------------- +* scale.dml <=> `StandardScaler <https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.StandardScaler.html#sklearn.preprocessing.StandardScaler>`_ +* normalize.dml <=> `Normalizer <https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.Normalizer.html#sklearn.preprocessing.Normalizer>`_ +* pca.dml <=> `PCA <https://scikit-learn.org/stable/modules/generated/sklearn.decomposition.PCA.html#sklearn.decomposition.PCA>`_ +* imputeByMean.dml/imputeByMedian.dml <=> `SimpleImputer <https://scikit-learn.org/stable/modules/generated/sklearn.impute.SimpleImputer.html#sklearn.impute.SimpleImputer>`_ + diff --git a/scripts/staging/sklearn/mappers/__init__.py b/scripts/staging/sklearn/mappers/__init__.py new file mode 100644 index 0000000..3dfe2e8 --- /dev/null +++ b/scripts/staging/sklearn/mappers/__init__.py @@ -0,0 +1,25 @@ +# ------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +# ------------------------------------------------------------- + +from .mapper import * +from .supervised import * +from .unsupervised import * +from .transformations import * diff --git a/scripts/staging/sklearn/mappers/mapper.py b/scripts/staging/sklearn/mappers/mapper.py new file mode 100644 index 0000000..6904d10 --- /dev/null +++ b/scripts/staging/sklearn/mappers/mapper.py @@ -0,0 +1,63 @@ +# ------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +# ------------------------------------------------------------- + +import os + +builtin_path = "scripts/builtin" + +def scripts_home(): + systemds_home = os.getenv('SYSTEMDS_HOME') + if systemds_home is None: + return builtin_path + else: + return f'{systemds_home}/{builtin_path}' + +class Mapper: + name = None + sklearn_name = None + + mapped_params = [] + mapped_output = [] + + is_intermediate = False + is_supervised = False + + def __init__(self, params=None): + self.params = params + if params is not None: + self.map_params() + + def get_source(self): + return 'source("{}/{}.dml") as ns_{}'.format(scripts_home(), + self.name, + self.name) + + def get_call(self): + input_ = ['X', 'Y'] if self.is_supervised else ['X'] + input_ += self.mapped_params + output_ = ', '.join(self.mapped_output) if not self.is_intermediate else 'X' + param_ = ', '.join(map(str, input_)) + call = "[{}] = ns_{}::m_{}({})".format( + output_, self.name, self.name, param_) + return call + + def map_params(self): + pass diff --git a/scripts/staging/sklearn/mappers/supervised.py b/scripts/staging/sklearn/mappers/supervised.py new file mode 100644 index 0000000..396d4f6 --- /dev/null +++ b/scripts/staging/sklearn/mappers/supervised.py @@ -0,0 +1,88 @@ +# ------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +# ------------------------------------------------------------- + +from .mapper import Mapper + +class LinearSVMMapper(Mapper): + name = 'l2svm' + sklearn_name = 'linearsvc' + is_supervised = True + mapped_output = [ + 'model' + ] + + def map_params(self): + self.mapped_params = [ + 'TRUE' if self.params.get('fit_intercept', False) else 'FALSE', + self.params.get('tol', 0.001), + self.params.get('C', 1.0), + self.params.get('max_iter', 100), + 20, # maxii parameter is unkown in sklearn and not documented in dml + 'TRUE' if self.params.get('verbose', False) else 'FALSE', + -1 # column_id is unkown in sklearn + ] + +class TweedieRegressorMapper(Mapper): + name = 'glm' + sklearn_name = 'tweedieregressor' + is_supervised = True + mapped_output = [ + 'beta' + ] + + def map_params(self): + # TODO: many parameters cannot be mapped directly: + # how to handle defaults for dml? + self.mapped_params = [ + 1, # sklearn impl supports power only, dfam + self.params.get('power', 0.0), # vpow + 0, # link + 1.0, # lpow + 0.0, # yneg + # sklearn does not know last case + 0 if self.params.get('fit_intercept', 1) else 1, # icpt + 0.0, # disp + 0.0, # reg + self.params.get('tol', 0.000001), # tol + 200, # moi + 0, # mii, + 'TRUE' if self.params.get('verbose', False) else 'FALSE' + ] + + +class LogisticRegressionMapper(Mapper): + name = 'multiLogReg' + sklearn_name = 'logisticregression' + is_supervised = True + mapped_output = [ + 'beta' + ] + + def map_params(self): + self.mapped_params = [ + # sklearn does not know last case + 0 if self.params.get('fit_intercept', 1) else 1, + self.params.get('tol', 0.000001), # tol + self.params.get('C', 0.0), # reg + 100, # maxi + 0, # maxii + 'TRUE' if self.params.get('verbose', False) else 'FALSE' + ] diff --git a/scripts/staging/sklearn/mappers/transformations.py b/scripts/staging/sklearn/mappers/transformations.py new file mode 100644 index 0000000..ed4a578 --- /dev/null +++ b/scripts/staging/sklearn/mappers/transformations.py @@ -0,0 +1,81 @@ +# ------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +# ------------------------------------------------------------- + +from .mapper import Mapper + +class StandardScalerMapper(Mapper): + name = 'scale' + sklearn_name = 'standardscaler' + is_intermediate = True + mapped_output = [ + 'Y' + ] + + def map_params(self): + self.mapped_params = [ + 'TRUE' if self.params.get('with_mean', True) else 'FALSE', + 'TRUE' if self.params.get('with_std', True) else 'FALSE' + ] + +class NormalizeMapper(Mapper): + name = 'normalize' + sklearn_name = 'normalizer' + is_intermediate = True + mapped_output = [ + 'Y' + ] + + def map_params(self): + self.mapped_params = [] + + +class SimpleImputerMapper(Mapper): + name = 'impute' + sklearn_name = 'simpleimputer' + is_intermediate = True + mapped_output = [ + 'X' + ] + + def map_params(self): # might update naming ? + if self.params['strategy'] == 'median': + self.name = 'imputeByMedian' + else: + self.name = 'imputeByMean' + + self.mapped_params = [] + + +class PCAMapper(Mapper): + name = 'pca' + sklearn_name = 'pca' + is_intermediate = True + mapped_output = [ + 'Xout', + 'Mout' + ] + + def map_params(self): + self.mapped_params = [ + 2 if self.params['n_components'] is None else self.params['random_state'], + 'TRUE', # non existant in SKlearn + 'TRUE' # non existant in SKlearn + ] diff --git a/scripts/staging/sklearn/mappers/unsupervised.py b/scripts/staging/sklearn/mappers/unsupervised.py new file mode 100644 index 0000000..5191f87 --- /dev/null +++ b/scripts/staging/sklearn/mappers/unsupervised.py @@ -0,0 +1,83 @@ +# ------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +# ------------------------------------------------------------- + +from .mapper import Mapper + +class KmeansMapper(Mapper): + name = 'kmeans' + sklearn_name = 'kmeans' + mapped_output = [ + 'C', # The output matrix with the centroids + 'Y' # The mapping of records to centroids + ] + + def map_params(self): + self.mapped_params = [ + self.params['n_clusters'], + self.params['n_init'], + self.params['max_iter'], + self.params['tol'], + 'TRUE' if self.params.get('verbose', False) else 'FALSE', + 50, # avg_sample_size_per_centroid unkown in sklearn + -1 if self.params['random_state'] is None \ + else self.params['random_state'] + ] + + +class DBSCANMapper(Mapper): + name = 'dbscan' + sklearn_name = 'dbscan' + mapped_output = [ + 'clusterMembers' + ] + + def map_params(self): + self.mapped_params = [ + self.params.get('eps', 0.5), + self.params.get('min_samples', 5) + ] + + +class GaussianMixtureMapper(Mapper): + name = 'gmm' + sklearn_name = 'gaussianmixture' + model_map = { + 'full': 'VVV', + 'tied': 'EEE', + 'diag': 'VVI', + 'spherical': 'VVI' + } + mapped_output = [ + 'weight', + 'labels', + 'df', + 'bic' + ] + + def map_params(self): + self.mapped_params = [ + self.params.get('n_components', 3), + f'"{self.model_map.get(self.params.get("covariance_type", "VVV"))}"', + self.params.get('init_params', '"kmeans"'), + self.params.get('max_iter', 100), + self.params.get('reg_covar', 1e-6), + self.params.get('tol', 0.000001) + ] diff --git a/scripts/staging/sklearn/poc/design.rst b/scripts/staging/sklearn/poc/design.rst new file mode 100644 index 0000000..b07e748 --- /dev/null +++ b/scripts/staging/sklearn/poc/design.rst @@ -0,0 +1,65 @@ +# ------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +# ------------------------------------------------------------- + +Scikit-learn - Importer +======================= +Scikit-learn_ is a very popular and well established open-source python library for data science applications. A large number of common algorithms and many useful tools are implemented and maintained. + +Idea +---- +Allowing the import of sklearn models, allows for an easy extension of already established implementations with systemds. + +Current State +^^^^^^^^^^^^^ +Currently a ONNX_ importer is in staging, which is somehow broken or rather breaks something else in the stable branch. + +This tool allows the conversion of ONNX graphs to dml. + +The Importer +------------ +We have following idea. Since the ONNX importer is (to some degree) working, we suggest the modification/extension of the importer to work with sklearn models. + +When reading from a saved model we expect for either approach a pickle_ serialized python object, since scikit-learn uses Python's built-in `persistence model`_. joblib_ is a pickle replacement, which works more efficient on large/complex objects, which is the case with some scikit-learn models. In both cases there are some security and maintainability concerns_ to be kept in mind. + +Proposal 1 +^^^^^^^^^^ +One possible approach to this problem is a direct mapping from scikit-learn to DML. But the effort for this approach may be out of scope for this pull request (for now). + +Proposal 2 +^^^^^^^^^^ +An easier approach would involve a indirect mapping to ONNX and then to DML: + +sklearn --> onnx && onnx --> dml ==> sklearn --> dml + +Sklearn models may be converted to ONNX using the sklearn-onnx_ converter, part of the official ONNX project. The conversion from ONNX to DML can be accomplished using the existing ONNX-Importer of systemds. + +This approach requires fixing the onnx importer and the inclusion of a further dependency. sklearn-onnx_ is published under a MIT license and requires a few other dependencies_. + + +.. _Scikit-learn: https://scikit-learn.org/stable/index.html +.. _sklearn-onnx: https://github.com/onnx/sklearn-onnx +.. _dependencies: https://github.com/onnx/sklearn-onnx/blob/master/requirements.txt +.. _ONNX: http://onnx.ai/sklearn-onnx/ +.. _sklearn_related: https://scikit-learn.org/stable/related_projects.html#related-projects +.. _`persistence model`: https://scikit-learn.org/stable/modules/model_persistence.html +.. _pickle: https://docs.python.org/3/library/pickle.html +.. _joblib: https://joblib.readthedocs.io/en/latest/persistence.html +.. _concerns: https://scikit-learn.org/stable/modules/model_persistence.html#security-maintainability-limitations diff --git a/scripts/staging/sklearn/poc/poc.py b/scripts/staging/sklearn/poc/poc.py new file mode 100755 index 0000000..dd26660 --- /dev/null +++ b/scripts/staging/sklearn/poc/poc.py @@ -0,0 +1,103 @@ +# ------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +# ------------------------------------------------------------- + +import pickle + +def dump(): + from sklearn.linear_model import LinearRegression + from sklearn.preprocessing import StandardScaler + from sklearn.cluster import KMeans + + from sklearn.pipeline import make_pipeline + + pipeline = make_pipeline(StandardScaler(), KMeans()) + + print('Sklearn pipeline:') + print(pipeline) + + with open('pipe.pkl', 'wb') as f: + pickle.dump(pipeline, f) + + print() +dump() + +# source scripts from /scripts/builtin/ +# call functions: https://apache.github.io/systemds/site/dml-language-reference.html#user-defined-function-udf + +def map_lm(sklearn_func): + # TODO + call = 'm_lm()'.format() + +def map_kmeans(sklearn_func): + ''' + m_kmeans = function(Matrix[Double] X, Integer k = 10, Integer runs = 10, Integer max_iter = 1000, + Double eps = 0.000001, Boolean is_verbose = FALSE, Integer avg_sample_size_per_centroid = 50, + Integer seed = -1) + return (Matrix[Double] C, Matrix[Double] Y) + ''' + params = sklearn_func.get_params() + return 'm_kmeans(X, {}, {}, {})'.format(params['n_clusters'], params['n_init'], params['max_iter'], params['tol']) + +def map_scale(sklearn_func): + params = sklearn_func.get_params() + # handle default params as in dml definiton + # handle type mappings + return 'm_scale(X, {}, {})'.format(params['with_mean'], params['with_std']) + +algorithms = { + "linearregression": ("lm", map_lm), + "standardscaler": ("scale", map_scale), + "kmeans": ("kmeans", map_kmeans) +} + +# use setwd for this? +builtin_path = "scripts/builtin" + +sources = [] + +dml_pipeline = [] + +# use jinja templating for this? +# source directory? +# create sperate source file which contains supported algorithms +# and combine into common namespace? + +# validate contents of pipeline: +# intermediate steps need to be transformative +# and the last step fits an estimator +# see https://scikit-learn.org/stable/modules/generated/sklearn.pipeline.Pipeline.html + +with open('pipe.pkl', 'rb') as f: + loaded = pickle.load(f) + +for i, (sklearn_name, algorithm) in enumerate(loaded.steps): + name, mapping = algorithms[sklearn_name] + call = mapping(algorithm) + sources.append('source("{}/{}") as ns_{}'.format(builtin_path, name, name)) + # step_i will be needed in following steps + dml_pipeline.append('step_{} = ns_{}::{}'.format(i, name, call)) + +dml_script = '\n'.join(sources) +dml_script += '\n\n' +dml_script += '\n'.join(dml_pipeline) + +print('DML Script') +print(dml_script) diff --git a/scripts/staging/sklearn/run_tests.py b/scripts/staging/sklearn/run_tests.py new file mode 100755 index 0000000..9aaac30 --- /dev/null +++ b/scripts/staging/sklearn/run_tests.py @@ -0,0 +1,99 @@ +# ------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +# ------------------------------------------------------------- + +import sys +import os +import subprocess +import argparse +import logging + +from sklearn.preprocessing import StandardScaler, Normalizer +from sklearn.impute import SimpleImputer +from sklearn.decomposition import PCA +from sklearn.cluster import KMeans, DBSCAN +from sklearn.pipeline import make_pipeline +from sklearn.svm import LinearSVC +from sklearn.linear_model import TweedieRegressor, LogisticRegression +from sklearn.mixture import GaussianMixture + +from SklearnToDMLMapper import SklearnToDMLMapper +from tests.util import test_script, compare_script, get_systemds_root + +def test_valid(name, pipeline): + mapper = SklearnToDMLMapper(pipeline) + mapper.transform() + path = f'{name}_gen.dml' + mapper.save(path) + return test_script(path) + +if __name__ == '__main__': + parser = argparse.ArgumentParser() + parser.add_argument('--log', action='store', default='ERROR', + help='Set logging level (ERROR, INFO, DEBUG).') + + options = parser.parse_args() + numeric_level = getattr(logging, options.log.upper(), None) + if not isinstance(numeric_level, int): + raise ValueError(f'Invalid log level: {options.log}') + logging.basicConfig(level=numeric_level) + + try: + get_systemds_root() + except Exception as e: + logging.error(e) + exit(-1) + + + valid_pipelines = [ + make_pipeline(StandardScaler(), KMeans()), + make_pipeline(Normalizer(), KMeans()), + make_pipeline(SimpleImputer(strategy='mean'), KMeans()), + make_pipeline(SimpleImputer(strategy='median'), KMeans()), + make_pipeline(Normalizer(), LinearSVC()), + make_pipeline(Normalizer(), TweedieRegressor()), + make_pipeline(StandardScaler(), LogisticRegression()), + make_pipeline(Normalizer(), LogisticRegression()), + #TODO: Tests which use PCA or DBSCAN, trigger a NullPointerException during parsing for some reason + make_pipeline(StandardScaler(), DBSCAN()), + make_pipeline(Normalizer(), DBSCAN()), + make_pipeline(SimpleImputer(strategy='mean'), DBSCAN()), + make_pipeline(SimpleImputer(strategy='median'), DBSCAN()), + make_pipeline(PCA(), KMeans()), + make_pipeline(PCA(), DBSCAN()), + # TODO: GaussianMixtureModel results in LanguageException -- ERROR: [line 0:0] -- Function get_sample_maps() is undefined. + make_pipeline(StandardScaler(), GaussianMixture()), + make_pipeline(Normalizer(), GaussianMixture()) + ] + + valid_results = [] + valid_tests_names = [] + for i, pipeline in enumerate(valid_pipelines): + name = f'test_{i}_' + '_'.join([s[0] for s in pipeline.steps]) + logging.info('*' * 50) + logging.info((18*'*' + name + (50-20-len(name)) * '*')) + result = test_valid(name, pipeline) + valid_results.append(result) + valid_tests_names.append(name) + + print('*' * 50) + print('Finished all tests.') + for (name, r) in zip(valid_tests_names, valid_results): + print('{}: {}'.format(name, 'Failed' if not r else 'Success')) diff --git a/scripts/staging/sklearn/tests/input_X.csv b/scripts/staging/sklearn/tests/input_X.csv new file mode 100644 index 0000000..f6e6b31 --- /dev/null +++ b/scripts/staging/sklearn/tests/input_X.csv @@ -0,0 +1,10 @@ +-7.237310391208174210e+00,-9.031086522545416884e+00 +-8.165501360870660363e+00,-7.008504394784431213e+00 +-7.022668436942145931e+00,-7.570412890908222892e+00 +-8.863943061317664629e+00,-5.053239814677235486e+00 +8.525185826796044530e-02,3.645282967948058506e+00 +-7.941522766238410247e-01,2.104951171962878842e+00 +-1.340520809891420972e+00,4.157119493365751595e+00 +-1.032012970766660942e+01,-4.337402902031620044e+00 +-2.187731658211975017e+00,3.333521246686991013e+00 +-8.535604566608126831e+00,-6.013489256860858667e+00 diff --git a/scripts/staging/sklearn/tests/input_X.csv.mtd b/scripts/staging/sklearn/tests/input_X.csv.mtd new file mode 100644 index 0000000..1acb285 --- /dev/null +++ b/scripts/staging/sklearn/tests/input_X.csv.mtd @@ -0,0 +1 @@ +{"rows": 10, "cols": 2, "format": "csv"} diff --git a/scripts/staging/sklearn/tests/input_Y.csv b/scripts/staging/sklearn/tests/input_Y.csv new file mode 100644 index 0000000..2b81a96 --- /dev/null +++ b/scripts/staging/sklearn/tests/input_Y.csv @@ -0,0 +1,10 @@ +3.0 +3.0 +3.0 +2.0 +1.0 +1.0 +1.0 +2.0 +1.0 +2.0 diff --git a/scripts/staging/sklearn/tests/input_Y.csv.mtd b/scripts/staging/sklearn/tests/input_Y.csv.mtd new file mode 100644 index 0000000..4b8fdf2 --- /dev/null +++ b/scripts/staging/sklearn/tests/input_Y.csv.mtd @@ -0,0 +1,12 @@ +{ + "data_type": "matrix", + "value_type": "double", + "rows": 10, + "cols": 1, + "nnz": 10, + "format": "csv", + "author": "mathias", + "header": false, + "sep": ",", + "created": "2021-02-16 17:11:40 CET" +} \ No newline at end of file diff --git a/scripts/staging/sklearn/tests/util.py b/scripts/staging/sklearn/tests/util.py new file mode 100755 index 0000000..8d80c4a --- /dev/null +++ b/scripts/staging/sklearn/tests/util.py @@ -0,0 +1,101 @@ +# ------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +# ------------------------------------------------------------- + +import sys +import os +import subprocess +import difflib +import logging + +def get_systemds_root(): + try: + return os.environ['SYSTEMDS_ROOT'] + except KeyError as error: + raise KeyError(f"SYSTEMDS_ROOT is not set.\nError\n{error}") + +def get_sklearn_root(): + return f'{get_systemds_root()}/scripts/staging/sklearn' + +def invoke_systemds(path): + root = get_systemds_root() + + try: + script_path = os.path.relpath(path, os.getcwd()) + result = subprocess.run([root + "/bin/systemds", script_path, '-nvargs input_X=tests/input_X.csv input_Y=tests/input_Y.csv'], + check=True, + stdout=subprocess.PIPE, stderr=subprocess.PIPE, + timeout=10000) + + logging.debug('*' * 100) + logging.debug('\n' + result.stdout.decode('utf-8')) + logging.debug('\n' + result.stderr.decode('utf-8')) + logging.debug('*' * 100) + + # It looks like python does not notice systemds errors + # Is 0 returned in error cases? + # Check if there is any error and raise manually. + if len(result.stderr) != 0 or 'error' in str(result.stdout).lower(): + raise subprocess.CalledProcessError(returncode=result.returncode, cmd=result.args, + stderr=result.stderr, output=result.stdout) + + except subprocess.CalledProcessError as systemds_error: + logging.error("Failed to run systemds!") + logging.error("Error code: " + str(systemds_error.returncode)) + logging.error("Stdout:") + logging.error(systemds_error.output.decode("utf-8")) + logging.error("Stderr:") + logging.error(systemds_error.stderr.decode("utf-8")) + return False + logging.info("Successfully executed script.") + return True + +def test_script(path): + logging.info('#' * 30) + logging.info('Running generated script on systemds.') + result = invoke_systemds(path) + logging.info('Finished test.') + return result + +# Compares two script using diff +def compare_script(actual, expected): + try: + f_expected = open(f'{get_sklearn_root()}/tests/expected/{expected}') + f_actual = open(f'{get_sklearn_root()}/{actual}') + diff = difflib.ndiff(f_actual.readlines(), f_expected.readlines()) + changes = [l.strip() for l in diff if not l.startswith(' ')] + logging.info('#' * 30) + if len(changes) == 0: + logging.info('Actual script matches expected script.') + return True + else: + logging.info('Actual script does not match expected script.') + logging.info('Legend:') + logging.info(' "+ " ... line unique to actual script') + logging.info(' "- " ... line unique to expected script') + logging.info(' "? " ... linue not present in either script') + logging.info('#' * 30) + logging.info('\n' + '\n'.join(changes)) + logging.info('#' * 30) + return False + except Exception as e: + logging.error('Failed to compare script.') + logging.error(e) + return False \ No newline at end of file