[systemds] 01/03: [SYSTEMDS-3235] Scikit-learn to SystemDS dml converter

janardhan Mon, 06 Dec 2021 00:52:54 -0800

This is an automated email from the ASF dual-hosted git repository.

janardhan pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/systemds.git


commit 97633c74d5fcf4d884aef433192ada6c977d465d
Author: Erwin Tumbul <[email protected]>
AuthorDate: Fri Dec 3 16:17:41 2021 +0530

    [SYSTEMDS-3235] Scikit-learn to SystemDS dml converter
    
      * Basic sklearn pipelines to dml script mapping
      * List of supported algorithms DBSCANMapper,
        l2svm, Gaussian, GLM, MultiLogReg
      * A distinction between supervised and unsupervised
        algorithms added.
      * Simple testing to verify the transformation, and
        add error handling
      * Includes basic design documentation and decisions.
    
    Co-authored-by: Erwin Tumbul <[email protected]>
    Co-authored-by: Alexander Thien <[email protected]>
    Co-authored-by: Mathias Kahr <[email protected]>
---
 scripts/staging/sklearn/.gitignore                 |   7 ++
 scripts/staging/sklearn/SklearnToDMLMapper.py      | 139 +++++++++++++++++++++
 scripts/staging/sklearn/SklearnToDMLMapper.rst     |  72 +++++++++++
 scripts/staging/sklearn/mapped_functions.rst       |  46 +++++++
 scripts/staging/sklearn/mappers/__init__.py        |  25 ++++
 scripts/staging/sklearn/mappers/mapper.py          |  63 ++++++++++
 scripts/staging/sklearn/mappers/supervised.py      |  88 +++++++++++++
 scripts/staging/sklearn/mappers/transformations.py |  81 ++++++++++++
 scripts/staging/sklearn/mappers/unsupervised.py    |  83 ++++++++++++
 scripts/staging/sklearn/poc/design.rst             |  65 ++++++++++
 scripts/staging/sklearn/poc/poc.py                 | 103 +++++++++++++++
 scripts/staging/sklearn/run_tests.py               |  99 +++++++++++++++
 scripts/staging/sklearn/tests/input_X.csv          |  10 ++
 scripts/staging/sklearn/tests/input_X.csv.mtd      |   1 +
 scripts/staging/sklearn/tests/input_Y.csv          |  10 ++
 scripts/staging/sklearn/tests/input_Y.csv.mtd      |  12 ++
 scripts/staging/sklearn/tests/util.py              | 101 +++++++++++++++
 17 files changed, 1005 insertions(+)

diff --git a/scripts/staging/sklearn/.gitignore 
b/scripts/staging/sklearn/.gitignore
new file mode 100644
index 0000000..2508296
--- /dev/null
+++ b/scripts/staging/sklearn/.gitignore
@@ -0,0 +1,7 @@
+*.pkl
+*.dml
+*.csv
+*.csv.mtd
+
+!/tests/input_X.*
+!/tests/input_Y.*
\ No newline at end of file
diff --git a/scripts/staging/sklearn/SklearnToDMLMapper.py 
b/scripts/staging/sklearn/SklearnToDMLMapper.py
new file mode 100755
index 0000000..703153d
--- /dev/null
+++ b/scripts/staging/sklearn/SklearnToDMLMapper.py
@@ -0,0 +1,139 @@
+# -------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+# -------------------------------------------------------------
+
+import pickle
+import sys
+import inspect
+import mappers
+import argparse
+
+class SklearnToDMLMapper:
+    """ SklearnToDMLMapper is a simple tool for transforming scikit-learn 
pipelines into DML scripts.
+        This tool may be used over a simple command line interface, where a 
scikit-learn pipeline provided over 
+        a pickle file. Alternatively, SklearnToDMLMapper can be used in a 
script as a Python module.
+
+        Args:
+            pipeline (sklearn.pipeline.Pipeline): sklearn pipeline
+            input_name (str, optional): Name for the input variable (prefix). 
Defaults to 'input'. 
+                                        Depending on the pipeline two files 
are necessary.
+                                        Example: input_name="input". Maps to 
files input_X.csv and input_Y.csv 
+                                        for a pipeline ending in a supervised 
algorithm.
+    """
+    def __init__(self, pipeline, input_name='input'):
+        """Create an SklearnToDMLMapper."""
+        self.steps = pipeline.steps
+        self.functions = self.__get_functions()
+        self.dml_script = None
+        self.input_name = input_name
+
+    def __get_functions(self):
+        clsmembers = inspect.getmembers(sys.modules['mappers'], 
inspect.isclass)
+        functions = {}
+        for cls in clsmembers:
+            instance = cls[1]()
+            if instance.sklearn_name is not None:
+                functions[instance.sklearn_name] = cls[1]
+
+        return functions
+
+    def __get_input(self):
+        # Get last function (an algorithm)
+        func = self.functions[self.steps[-1][0]]()
+        if func is None:
+            raise RuntimeError(f'{self.steps[-1][0]} is not supported.')
+
+        if func.is_supervised:
+            return f'X = read(${self.input_name}_X)\nY = 
read(${self.input_name}_Y)'
+        else:
+            return f'X = read(${self.input_name}_X)'
+
+    def __get_output(self):
+        func = self.functions[self.steps[-1][0]]()
+        if func is None:
+            raise RuntimeError(f'{self.steps[-1][0]} is not supported.')
+        return '\n'.join([f'write({output}, "{output}.csv")' for output in 
func.mapped_output])
+
+    def transform(self):
+        """Transforms a sklearn pipeline in a .dml script. 
+
+        Returns:
+            str: The transformed .dml script.
+        """
+        sources = []
+        calls = []
+
+        for name, step in self.steps:
+            if name not in self.functions:
+                continue
+
+            mapper = self.functions[name](step.get_params())
+            calls.append(mapper.get_call())
+            sources.append(mapper.get_source())
+
+        self.dml_script = "{}\n\n{}\n\n{}\n\n{}".format('\n'.join(sources), 
+                                                        self.__get_input(), 
+                                                        '\n'.join(calls), 
+                                                        self.__get_output())
+        return self.dml_script
+
+    def save(self, path):
+        """Saves the transformed .dml script.
+
+        Args:
+            path (str): Location where the DML script is to be saved.
+
+        Raises:
+            RuntimeError: Save can only be called if a transformation was 
executed beforehand.
+        """
+        if self.dml_script is None:
+            raise RuntimeError('Transformation was not applied yet.')
+
+        with open(path, 'w') as f:
+            f.write(self.dml_script)
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(description='Tool that parses a sklearn 
pipeline and produces a dml script')
+    parser.add_argument('Path',
+            metavar='path',
+            type=str,
+            help='Location of the sklearn pipeline saved as pickle file')
+    parser.add_argument('-i',
+            metavar='input_name',
+            type=str,
+            default='X',
+            help='Name for the input variable (prefix). Depending on the 
pipeline two files are necessary. Example: input_name="input". Maps to files 
input_X.csv and input_Y.csv for a pipeline ending in a supervised algorithm.')
+    parser.add_argument('-o',
+            metavar='output',
+            type=str,
+            default='./pipeline.dml',
+            help='Path for the dml output script')
+
+    args = parser.parse_args()
+
+    try:
+        with open(args['path'], 'rb') as f:
+            pipeline = pickle.load(f)
+
+        mapper = SklearnToDMLMapper(pipeline, args['input_name'])
+        mapper.transform()
+        mapper.save(args['output'])
+    except Exception as e:
+        print(f'Failed to transform pipeline.\nError:\n{e}')
\ No newline at end of file
diff --git a/scripts/staging/sklearn/SklearnToDMLMapper.rst 
b/scripts/staging/sklearn/SklearnToDMLMapper.rst
new file mode 100644
index 0000000..d80251b
--- /dev/null
+++ b/scripts/staging/sklearn/SklearnToDMLMapper.rst
@@ -0,0 +1,72 @@
+# -------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+# -------------------------------------------------------------
+
+SklearnToDMLMapper
+==================
+
+SklearnToDMLMapper is a simple tool for transforming scikit-learn pipelines 
into DML scripts.
+This tool may be used over a simple command line interface, where a 
scikit-learn pipeline provided over a `pickle 
<https://docs.python.org/3/library/pickle.html>`_ file. Alternatively, 
SklearnToDMLMapper can be used in a script as a Python module.
+
+
+Prerequisites
+-------------
+
+If a pickle file is provided, no dependecies are necessary except for python 
3.6+.
+Otherwise, scikit-learn needs to be `installed 
<https://scikit-learn.org/stable/install.html>`_.
+
+Usage
+-----
+
+For usage over the CLI, as example call may look as follows:
+
+    python SklearnToDMLMapper.py -i input -o output_path pipe.pkl
+
+* input: name (prefix) of the input file(s) (see below)
+* output_path: transformed pipeline as .dml script
+* pipe.pkl: binary file (pickle) of a sklear pipeline
+
+Used as a Python module a script may look as follows::
+
+    from sklearn.pipeline import make_pipeline
+    # Other imports from sklearn
+    from SklearnToDMLMapper import SklearnToDMLMapper
+
+    pipeline = make_pipeline(...)
+
+    mapper = SklearnToDMLMapper(pipeline, 'input')
+    mapper.transform()
+    mapper.save('mapped_pipeline')
+
+or, alternatively using a pickle file::
+
+    from SklearnToDMLMapper import SklearnToDMLMapper
+
+    with open('pipeline.pkl', 'rb') as f:
+        pipeline = pickle.load(f)
+
+    mapper = SklearnToDMLMapper(pipeline, 'input')
+    mapper.transform()
+    mapper.save('mapped_pipeline')
+
+API description
+---------------
+
+.. autoclass:: SklearnToDMLMapper
\ No newline at end of file
diff --git a/scripts/staging/sklearn/mapped_functions.rst 
b/scripts/staging/sklearn/mapped_functions.rst
new file mode 100644
index 0000000..4db4ada
--- /dev/null
+++ b/scripts/staging/sklearn/mapped_functions.rst
@@ -0,0 +1,46 @@
+# -------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+# -------------------------------------------------------------
+
+Mapped functions
+================
+
+Classification
+--------------
+
+Supervised
+""""""""""
+* glm.dml <=> `TweedieRegressor 
<https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.TweedieRegressor.html#sklearn.linear_model.TweedieRegressor>`_
+* l2svm.dml <=> `LinearSVC 
<https://scikit-learn.org/stable/modules/generated/sklearn.svm.LinearSVC.html#sklearn.svm.LinearSVC>`_
+* multiLogReg.dml <=> `LogisticRegression 
<https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html>`_
+
+Unsupervised
+""""""""""""
+* dbscan.dml <=> `DBSCAN 
<https://scikit-learn.org/stable/modules/generated/sklearn.cluster.DBSCAN.html#sklearn.cluster.DBSCAN>`_
+* kmeans.dml <=> `KMeans 
<https://scikit-learn.org/stable/modules/generated/sklearn.cluster.KMeans.html#sklearn.cluster.KMeans>`_
+* gmm.dml <=> `GaussianMixture 
<https://scikit-learn.org/stable/modules/generated/sklearn.mixture.GaussianMixture.html#sklearn.mixture.GaussianMixture>`_
+
+Transformations
+---------------
+* scale.dml <=> `StandardScaler 
<https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.StandardScaler.html#sklearn.preprocessing.StandardScaler>`_
+* normalize.dml <=> `Normalizer 
<https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.Normalizer.html#sklearn.preprocessing.Normalizer>`_
+* pca.dml <=> `PCA 
<https://scikit-learn.org/stable/modules/generated/sklearn.decomposition.PCA.html#sklearn.decomposition.PCA>`_
+* imputeByMean.dml/imputeByMedian.dml <=> `SimpleImputer 
<https://scikit-learn.org/stable/modules/generated/sklearn.impute.SimpleImputer.html#sklearn.impute.SimpleImputer>`_
+
diff --git a/scripts/staging/sklearn/mappers/__init__.py 
b/scripts/staging/sklearn/mappers/__init__.py
new file mode 100644
index 0000000..3dfe2e8
--- /dev/null
+++ b/scripts/staging/sklearn/mappers/__init__.py
@@ -0,0 +1,25 @@
+# -------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+# -------------------------------------------------------------
+
+from .mapper import *
+from .supervised import *
+from .unsupervised import *
+from .transformations import *
diff --git a/scripts/staging/sklearn/mappers/mapper.py 
b/scripts/staging/sklearn/mappers/mapper.py
new file mode 100644
index 0000000..6904d10
--- /dev/null
+++ b/scripts/staging/sklearn/mappers/mapper.py
@@ -0,0 +1,63 @@
+# -------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+# -------------------------------------------------------------
+
+import os
+
+builtin_path = "scripts/builtin"
+
+def scripts_home():
+    systemds_home = os.getenv('SYSTEMDS_HOME')
+    if systemds_home is None:
+        return builtin_path
+    else:
+        return f'{systemds_home}/{builtin_path}'
+
+class Mapper:
+    name = None
+    sklearn_name = None
+
+    mapped_params = []
+    mapped_output = []
+
+    is_intermediate = False
+    is_supervised = False
+
+    def __init__(self, params=None):
+        self.params = params
+        if params is not None:
+            self.map_params()
+
+    def get_source(self):
+        return 'source("{}/{}.dml") as ns_{}'.format(scripts_home(),
+                                                 self.name,
+                                                 self.name)
+
+    def get_call(self):
+        input_ = ['X', 'Y'] if self.is_supervised else ['X']
+        input_ += self.mapped_params
+        output_ = ', '.join(self.mapped_output) if not self.is_intermediate 
else 'X'
+        param_ = ', '.join(map(str, input_))
+        call = "[{}] = ns_{}::m_{}({})".format(
+            output_, self.name, self.name, param_)
+        return call
+
+    def map_params(self):
+        pass
diff --git a/scripts/staging/sklearn/mappers/supervised.py 
b/scripts/staging/sklearn/mappers/supervised.py
new file mode 100644
index 0000000..396d4f6
--- /dev/null
+++ b/scripts/staging/sklearn/mappers/supervised.py
@@ -0,0 +1,88 @@
+# -------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+# -------------------------------------------------------------
+
+from .mapper import Mapper
+
+class LinearSVMMapper(Mapper):
+    name = 'l2svm'
+    sklearn_name = 'linearsvc'
+    is_supervised = True
+    mapped_output = [
+        'model'
+    ]
+
+    def map_params(self):
+        self.mapped_params = [
+            'TRUE' if self.params.get('fit_intercept', False) else 'FALSE',
+            self.params.get('tol', 0.001),
+            self.params.get('C', 1.0),
+            self.params.get('max_iter', 100),
+            20, # maxii parameter is unkown in sklearn and not documented in 
dml
+            'TRUE' if self.params.get('verbose', False) else 'FALSE',
+            -1  # column_id is unkown in sklearn
+        ]
+
+class TweedieRegressorMapper(Mapper):
+    name = 'glm'
+    sklearn_name = 'tweedieregressor'
+    is_supervised = True
+    mapped_output = [
+        'beta'
+    ]
+
+    def map_params(self):
+        # TODO: many parameters cannot be mapped directly:
+        # how to handle defaults for dml?
+        self.mapped_params = [
+            1,  # sklearn impl supports power only, dfam
+            self.params.get('power', 0.0),  # vpow
+            0,  # link
+            1.0,  # lpow
+            0.0,  # yneg
+            # sklearn does not know last case
+            0 if self.params.get('fit_intercept', 1) else 1, # icpt
+            0.0,  # disp
+            0.0,  # reg
+            self.params.get('tol', 0.000001), # tol
+            200,  # moi
+            0,  # mii,
+            'TRUE' if self.params.get('verbose', False) else 'FALSE'
+        ]
+
+
+class LogisticRegressionMapper(Mapper):
+    name = 'multiLogReg'
+    sklearn_name = 'logisticregression'
+    is_supervised = True
+    mapped_output = [
+        'beta'
+    ]
+
+    def map_params(self):
+        self.mapped_params = [
+            # sklearn does not know last case
+            0 if self.params.get('fit_intercept', 1) else 1,
+            self.params.get('tol', 0.000001), # tol
+            self.params.get('C', 0.0), # reg
+            100,  # maxi
+            0,  # maxii
+            'TRUE' if self.params.get('verbose', False) else 'FALSE'
+        ]
diff --git a/scripts/staging/sklearn/mappers/transformations.py 
b/scripts/staging/sklearn/mappers/transformations.py
new file mode 100644
index 0000000..ed4a578
--- /dev/null
+++ b/scripts/staging/sklearn/mappers/transformations.py
@@ -0,0 +1,81 @@
+# -------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+# -------------------------------------------------------------
+
+from .mapper import Mapper
+
+class StandardScalerMapper(Mapper):
+    name = 'scale'
+    sklearn_name = 'standardscaler'
+    is_intermediate = True
+    mapped_output = [
+        'Y'
+    ]
+
+    def map_params(self):
+        self.mapped_params = [
+            'TRUE' if self.params.get('with_mean', True) else 'FALSE',
+            'TRUE' if self.params.get('with_std', True) else 'FALSE'
+        ]
+
+class NormalizeMapper(Mapper):
+    name = 'normalize'
+    sklearn_name = 'normalizer'
+    is_intermediate = True
+    mapped_output = [
+        'Y'
+    ]
+
+    def map_params(self):
+        self.mapped_params = []
+
+
+class SimpleImputerMapper(Mapper):
+    name = 'impute'
+    sklearn_name = 'simpleimputer'
+    is_intermediate = True
+    mapped_output = [
+        'X'
+    ]
+
+    def map_params(self):  # might update naming ?
+        if self.params['strategy'] == 'median':
+            self.name = 'imputeByMedian'
+        else:
+            self.name = 'imputeByMean'
+
+        self.mapped_params = []
+
+
+class PCAMapper(Mapper):
+    name = 'pca'
+    sklearn_name = 'pca'
+    is_intermediate = True
+    mapped_output = [
+        'Xout',
+        'Mout'
+    ]
+
+    def map_params(self):
+        self.mapped_params = [
+            2 if self.params['n_components'] is None else 
self.params['random_state'],
+            'TRUE',  # non existant in SKlearn
+            'TRUE'  # non existant in SKlearn
+        ]
diff --git a/scripts/staging/sklearn/mappers/unsupervised.py 
b/scripts/staging/sklearn/mappers/unsupervised.py
new file mode 100644
index 0000000..5191f87
--- /dev/null
+++ b/scripts/staging/sklearn/mappers/unsupervised.py
@@ -0,0 +1,83 @@
+# -------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+# -------------------------------------------------------------
+
+from .mapper import Mapper
+
+class KmeansMapper(Mapper):
+    name = 'kmeans'
+    sklearn_name = 'kmeans'
+    mapped_output = [
+        'C',  # The output matrix with the centroids
+        'Y'  # The mapping of records to centroids
+    ]
+
+    def map_params(self):
+        self.mapped_params = [
+            self.params['n_clusters'],
+            self.params['n_init'],
+            self.params['max_iter'],
+            self.params['tol'],
+            'TRUE' if self.params.get('verbose', False) else 'FALSE',
+            50,  # avg_sample_size_per_centroid unkown in sklearn
+            -1 if self.params['random_state'] is None \
+            else self.params['random_state']
+        ]
+
+
+class DBSCANMapper(Mapper):
+    name = 'dbscan'
+    sklearn_name = 'dbscan'
+    mapped_output = [
+        'clusterMembers'
+    ]
+
+    def map_params(self):
+        self.mapped_params = [
+            self.params.get('eps', 0.5),
+            self.params.get('min_samples', 5)
+        ]
+
+
+class GaussianMixtureMapper(Mapper):
+    name = 'gmm'
+    sklearn_name = 'gaussianmixture'
+    model_map = {
+        'full': 'VVV',
+        'tied': 'EEE',
+        'diag': 'VVI',
+        'spherical': 'VVI'
+    }
+    mapped_output = [
+        'weight',
+        'labels',
+        'df',
+        'bic'
+    ]
+
+    def map_params(self):
+        self.mapped_params = [
+            self.params.get('n_components', 3),
+            f'"{self.model_map.get(self.params.get("covariance_type", 
"VVV"))}"',
+            self.params.get('init_params', '"kmeans"'),
+            self.params.get('max_iter', 100),
+            self.params.get('reg_covar', 1e-6),
+            self.params.get('tol', 0.000001)
+        ]
diff --git a/scripts/staging/sklearn/poc/design.rst 
b/scripts/staging/sklearn/poc/design.rst
new file mode 100644
index 0000000..b07e748
--- /dev/null
+++ b/scripts/staging/sklearn/poc/design.rst
@@ -0,0 +1,65 @@
+# -------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+# -------------------------------------------------------------
+
+Scikit-learn - Importer
+=======================
+Scikit-learn_ is a very popular and well established open-source python 
library for data science applications. A large number of common algorithms and 
many useful tools are implemented and maintained. 
+
+Idea
+----
+Allowing the import of sklearn models, allows for an easy extension of already 
established implementations with systemds.
+
+Current State
+^^^^^^^^^^^^^
+Currently a ONNX_ importer is in staging, which is somehow broken or rather 
breaks something else in the stable branch.
+
+This tool allows the conversion of ONNX graphs to dml.
+
+The Importer
+------------
+We have following idea. Since the ONNX importer is (to some degree) working, 
we suggest the modification/extension of the importer to work with sklearn 
models.
+
+When reading from a saved model we expect for either approach a pickle_ 
serialized python object, since scikit-learn uses Python's built-in 
`persistence model`_. joblib_ is a pickle replacement, which works more 
efficient on large/complex objects, which is the case with some scikit-learn 
models. In both cases there are some security and maintainability concerns_ to 
be kept in mind.
+
+Proposal 1
+^^^^^^^^^^
+One possible approach to this problem is a direct mapping from scikit-learn to 
DML. But the effort for this approach may be out of scope for this pull request 
(for now). 
+
+Proposal 2
+^^^^^^^^^^
+An easier approach would involve a indirect mapping to ONNX and then to DML:
+
+sklearn --> onnx && onnx --> dml ==> sklearn --> dml
+
+Sklearn models may be converted to ONNX using the sklearn-onnx_ converter, 
part of the official ONNX project. The conversion from ONNX to DML can be 
accomplished using the existing ONNX-Importer of systemds.
+
+This approach requires fixing the onnx importer and the inclusion of a further 
dependency. sklearn-onnx_ is published under a MIT license and requires a few 
other dependencies_.
+
+
+.. _Scikit-learn: https://scikit-learn.org/stable/index.html
+.. _sklearn-onnx: https://github.com/onnx/sklearn-onnx
+.. _dependencies: 
https://github.com/onnx/sklearn-onnx/blob/master/requirements.txt
+.. _ONNX: http://onnx.ai/sklearn-onnx/
+.. _sklearn_related: 
https://scikit-learn.org/stable/related_projects.html#related-projects
+.. _`persistence model`: 
https://scikit-learn.org/stable/modules/model_persistence.html
+.. _pickle: https://docs.python.org/3/library/pickle.html
+.. _joblib: https://joblib.readthedocs.io/en/latest/persistence.html
+.. _concerns: 
https://scikit-learn.org/stable/modules/model_persistence.html#security-maintainability-limitations
diff --git a/scripts/staging/sklearn/poc/poc.py 
b/scripts/staging/sklearn/poc/poc.py
new file mode 100755
index 0000000..dd26660
--- /dev/null
+++ b/scripts/staging/sklearn/poc/poc.py
@@ -0,0 +1,103 @@
+# -------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+# -------------------------------------------------------------
+
+import pickle
+
+def dump():
+    from sklearn.linear_model import LinearRegression
+    from sklearn.preprocessing import StandardScaler
+    from sklearn.cluster import KMeans
+
+    from sklearn.pipeline import make_pipeline
+
+    pipeline = make_pipeline(StandardScaler(), KMeans())
+
+    print('Sklearn pipeline:')
+    print(pipeline)
+
+    with open('pipe.pkl', 'wb') as f:
+        pickle.dump(pipeline, f)
+
+    print()
+dump()
+
+# source scripts from /scripts/builtin/
+# call functions: 
https://apache.github.io/systemds/site/dml-language-reference.html#user-defined-function-udf
+
+def map_lm(sklearn_func):
+    # TODO
+    call = 'm_lm()'.format()
+
+def map_kmeans(sklearn_func):
+    '''
+    m_kmeans = function(Matrix[Double] X, Integer k = 10, Integer runs = 10, 
Integer max_iter = 1000,
+    Double eps = 0.000001, Boolean is_verbose = FALSE, Integer 
avg_sample_size_per_centroid = 50,
+    Integer seed = -1)
+    return (Matrix[Double] C, Matrix[Double] Y)
+    '''
+    params = sklearn_func.get_params()
+    return 'm_kmeans(X, {}, {}, {})'.format(params['n_clusters'], 
params['n_init'], params['max_iter'], params['tol'])
+
+def map_scale(sklearn_func):
+    params = sklearn_func.get_params()
+    # handle default params as in dml definiton
+    # handle type mappings
+    return 'm_scale(X, {}, {})'.format(params['with_mean'], params['with_std'])
+    
+algorithms = {
+    "linearregression": ("lm", map_lm),
+    "standardscaler": ("scale", map_scale),
+    "kmeans": ("kmeans", map_kmeans)
+}
+
+# use setwd for this?
+builtin_path = "scripts/builtin"
+
+sources = []
+
+dml_pipeline = []
+
+# use jinja templating for this?
+# source directory?
+# create sperate source file which contains supported algorithms
+# and combine into common namespace?
+
+# validate contents of pipeline:
+# intermediate steps need to be transformative
+# and the last step fits an estimator
+# see 
https://scikit-learn.org/stable/modules/generated/sklearn.pipeline.Pipeline.html
+
+with open('pipe.pkl', 'rb') as f:
+    loaded = pickle.load(f)
+
+for i, (sklearn_name, algorithm) in enumerate(loaded.steps):
+    name, mapping = algorithms[sklearn_name]
+    call = mapping(algorithm)
+    sources.append('source("{}/{}") as ns_{}'.format(builtin_path, name, name))
+    # step_i will be needed in following steps
+    dml_pipeline.append('step_{} = ns_{}::{}'.format(i, name, call))
+
+dml_script = '\n'.join(sources)
+dml_script += '\n\n'
+dml_script += '\n'.join(dml_pipeline)
+
+print('DML Script')
+print(dml_script)
diff --git a/scripts/staging/sklearn/run_tests.py 
b/scripts/staging/sklearn/run_tests.py
new file mode 100755
index 0000000..9aaac30
--- /dev/null
+++ b/scripts/staging/sklearn/run_tests.py
@@ -0,0 +1,99 @@
+# -------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+# -------------------------------------------------------------
+
+import sys
+import os
+import subprocess
+import argparse
+import logging
+
+from sklearn.preprocessing import StandardScaler, Normalizer
+from sklearn.impute import SimpleImputer
+from sklearn.decomposition import PCA
+from sklearn.cluster import KMeans, DBSCAN
+from sklearn.pipeline import make_pipeline
+from sklearn.svm import LinearSVC
+from sklearn.linear_model import TweedieRegressor, LogisticRegression
+from sklearn.mixture import GaussianMixture
+
+from SklearnToDMLMapper import SklearnToDMLMapper
+from tests.util import test_script, compare_script, get_systemds_root
+
+def test_valid(name, pipeline):
+    mapper = SklearnToDMLMapper(pipeline)
+    mapper.transform()
+    path = f'{name}_gen.dml'
+    mapper.save(path)
+    return test_script(path)
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--log', action='store', default='ERROR', 
+                        help='Set logging level (ERROR, INFO, DEBUG).')
+
+    options = parser.parse_args()
+    numeric_level = getattr(logging, options.log.upper(), None)
+    if not isinstance(numeric_level, int):
+        raise ValueError(f'Invalid log level: {options.log}')
+    logging.basicConfig(level=numeric_level)
+
+    try:
+        get_systemds_root()
+    except Exception as e:
+        logging.error(e)
+        exit(-1)
+
+    
+    valid_pipelines = [
+        make_pipeline(StandardScaler(), KMeans()),
+        make_pipeline(Normalizer(), KMeans()),
+        make_pipeline(SimpleImputer(strategy='mean'), KMeans()),
+        make_pipeline(SimpleImputer(strategy='median'), KMeans()),
+        make_pipeline(Normalizer(), LinearSVC()),
+        make_pipeline(Normalizer(), TweedieRegressor()),
+        make_pipeline(StandardScaler(), LogisticRegression()),
+        make_pipeline(Normalizer(), LogisticRegression()),
+        #TODO: Tests which use PCA or DBSCAN, trigger a NullPointerException 
during parsing for some reason
+        make_pipeline(StandardScaler(), DBSCAN()),
+        make_pipeline(Normalizer(), DBSCAN()),
+        make_pipeline(SimpleImputer(strategy='mean'), DBSCAN()),
+        make_pipeline(SimpleImputer(strategy='median'), DBSCAN()),
+        make_pipeline(PCA(), KMeans()),
+        make_pipeline(PCA(), DBSCAN()),
+        # TODO: GaussianMixtureModel results in LanguageException -- ERROR: 
[line 0:0] -- Function get_sample_maps() is undefined.
+        make_pipeline(StandardScaler(), GaussianMixture()),
+        make_pipeline(Normalizer(), GaussianMixture())
+    ]
+
+    valid_results = []
+    valid_tests_names = []
+    for i, pipeline in enumerate(valid_pipelines):
+        name = f'test_{i}_' + '_'.join([s[0] for s in pipeline.steps])
+        logging.info('*' * 50)
+        logging.info((18*'*' + name + (50-20-len(name)) * '*'))
+        result = test_valid(name, pipeline)
+        valid_results.append(result)
+        valid_tests_names.append(name)
+    
+    print('*' * 50)
+    print('Finished all tests.')
+    for (name, r) in zip(valid_tests_names, valid_results):
+        print('{}: {}'.format(name, 'Failed' if not r else 'Success'))
diff --git a/scripts/staging/sklearn/tests/input_X.csv 
b/scripts/staging/sklearn/tests/input_X.csv
new file mode 100644
index 0000000..f6e6b31
--- /dev/null
+++ b/scripts/staging/sklearn/tests/input_X.csv
@@ -0,0 +1,10 @@
+-7.237310391208174210e+00,-9.031086522545416884e+00
+-8.165501360870660363e+00,-7.008504394784431213e+00
+-7.022668436942145931e+00,-7.570412890908222892e+00
+-8.863943061317664629e+00,-5.053239814677235486e+00
+8.525185826796044530e-02,3.645282967948058506e+00
+-7.941522766238410247e-01,2.104951171962878842e+00
+-1.340520809891420972e+00,4.157119493365751595e+00
+-1.032012970766660942e+01,-4.337402902031620044e+00
+-2.187731658211975017e+00,3.333521246686991013e+00
+-8.535604566608126831e+00,-6.013489256860858667e+00
diff --git a/scripts/staging/sklearn/tests/input_X.csv.mtd 
b/scripts/staging/sklearn/tests/input_X.csv.mtd
new file mode 100644
index 0000000..1acb285
--- /dev/null
+++ b/scripts/staging/sklearn/tests/input_X.csv.mtd
@@ -0,0 +1 @@
+{"rows": 10, "cols": 2, "format": "csv"}
diff --git a/scripts/staging/sklearn/tests/input_Y.csv 
b/scripts/staging/sklearn/tests/input_Y.csv
new file mode 100644
index 0000000..2b81a96
--- /dev/null
+++ b/scripts/staging/sklearn/tests/input_Y.csv
@@ -0,0 +1,10 @@
+3.0
+3.0
+3.0
+2.0
+1.0
+1.0
+1.0
+2.0
+1.0
+2.0
diff --git a/scripts/staging/sklearn/tests/input_Y.csv.mtd 
b/scripts/staging/sklearn/tests/input_Y.csv.mtd
new file mode 100644
index 0000000..4b8fdf2
--- /dev/null
+++ b/scripts/staging/sklearn/tests/input_Y.csv.mtd
@@ -0,0 +1,12 @@
+{
+    "data_type": "matrix",
+    "value_type": "double",
+    "rows": 10,
+    "cols": 1,
+    "nnz": 10,
+    "format": "csv",
+    "author": "mathias",
+    "header": false,
+    "sep": ",",
+    "created": "2021-02-16 17:11:40 CET"
+}
\ No newline at end of file
diff --git a/scripts/staging/sklearn/tests/util.py 
b/scripts/staging/sklearn/tests/util.py
new file mode 100755
index 0000000..8d80c4a
--- /dev/null
+++ b/scripts/staging/sklearn/tests/util.py
@@ -0,0 +1,101 @@
+# -------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+# -------------------------------------------------------------
+
+import sys
+import os
+import subprocess
+import difflib
+import logging
+
+def get_systemds_root():
+    try:
+        return os.environ['SYSTEMDS_ROOT']
+    except KeyError as error:
+        raise KeyError(f"SYSTEMDS_ROOT is not set.\nError\n{error}")
+        
+def get_sklearn_root():
+    return f'{get_systemds_root()}/scripts/staging/sklearn'
+
+def invoke_systemds(path):
+    root = get_systemds_root()
+
+    try:
+        script_path = os.path.relpath(path, os.getcwd())
+        result = subprocess.run([root + "/bin/systemds", script_path, '-nvargs 
input_X=tests/input_X.csv input_Y=tests/input_Y.csv'],
+                             check=True,
+                             stdout=subprocess.PIPE, stderr=subprocess.PIPE,
+                             timeout=10000)
+        
+        logging.debug('*' * 100)
+        logging.debug('\n' + result.stdout.decode('utf-8'))
+        logging.debug('\n' + result.stderr.decode('utf-8'))
+        logging.debug('*' * 100)
+        
+        # It looks like python does not notice systemds errors
+        # Is 0 returned in error cases?
+        # Check if there is any error and raise manually.
+        if len(result.stderr) != 0 or 'error' in str(result.stdout).lower():
+            raise subprocess.CalledProcessError(returncode=result.returncode, 
cmd=result.args, 
+                                                stderr=result.stderr, 
output=result.stdout)
+        
+    except subprocess.CalledProcessError as systemds_error:
+        logging.error("Failed to run systemds!")
+        logging.error("Error code: " + str(systemds_error.returncode))
+        logging.error("Stdout:")
+        logging.error(systemds_error.output.decode("utf-8"))
+        logging.error("Stderr:")
+        logging.error(systemds_error.stderr.decode("utf-8"))
+        return False
+    logging.info("Successfully executed script.")
+    return True
+
+def test_script(path):
+    logging.info('#' * 30)
+    logging.info('Running generated script on systemds.')
+    result = invoke_systemds(path)
+    logging.info('Finished test.')
+    return result
+
+# Compares two script using diff
+def compare_script(actual, expected):
+    try:
+        f_expected = open(f'{get_sklearn_root()}/tests/expected/{expected}')
+        f_actual = open(f'{get_sklearn_root()}/{actual}')
+        diff = difflib.ndiff(f_actual.readlines(), f_expected.readlines())
+        changes = [l.strip() for l in diff if not l.startswith('  ')]
+        logging.info('#' * 30)
+        if len(changes) == 0:
+            logging.info('Actual script matches expected script.')
+            return True
+        else:
+            logging.info('Actual script does not match expected script.')
+            logging.info('Legend:')
+            logging.info('    "+ " ... line unique to actual script')
+            logging.info('    "- " ... line unique to expected script')
+            logging.info('    "? " ... linue not present in either script')
+            logging.info('#' * 30)
+            logging.info('\n' + '\n'.join(changes))
+            logging.info('#' * 30)
+            return False
+    except Exception as e:
+        logging.error('Failed to compare script.')
+        logging.error(e)
+        return False
\ No newline at end of file

[systemds] 01/03: [SYSTEMDS-3235] Scikit-learn to SystemDS dml converter

Reply via email to