This is an automated email from the ASF dual-hosted git repository.

cutlerb pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git


The following commit(s) were added to refs/heads/master by this push:
     new 5bf5d9d  [SPARK-26970][PYTHON][ML] Add Spark ML interaction 
transformer to PySpark
5bf5d9d is described below

commit 5bf5d9d854db53541956dedb03e2de8eecf65b81
Author: Andrew-Crosby <[email protected]>
AuthorDate: Tue Apr 23 13:53:33 2019 -0700

    [SPARK-26970][PYTHON][ML] Add Spark ML interaction transformer to PySpark
    
    ## What changes were proposed in this pull request?
    
    Adds the Spark ML Interaction transformer to PySpark
    
    ## How was this patch tested?
    
    - Added Python doctest
    - Ran the newly added example code
    - Manually confirmed that a PipelineModel that contains an Interaction 
transformer can now be loaded in PySpark
    
    Closes #24426 from Andrew-Crosby/pyspark-interaction-transformer.
    
    Lead-authored-by: Andrew-Crosby 
<[email protected]>
    Co-authored-by: Andrew-Crosby <[email protected]>
    Signed-off-by: Bryan Cutler <[email protected]>
---
 examples/src/main/python/ml/interaction_example.py | 56 ++++++++++++++++++++++
 python/pyspark/ml/feature.py                       | 54 +++++++++++++++++++++
 2 files changed, 110 insertions(+)

diff --git a/examples/src/main/python/ml/interaction_example.py 
b/examples/src/main/python/ml/interaction_example.py
new file mode 100644
index 0000000..4b63227
--- /dev/null
+++ b/examples/src/main/python/ml/interaction_example.py
@@ -0,0 +1,56 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+from __future__ import print_function
+
+# $example on$
+from pyspark.ml.feature import Interaction, VectorAssembler
+# $example off$
+from pyspark.sql import SparkSession
+
+if __name__ == "__main__":
+    spark = SparkSession\
+        .builder\
+        .appName("InteractionExample")\
+        .getOrCreate()
+
+    # $example on$
+    df = spark.createDataFrame(
+        [(1, 1, 2, 3, 8, 4, 5),
+         (2, 4, 3, 8, 7, 9, 8),
+         (3, 6, 1, 9, 2, 3, 6),
+         (4, 10, 8, 6, 9, 4, 5),
+         (5, 9, 2, 7, 10, 7, 3),
+         (6, 1, 1, 4, 2, 8, 4)],
+        ["id1", "id2", "id3", "id4", "id5", "id6", "id7"])
+
+    assembler1 = VectorAssembler(inputCols=["id2", "id3", "id4"], 
outputCol="vec1")
+
+    assembled1 = assembler1.transform(df)
+
+    assembler2 = VectorAssembler(inputCols=["id5", "id6", "id7"], 
outputCol="vec2")
+
+    assembled2 = assembler2.transform(assembled1).select("id1", "vec1", "vec2")
+
+    interaction = Interaction(inputCols=["id1", "vec1", "vec2"], 
outputCol="interactedCol")
+
+    interacted = interaction.transform(assembled2)
+
+    interacted.show(truncate=False)
+    # $example off$
+
+    spark.stop()
diff --git a/python/pyspark/ml/feature.py b/python/pyspark/ml/feature.py
index 595ab18..4f5809c 100755
--- a/python/pyspark/ml/feature.py
+++ b/python/pyspark/ml/feature.py
@@ -39,6 +39,7 @@ __all__ = ['Binarizer',
            'IDF', 'IDFModel',
            'Imputer', 'ImputerModel',
            'IndexToString',
+           'Interaction',
            'MaxAbsScaler', 'MaxAbsScalerModel',
            'MinHashLSH', 'MinHashLSHModel',
            'MinMaxScaler', 'MinMaxScalerModel',
@@ -1228,6 +1229,59 @@ class ImputerModel(JavaModel, JavaMLReadable, 
JavaMLWritable):
 
 
 @inherit_doc
+class Interaction(JavaTransformer, HasInputCols, HasOutputCol, JavaMLReadable, 
JavaMLWritable):
+    """
+    Implements the feature interaction transform. This transformer takes in 
Double and Vector type
+    columns and outputs a flattened vector of their feature interactions. To 
handle interaction,
+    we first one-hot encode any nominal features. Then, a vector of the 
feature cross-products is
+    produced.
+
+    For example, given the input feature values `Double(2)` and `Vector(3, 
4)`, the output would be
+    `Vector(6, 8)` if all input features were numeric. If the first feature 
was instead nominal
+    with four categories, the output would then be `Vector(0, 0, 0, 0, 3, 4, 
0, 0)`.
+
+    >>> df = spark.createDataFrame([(0.0, 1.0), (2.0, 3.0)], ["a", "b"])
+    >>> interaction = Interaction(inputCols=["a", "b"], outputCol="ab")
+    >>> interaction.transform(df).show()
+    +---+---+-----+
+    |  a|  b|   ab|
+    +---+---+-----+
+    |0.0|1.0|[0.0]|
+    |2.0|3.0|[6.0]|
+    +---+---+-----+
+    ...
+    >>> interactionPath = temp_path + "/interaction"
+    >>> interaction.save(interactionPath)
+    >>> loadedInteraction = Interaction.load(interactionPath)
+    >>> loadedInteraction.transform(df).head().ab == 
interaction.transform(df).head().ab
+    True
+
+    .. versionadded:: 3.0.0
+    """
+
+    @keyword_only
+    def __init__(self, inputCols=None, outputCol=None):
+        """
+        __init__(self, inputCols=None, outputCol=None):
+        """
+        super(Interaction, self).__init__()
+        self._java_obj = 
self._new_java_obj("org.apache.spark.ml.feature.Interaction", self.uid)
+        self._setDefault()
+        kwargs = self._input_kwargs
+        self.setParams(**kwargs)
+
+    @keyword_only
+    @since("3.0.0")
+    def setParams(self, inputCols=None, outputCol=None):
+        """
+        setParams(self, inputCols=None, outputCol=None)
+        Sets params for this Interaction.
+        """
+        kwargs = self._input_kwargs
+        return self._set(**kwargs)
+
+
+@inherit_doc
 class MaxAbsScaler(JavaEstimator, HasInputCol, HasOutputCol, JavaMLReadable, 
JavaMLWritable):
     """
     Rescale each feature individually to range [-1, 1] by dividing through the 
largest maximum


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to